dmlc · razdoburdin · Apr 4, 2025 · Apr 4, 2025 · Apr 7, 2025 · Apr 8, 2025
diff --git a/src/common/column_matrix.cc b/src/common/column_matrix.cc
@@ -1,23 +1,23 @@
 /**
- * Copyright 2017-2023, XGBoost Contributors
+ * Copyright 2017-2025, XGBoost Contributors
  * \brief Utility for fast column-wise access
  */
 #include "column_matrix.h"
 
-#include <algorithm>    // for transform
 #include <cstddef>      // for size_t
 #include <cstdint>      // for uint64_t, uint8_t
 #include <limits>       // for numeric_limits
 #include <type_traits>  // for remove_reference_t
 #include <vector>       // for vector
 
-#include "../data/gradient_index.h"  // for GHistIndexMatrix
-#include "io.h"                      // for AlignedResourceReadStream, AlignedFileWriteStream
-#include "xgboost/base.h"            // for bst_feaature_t
-#include "xgboost/span.h"            // for Span
+#include "../common/ref_resource_view.h"  // for MakeFixedVecWithMalloc
+#include "../data/gradient_index.h"       // for GHistIndexMatrix
+#include "io.h"                           // for AlignedResourceReadStream, AlignedFileWriteStream
+#include "xgboost/base.h"                 // for bst_feaature_t
 
 namespace xgboost::common {
-void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_threshold) {
+void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_threshold,
+                               int n_threads) {
   auto const nfeature = gmat.Features();
   const size_t nrow = gmat.Size();
   // identify type of each column
@@ -61,18 +61,19 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
   auto storage_size =
       feature_offsets_.back() * static_cast<std::underlying_type_t<BinTypeSize>>(bins_type_size_);
 
-  index_ = common::MakeFixedVecWithMalloc(storage_size, std::uint8_t{0});
+  index_ = common::MakeFixedVecWithMalloc(storage_size, std::uint8_t{0}, n_threads);
 
   if (!all_dense_column) {
-    row_ind_ = common::MakeFixedVecWithMalloc(feature_offsets_[nfeature], std::size_t{0});
+    row_ind_ = common::MakeFixedVecWithMalloc(feature_offsets_[nfeature],
+                                              std::size_t{0}, n_threads);
   }
 
   // store least bin id for each feature
   index_base_ = const_cast<uint32_t*>(gmat.cut.Ptrs().data());
 
   any_missing_ = !gmat.IsDense();
 
-  missing_ = MissingIndicator{0, false};
+  missing_ = MissingIndicator{feature_offsets_, type_, any_missing_};
 }
 
 // IO procedures for external memory.
@@ -93,6 +94,9 @@ bool ColumnMatrix::Read(AlignedResourceReadStream* fi, uint32_t const* index_bas
   if (!common::ReadVec(fi, &missing_.storage)) {
     return false;
   }
+  if (!common::ReadVec(fi, &missing_.feature_offsets_padded)) {
+    return false;
+  }
   missing_.InitView();
 
   index_base_ = index_base;
@@ -113,6 +117,7 @@ std::size_t ColumnMatrix::Write(AlignedFileWriteStream* fo) const {
   bytes += common::WriteVec(fo, row_ind_);
   bytes += common::WriteVec(fo, feature_offsets_);
   bytes += common::WriteVec(fo, missing_.storage);
+  bytes += common::WriteVec(fo, missing_.feature_offsets_padded);
 
   bytes += fo->Write(bins_type_size_);
   bytes += fo->Write(any_missing_);

diff --git a/src/common/column_matrix.h b/src/common/column_matrix.h
@@ -13,6 +13,7 @@
 #include <cstdint>  // for uint8_t
 #include <limits>
 #include <memory>
+#include <vector>
 #include <type_traits>  // for enable_if_t, is_same_v, is_signed_v
 
 #include "../data/adapter.h"
@@ -144,40 +145,83 @@ class DenseColumnIter : public Column<BinIdxT> {
  *    in a column is below the threshold it's classified as dense column.
  */
 class ColumnMatrix {
-  /**
-   * @brief A bit set for indicating whether an element in a dense column is missing.
-   */
+/**
+ * @brief A bit set for indicating whether an element in a dense column is missing.
+ * Access is carefully managed to ensure thread safety during parallel operations.
+ */
   struct MissingIndicator {
     using BitFieldT = LBitField32;
     using T = typename BitFieldT::value_type;
 
     BitFieldT missing;
     RefResourceView<T> storage;
+    // Feature offset padded to allow concurrent access.
+    RefResourceView<std::size_t> feature_offsets_padded;
     static_assert(std::is_same_v<T, std::uint32_t>);
 
     template <typename U>
     [[nodiscard]] std::enable_if_t<!std::is_signed_v<U>, U> static InitValue(bool init) {
       return init ? ~U{0} : U{0};
     }
 
+    /**
+     * @param feature_offsets Offest of the first element for each feature
+     * @param type            Type of each column (Dense or Sparse).
+     */
+    void InitOffsetsPadded(const RefResourceView<std::size_t>& feature_offsets,
+                           const RefResourceView<ColumnType>& type) {
+      if (feature_offsets_padded.size() != feature_offsets.size()) {
+        CHECK(feature_offsets_padded.empty());
+        feature_offsets_padded = common::MakeFixedVecWithMalloc(feature_offsets.size(),
+                                                                std::size_t{0});
+      }
+
+      /*
+       * For missing indicator feature offsets are alligned to be a factor of
+       * BitFieldT::kValueSize (4 bytes).
+       * This is critical requariment for thread-safe access to bitfield.
+       * Each word processed by one thread.
+       */
+      for (std::size_t fid = 1; fid < feature_offsets.size(); ++fid) {
+        if (type[fid - 1] == ColumnType::kDenseColumn) {
+          std::size_t n_rows = feature_offsets[fid] - feature_offsets[fid - 1];
+          std::size_t n_rows_padded =
+              DivRoundUp(n_rows, BitFieldT::kValueSize) * BitFieldT::kValueSize;
+          feature_offsets_padded[fid] = feature_offsets_padded[fid - 1] + n_rows_padded;
+        } else {
+          feature_offsets_padded[fid] = feature_offsets_padded[fid - 1];
+        }
+      }
+    }
+
     MissingIndicator() = default;
     /**
      * @param n_elements Size of the bit set
      * @param init       Initialize the indicator to true or false.
      */
-    MissingIndicator(std::size_t n_elements, bool init) {
+    MissingIndicator(const RefResourceView<std::size_t>& feature_offsets,
+                     const RefResourceView<ColumnType>& type, bool init) {
+      this->InitOffsetsPadded(feature_offsets, type);
+      size_t n_elements = feature_offsets_padded.back();
       auto m_size = missing.ComputeStorageSize(n_elements);
       storage = common::MakeFixedVecWithMalloc(m_size, InitValue<T>(init));
       this->InitView();
     }
-    /** @brief Set the i^th element to be a valid element (instead of missing). */
-    void SetValid(typename LBitField32::index_type i) { missing.Clear(i); }
+    /** @brief Set the i^th element corresponding to feature fid
+      * to be a valid element (instead of missing). */
+    void SetValid(typename LBitField32::index_type i, std::size_t fid) {
+      missing.Clear(feature_offsets_padded[fid] + i);
+    }
     /** @brief assign the storage to the view. */
     void InitView() {
       missing = LBitField32{Span{storage.data(), static_cast<size_t>(storage.size())}};
     }
 
-    void GrowTo(std::size_t n_elements, bool init) {
+    void GrowTo(const RefResourceView<std::size_t>& feature_offsets,
+                const RefResourceView<ColumnType>& type, bool init) {
+      this->InitOffsetsPadded(feature_offsets, type);
+      size_t n_elements = feature_offsets_padded.back();
+
       CHECK(storage.Resource()->Type() == ResourceHandler::kMalloc)
           << "[Internal Error]: Cannot grow the vector when external memory is used.";
       auto m_size = missing.ComputeStorageSize(n_elements);
@@ -195,34 +239,42 @@ class ColumnMatrix {
     }
   };
 
-  void InitStorage(GHistIndexMatrix const& gmat, double sparse_threshold);
+  void InitStorage(GHistIndexMatrix const& gmat, double sparse_threshold, int n_threads);
 
   template <typename ColumnBinT, typename BinT, typename RIdx>
   void SetBinSparse(BinT bin_id, RIdx rid, bst_feature_t fid, ColumnBinT* local_index) {
+    ColumnBinT* begin = &local_index[feature_offsets_[fid]];
     if (type_[fid] == kDenseColumn) {
-      ColumnBinT* begin = &local_index[feature_offsets_[fid]];
       begin[rid] = bin_id - index_base_[fid];
-      // not thread-safe with bit field.
-      // FIXME(jiamingy): We can directly assign kMissingId to the index to avoid missing
-      // flags.
-      missing_.SetValid(feature_offsets_[fid] + rid);
+      missing_.SetValid(rid, fid);
     } else {
-      ColumnBinT* begin = &local_index[feature_offsets_[fid]];
       begin[num_nonzeros_[fid]] = bin_id - index_base_[fid];
       row_ind_[feature_offsets_[fid] + num_nonzeros_[fid]] = rid;
       ++num_nonzeros_[fid];
     }
   }
 
+  template <typename ColumnBinT, typename BinT, typename RIdx>
+  void SetBinSparse(BinT bin_id, RIdx rid, bst_feature_t fid, ColumnBinT* local_index, size_t nnz) {
+    ColumnBinT* begin = &local_index[feature_offsets_[fid]];
+    if (type_[fid] == kDenseColumn) {
+      begin[rid] = bin_id - index_base_[fid];
+      missing_.SetValid(rid, fid);
+    } else {
+      begin[nnz] = bin_id - index_base_[fid];
+      row_ind_[feature_offsets_[fid] + nnz] = rid;
+    }
+  }
+
  public:
   // get number of features
   [[nodiscard]] bst_feature_t GetNumFeature() const {
     return static_cast<bst_feature_t>(type_.size());
   }
 
   ColumnMatrix() = default;
-  ColumnMatrix(GHistIndexMatrix const& gmat, double sparse_threshold) {
-    this->InitStorage(gmat, sparse_threshold);
+  ColumnMatrix(GHistIndexMatrix const& gmat, double sparse_threshold, int n_threads) {
+    this->InitStorage(gmat, sparse_threshold, n_threads);
   }
 
   /**
@@ -232,7 +284,7 @@ class ColumnMatrix {
   void InitFromSparse(SparsePage const& page, const GHistIndexMatrix& gmat, double sparse_threshold,
                       int32_t n_threads) {
     auto batch = data::SparsePageAdapterBatch{page.GetView()};
-    this->InitStorage(gmat, sparse_threshold);
+    this->InitStorage(gmat, sparse_threshold, n_threads);
     // ignore base row id here as we always has one column matrix for each sparse page.
     this->PushBatch(n_threads, batch, std::numeric_limits<float>::quiet_NaN(), gmat, 0);
   }
@@ -283,7 +335,7 @@ class ColumnMatrix {
         SetIndexNoMissing(base_rowid, gmat.index.data<RowBinIdxT>(), size, n_features, n_threads);
       });
     } else {
-      SetIndexMixedColumns(base_rowid, batch, gmat, missing);
+      SetIndexMixedColumns(base_rowid, batch, gmat, missing, n_threads);
     }
   }
 
@@ -316,16 +368,21 @@ class ColumnMatrix {
     common::Span<const BinIdxType> bin_index = {
         reinterpret_cast<const BinIdxType*>(&index_[feature_offset * bins_type_size_]),
         column_size};
+    /*
+     * Pass the pre-calculated starting offset missing_.feature_offsets_expand[fidx]
+     * in the bitfield for this specific feature (fidx).
+     */
     return DenseColumnIter<BinIdxType, any_missing>{
-        bin_index, static_cast<bst_bin_t>(index_base_[fidx]), missing_.missing, feature_offset};
+        bin_index, static_cast<bst_bin_t>(index_base_[fidx]), missing_.missing,
+                                          missing_.feature_offsets_padded[fidx]};
   }
 
   // all columns are dense column and has no missing value
   // FIXME(jiamingy): We don't need a column matrix if there's no missing value.
   template <typename RowBinIdxT>
   void SetIndexNoMissing(bst_idx_t base_rowid, RowBinIdxT const* row_index, const size_t n_samples,
                          const size_t n_features, int32_t n_threads) {
-    missing_.GrowTo(feature_offsets_[n_features], false);
+    missing_.GrowTo(feature_offsets_, type_, false);
 
     DispatchBinType(bins_type_size_, [&](auto t) {
       using ColumnBinT = decltype(t);
@@ -348,11 +405,11 @@ class ColumnMatrix {
    * \brief Set column index for both dense and sparse columns
    */
   template <typename Batch>
-  void SetIndexMixedColumns(size_t base_rowid, Batch const& batch, const GHistIndexMatrix& gmat,
-                            float missing) {
+  void SetIndexMixedColumns(bst_idx_t base_rowid, Batch const& batch, const GHistIndexMatrix& gmat,
+                            float missing, int n_threads) {
     auto n_features = gmat.Features();
 
-    missing_.GrowTo(feature_offsets_[n_features], true);
+    missing_.GrowTo(feature_offsets_, type_, true);
     auto const* row_index = gmat.index.data<std::uint32_t>() + gmat.row_ptr[base_rowid];
     if (num_nonzeros_.empty()) {
       num_nonzeros_ = common::MakeFixedVecWithMalloc(n_features, std::size_t{0});
@@ -366,19 +423,100 @@ class ColumnMatrix {
       using ColumnBinT = decltype(t);
       ColumnBinT* local_index = reinterpret_cast<ColumnBinT*>(index_.data());
       size_t const batch_size = batch.Size();
-      size_t k{0};
-      for (size_t rid = 0; rid < batch_size; ++rid) {
-        auto line = batch.GetLine(rid);
-        for (size_t i = 0; i < line.Size(); ++i) {
-          auto coo = line.GetElement(i);
-          if (is_valid(coo)) {
-            auto fid = coo.column_idx;
-            const uint32_t bin_id = row_index[k];
-            SetBinSparse(bin_id, rid + base_rowid, fid, local_index);
-            ++k;
+
+      // Parallel sparse batch processing
+      dmlc::OMPException exc;
+      std::vector<size_t> n_elements((n_threads + 1) * n_features, 0);
+      std::vector<size_t> k_offsets(n_threads + 1, 0);
+      size_t block_size = DivRoundUp(batch_size, n_threads);
+
+      /*
+       * We use bitfield as a missing indicator. To ensure thread safe access to the bitfield
+       * each underlying word of the bitfiled should be processed by a single thread.
+       * So we need to align the row-blocks.
+       */
+      block_size = DivRoundUp(block_size, MissingIndicator::BitFieldT::kValueSize) *
+                   MissingIndicator::BitFieldT::kValueSize;
+      /*
+       * If base_rowid > 0 we need to shift the blocks boundaries.
+       * Otherwise the two threads may operate with the single word of bitfield.
+       */
+      size_t shift = MissingIndicator::BitFieldT::kValueSize -
 this->page_->PushAdapterBatchColumns(ctx_, value, this->missing_, rbegin); 
 this->columns_->PushBatch(ctx->Threads(), batch, missing, *this, rbegin); 
 SetIndexMixedColumns(base_rowid, batch, gmat, missing); 
 this->page_->PushAdapterBatchColumns(ctx_, value, this->missing_, rbegin); 
 this->columns_->PushBatch(ctx->Threads(), batch, missing, *this, rbegin); 
 SetIndexMixedColumns(base_rowid, batch, gmat, missing); 
+                     (base_rowid % MissingIndicator::BitFieldT::kValueSize);
+      if (shift == MissingIndicator::BitFieldT::kValueSize) {
+        shift = 0;
+      }
+
+      // Parallel row processing for thread-local counting.
+      #pragma omp parallel num_threads(n_threads)
+      {
+        exc.Run([&, is_valid]() {
+          int tid = omp_get_thread_num();
+          size_t begin = block_size * tid;
+          size_t end = std::min(begin + shift + block_size, batch_size);
+          // Apply shift for threads > 0 to maintain word alignment across blocks.
+          if (tid > 0) {
+            begin += shift;
+          }
+          for (size_t rid = begin; rid < end; ++rid) {
+            const auto& line = batch.GetLine(rid);
+            for (size_t i = 0; i < line.Size(); ++i) {
+              auto coo = line.GetElement(i);
+              if (is_valid(coo)) {
+                auto fid = coo.column_idx;
+                if ((type_[fid] != kDenseColumn)) {
+                  n_elements[(tid + 1) * n_features + fid] += 1;
+                }
+                k_offsets[tid + 1] += 1;
+              }
+            }
           }
+        });
+      }
+      exc.Rethrow();
+
+      // Parallel feature processing to aggregate counts & calculate offsets.
+      ParallelFor(n_features, n_threads, [&](auto fid) {
+        n_elements[fid] += num_nonzeros_[fid];
+        for (int tid = 0; tid < n_threads; ++tid) {
+          n_elements[(tid + 1) * n_features + fid] +=
+            n_elements[tid * n_features + fid];
         }
+        num_nonzeros_[fid] = n_elements[n_threads * n_features + fid];
+      });
+      std::partial_sum(k_offsets.cbegin(), k_offsets.cend(), k_offsets.begin());
+
+      // Parallel row processing to place data using offsets into sparse structure.
+      #pragma omp parallel num_threads(n_threads)
+      {
+        std::vector<size_t> nnz_offsets(n_features, 0);
+        exc.Run([&, is_valid, base_rowid, row_index]() {
+          int tid = omp_get_thread_num();
+          size_t begin = block_size * tid;
+          size_t end = std::min(begin + shift + block_size, batch_size);
+          // Apply shift for threads > 0 to maintain word alignment across blocks.
+          if (tid > 0) {
+            begin += shift;
+          }
+
+          size_t k = 0;
+          for (size_t rid = begin; rid < end; ++rid) {
+            const auto& line = batch.GetLine(rid);
+            for (size_t i = 0; i < line.Size(); ++i) {
+              auto coo = line.GetElement(i);
+              if (is_valid(coo)) {
+                auto fid = coo.column_idx;
+                const uint32_t bin_id = row_index[k_offsets[tid] + k];
+                size_t nnz = n_elements[tid * n_features + fid] + nnz_offsets[fid];
+                SetBinSparse(bin_id, rid + base_rowid, fid, local_index, nnz);
+                ++k;
+                nnz_offsets[fid] += (type_[fid] != kDenseColumn);
+              }
+            }
+          }
+        });
       }
+      exc.Rethrow();
     });
   }
 
@@ -389,7 +527,7 @@ class ColumnMatrix {
   void SetIndexMixedColumns(const GHistIndexMatrix& gmat) {
     auto n_features = gmat.Features();
 
-    missing_ = MissingIndicator{feature_offsets_[n_features], true};
+    missing_ = MissingIndicator{feature_offsets_, type_, true};
     num_nonzeros_ = common::MakeFixedVecWithMalloc(n_features, std::size_t{0});
 
     DispatchBinType(bins_type_size_, [&](auto t) {