From d3dcf0f31b162f8e0f7ce074ff35e63edfe30c20 Mon Sep 17 00:00:00 2001 From: Ashutosh Kamble Date: Wed, 11 Feb 2026 01:49:27 +0530 Subject: [PATCH 1/2] Fix SketchBatchNumElements test to clamp with kIntMax --- tests/cpp/common/test_hist_util.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu index 3ae9229cc9b5..17cfef50960a 100644 --- a/tests/cpp/common/test_hist_util.cu +++ b/tests/cpp/common/test_hist_util.cu @@ -68,7 +68,7 @@ TEST(HistUtil, SketchBatchNumElements) { auto shape = detail::SketchShape{rows, kCols, rows * kCols}; auto batch = detail::SketchBatchNumElements(detail::UnknownSketchNumElements(), shape, device, 256, false, 0); - ASSERT_EQ(batch, avail_elem); + ASSERT_EQ(batch, std::min(avail_elem, kIntMax)); } TEST(HistUtil, DeviceSketchMemory) { From 4e29f4f68d3d7103b9703de6f4ed2f14cf3c3083 Mon Sep 17 00:00:00 2001 From: Ashutosh Kamble Date: Wed, 11 Feb 2026 18:27:12 +0530 Subject: [PATCH 2/2] Apply pre-commit formatting --- .gitattributes | 2 +- .gitignore | 2 +- CONTRIBUTORS.md | 6 +- cmake/RPackageInstall.cmake.in | 2 +- demo/aft_survival/aft_survival_demo.py | 69 ++-- .../aft_survival_demo_with_optuna.py | 106 +++-- demo/aft_survival/aft_survival_viz_demo.py | 1 - demo/c-api/external-memory/README.md | 2 +- demo/dask/cpu_survival.py | 1 - demo/dask/cpu_training.py | 1 - demo/dask/dask_callbacks.py | 5 +- demo/dask/dask_learning_to_rank.py | 1 - demo/dask/forward_logging.py | 1 - demo/dask/gpu_training.py | 1 - demo/dask/sklearn_cpu_training.py | 1 - demo/dask/sklearn_gpu_training.py | 1 - demo/data/regression/machine.names | 25 +- demo/data/regression/mapfeat.py | 22 +- demo/data/regression/mknfold.py | 8 +- demo/guide-python/basic_walkthrough.py | 4 +- demo/guide-python/boost_from_prediction.py | 1 + demo/guide-python/callbacks.py | 3 +- demo/guide-python/cat_in_the_dat.py | 3 +- demo/guide-python/cat_pipeline.py | 3 +- demo/guide-python/categorical.py | 1 - demo/guide-python/continuation.py | 3 +- demo/guide-python/cover_type.py | 4 +- demo/guide-python/cross_validation.py | 1 - demo/guide-python/custom_rmsle.py | 140 ++++--- demo/guide-python/custom_softmax.py | 3 +- demo/guide-python/evals_result.py | 1 + demo/guide-python/external_memory.py | 3 +- demo/guide-python/feature_weights.py | 3 +- demo/guide-python/gamma_regression.py | 13 +- demo/guide-python/generalized_linear_model.py | 1 + demo/guide-python/gpu_tree_shap.py | 4 +- demo/guide-python/individual_trees.py | 3 +- demo/guide-python/learning_to_rank.py | 3 +- .../multioutput_reduced_gradient.py | 7 +- demo/guide-python/multioutput_regression.py | 3 +- demo/guide-python/predict_first_ntree.py | 4 +- demo/guide-python/predict_leaf_indices.py | 1 + demo/guide-python/quantile_data_iterator.py | 1 - demo/guide-python/quantile_regression.py | 3 +- demo/guide-python/sklearn_evals_result.py | 3 +- demo/guide-python/sklearn_examples.py | 3 +- demo/guide-python/sklearn_parallel.py | 3 +- demo/guide-python/spark_estimator_examples.py | 1 - demo/guide-python/update_process.py | 3 +- demo/kaggle-higgs/higgs-cv.py | 38 +- demo/kaggle-higgs/higgs-numpy.py | 55 +-- demo/kaggle-higgs/higgs-pred.py | 41 +- demo/kaggle-higgs/speedtest.py | 67 +-- demo/multiclass_classification/train.py | 30 +- demo/nvflare/horizontal/custom/controller.py | 39 +- demo/nvflare/horizontal/custom/trainer.py | 73 ++-- demo/nvflare/vertical/custom/controller.py | 39 +- demo/nvflare/vertical/custom/trainer.py | 80 ++-- demo/rmm_plugin/README.rst | 2 +- demo/rmm_plugin/rmm_mgpu_with_dask.py | 3 +- demo/rmm_plugin/rmm_singlegpu.py | 3 +- dev/query_contributors.py | 84 ++-- doc/.gitignore | 2 +- doc/_static/js/auto_module_index.js | 1 - doc/contrib/consistency.rst | 2 +- doc/contrib/featuremap.rst | 2 +- doc/contrib/git_guide.rst | 1 - doc/contrib/release.rst | 2 +- doc/python/.gitignore | 2 +- doc/python/data_input.rst | 2 +- doc/python/python_api.rst | 2 +- doc/sphinx_util.py | 14 +- doc/tutorials/index.rst | 2 +- doc/tutorials/privacy_preserving.rst | 4 +- doc/tutorials/saving_model.rst | 2 +- jvm-packages/README.md | 2 +- jvm-packages/checkstyle.xml | 2 +- jvm-packages/create_jni.py | 12 +- jvm-packages/xgboost4j-example/LICENSE | 6 +- jvm-packages/xgboost4j/LICENSE | 6 +- .../ml/dmlc/xgboost4j/java/XGBoostJNI.java | 2 +- .../resources/xgboost4j-version.properties | 2 +- ops/script/pypi_variants.py | 2 +- ops/script/release_artifacts.py | 3 +- ops/script/run_clang_tidy.py | 2 +- python-package/.gitignore | 2 +- python-package/xgboost/core.py | 6 +- python-package/xgboost/sklearn.py | 6 +- python-package/xgboost/spark/core.py | 10 +- python-package/xgboost/testing/__init__.py | 7 +- python-package/xgboost/tracker.py | 1 - tests/cpp/plugin/test_sycl_hist_updater.cc | 2 +- tests/cpp/plugin/test_sycl_lambdarank_obj.cc | 1 - tests/cpp/plugin/test_sycl_linalg.cc | 2 +- .../plugin/test_sycl_quantile_hist_builder.cc | 2 +- tests/python-gpu/conftest.py | 1 - tests/python-gpu/load_pickle.py | 3 +- .../test_device_quantile_dmatrix.py | 3 +- tests/python-gpu/test_from_cudf.py | 1 - tests/python-gpu/test_from_cupy.py | 1 - tests/python-gpu/test_gpu_basic_models.py | 1 - tests/python-gpu/test_gpu_callbacks.py | 1 - tests/python-gpu/test_gpu_data_iterator.py | 3 +- tests/python-gpu/test_gpu_demos.py | 1 - tests/python-gpu/test_gpu_eval_metrics.py | 1 - .../test_gpu_interaction_constraints.py | 1 - tests/python-gpu/test_gpu_intercept.py | 1 - tests/python-gpu/test_gpu_linear.py | 3 +- tests/python-gpu/test_gpu_ordinal.py | 1 - tests/python-gpu/test_gpu_pickling.py | 3 +- tests/python-gpu/test_gpu_plotting.py | 1 - tests/python-gpu/test_gpu_prediction.py | 3 +- tests/python-gpu/test_gpu_ranking.py | 1 - .../test_gpu_training_continuation.py | 1 - tests/python-gpu/test_gpu_updaters.py | 3 +- tests/python-gpu/test_gpu_with_sklearn.py | 1 - tests/python-gpu/test_large_input.py | 1 - .../python-gpu/test_monotonic_constraints.py | 1 - tests/python-sycl/test_sycl_prediction.py | 5 +- tests/python-sycl/test_sycl_simple_dask.py | 5 +- .../test_sycl_training_continuation.py | 3 +- tests/python-sycl/test_sycl_updaters.py | 10 +- tests/python-sycl/test_sycl_with_sklearn.py | 6 +- tests/python/generate_models.py | 3 +- tests/python/test_basic.py | 14 +- tests/python/test_basic_models.py | 1 - tests/python/test_callback.py | 1 - tests/python/test_collective.py | 1 - tests/python/test_config.py | 1 - tests/python/test_data_iterator.py | 3 +- tests/python/test_demos.py | 1 - tests/python/test_dmatrix.py | 3 +- tests/python/test_early_stopping.py | 1 - tests/python/test_eval_metrics.py | 1 - tests/python/test_interaction_constraints.py | 1 - tests/python/test_intercept.py | 1 - tests/python/test_linear.py | 76 ++-- tests/python/test_model_compatibility.py | 1 - tests/python/test_model_io.py | 7 +- tests/python/test_monotone_constraints.py | 3 - tests/python/test_openmp.py | 64 +-- tests/python/test_ordinal.py | 1 - tests/python/test_parse_tree.py | 1 - tests/python/test_pickling.py | 1 - tests/python/test_plotting.py | 1 - tests/python/test_predict.py | 3 +- tests/python/test_quantile_dmatrix.py | 3 +- tests/python/test_shap.py | 1 - tests/python/test_survival.py | 67 +-- tests/python/test_tracker.py | 2 - tests/python/test_training_continuation.py | 1 - tests/python/test_tree_regularization.py | 3 +- tests/python/test_with_arrow.py | 1 - tests/python/test_with_modin.py | 84 ++-- tests/python/test_with_pandas.py | 1 - tests/python/test_with_polars.py | 1 - tests/python/test_with_scipy.py | 1 - tests/python/test_with_shap.py | 1 - tests/python/test_with_sklearn.py | 391 ++++++++++++------ tests/python/with_omp_limit.py | 3 +- .../test_federated/test_federated.py | 1 - .../test_gpu_federated/test_gpu_federated.py | 1 - .../test_gpu_with_dask/__init__.py | 1 - .../test_gpu_with_dask/conftest.py | 1 - .../test_gpu_with_dask/test_gpu_demos.py | 1 - .../test_gpu_external_memory.py | 3 +- .../test_gpu_with_dask/test_gpu_ranking.py | 1 - .../test_gpu_with_dask/test_gpu_with_dask.py | 8 +- .../test_gpu_with_spark/test_data.py | 1 - .../test_gpu_with_spark/test_gpu_spark.py | 2 - .../test_with_dask/__init__.py | 1 - .../test_with_dask/test_demos.py | 1 - .../test_with_dask/test_external_memory.py | 1 - .../test_with_spark/test_data.py | 1 - 174 files changed, 1127 insertions(+), 869 deletions(-) diff --git a/.gitattributes b/.gitattributes index 5c71e130ed17..a22b3664e535 100644 --- a/.gitattributes +++ b/.gitattributes @@ -15,4 +15,4 @@ *.rst text eol=lf *.md text eol=lf -*.csv text eol=lf \ No newline at end of file +*.csv text eol=lf diff --git a/.gitignore b/.gitignore index 6dfb1212e135..f6bf2b8935df 100644 --- a/.gitignore +++ b/.gitignore @@ -165,4 +165,4 @@ Rplots.pdf # nsys *.nsys-rep -rmm_log.dev* \ No newline at end of file +rmm_log.dev* diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 926ad43fa6f0..b1f076cb3a8b 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -2,9 +2,9 @@ Contributors of DMLC/XGBoost ============================ XGBoost has been developed and used by a group of active community. Everyone is more than welcomed to is a great way to make the project better and more accessible to more users. -Project Management Committee(PMC) +Project Management Committee(PMC) ---------- -The Project Management Committee(PMC) consists group of active committers that moderate the discussion, manage the project release, and proposes new committer/PMC members. +The Project Management Committee(PMC) consists group of active committers that moderate the discussion, manage the project release, and proposes new committer/PMC members. * [Tianqi Chen](https://github.com/tqchen), University of Washington - Tianqi is a Ph.D. student working on large-scale machine learning. He is the creator of the project. @@ -19,7 +19,7 @@ The Project Management Committee(PMC) consists group of active committers that m * [Hyunsu Cho](http://hyunsu-cho.io/), NVIDIA - Hyunsu is the maintainer of the XGBoost Python package. He also manages the Jenkins continuous integration system (https://xgboost-ci.net/). He is the initial author of the CPU 'hist' updater. * [Rory Mitchell](https://github.com/RAMitchell), University of Waikato - - Rory is a Ph.D. student at University of Waikato. He is the original creator of the GPU training algorithms. He improved the CMake build system and continuous integration. + - Rory is a Ph.D. student at University of Waikato. He is the original creator of the GPU training algorithms. He improved the CMake build system and continuous integration. * [Hongliang Liu](https://github.com/phunterlau) diff --git a/cmake/RPackageInstall.cmake.in b/cmake/RPackageInstall.cmake.in index bde4c75c726a..cf0c39b4a9ae 100644 --- a/cmake/RPackageInstall.cmake.in +++ b/cmake/RPackageInstall.cmake.in @@ -31,4 +31,4 @@ set(XGB_DEPS_SCRIPT check_call(COMMAND "${LIBR_EXECUTABLE}" -q -e "${XGB_DEPS_SCRIPT}") # Install the XGBoost R package -check_call(COMMAND "${LIBR_EXECUTABLE}" CMD INSTALL --no-multiarch --build "${build_dir}/R-package") \ No newline at end of file +check_call(COMMAND "${LIBR_EXECUTABLE}" CMD INSTALL --no-multiarch --build "${build_dir}/R-package") diff --git a/demo/aft_survival/aft_survival_demo.py b/demo/aft_survival/aft_survival_demo.py index 93359c83510c..f10119884d16 100644 --- a/demo/aft_survival/aft_survival_demo.py +++ b/demo/aft_survival/aft_survival_demo.py @@ -9,54 +9,63 @@ import numpy as np import pandas as pd -from sklearn.model_selection import ShuffleSplit - import xgboost as xgb +from sklearn.model_selection import ShuffleSplit # The Veterans' Administration Lung Cancer Trial # The Statistical Analysis of Failure Time Data by Kalbfleisch J. and Prentice R (1980) CURRENT_DIR = os.path.dirname(__file__) -df = pd.read_csv(os.path.join(CURRENT_DIR, '../data/veterans_lung_cancer.csv')) -print('Training data:') +df = pd.read_csv(os.path.join(CURRENT_DIR, "../data/veterans_lung_cancer.csv")) +print("Training data:") print(df) # Split features and labels -y_lower_bound = df['Survival_label_lower_bound'] -y_upper_bound = df['Survival_label_upper_bound'] -X = df.drop(['Survival_label_lower_bound', 'Survival_label_upper_bound'], axis=1) +y_lower_bound = df["Survival_label_lower_bound"] +y_upper_bound = df["Survival_label_upper_bound"] +X = df.drop(["Survival_label_lower_bound", "Survival_label_upper_bound"], axis=1) # Split data into training and validation sets -rs = ShuffleSplit(n_splits=2, test_size=.7, random_state=0) +rs = ShuffleSplit(n_splits=2, test_size=0.7, random_state=0) train_index, valid_index = next(rs.split(X)) dtrain = xgb.DMatrix(X.values[train_index, :]) -dtrain.set_float_info('label_lower_bound', y_lower_bound[train_index]) -dtrain.set_float_info('label_upper_bound', y_upper_bound[train_index]) +dtrain.set_float_info("label_lower_bound", y_lower_bound[train_index]) +dtrain.set_float_info("label_upper_bound", y_upper_bound[train_index]) dvalid = xgb.DMatrix(X.values[valid_index, :]) -dvalid.set_float_info('label_lower_bound', y_lower_bound[valid_index]) -dvalid.set_float_info('label_upper_bound', y_upper_bound[valid_index]) +dvalid.set_float_info("label_lower_bound", y_lower_bound[valid_index]) +dvalid.set_float_info("label_upper_bound", y_upper_bound[valid_index]) # Train gradient boosted trees using AFT loss and metric -params = {'verbosity': 0, - 'objective': 'survival:aft', - 'eval_metric': 'aft-nloglik', - 'tree_method': 'hist', - 'learning_rate': 0.05, - 'aft_loss_distribution': 'normal', - 'aft_loss_distribution_scale': 1.20, - 'max_depth': 6, - 'lambda': 0.01, - 'alpha': 0.02} -bst = xgb.train(params, dtrain, num_boost_round=10000, - evals=[(dtrain, 'train'), (dvalid, 'valid')], - early_stopping_rounds=50) +params = { + "verbosity": 0, + "objective": "survival:aft", + "eval_metric": "aft-nloglik", + "tree_method": "hist", + "learning_rate": 0.05, + "aft_loss_distribution": "normal", + "aft_loss_distribution_scale": 1.20, + "max_depth": 6, + "lambda": 0.01, + "alpha": 0.02, +} +bst = xgb.train( + params, + dtrain, + num_boost_round=10000, + evals=[(dtrain, "train"), (dvalid, "valid")], + early_stopping_rounds=50, +) # Run prediction on the validation set -df = pd.DataFrame({'Label (lower bound)': y_lower_bound[valid_index], - 'Label (upper bound)': y_upper_bound[valid_index], - 'Predicted label': bst.predict(dvalid)}) +df = pd.DataFrame( + { + "Label (lower bound)": y_lower_bound[valid_index], + "Label (upper bound)": y_upper_bound[valid_index], + "Predicted label": bst.predict(dvalid), + } +) print(df) # Show only data points with right-censored labels -print(df[np.isinf(df['Label (upper bound)'])]) +print(df[np.isinf(df["Label (upper bound)"])]) # Save trained model -bst.save_model('aft_model.json') +bst.save_model("aft_model.json") diff --git a/demo/aft_survival/aft_survival_demo_with_optuna.py b/demo/aft_survival/aft_survival_demo_with_optuna.py index 11c1d32f6b32..2451ae94e017 100644 --- a/demo/aft_survival/aft_survival_demo_with_optuna.py +++ b/demo/aft_survival/aft_survival_demo_with_optuna.py @@ -6,78 +6,108 @@ using Optuna to tune hyperparameters """ + import numpy as np import optuna import pandas as pd -from sklearn.model_selection import ShuffleSplit - import xgboost as xgb +from sklearn.model_selection import ShuffleSplit # The Veterans' Administration Lung Cancer Trial # The Statistical Analysis of Failure Time Data by Kalbfleisch J. and Prentice R (1980) -df = pd.read_csv('../data/veterans_lung_cancer.csv') -print('Training data:') +df = pd.read_csv("../data/veterans_lung_cancer.csv") +print("Training data:") print(df) # Split features and labels -y_lower_bound = df['Survival_label_lower_bound'] -y_upper_bound = df['Survival_label_upper_bound'] -X = df.drop(['Survival_label_lower_bound', 'Survival_label_upper_bound'], axis=1) +y_lower_bound = df["Survival_label_lower_bound"] +y_upper_bound = df["Survival_label_upper_bound"] +X = df.drop(["Survival_label_lower_bound", "Survival_label_upper_bound"], axis=1) # Split data into training and validation sets -rs = ShuffleSplit(n_splits=2, test_size=.7, random_state=0) +rs = ShuffleSplit(n_splits=2, test_size=0.7, random_state=0) train_index, valid_index = next(rs.split(X)) dtrain = xgb.DMatrix(X.values[train_index, :]) -dtrain.set_float_info('label_lower_bound', y_lower_bound[train_index]) -dtrain.set_float_info('label_upper_bound', y_upper_bound[train_index]) +dtrain.set_float_info("label_lower_bound", y_lower_bound[train_index]) +dtrain.set_float_info("label_upper_bound", y_upper_bound[train_index]) dvalid = xgb.DMatrix(X.values[valid_index, :]) -dvalid.set_float_info('label_lower_bound', y_lower_bound[valid_index]) -dvalid.set_float_info('label_upper_bound', y_upper_bound[valid_index]) +dvalid.set_float_info("label_lower_bound", y_lower_bound[valid_index]) +dvalid.set_float_info("label_upper_bound", y_upper_bound[valid_index]) # Define hyperparameter search space -base_params = {'verbosity': 0, - 'objective': 'survival:aft', - 'eval_metric': 'aft-nloglik', - 'tree_method': 'hist'} # Hyperparameters common to all trials +base_params = { + "verbosity": 0, + "objective": "survival:aft", + "eval_metric": "aft-nloglik", + "tree_method": "hist", +} # Hyperparameters common to all trials + + def objective(trial): - params = {'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0), - 'aft_loss_distribution': trial.suggest_categorical('aft_loss_distribution', - ['normal', 'logistic', 'extreme']), - 'aft_loss_distribution_scale': trial.suggest_loguniform('aft_loss_distribution_scale', 0.1, 10.0), - 'max_depth': trial.suggest_int('max_depth', 3, 8), - 'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0), - 'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0)} # Search space + params = { + "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 1.0), + "aft_loss_distribution": trial.suggest_categorical( + "aft_loss_distribution", ["normal", "logistic", "extreme"] + ), + "aft_loss_distribution_scale": trial.suggest_loguniform( + "aft_loss_distribution_scale", 0.1, 10.0 + ), + "max_depth": trial.suggest_int("max_depth", 3, 8), + "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0), + "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0), + } # Search space params.update(base_params) - pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'valid-aft-nloglik') - bst = xgb.train(params, dtrain, num_boost_round=10000, - evals=[(dtrain, 'train'), (dvalid, 'valid')], - early_stopping_rounds=50, verbose_eval=False, callbacks=[pruning_callback]) + pruning_callback = optuna.integration.XGBoostPruningCallback( + trial, "valid-aft-nloglik" + ) + bst = xgb.train( + params, + dtrain, + num_boost_round=10000, + evals=[(dtrain, "train"), (dvalid, "valid")], + early_stopping_rounds=50, + verbose_eval=False, + callbacks=[pruning_callback], + ) if bst.best_iteration >= 25: return bst.best_score else: return np.inf # Reject models with < 25 trees + # Run hyperparameter search -study = optuna.create_study(direction='minimize') +study = optuna.create_study(direction="minimize") study.optimize(objective, n_trials=200) -print('Completed hyperparameter tuning with best aft-nloglik = {}.'.format(study.best_trial.value)) +print( + "Completed hyperparameter tuning with best aft-nloglik = {}.".format( + study.best_trial.value + ) +) params = {} params.update(base_params) params.update(study.best_trial.params) # Re-run training with the best hyperparameter combination -print('Re-running the best trial... params = {}'.format(params)) -bst = xgb.train(params, dtrain, num_boost_round=10000, - evals=[(dtrain, 'train'), (dvalid, 'valid')], - early_stopping_rounds=50) +print("Re-running the best trial... params = {}".format(params)) +bst = xgb.train( + params, + dtrain, + num_boost_round=10000, + evals=[(dtrain, "train"), (dvalid, "valid")], + early_stopping_rounds=50, +) # Run prediction on the validation set -df = pd.DataFrame({'Label (lower bound)': y_lower_bound[valid_index], - 'Label (upper bound)': y_upper_bound[valid_index], - 'Predicted label': bst.predict(dvalid)}) +df = pd.DataFrame( + { + "Label (lower bound)": y_lower_bound[valid_index], + "Label (upper bound)": y_upper_bound[valid_index], + "Predicted label": bst.predict(dvalid), + } +) print(df) # Show only data points with right-censored labels -print(df[np.isinf(df['Label (upper bound)'])]) +print(df[np.isinf(df["Label (upper bound)"])]) # Save trained model -bst.save_model('aft_best_model.json') +bst.save_model("aft_best_model.json") diff --git a/demo/aft_survival/aft_survival_viz_demo.py b/demo/aft_survival/aft_survival_viz_demo.py index 6d2279fdbd42..715f76a0dc69 100644 --- a/demo/aft_survival/aft_survival_viz_demo.py +++ b/demo/aft_survival/aft_survival_viz_demo.py @@ -9,7 +9,6 @@ import matplotlib.pyplot as plt import numpy as np - import xgboost as xgb plt.rcParams.update({"font.size": 13}) diff --git a/demo/c-api/external-memory/README.md b/demo/c-api/external-memory/README.md index e578b535ba21..47cb272d8faa 100644 --- a/demo/c-api/external-memory/README.md +++ b/demo/c-api/external-memory/README.md @@ -13,4 +13,4 @@ In the example, we define a custom data iterator with 2 methods: `reset` and `ne its end, and the `reset` method resets iterations. One important detail when using the C API for data iterator is users need to make sure that the data passed into `next` method must be kept in memory until the next iteration or `reset` is called. The external memory -DMatrix is not limited to training, but also valid for other features like prediction. \ No newline at end of file +DMatrix is not limited to training, but also valid for other features like prediction. diff --git a/demo/dask/cpu_survival.py b/demo/dask/cpu_survival.py index 44032bab207f..a9d28d67acc9 100644 --- a/demo/dask/cpu_survival.py +++ b/demo/dask/cpu_survival.py @@ -9,7 +9,6 @@ import dask.array as da import dask.dataframe as dd from dask.distributed import Client, LocalCluster - from xgboost import dask as dxgb from xgboost.dask import DaskDMatrix diff --git a/demo/dask/cpu_training.py b/demo/dask/cpu_training.py index b3a389458987..1febf96cc7a1 100644 --- a/demo/dask/cpu_training.py +++ b/demo/dask/cpu_training.py @@ -6,7 +6,6 @@ from dask import array as da from dask.distributed import Client, LocalCluster - from xgboost import dask as dxgb from xgboost.dask import DaskDMatrix diff --git a/demo/dask/dask_callbacks.py b/demo/dask/dask_callbacks.py index 1a15b918a534..8a1d57eef5f7 100644 --- a/demo/dask/dask_callbacks.py +++ b/demo/dask/dask_callbacks.py @@ -6,12 +6,11 @@ from typing import Any import numpy as np +import xgboost as xgb +import xgboost.dask as dxgb from dask.distributed import Client, LocalCluster from dask_ml.datasets import make_regression from dask_ml.model_selection import train_test_split - -import xgboost as xgb -import xgboost.dask as dxgb from xgboost.dask import DaskDMatrix diff --git a/demo/dask/dask_learning_to_rank.py b/demo/dask/dask_learning_to_rank.py index c08450fec56e..9e4e7bff5152 100644 --- a/demo/dask/dask_learning_to_rank.py +++ b/demo/dask/dask_learning_to_rank.py @@ -25,7 +25,6 @@ from dask import dataframe as dd from distributed import Client, LocalCluster, wait from sklearn.datasets import load_svmlight_file - from xgboost import dask as dxgb diff --git a/demo/dask/forward_logging.py b/demo/dask/forward_logging.py index 37189e8a429a..00fd7e624f9e 100644 --- a/demo/dask/forward_logging.py +++ b/demo/dask/forward_logging.py @@ -14,7 +14,6 @@ from dask import array as da from dask_cuda import LocalCUDACluster from distributed import Client - from xgboost import dask as dxgb from xgboost.callback import EvaluationMonitor diff --git a/demo/dask/gpu_training.py b/demo/dask/gpu_training.py index 2de5b245f7d8..9c673ee990d9 100644 --- a/demo/dask/gpu_training.py +++ b/demo/dask/gpu_training.py @@ -9,7 +9,6 @@ from dask import dataframe as dd from dask.distributed import Client from dask_cuda import LocalCUDACluster - from xgboost import dask as dxgb from xgboost.dask import DaskDMatrix diff --git a/demo/dask/sklearn_cpu_training.py b/demo/dask/sklearn_cpu_training.py index 38a53c6ca71c..1825ae851a7f 100644 --- a/demo/dask/sklearn_cpu_training.py +++ b/demo/dask/sklearn_cpu_training.py @@ -5,7 +5,6 @@ from dask import array as da from dask.distributed import Client, LocalCluster - from xgboost import dask as dxgb diff --git a/demo/dask/sklearn_gpu_training.py b/demo/dask/sklearn_gpu_training.py index 56f1be7151c4..3f0d3a70edc2 100644 --- a/demo/dask/sklearn_gpu_training.py +++ b/demo/dask/sklearn_gpu_training.py @@ -9,7 +9,6 @@ # It's recommended to use dask_cuda for GPU assignment from dask_cuda import LocalCUDACluster - from xgboost import dask as dxgb diff --git a/demo/data/regression/machine.names b/demo/data/regression/machine.names index f19a21827976..b087e58fd32d 100644 --- a/demo/data/regression/machine.names +++ b/demo/data/regression/machine.names @@ -1,17 +1,17 @@ -1. Title: Relative CPU Performance Data +1. Title: Relative CPU Performance Data 2. Source Information -- Creators: Phillip Ein-Dor and Jacob Feldmesser - -- Ein-Dor: Faculty of Management; Tel Aviv University; Ramat-Aviv; + -- Ein-Dor: Faculty of Management; Tel Aviv University; Ramat-Aviv; Tel Aviv, 69978; Israel - -- Donor: David W. Aha (aha@ics.uci.edu) (714) 856-8779 + -- Donor: David W. Aha (aha@ics.uci.edu) (714) 856-8779 -- Date: October, 1987 - + 3. Past Usage: 1. Ein-Dor and Feldmesser (CACM 4/87, pp 308-317) - -- Results: + -- Results: -- linear regression prediction of relative cpu performance - -- Recorded 34% average deviation from actual values + -- Recorded 34% average deviation from actual values 2. Kibler,D. & Aha,D. (1988). Instance-Based Prediction of Real-Valued Attributes. In Proceedings of the CSCSI (Canadian AI) Conference. @@ -25,16 +25,16 @@ using a linear regression method. See their article (pp 308-313) for more details on how the relative performance values were set. -5. Number of Instances: 209 +5. Number of Instances: 209 -6. Number of Attributes: 10 (6 predictive attributes, 2 non-predictive, +6. Number of Attributes: 10 (6 predictive attributes, 2 non-predictive, 1 goal field, and the linear regression's guess) 7. Attribute Information: - 1. vendor name: 30 - (adviser, amdahl,apollo, basf, bti, burroughs, c.r.d, cambex, cdc, dec, - dg, formation, four-phase, gould, honeywell, hp, ibm, ipl, magnuson, - microdata, nas, ncr, nixdorf, perkin-elmer, prime, siemens, sperry, + 1. vendor name: 30 + (adviser, amdahl,apollo, basf, bti, burroughs, c.r.d, cambex, cdc, dec, + dg, formation, four-phase, gould, honeywell, hp, ibm, ipl, magnuson, + microdata, nas, ncr, nixdorf, perkin-elmer, prime, siemens, sperry, sratus, wang) 2. Model Name: many unique symbols 3. MYCT: machine cycle time in nanoseconds (integer) @@ -69,4 +69,3 @@ Summary Statistics: CHMAX: 0 176 18.2 26.0 0.6052 PRP: 6 1150 105.6 160.8 1.0000 ERP: 15 1238 99.3 154.8 0.9665 - diff --git a/demo/data/regression/mapfeat.py b/demo/data/regression/mapfeat.py index 1e0318e99ebb..01633820551e 100755 --- a/demo/data/regression/mapfeat.py +++ b/demo/data/regression/mapfeat.py @@ -1,33 +1,31 @@ #!/usr/bin/env python3 -fo = open('machine.txt', 'w') +fo = open("machine.txt", "w") cnt = 6 fmap = {} -for l in open('machine.data'): - arr = l.split(',') +for l in open("machine.data"): + arr = l.split(",") fo.write(arr[8]) for i in range(0, 6): - fo.write(' %d:%s' % (i, arr[i + 2])) + fo.write(" %d:%s" % (i, arr[i + 2])) if arr[0] not in fmap: fmap[arr[0]] = cnt cnt += 1 - fo.write(' %d:1' % fmap[arr[0]]) - fo.write('\n') + fo.write(" %d:1" % fmap[arr[0]]) + fo.write("\n") fo.close() # create feature map for machine data -fo = open('featmap.txt', 'w') +fo = open("featmap.txt", "w") # list from machine.names -names = [ - 'vendor', 'MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP' -] +names = ["vendor", "MYCT", "MMIN", "MMAX", "CACH", "CHMIN", "CHMAX", "PRP", "ERP"] for i in range(0, 6): - fo.write('%d\t%s\tint\n' % (i, names[i + 1])) + fo.write("%d\t%s\tint\n" % (i, names[i + 1])) for v, k in sorted(fmap.items(), key=lambda x: x[1]): - fo.write('%d\tvendor=%s\ti\n' % (k, v)) + fo.write("%d\tvendor=%s\ti\n" % (k, v)) fo.close() diff --git a/demo/data/regression/mknfold.py b/demo/data/regression/mknfold.py index 14b5ab4e9c9b..316680947051 100755 --- a/demo/data/regression/mknfold.py +++ b/demo/data/regression/mknfold.py @@ -4,7 +4,7 @@ import sys if len(sys.argv) < 2: - print('Usage: [nfold = 5]') + print("Usage: [nfold = 5]") exit(0) random.seed(10) @@ -15,9 +15,9 @@ else: nfold = 5 -fi = open(sys.argv[1], 'r') -ftr = open(sys.argv[1] + '.train', 'w') -fte = open(sys.argv[1] + '.test', 'w') +fi = open(sys.argv[1], "r") +ftr = open(sys.argv[1] + ".train", "w") +fte = open(sys.argv[1] + ".test", "w") for l in fi: if random.randint(1, nfold) == k: fte.write(l) diff --git a/demo/guide-python/basic_walkthrough.py b/demo/guide-python/basic_walkthrough.py index 90318f5fe150..bb9ec421e5b7 100644 --- a/demo/guide-python/basic_walkthrough.py +++ b/demo/guide-python/basic_walkthrough.py @@ -9,13 +9,13 @@ See :doc:`/python/python_intro` and :doc:`/tutorials/index` for other references. """ + import os import pickle import numpy as np -from sklearn.datasets import load_svmlight_file - import xgboost as xgb +from sklearn.datasets import load_svmlight_file # Make sure the demo knows where to load the data. CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) diff --git a/demo/guide-python/boost_from_prediction.py b/demo/guide-python/boost_from_prediction.py index 13f91d7c8b73..334ae3b8410d 100644 --- a/demo/guide-python/boost_from_prediction.py +++ b/demo/guide-python/boost_from_prediction.py @@ -2,6 +2,7 @@ Demo for boosting from prediction ================================= """ + import os import xgboost as xgb diff --git a/demo/guide-python/callbacks.py b/demo/guide-python/callbacks.py index 2f8ac5c792d8..795b23fb7495 100644 --- a/demo/guide-python/callbacks.py +++ b/demo/guide-python/callbacks.py @@ -11,12 +11,11 @@ from typing import Dict import numpy as np +import xgboost as xgb from matplotlib import pyplot as plt from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split -import xgboost as xgb - class Plotting(xgb.callback.TrainingCallback): """Plot evaluation result during training. Only for demonstration purpose as it's diff --git a/demo/guide-python/cat_in_the_dat.py b/demo/guide-python/cat_in_the_dat.py index 2b2d20682d39..47028e0d7d9a 100644 --- a/demo/guide-python/cat_in_the_dat.py +++ b/demo/guide-python/cat_in_the_dat.py @@ -28,11 +28,10 @@ from time import time import pandas as pd +import xgboost as xgb from sklearn.metrics import roc_auc_score from sklearn.model_selection import train_test_split -import xgboost as xgb - def load_cat_in_the_dat() -> tuple[pd.DataFrame, pd.Series]: """Assuming you have already downloaded the data into `input` directory.""" diff --git a/demo/guide-python/cat_pipeline.py b/demo/guide-python/cat_pipeline.py index e4ec2a5cdcae..385b31a738a7 100644 --- a/demo/guide-python/cat_pipeline.py +++ b/demo/guide-python/cat_pipeline.py @@ -23,13 +23,12 @@ import numpy as np import pandas as pd +import xgboost as xgb from sklearn.compose import make_column_selector, make_column_transformer from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import OrdinalEncoder -import xgboost as xgb - def make_example_data() -> Tuple[pd.DataFrame, pd.Series, List[str]]: """Generate data for demo.""" diff --git a/demo/guide-python/categorical.py b/demo/guide-python/categorical.py index d42fb8b7755d..ddf065bc55e6 100644 --- a/demo/guide-python/categorical.py +++ b/demo/guide-python/categorical.py @@ -22,7 +22,6 @@ import numpy as np import pandas as pd - import xgboost as xgb diff --git a/demo/guide-python/continuation.py b/demo/guide-python/continuation.py index e32c486651c8..de4b258fcfee 100644 --- a/demo/guide-python/continuation.py +++ b/demo/guide-python/continuation.py @@ -7,9 +7,8 @@ import pickle import tempfile -from sklearn.datasets import load_breast_cancer - import xgboost +from sklearn.datasets import load_breast_cancer def training_continuation(tmpdir: str, use_pickle: bool) -> None: diff --git a/demo/guide-python/cover_type.py b/demo/guide-python/cover_type.py index 20d2a81d0b45..2ef0a4fbde02 100644 --- a/demo/guide-python/cover_type.py +++ b/demo/guide-python/cover_type.py @@ -12,14 +12,14 @@ cupy and cuml. These libraries are not strictly required. """ + import time import cupy as cp +import xgboost as xgb from cuml.model_selection import train_test_split from sklearn.datasets import fetch_covtype -import xgboost as xgb - # Fetch dataset using sklearn X, y = fetch_covtype(return_X_y=True) X = cp.array(X) diff --git a/demo/guide-python/cross_validation.py b/demo/guide-python/cross_validation.py index 27736aa6f406..815216d86d9f 100644 --- a/demo/guide-python/cross_validation.py +++ b/demo/guide-python/cross_validation.py @@ -7,7 +7,6 @@ from typing import Any, Dict, Tuple import numpy as np - import xgboost as xgb # load data in do training diff --git a/demo/guide-python/custom_rmsle.py b/demo/guide-python/custom_rmsle.py index c958b298d1e1..32de4d22be78 100644 --- a/demo/guide-python/custom_rmsle.py +++ b/demo/guide-python/custom_rmsle.py @@ -14,20 +14,20 @@ compare its performance with standard squared error. """ + import argparse from time import time from typing import Dict, List, Tuple import numpy as np -from matplotlib import pyplot as plt - import xgboost as xgb +from matplotlib import pyplot as plt # shape of generated data. kRows = 4096 kCols = 16 -kOutlier = 10000 # mean of generated outliers +kOutlier = 10000 # mean of generated outliers kNumberOfOutliers = 64 kRatio = 0.7 @@ -39,14 +39,14 @@ def generate_data() -> Tuple[xgb.DMatrix, xgb.DMatrix]: - '''Generate data containing outliers.''' + """Generate data containing outliers.""" x = np.random.randn(kRows, kCols) y = np.random.randn(kRows) y += np.abs(np.min(y)) # Create outliers for i in range(0, kNumberOfOutliers): - ind = np.random.randint(0, len(y)-1) + ind = np.random.randint(0, len(y) - 1) y[ind] += np.random.randint(0, kOutlier) train_portion = int(kRows * kRatio) @@ -54,8 +54,8 @@ def generate_data() -> Tuple[xgb.DMatrix, xgb.DMatrix]: # rmsle requires all label be greater than -1. assert np.all(y > -1.0) - train_x: np.ndarray = x[: train_portion] - train_y: np.ndarray = y[: train_portion] + train_x: np.ndarray = x[:train_portion] + train_y: np.ndarray = y[:train_portion] dtrain = xgb.DMatrix(train_x, label=train_y) test_x = x[train_portion:] @@ -64,93 +64,101 @@ def generate_data() -> Tuple[xgb.DMatrix, xgb.DMatrix]: return dtrain, dtest -def native_rmse(dtrain: xgb.DMatrix, - dtest: xgb.DMatrix) -> Dict[str, Dict[str, List[float]]]: - '''Train using native implementation of Root Mean Squared Loss.''' - print('Squared Error') +def native_rmse( + dtrain: xgb.DMatrix, dtest: xgb.DMatrix +) -> Dict[str, Dict[str, List[float]]]: + """Train using native implementation of Root Mean Squared Loss.""" + print("Squared Error") squared_error = { - 'objective': 'reg:squarederror', - 'eval_metric': 'rmse', - 'tree_method': 'hist', - 'seed': kSeed + "objective": "reg:squarederror", + "eval_metric": "rmse", + "tree_method": "hist", + "seed": kSeed, } start = time() results: Dict[str, Dict[str, List[float]]] = {} - xgb.train(squared_error, - dtrain=dtrain, - num_boost_round=kBoostRound, - evals=[(dtrain, 'dtrain'), (dtest, 'dtest')], - evals_result=results) - print('Finished Squared Error in:', time() - start, '\n') + xgb.train( + squared_error, + dtrain=dtrain, + num_boost_round=kBoostRound, + evals=[(dtrain, "dtrain"), (dtest, "dtest")], + evals_result=results, + ) + print("Finished Squared Error in:", time() - start, "\n") return results -def native_rmsle(dtrain: xgb.DMatrix, - dtest: xgb.DMatrix) -> Dict[str, Dict[str, List[float]]]: - '''Train using native implementation of Squared Log Error.''' - print('Squared Log Error') +def native_rmsle( + dtrain: xgb.DMatrix, dtest: xgb.DMatrix +) -> Dict[str, Dict[str, List[float]]]: + """Train using native implementation of Squared Log Error.""" + print("Squared Log Error") results: Dict[str, Dict[str, List[float]]] = {} squared_log_error = { - 'objective': 'reg:squaredlogerror', - 'eval_metric': 'rmsle', - 'tree_method': 'hist', - 'seed': kSeed + "objective": "reg:squaredlogerror", + "eval_metric": "rmsle", + "tree_method": "hist", + "seed": kSeed, } start = time() - xgb.train(squared_log_error, - dtrain=dtrain, - num_boost_round=kBoostRound, - evals=[(dtrain, 'dtrain'), (dtest, 'dtest')], - evals_result=results) - print('Finished Squared Log Error in:', time() - start) + xgb.train( + squared_log_error, + dtrain=dtrain, + num_boost_round=kBoostRound, + evals=[(dtrain, "dtrain"), (dtest, "dtest")], + evals_result=results, + ) + print("Finished Squared Log Error in:", time() - start) return results def py_rmsle(dtrain: xgb.DMatrix, dtest: xgb.DMatrix) -> Dict: - '''Train using Python implementation of Squared Log Error.''' + """Train using Python implementation of Squared Log Error.""" + def gradient(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray: - '''Compute the gradient squared log error.''' + """Compute the gradient squared log error.""" y = dtrain.get_label() return (np.log1p(predt) - np.log1p(y)) / (predt + 1) def hessian(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray: - '''Compute the hessian for squared log error.''' + """Compute the hessian for squared log error.""" y = dtrain.get_label() - return ((-np.log1p(predt) + np.log1p(y) + 1) / - np.power(predt + 1, 2)) + return (-np.log1p(predt) + np.log1p(y) + 1) / np.power(predt + 1, 2) - def squared_log(predt: np.ndarray, - dtrain: xgb.DMatrix) -> Tuple[np.ndarray, np.ndarray]: - '''Squared Log Error objective. A simplified version for RMSLE used as + def squared_log( + predt: np.ndarray, dtrain: xgb.DMatrix + ) -> Tuple[np.ndarray, np.ndarray]: + """Squared Log Error objective. A simplified version for RMSLE used as objective function. :math:`\frac{1}{2}[log(pred + 1) - log(label + 1)]^2` - ''' + """ predt[predt < -1] = -1 + 1e-6 grad = gradient(predt, dtrain) hess = hessian(predt, dtrain) return grad, hess def rmsle(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]: - ''' Root mean squared log error metric. + """Root mean squared log error metric. :math:`\\sqrt{\frac{1}{N}[log(pred + 1) - log(label + 1)]^2}` - ''' + """ y = dtrain.get_label() predt[predt < -1] = -1 + 1e-6 elements = np.power(np.log1p(y) - np.log1p(predt), 2) - return 'PyRMSLE', float(np.sqrt(np.sum(elements) / len(y))) + return "PyRMSLE", float(np.sqrt(np.sum(elements) / len(y))) results: Dict[str, Dict[str, List[float]]] = {} - xgb.train({'tree_method': 'hist', 'seed': kSeed, - 'disable_default_eval_metric': 1}, - dtrain=dtrain, - num_boost_round=kBoostRound, - obj=squared_log, - custom_metric=rmsle, - evals=[(dtrain, 'dtrain'), (dtest, 'dtest')], - evals_result=results) + xgb.train( + {"tree_method": "hist", "seed": kSeed, "disable_default_eval_metric": 1}, + dtrain=dtrain, + num_boost_round=kBoostRound, + obj=squared_log, + custom_metric=rmsle, + evals=[(dtrain, "dtrain"), (dtest, "dtest")], + evals_result=results, + ) return results @@ -158,7 +166,7 @@ def rmsle(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]: def plot_history( rmse_evals: Dict[str, Dict], rmsle_evals: Dict[str, Dict], - py_rmsle_evals: Dict[str, Dict] + py_rmsle_evals: Dict[str, Dict], ) -> None: fig, axs = plt.subplots(3, 1) assert isinstance(axs, np.ndarray) @@ -168,16 +176,16 @@ def plot_history( x = np.arange(0, kBoostRound, 1) - ax0.plot(x, rmse_evals['dtrain']['rmse'], label='train-RMSE') - ax0.plot(x, rmse_evals['dtest']['rmse'], label='test-RMSE') + ax0.plot(x, rmse_evals["dtrain"]["rmse"], label="train-RMSE") + ax0.plot(x, rmse_evals["dtest"]["rmse"], label="test-RMSE") ax0.legend() - ax1.plot(x, rmsle_evals['dtrain']['rmsle'], label='train-native-RMSLE') - ax1.plot(x, rmsle_evals['dtest']['rmsle'], label='test-native-RMSLE') + ax1.plot(x, rmsle_evals["dtrain"]["rmsle"], label="train-native-RMSLE") + ax1.plot(x, rmsle_evals["dtest"]["rmsle"], label="test-native-RMSLE") ax1.legend() - ax2.plot(x, py_rmsle_evals['dtrain']['PyRMSLE'], label='train-PyRMSLE') - ax2.plot(x, py_rmsle_evals['dtest']['PyRMSLE'], label='test-PyRMSLE') + ax2.plot(x, py_rmsle_evals["dtrain"]["PyRMSLE"], label="train-PyRMSLE") + ax2.plot(x, py_rmsle_evals["dtest"]["PyRMSLE"], label="test-PyRMSLE") ax2.legend() @@ -194,11 +202,13 @@ def main(args: argparse.Namespace) -> None: if __name__ == "__main__": parser = argparse.ArgumentParser( - description='Arguments for custom RMSLE objective function demo.') + description="Arguments for custom RMSLE objective function demo." + ) parser.add_argument( - '--plot', + "--plot", type=int, default=1, - help='Set to 0 to disable plotting the evaluation history.') + help="Set to 0 to disable plotting the evaluation history.", + ) args = parser.parse_args() main(args) diff --git a/demo/guide-python/custom_softmax.py b/demo/guide-python/custom_softmax.py index 207b38d01f37..082460c39a33 100644 --- a/demo/guide-python/custom_softmax.py +++ b/demo/guide-python/custom_softmax.py @@ -15,9 +15,8 @@ from typing import Dict, Tuple import numpy as np -from matplotlib import pyplot as plt - import xgboost as xgb +from matplotlib import pyplot as plt np.random.seed(1994) diff --git a/demo/guide-python/evals_result.py b/demo/guide-python/evals_result.py index 2ea853a090b1..1de0b19e2835 100644 --- a/demo/guide-python/evals_result.py +++ b/demo/guide-python/evals_result.py @@ -2,6 +2,7 @@ This script demonstrate how to access the eval metrics ====================================================== """ + import os from typing import Any, Dict diff --git a/demo/guide-python/external_memory.py b/demo/guide-python/external_memory.py index 5d57aa646f79..b0e249145996 100644 --- a/demo/guide-python/external_memory.py +++ b/demo/guide-python/external_memory.py @@ -40,9 +40,8 @@ from typing import TYPE_CHECKING, Callable, List, Literal, Tuple import numpy as np -from sklearn.datasets import make_regression - import xgboost +from sklearn.datasets import make_regression if TYPE_CHECKING: from cuda.bindings.runtime import cudaError_t diff --git a/demo/guide-python/feature_weights.py b/demo/guide-python/feature_weights.py index b12edb9415ec..b7687674f4e3 100644 --- a/demo/guide-python/feature_weights.py +++ b/demo/guide-python/feature_weights.py @@ -8,9 +8,8 @@ import argparse import numpy as np -from matplotlib import pyplot as plt - import xgboost +from matplotlib import pyplot as plt def main(args: argparse.Namespace) -> None: diff --git a/demo/guide-python/gamma_regression.py b/demo/guide-python/gamma_regression.py index 74d256990092..f2327eebc051 100644 --- a/demo/guide-python/gamma_regression.py +++ b/demo/guide-python/gamma_regression.py @@ -2,28 +2,31 @@ Demo for gamma regression ========================= """ -import numpy as np +import numpy as np import xgboost as xgb # this script demonstrates how to fit gamma regression model (with log link function) # in xgboost, before running the demo you need to generate the autoclaims dataset # by running gen_autoclaims.R located in xgboost/demo/data. -data = np.genfromtxt('../data/autoclaims.csv', delimiter=',') +data = np.genfromtxt("../data/autoclaims.csv", delimiter=",") dtrain = xgb.DMatrix(data[0:4741, 0:34], data[0:4741, 34]) dtest = xgb.DMatrix(data[4741:6773, 0:34], data[4741:6773, 34]) # for gamma regression, we need to set the objective to 'reg:gamma', it also suggests # to set the base_score to a value between 1 to 5 if the number of iteration is small -param = {'objective':'reg:gamma', 'booster':'gbtree', 'base_score':3} +param = {"objective": "reg:gamma", "booster": "gbtree", "base_score": 3} # the rest of settings are the same -watchlist = [(dtest, 'eval'), (dtrain, 'train')] +watchlist = [(dtest, "eval"), (dtrain, "train")] num_round = 30 # training and evaluation bst = xgb.train(param, dtrain, num_round, watchlist) preds = bst.predict(dtest) labels = dtest.get_label() -print('test deviance=%f' % (2 * np.sum((labels - preds) / preds - np.log(labels) + np.log(preds)))) +print( + "test deviance=%f" + % (2 * np.sum((labels - preds) / preds - np.log(labels) + np.log(preds))) +) diff --git a/demo/guide-python/generalized_linear_model.py b/demo/guide-python/generalized_linear_model.py index 3387b1982acb..17d6b12e3709 100644 --- a/demo/guide-python/generalized_linear_model.py +++ b/demo/guide-python/generalized_linear_model.py @@ -2,6 +2,7 @@ Demo for GLM ============ """ + import os import xgboost as xgb diff --git a/demo/guide-python/gpu_tree_shap.py b/demo/guide-python/gpu_tree_shap.py index f0e772ff3cc4..d257411467bb 100644 --- a/demo/guide-python/gpu_tree_shap.py +++ b/demo/guide-python/gpu_tree_shap.py @@ -5,12 +5,12 @@ Demonstrates using GPU acceleration to compute SHAP values for feature importance. """ + from urllib.error import HTTPError import shap -from sklearn.datasets import fetch_california_housing, make_regression - import xgboost as xgb +from sklearn.datasets import fetch_california_housing, make_regression # Fetch dataset using sklearn try: diff --git a/demo/guide-python/individual_trees.py b/demo/guide-python/individual_trees.py index b10fabf64a15..009911a32511 100644 --- a/demo/guide-python/individual_trees.py +++ b/demo/guide-python/individual_trees.py @@ -6,11 +6,10 @@ import os import numpy as np +import xgboost as xgb from scipy.special import logit from sklearn.datasets import load_svmlight_file -import xgboost as xgb - CURRENT_DIR = os.path.dirname(__file__) train = os.path.join(CURRENT_DIR, "../data/agaricus.txt.train") test = os.path.join(CURRENT_DIR, "../data/agaricus.txt.test") diff --git a/demo/guide-python/learning_to_rank.py b/demo/guide-python/learning_to_rank.py index fbc1f44baf50..e07cd51700a9 100644 --- a/demo/guide-python/learning_to_rank.py +++ b/demo/guide-python/learning_to_rank.py @@ -25,9 +25,8 @@ import numpy as np import pandas as pd -from sklearn.datasets import load_svmlight_file - import xgboost as xgb +from sklearn.datasets import load_svmlight_file from xgboost.testing.data import RelDataCV, simulate_clicks, sort_ltr_samples diff --git a/demo/guide-python/multioutput_reduced_gradient.py b/demo/guide-python/multioutput_reduced_gradient.py index 29e828806f7a..e4747b5c8562 100644 --- a/demo/guide-python/multioutput_reduced_gradient.py +++ b/demo/guide-python/multioutput_reduced_gradient.py @@ -18,10 +18,9 @@ from typing import Tuple import numpy as np +import xgboost as xgb from sklearn.base import BaseEstimator from sklearn.datasets import make_regression - -import xgboost as xgb from xgboost.objective import TreeObjective @@ -79,8 +78,8 @@ def split_grad( ) -> Tuple[np.ndarray, np.ndarray]: svd = svd_class(self.device) if self.device == "cuda": - grad = grad.get() # type: ignore - hess = hess.get() # type: ignore + grad = grad.get() # type: ignore + hess = hess.get() # type: ignore svd.fit(grad) grad = svd.transform(grad) diff --git a/demo/guide-python/multioutput_regression.py b/demo/guide-python/multioutput_regression.py index 9908251a0496..4c66205945d8 100644 --- a/demo/guide-python/multioutput_regression.py +++ b/demo/guide-python/multioutput_regression.py @@ -22,9 +22,8 @@ import matplotlib import numpy as np -from matplotlib import pyplot as plt - import xgboost as xgb +from matplotlib import pyplot as plt def plot_predt( diff --git a/demo/guide-python/predict_first_ntree.py b/demo/guide-python/predict_first_ntree.py index 312522fc5b6a..69a9b62adaff 100644 --- a/demo/guide-python/predict_first_ntree.py +++ b/demo/guide-python/predict_first_ntree.py @@ -2,12 +2,12 @@ Demo for prediction using number of trees ========================================= """ + import os import numpy as np -from sklearn.datasets import load_svmlight_file - import xgboost as xgb +from sklearn.datasets import load_svmlight_file CURRENT_DIR = os.path.dirname(__file__) train = os.path.join(CURRENT_DIR, "../data/agaricus.txt.train") diff --git a/demo/guide-python/predict_leaf_indices.py b/demo/guide-python/predict_leaf_indices.py index 627619724f96..cb30d4183491 100644 --- a/demo/guide-python/predict_leaf_indices.py +++ b/demo/guide-python/predict_leaf_indices.py @@ -2,6 +2,7 @@ Demo for obtaining leaf index ============================= """ + import os import xgboost as xgb diff --git a/demo/guide-python/quantile_data_iterator.py b/demo/guide-python/quantile_data_iterator.py index 4753d5c5083a..b9ea0250e9f8 100644 --- a/demo/guide-python/quantile_data_iterator.py +++ b/demo/guide-python/quantile_data_iterator.py @@ -25,7 +25,6 @@ import cupy import numpy - import xgboost COLS = 64 diff --git a/demo/guide-python/quantile_regression.py b/demo/guide-python/quantile_regression.py index f331d5a21d3e..f7691fd32bd7 100644 --- a/demo/guide-python/quantile_regression.py +++ b/demo/guide-python/quantile_regression.py @@ -18,9 +18,8 @@ from typing import Dict import numpy as np -from sklearn.model_selection import train_test_split - import xgboost as xgb +from sklearn.model_selection import train_test_split def f(x: np.ndarray) -> np.ndarray: diff --git a/demo/guide-python/sklearn_evals_result.py b/demo/guide-python/sklearn_evals_result.py index 781ab81af722..3ac68e3773c8 100644 --- a/demo/guide-python/sklearn_evals_result.py +++ b/demo/guide-python/sklearn_evals_result.py @@ -4,9 +4,8 @@ """ import numpy as np -from sklearn.datasets import make_hastie_10_2 - import xgboost as xgb +from sklearn.datasets import make_hastie_10_2 X, y = make_hastie_10_2(n_samples=2000, random_state=42) diff --git a/demo/guide-python/sklearn_examples.py b/demo/guide-python/sklearn_examples.py index 4e0392988cb3..c697ce79b6d3 100644 --- a/demo/guide-python/sklearn_examples.py +++ b/demo/guide-python/sklearn_examples.py @@ -14,6 +14,7 @@ from urllib.error import HTTPError import numpy as np +import xgboost as xgb from sklearn.datasets import ( fetch_california_housing, load_digits, @@ -23,8 +24,6 @@ from sklearn.metrics import confusion_matrix, mean_squared_error from sklearn.model_selection import GridSearchCV, KFold, train_test_split -import xgboost as xgb - rng = np.random.RandomState(31337) print("Zeros and Ones from the Digits dataset: binary classification") diff --git a/demo/guide-python/sklearn_parallel.py b/demo/guide-python/sklearn_parallel.py index 2f62d2b48ca3..212c2f18f024 100644 --- a/demo/guide-python/sklearn_parallel.py +++ b/demo/guide-python/sklearn_parallel.py @@ -6,11 +6,10 @@ import multiprocessing from urllib.error import HTTPError +import xgboost as xgb from sklearn.datasets import fetch_california_housing, make_regression from sklearn.model_selection import GridSearchCV -import xgboost as xgb - if __name__ == "__main__": print("Parallel Parameter optimization") try: diff --git a/demo/guide-python/spark_estimator_examples.py b/demo/guide-python/spark_estimator_examples.py index 2437d2fd59a2..93844a7f005a 100644 --- a/demo/guide-python/spark_estimator_examples.py +++ b/demo/guide-python/spark_estimator_examples.py @@ -12,7 +12,6 @@ from pyspark.sql import DataFrame, SparkSession from pyspark.sql.functions import rand from sklearn.model_selection import train_test_split - from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor spark = SparkSession.builder.master("local[*]").getOrCreate() diff --git a/demo/guide-python/update_process.py b/demo/guide-python/update_process.py index 2ef6e7fc31c9..70ea6eb99562 100644 --- a/demo/guide-python/update_process.py +++ b/demo/guide-python/update_process.py @@ -10,9 +10,8 @@ from urllib.error import HTTPError import numpy as np -from sklearn.datasets import fetch_california_housing, make_regression - import xgboost as xgb +from sklearn.datasets import fetch_california_housing, make_regression def main() -> None: diff --git a/demo/kaggle-higgs/higgs-cv.py b/demo/kaggle-higgs/higgs-cv.py index 75b8202dfcb4..1a901337d161 100755 --- a/demo/kaggle-higgs/higgs-cv.py +++ b/demo/kaggle-higgs/higgs-cv.py @@ -1,26 +1,32 @@ #!/usr/bin/python import numpy as np - import xgboost as xgb ### load data in do training -train = np.loadtxt('./data/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s'.encode('utf-8')) } ) -label = train[:,32] -data = train[:,1:31] -weight = train[:,31] -dtrain = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight ) -param = {'max_depth':6, 'eta':0.1, 'objective':'binary:logitraw', 'nthread':4} +train = np.loadtxt( + "./data/training.csv", + delimiter=",", + skiprows=1, + converters={32: lambda x: int(x == "s".encode("utf-8"))}, +) +label = train[:, 32] +data = train[:, 1:31] +weight = train[:, 31] +dtrain = xgb.DMatrix(data, label=label, missing=-999.0, weight=weight) +param = {"max_depth": 6, "eta": 0.1, "objective": "binary:logitraw", "nthread": 4} num_round = 120 -print ('running cross validation, with preprocessing function') +print("running cross validation, with preprocessing function") + + # define the preprocessing function # used to return the preprocessed training, test data, and parameter # we can use this to do weight rescale, etc. # as a example, we try to set scale_pos_weight def fpreproc(dtrain, dtest, param): label = dtrain.get_label() - ratio = float(np.sum(label == 0)) / np.sum(label==1) - param['scale_pos_weight'] = ratio + ratio = float(np.sum(label == 0)) / np.sum(label == 1) + param["scale_pos_weight"] = ratio wtrain = dtrain.get_weight() wtest = dtest.get_weight() sum_weight = sum(wtrain) + sum(wtest) @@ -30,9 +36,17 @@ def fpreproc(dtrain, dtest, param): dtest.set_weight(wtest) return (dtrain, dtest, param) + # do cross validation, for each fold # the dtrain, dtest, param will be passed into fpreproc # then the return value of fpreproc will be used to generate # results of that fold -xgb.cv(param, dtrain, num_round, nfold=5, - metrics={'ams@0.15', 'auc'}, seed = 0, fpreproc = fpreproc) +xgb.cv( + param, + dtrain, + num_round, + nfold=5, + metrics={"ams@0.15", "auc"}, + seed=0, + fpreproc=fpreproc, +) diff --git a/demo/kaggle-higgs/higgs-numpy.py b/demo/kaggle-higgs/higgs-numpy.py index 41c44c9352bb..f18f13d2c12b 100755 --- a/demo/kaggle-higgs/higgs-numpy.py +++ b/demo/kaggle-higgs/higgs-numpy.py @@ -1,53 +1,60 @@ #!/usr/bin/python # this is the example script to use xgboost to train import numpy as np - import xgboost as xgb test_size = 550000 # path to where the data lies -dpath = 'data' +dpath = "data" # load in training data, directly use numpy -dtrain = np.loadtxt( dpath+'/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s'.encode('utf-8')) } ) -print ('finish loading from csv ') - -label = dtrain[:,32] -data = dtrain[:,1:31] +dtrain = np.loadtxt( + dpath + "/training.csv", + delimiter=",", + skiprows=1, + converters={32: lambda x: int(x == "s".encode("utf-8"))}, +) +print("finish loading from csv ") + +label = dtrain[:, 32] +data = dtrain[:, 1:31] # rescale weight to make it same as test set -weight = dtrain[:,31] * float(test_size) / len(label) +weight = dtrain[:, 31] * float(test_size) / len(label) -sum_wpos = sum( weight[i] for i in range(len(label)) if label[i] == 1.0 ) -sum_wneg = sum( weight[i] for i in range(len(label)) if label[i] == 0.0 ) +sum_wpos = sum(weight[i] for i in range(len(label)) if label[i] == 1.0) +sum_wneg = sum(weight[i] for i in range(len(label)) if label[i] == 0.0) # print weight statistics -print ('weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos )) +print( + "weight statistics: wpos=%g, wneg=%g, ratio=%g" + % (sum_wpos, sum_wneg, sum_wneg / sum_wpos) +) # construct xgboost.DMatrix from numpy array, treat -999.0 as missing value -xgmat = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight ) +xgmat = xgb.DMatrix(data, label=label, missing=-999.0, weight=weight) # setup parameters for xgboost param = {} # use logistic regression loss, use raw prediction before logistic transformation # since we only need the rank -param['objective'] = 'binary:logitraw' +param["objective"] = "binary:logitraw" # scale weight of positive examples -param['scale_pos_weight'] = sum_wneg/sum_wpos -param['eta'] = 0.1 -param['max_depth'] = 6 -param['eval_metric'] = 'auc' -param['nthread'] = 16 +param["scale_pos_weight"] = sum_wneg / sum_wpos +param["eta"] = 0.1 +param["max_depth"] = 6 +param["eval_metric"] = "auc" +param["nthread"] = 16 # you can directly throw param in, though we want to watch multiple metrics here -plst = list(param.items())+[('eval_metric', 'ams@0.15')] +plst = list(param.items()) + [("eval_metric", "ams@0.15")] -watchlist = [ (xgmat,'train') ] +watchlist = [(xgmat, "train")] # boost 120 trees num_round = 120 -print ('loading data end, start to boost trees') -bst = xgb.train( plst, xgmat, num_round, watchlist ); +print("loading data end, start to boost trees") +bst = xgb.train(plst, xgmat, num_round, watchlist) # save out model -bst.save_model('higgs.model') +bst.save_model("higgs.model") -print ('finish training') +print("finish training") diff --git a/demo/kaggle-higgs/higgs-pred.py b/demo/kaggle-higgs/higgs-pred.py index c14a8a94c06f..17210de06e92 100755 --- a/demo/kaggle-higgs/higgs-pred.py +++ b/demo/kaggle-higgs/higgs-pred.py @@ -1,48 +1,47 @@ #!/usr/bin/python # make prediction import numpy as np - import xgboost as xgb # path to where the data lies -dpath = 'data' +dpath = "data" -modelfile = 'higgs.model' -outfile = 'higgs.pred.csv' +modelfile = "higgs.model" +outfile = "higgs.pred.csv" # make top 15% as positive threshold_ratio = 0.15 # load in training data, directly use numpy -dtest = np.loadtxt( dpath+'/test.csv', delimiter=',', skiprows=1 ) -data = dtest[:,1:31] -idx = dtest[:,0] +dtest = np.loadtxt(dpath + "/test.csv", delimiter=",", skiprows=1) +data = dtest[:, 1:31] +idx = dtest[:, 0] -print ('finish loading from csv ') -xgmat = xgb.DMatrix( data, missing = -999.0 ) -bst = xgb.Booster({'nthread':16}, model_file = modelfile) -ypred = bst.predict( xgmat ) +print("finish loading from csv ") +xgmat = xgb.DMatrix(data, missing=-999.0) +bst = xgb.Booster({"nthread": 16}, model_file=modelfile) +ypred = bst.predict(xgmat) -res = [ ( int(idx[i]), ypred[i] ) for i in range(len(ypred)) ] +res = [(int(idx[i]), ypred[i]) for i in range(len(ypred))] rorder = {} -for k, v in sorted( res, key = lambda x:-x[1] ): - rorder[ k ] = len(rorder) + 1 +for k, v in sorted(res, key=lambda x: -x[1]): + rorder[k] = len(rorder) + 1 # write out predictions -ntop = int( threshold_ratio * len(rorder ) ) -fo = open(outfile, 'w') +ntop = int(threshold_ratio * len(rorder)) +fo = open(outfile, "w") nhit = 0 ntot = 0 -fo.write('EventId,RankOrder,Class\n') +fo.write("EventId,RankOrder,Class\n") for k, v in res: if rorder[k] <= ntop: - lb = 's' + lb = "s" nhit += 1 else: - lb = 'b' + lb = "b" # change output rank order to follow Kaggle convention - fo.write('%s,%d,%s\n' % ( k, len(rorder)+1-rorder[k], lb ) ) + fo.write("%s,%d,%s\n" % (k, len(rorder) + 1 - rorder[k], lb)) ntot += 1 fo.close() -print ('finished writing into prediction file') +print("finished writing into prediction file") diff --git a/demo/kaggle-higgs/speedtest.py b/demo/kaggle-higgs/speedtest.py index be101e8b5acb..afd63308b217 100755 --- a/demo/kaggle-higgs/speedtest.py +++ b/demo/kaggle-higgs/speedtest.py @@ -3,63 +3,70 @@ import time import numpy as np -from sklearn.ensemble import GradientBoostingClassifier - import xgboost as xgb +from sklearn.ensemble import GradientBoostingClassifier test_size = 550000 # path to where the data lies -dpath = 'data' +dpath = "data" # load in training data, directly use numpy -dtrain = np.loadtxt( dpath+'/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s') } ) -print ('finish loading from csv ') +dtrain = np.loadtxt( + dpath + "/training.csv", + delimiter=",", + skiprows=1, + converters={32: lambda x: int(x == "s")}, +) +print("finish loading from csv ") -label = dtrain[:,32] -data = dtrain[:,1:31] +label = dtrain[:, 32] +data = dtrain[:, 1:31] # rescale weight to make it same as test set -weight = dtrain[:,31] * float(test_size) / len(label) +weight = dtrain[:, 31] * float(test_size) / len(label) -sum_wpos = sum( weight[i] for i in range(len(label)) if label[i] == 1.0 ) -sum_wneg = sum( weight[i] for i in range(len(label)) if label[i] == 0.0 ) +sum_wpos = sum(weight[i] for i in range(len(label)) if label[i] == 1.0) +sum_wneg = sum(weight[i] for i in range(len(label)) if label[i] == 0.0) # print weight statistics -print ('weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos )) +print( + "weight statistics: wpos=%g, wneg=%g, ratio=%g" + % (sum_wpos, sum_wneg, sum_wneg / sum_wpos) +) # construct xgboost.DMatrix from numpy array, treat -999.0 as missing value -xgmat = xgb.DMatrix( data, label=label, missing = -999.0, weight=weight ) +xgmat = xgb.DMatrix(data, label=label, missing=-999.0, weight=weight) # setup parameters for xgboost param = {} # use logistic regression loss -param['objective'] = 'binary:logitraw' +param["objective"] = "binary:logitraw" # scale weight of positive examples -param['scale_pos_weight'] = sum_wneg/sum_wpos -param['bst:eta'] = 0.1 -param['bst:max_depth'] = 6 -param['eval_metric'] = 'auc' -param['nthread'] = 4 +param["scale_pos_weight"] = sum_wneg / sum_wpos +param["bst:eta"] = 0.1 +param["bst:max_depth"] = 6 +param["eval_metric"] = "auc" +param["nthread"] = 4 -plst = param.items()+[('eval_metric', 'ams@0.15')] +plst = param.items() + [("eval_metric", "ams@0.15")] -watchlist = [ (xgmat,'train') ] +watchlist = [(xgmat, "train")] # boost 10 trees num_round = 10 -print ('loading data end, start to boost trees') -print ("training GBM from sklearn") +print("loading data end, start to boost trees") +print("training GBM from sklearn") tmp = time.time() gbm = GradientBoostingClassifier(n_estimators=num_round, max_depth=6, verbose=2) gbm.fit(data, label) -print ("sklearn.GBM costs: %s seconds" % str(time.time() - tmp)) -#raw_input() -print ("training xgboost") +print("sklearn.GBM costs: %s seconds" % str(time.time() - tmp)) +# raw_input() +print("training xgboost") threads = [1, 2, 4, 16] for i in threads: - param['nthread'] = i + param["nthread"] = i tmp = time.time() - plst = param.items()+[('eval_metric', 'ams@0.15')] - bst = xgb.train( plst, xgmat, num_round, watchlist ); - print ("XGBoost with %d thread costs: %s seconds" % (i, str(time.time() - tmp))) + plst = param.items() + [("eval_metric", "ams@0.15")] + bst = xgb.train(plst, xgmat, num_round, watchlist) + print("XGBoost with %d thread costs: %s seconds" % (i, str(time.time() - tmp))) -print ('finish training') +print("finish training") diff --git a/demo/multiclass_classification/train.py b/demo/multiclass_classification/train.py index a261c20a2a98..e6a4a3e9105b 100755 --- a/demo/multiclass_classification/train.py +++ b/demo/multiclass_classification/train.py @@ -3,16 +3,18 @@ from __future__ import division import numpy as np - import xgboost as xgb # label need to be 0 to num_class -1 -data = np.loadtxt('./dermatology.data', delimiter=',', - converters={33: lambda x:int(x == '?'), 34: lambda x:int(x) - 1}) +data = np.loadtxt( + "./dermatology.data", + delimiter=",", + converters={33: lambda x: int(x == "?"), 34: lambda x: int(x) - 1}, +) sz = data.shape -train = data[:int(sz[0] * 0.7), :] -test = data[int(sz[0] * 0.7):, :] +train = data[: int(sz[0] * 0.7), :] +test = data[int(sz[0] * 0.7) :, :] train_X = train[:, :33] train_Y = train[:, 34] @@ -25,27 +27,27 @@ # setup parameters for xgboost param = {} # use softmax multi-class classification -param['objective'] = 'multi:softmax' +param["objective"] = "multi:softmax" # scale weight of positive examples -param['eta'] = 0.1 -param['max_depth'] = 6 -param['nthread'] = 4 -param['num_class'] = 6 +param["eta"] = 0.1 +param["max_depth"] = 6 +param["nthread"] = 4 +param["num_class"] = 6 -watchlist = [(xg_train, 'train'), (xg_test, 'test')] +watchlist = [(xg_train, "train"), (xg_test, "test")] num_round = 5 bst = xgb.train(param, xg_train, num_round, watchlist) # get prediction pred = bst.predict(xg_test) error_rate = np.sum(pred != test_Y) / test_Y.shape[0] -print('Test error using softmax = {}'.format(error_rate)) +print("Test error using softmax = {}".format(error_rate)) # do the same thing again, but output probabilities -param['objective'] = 'multi:softprob' +param["objective"] = "multi:softprob" bst = xgb.train(param, xg_train, num_round, watchlist) # Note: this convention has been changed since xgboost-unity # get prediction, this is in 1D array, need reshape to (ndata, nclass) pred_prob = bst.predict(xg_test).reshape(test_Y.shape[0], 6) pred_label = np.argmax(pred_prob, axis=1) error_rate = np.sum(pred_label != test_Y) / test_Y.shape[0] -print('Test error using softprob = {}'.format(error_rate)) +print("Test error using softprob = {}".format(error_rate)) diff --git a/demo/nvflare/horizontal/custom/controller.py b/demo/nvflare/horizontal/custom/controller.py index dd3e39f46cf3..ad03d09fdef4 100644 --- a/demo/nvflare/horizontal/custom/controller.py +++ b/demo/nvflare/horizontal/custom/controller.py @@ -2,8 +2,10 @@ Example of training controller with NVFlare =========================================== """ + import multiprocessing +import xgboost.federated from nvflare.apis.client import Client from nvflare.apis.fl_context import FLContext from nvflare.apis.impl.controller import Controller, Task @@ -11,12 +13,16 @@ from nvflare.apis.signal import Signal from trainer import SupportedTasks -import xgboost.federated - class XGBoostController(Controller): - def __init__(self, port: int, world_size: int, server_key_path: str, - server_cert_path: str, client_cert_path: str): + def __init__( + self, + port: int, + world_size: int, + server_key_path: str, + server_cert_path: str, + client_cert_path: str, + ): """Controller for federated XGBoost. Args: @@ -37,18 +43,31 @@ def __init__(self, port: int, world_size: int, server_key_path: str, def start_controller(self, fl_ctx: FLContext): self._server = multiprocessing.Process( target=xgboost.federated.run_federated_server, - args=(self._port, self._world_size, self._server_key_path, - self._server_cert_path, self._client_cert_path)) + args=( + self._port, + self._world_size, + self._server_key_path, + self._server_cert_path, + self._client_cert_path, + ), + ) self._server.start() def stop_controller(self, fl_ctx: FLContext): if self._server: self._server.terminate() - def process_result_of_unknown_task(self, client: Client, task_name: str, - client_task_id: str, result: Shareable, - fl_ctx: FLContext): - self.log_warning(fl_ctx, f"Unknown task: {task_name} from client {client.name}.") + def process_result_of_unknown_task( + self, + client: Client, + task_name: str, + client_task_id: str, + result: Shareable, + fl_ctx: FLContext, + ): + self.log_warning( + fl_ctx, f"Unknown task: {task_name} from client {client.name}." + ) def control_flow(self, abort_signal: Signal, fl_ctx: FLContext): self.log_info(fl_ctx, "XGBoost training control flow started.") diff --git a/demo/nvflare/horizontal/custom/trainer.py b/demo/nvflare/horizontal/custom/trainer.py index 4f20b2f39f3f..c26311aea79b 100644 --- a/demo/nvflare/horizontal/custom/trainer.py +++ b/demo/nvflare/horizontal/custom/trainer.py @@ -1,12 +1,11 @@ import os +import xgboost as xgb from nvflare.apis.executor import Executor from nvflare.apis.fl_constant import FLContextKey, ReturnCode from nvflare.apis.fl_context import FLContext from nvflare.apis.shareable import Shareable, make_reply from nvflare.apis.signal import Signal - -import xgboost as xgb from xgboost import callback @@ -15,8 +14,15 @@ class SupportedTasks(object): class XGBoostTrainer(Executor): - def __init__(self, server_address: str, world_size: int, server_cert_path: str, - client_key_path: str, client_cert_path: str, use_gpus: bool): + def __init__( + self, + server_address: str, + world_size: int, + server_cert_path: str, + client_key_path: str, + client_cert_path: str, + use_gpus: bool, + ): """Trainer for federated XGBoost. Args: @@ -34,8 +40,13 @@ def __init__(self, server_address: str, world_size: int, server_cert_path: str, self._client_cert_path = client_cert_path self._use_gpus = use_gpus - def execute(self, task_name: str, shareable: Shareable, fl_ctx: FLContext, - abort_signal: Signal) -> Shareable: + def execute( + self, + task_name: str, + shareable: Shareable, + fl_ctx: FLContext, + abort_signal: Signal, + ) -> Shareable: self.log_info(fl_ctx, f"Executing {task_name}") try: if task_name == SupportedTasks.TRAIN: @@ -45,41 +56,53 @@ def execute(self, task_name: str, shareable: Shareable, fl_ctx: FLContext, self.log_error(fl_ctx, f"{task_name} is not a supported task.") return make_reply(ReturnCode.TASK_UNKNOWN) except BaseException as e: - self.log_exception(fl_ctx, - f"Task {task_name} failed. Exception: {e.__str__()}") + self.log_exception( + fl_ctx, f"Task {task_name} failed. Exception: {e.__str__()}" + ) return make_reply(ReturnCode.EXECUTION_EXCEPTION) def _do_training(self, fl_ctx: FLContext): client_name = fl_ctx.get_prop(FLContextKey.CLIENT_NAME) - rank = int(client_name.split('-')[1]) - 1 + rank = int(client_name.split("-")[1]) - 1 communicator_env = { - 'xgboost_communicator': 'federated', - 'federated_server_address': self._server_address, - 'federated_world_size': self._world_size, - 'federated_rank': rank, - 'federated_server_cert': self._server_cert_path, - 'federated_client_key': self._client_key_path, - 'federated_client_cert': self._client_cert_path + "xgboost_communicator": "federated", + "federated_server_address": self._server_address, + "federated_world_size": self._world_size, + "federated_rank": rank, + "federated_server_cert": self._server_cert_path, + "federated_client_key": self._client_key_path, + "federated_client_cert": self._client_cert_path, } with xgb.collective.CommunicatorContext(**communicator_env): # Load file, file will not be sharded in federated mode. - dtrain = xgb.DMatrix('agaricus.txt.train?format=libsvm') - dtest = xgb.DMatrix('agaricus.txt.test?format=libsvm') + dtrain = xgb.DMatrix("agaricus.txt.train?format=libsvm") + dtest = xgb.DMatrix("agaricus.txt.test?format=libsvm") # Specify parameters via map, definition are same as c++ version - param = {'tree_method': 'hist', 'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'} + param = { + "tree_method": "hist", + "max_depth": 2, + "eta": 1, + "objective": "binary:logistic", + } if self._use_gpus: - self.log_info(fl_ctx, f'Training with GPU {rank}') - param['device'] = f"cuda:{rank}" + self.log_info(fl_ctx, f"Training with GPU {rank}") + param["device"] = f"cuda:{rank}" # Specify validations set to watch performance - watchlist = [(dtest, 'eval'), (dtrain, 'train')] + watchlist = [(dtest, "eval"), (dtrain, "train")] num_round = 20 # Run training, all the features in training API is available. - bst = xgb.train(param, dtrain, num_round, evals=watchlist, - early_stopping_rounds=2, verbose_eval=False, - callbacks=[callback.EvaluationMonitor(rank=rank)]) + bst = xgb.train( + param, + dtrain, + num_round, + evals=watchlist, + early_stopping_rounds=2, + verbose_eval=False, + callbacks=[callback.EvaluationMonitor(rank=rank)], + ) # Save the model. workspace = fl_ctx.get_prop(FLContextKey.WORKSPACE_OBJECT) diff --git a/demo/nvflare/vertical/custom/controller.py b/demo/nvflare/vertical/custom/controller.py index dd3e39f46cf3..ad03d09fdef4 100644 --- a/demo/nvflare/vertical/custom/controller.py +++ b/demo/nvflare/vertical/custom/controller.py @@ -2,8 +2,10 @@ Example of training controller with NVFlare =========================================== """ + import multiprocessing +import xgboost.federated from nvflare.apis.client import Client from nvflare.apis.fl_context import FLContext from nvflare.apis.impl.controller import Controller, Task @@ -11,12 +13,16 @@ from nvflare.apis.signal import Signal from trainer import SupportedTasks -import xgboost.federated - class XGBoostController(Controller): - def __init__(self, port: int, world_size: int, server_key_path: str, - server_cert_path: str, client_cert_path: str): + def __init__( + self, + port: int, + world_size: int, + server_key_path: str, + server_cert_path: str, + client_cert_path: str, + ): """Controller for federated XGBoost. Args: @@ -37,18 +43,31 @@ def __init__(self, port: int, world_size: int, server_key_path: str, def start_controller(self, fl_ctx: FLContext): self._server = multiprocessing.Process( target=xgboost.federated.run_federated_server, - args=(self._port, self._world_size, self._server_key_path, - self._server_cert_path, self._client_cert_path)) + args=( + self._port, + self._world_size, + self._server_key_path, + self._server_cert_path, + self._client_cert_path, + ), + ) self._server.start() def stop_controller(self, fl_ctx: FLContext): if self._server: self._server.terminate() - def process_result_of_unknown_task(self, client: Client, task_name: str, - client_task_id: str, result: Shareable, - fl_ctx: FLContext): - self.log_warning(fl_ctx, f"Unknown task: {task_name} from client {client.name}.") + def process_result_of_unknown_task( + self, + client: Client, + task_name: str, + client_task_id: str, + result: Shareable, + fl_ctx: FLContext, + ): + self.log_warning( + fl_ctx, f"Unknown task: {task_name} from client {client.name}." + ) def control_flow(self, abort_signal: Signal, fl_ctx: FLContext): self.log_info(fl_ctx, "XGBoost training control flow started.") diff --git a/demo/nvflare/vertical/custom/trainer.py b/demo/nvflare/vertical/custom/trainer.py index b6c3855ef10f..d36a85d12650 100644 --- a/demo/nvflare/vertical/custom/trainer.py +++ b/demo/nvflare/vertical/custom/trainer.py @@ -1,12 +1,11 @@ import os +import xgboost as xgb from nvflare.apis.executor import Executor from nvflare.apis.fl_constant import FLContextKey, ReturnCode from nvflare.apis.fl_context import FLContext from nvflare.apis.shareable import Shareable, make_reply from nvflare.apis.signal import Signal - -import xgboost as xgb from xgboost import callback @@ -15,8 +14,15 @@ class SupportedTasks(object): class XGBoostTrainer(Executor): - def __init__(self, server_address: str, world_size: int, server_cert_path: str, - client_key_path: str, client_cert_path: str, use_gpus: bool): + def __init__( + self, + server_address: str, + world_size: int, + server_cert_path: str, + client_key_path: str, + client_cert_path: str, + use_gpus: bool, + ): """Trainer for federated XGBoost. Args: @@ -34,8 +40,13 @@ def __init__(self, server_address: str, world_size: int, server_cert_path: str, self._client_cert_path = client_cert_path self._use_gpus = use_gpus - def execute(self, task_name: str, shareable: Shareable, fl_ctx: FLContext, - abort_signal: Signal) -> Shareable: + def execute( + self, + task_name: str, + shareable: Shareable, + fl_ctx: FLContext, + abort_signal: Signal, + ) -> Shareable: self.log_info(fl_ctx, f"Executing {task_name}") try: if task_name == SupportedTasks.TRAIN: @@ -45,53 +56,58 @@ def execute(self, task_name: str, shareable: Shareable, fl_ctx: FLContext, self.log_error(fl_ctx, f"{task_name} is not a supported task.") return make_reply(ReturnCode.TASK_UNKNOWN) except BaseException as e: - self.log_exception(fl_ctx, - f"Task {task_name} failed. Exception: {e.__str__()}") + self.log_exception( + fl_ctx, f"Task {task_name} failed. Exception: {e.__str__()}" + ) return make_reply(ReturnCode.EXECUTION_EXCEPTION) def _do_training(self, fl_ctx: FLContext): client_name = fl_ctx.get_prop(FLContextKey.CLIENT_NAME) - rank = int(client_name.split('-')[1]) - 1 + rank = int(client_name.split("-")[1]) - 1 communicator_env = { - 'xgboost_communicator': 'federated', - 'federated_server_address': self._server_address, - 'federated_world_size': self._world_size, - 'federated_rank': rank, - 'federated_server_cert': self._server_cert_path, - 'federated_client_key': self._client_key_path, - 'federated_client_cert': self._client_cert_path + "xgboost_communicator": "federated", + "federated_server_address": self._server_address, + "federated_world_size": self._world_size, + "federated_rank": rank, + "federated_server_cert": self._server_cert_path, + "federated_client_key": self._client_key_path, + "federated_client_cert": self._client_cert_path, } with xgb.collective.CommunicatorContext(**communicator_env): # Load file, file will not be sharded in federated mode. if rank == 0: - label = '&label_column=0' + label = "&label_column=0" else: - label = '' - dtrain = xgb.DMatrix(f'higgs.train.csv?format=csv{label}', data_split_mode=1) - dtest = xgb.DMatrix(f'higgs.test.csv?format=csv{label}', data_split_mode=1) + label = "" + dtrain = xgb.DMatrix( + f"higgs.train.csv?format=csv{label}", data_split_mode=1 + ) + dtest = xgb.DMatrix(f"higgs.test.csv?format=csv{label}", data_split_mode=1) # specify parameters via map param = { - 'validate_parameters': True, - 'eta': 0.1, - 'gamma': 1.0, - 'max_depth': 8, - 'min_child_weight': 100, - 'tree_method': 'hist', - 'grow_policy': 'depthwise', - 'objective': 'binary:logistic', - 'eval_metric': 'auc', + "validate_parameters": True, + "eta": 0.1, + "gamma": 1.0, + "max_depth": 8, + "min_child_weight": 100, + "tree_method": "hist", + "grow_policy": "depthwise", + "objective": "binary:logistic", + "eval_metric": "auc", } if self._use_gpus: - self.log_info(fl_ctx, f'Training with GPU {rank}') - param['device'] = f"cuda:{rank}" + self.log_info(fl_ctx, f"Training with GPU {rank}") + param["device"] = f"cuda:{rank}" # specify validations set to watch performance watchlist = [(dtest, "eval"), (dtrain, "train")] # number of boosting rounds num_round = 10 - bst = xgb.train(param, dtrain, num_round, evals=watchlist, early_stopping_rounds=2) + bst = xgb.train( + param, dtrain, num_round, evals=watchlist, early_stopping_rounds=2 + ) # Save the model. workspace = fl_ctx.get_prop(FLContextKey.WORKSPACE_OBJECT) diff --git a/demo/rmm_plugin/README.rst b/demo/rmm_plugin/README.rst index ff15e815818a..942dc0093ddd 100644 --- a/demo/rmm_plugin/README.rst +++ b/demo/rmm_plugin/README.rst @@ -75,4 +75,4 @@ The newer NVIDIA platforms like `Grace-Hopper have a coherent memory model. Users can use the `SamHeadroomMemoryResource` in the latest RMM to utilize system memory for storing data. This can help XGBoost utilize memory from the host for GPU computation, but it may reduce performance due to slower CPU memory speed -and page migration overhead. \ No newline at end of file +and page migration overhead. diff --git a/demo/rmm_plugin/rmm_mgpu_with_dask.py b/demo/rmm_plugin/rmm_mgpu_with_dask.py index 467827074d24..3c00553842d2 100644 --- a/demo/rmm_plugin/rmm_mgpu_with_dask.py +++ b/demo/rmm_plugin/rmm_mgpu_with_dask.py @@ -4,12 +4,11 @@ """ import dask +import xgboost as xgb from dask.distributed import Client from dask_cuda import LocalCUDACluster from sklearn.datasets import make_classification -import xgboost as xgb - def main(client): # Optionally force XGBoost to use RMM for all GPU memory allocation, see ./README.md diff --git a/demo/rmm_plugin/rmm_singlegpu.py b/demo/rmm_plugin/rmm_singlegpu.py index a1457406b9dd..cac70ba8107c 100644 --- a/demo/rmm_plugin/rmm_singlegpu.py +++ b/demo/rmm_plugin/rmm_singlegpu.py @@ -4,9 +4,8 @@ """ import rmm -from sklearn.datasets import make_classification - import xgboost as xgb +from sklearn.datasets import make_classification # Initialize RMM pool allocator rmm.reinitialize(pool_allocator=True) diff --git a/dev/query_contributors.py b/dev/query_contributors.py index d57ad3f7c28a..f524af6312d6 100644 --- a/dev/query_contributors.py +++ b/dev/query_contributors.py @@ -8,8 +8,10 @@ from sh.contrib import git if len(sys.argv) != 5: - print(f'Usage: {sys.argv[0]} [starting commit/tag] [ending commit/tag] [GitHub username] ' + - '[GitHub password]') + print( + f"Usage: {sys.argv[0]} [starting commit/tag] [ending commit/tag] [GitHub username] " + + "[GitHub password]" + ) sys.exit(1) from_commit = sys.argv[1] @@ -20,56 +22,72 @@ contributors = set() reviewers = set() + def paginate_request(url, callback): r = requests.get(url, auth=(username, password)) - assert r.status_code == requests.codes.ok, f'Code: {r.status_code}, Text: {r.text}' + assert r.status_code == requests.codes.ok, f"Code: {r.status_code}, Text: {r.text}" callback(json.loads(r.text)) - while 'next' in r.links: - r = requests.get(r.links['next']['url'], auth=(username, password)) + while "next" in r.links: + r = requests.get(r.links["next"]["url"], auth=(username, password)) callback(json.loads(r.text)) -for line in git.log(f'{from_commit}..{to_commit}', '--pretty=format:%s', '--reverse', '--first-parent'): - m = re.search('\(#([0-9]+)\)$', line.rstrip()) + +for line in git.log( + f"{from_commit}..{to_commit}", "--pretty=format:%s", "--reverse", "--first-parent" +): + m = re.search("\(#([0-9]+)\)$", line.rstrip()) if m: pr_id = m.group(1) - print(f'PR #{pr_id}') + print(f"PR #{pr_id}") def process_commit_list(commit_list): try: - contributors.update([commit['author']['login'] for commit in commit_list]) + contributors.update( + [commit["author"]["login"] for commit in commit_list] + ) except TypeError: - prompt = (f'Error fetching contributors for PR #{pr_id}. Enter it manually, ' + - 'as a space-separated list: ') - contributors.update(str(input(prompt)).split(' ')) + prompt = ( + f"Error fetching contributors for PR #{pr_id}. Enter it manually, " + + "as a space-separated list: " + ) + contributors.update(str(input(prompt)).split(" ")) + def process_review_list(review_list): - reviewers.update([x['user']['login'] for x in review_list]) + reviewers.update([x["user"]["login"] for x in review_list]) + def process_comment_list(comment_list): - reviewers.update([x['user']['login'] for x in comment_list]) + reviewers.update([x["user"]["login"] for x in comment_list]) - paginate_request(f'https://api.github.com/repos/dmlc/xgboost/pulls/{pr_id}/commits', - process_commit_list) - paginate_request(f'https://api.github.com/repos/dmlc/xgboost/pulls/{pr_id}/reviews', - process_review_list) - paginate_request(f'https://api.github.com/repos/dmlc/xgboost/issues/{pr_id}/comments', - process_comment_list) + paginate_request( + f"https://api.github.com/repos/dmlc/xgboost/pulls/{pr_id}/commits", + process_commit_list, + ) + paginate_request( + f"https://api.github.com/repos/dmlc/xgboost/pulls/{pr_id}/reviews", + process_review_list, + ) + paginate_request( + f"https://api.github.com/repos/dmlc/xgboost/issues/{pr_id}/comments", + process_comment_list, + ) -print('Contributors: ', end='') +print("Contributors: ", end="") for x in sorted(contributors): - r = requests.get(f'https://api.github.com/users/{x}', auth=(username, password)) - assert r.status_code == requests.codes.ok, f'Code: {r.status_code}, Text: {r.text}' + r = requests.get(f"https://api.github.com/users/{x}", auth=(username, password)) + assert r.status_code == requests.codes.ok, f"Code: {r.status_code}, Text: {r.text}" user_info = json.loads(r.text) - if user_info['name'] is None: - print(f"@{x}, ", end='') + if user_info["name"] is None: + print(f"@{x}, ", end="") else: - print(f"{user_info['name']} (@{x}), ", end='') + print(f"{user_info['name']} (@{x}), ", end="") -print('\nReviewers: ', end='') +print("\nReviewers: ", end="") for x in sorted(reviewers): - r = requests.get(f'https://api.github.com/users/{x}', auth=(username, password)) - assert r.status_code == requests.codes.ok, f'Code: {r.status_code}, Text: {r.text}' + r = requests.get(f"https://api.github.com/users/{x}", auth=(username, password)) + assert r.status_code == requests.codes.ok, f"Code: {r.status_code}, Text: {r.text}" user_info = json.loads(r.text) - if user_info['name'] is None: - print(f"@{x}, ", end='') + if user_info["name"] is None: + print(f"@{x}, ", end="") else: - print(f"{user_info['name']} (@{x}), ", end='') -print('') + print(f"{user_info['name']} (@{x}), ", end="") +print("") diff --git a/doc/.gitignore b/doc/.gitignore index 5c373a0e4a20..9521f97ef71a 100644 --- a/doc/.gitignore +++ b/doc/.gitignore @@ -8,4 +8,4 @@ parser.py *.pyc web-data # generated by doxygen -tmp \ No newline at end of file +tmp diff --git a/doc/_static/js/auto_module_index.js b/doc/_static/js/auto_module_index.js index b918ecdc1635..8af0ad7bf266 100644 --- a/doc/_static/js/auto_module_index.js +++ b/doc/_static/js/auto_module_index.js @@ -22,4 +22,3 @@ function auto_index(module) { li_node.append(html); }); } - diff --git a/doc/contrib/consistency.rst b/doc/contrib/consistency.rst index b74234602e3c..512db5a36c1c 100644 --- a/doc/contrib/consistency.rst +++ b/doc/contrib/consistency.rst @@ -68,4 +68,4 @@ Feature Info XGBoost accepts data structures that contain meta info about predictors, including the names and types of features. Example inputs are :py:class:`pandas.DataFrame`, R `data.frame`. We have the following heuristics: - When the input data structure contains such information, we set the `feature_names` and `feature_types` for `DMatrix` accordingly. - When a user provides this information as explicit parameters, the user-provided version should override the one provided by the data structure. -- When both sources are missing, the `DMatrix` class contain empty info. \ No newline at end of file +- When both sources are missing, the `DMatrix` class contain empty info. diff --git a/doc/contrib/featuremap.rst b/doc/contrib/featuremap.rst index 66b87129e774..1169d2a5c2cc 100644 --- a/doc/contrib/featuremap.rst +++ b/doc/contrib/featuremap.rst @@ -66,4 +66,4 @@ Inference normally doesn't require any special treatment since we are using samp ***************** Language Bindings ***************** -We have a list of bindings for various languages. Inside the XGBoost repository, there's Python, R, Java, Scala, and C. All language bindings are built on top of the C version. Some others, like Julia and Rust, have their own repository. For guideline on adding a new binding, please see :doc:`/contrib/consistency`. \ No newline at end of file +We have a list of bindings for various languages. Inside the XGBoost repository, there's Python, R, Java, Scala, and C. All language bindings are built on top of the C version. Some others, like Julia and Rust, have their own repository. For guideline on adding a new binding, please see :doc:`/contrib/consistency`. diff --git a/doc/contrib/git_guide.rst b/doc/contrib/git_guide.rst index 5a2c8face164..20925ec6fc45 100644 --- a/doc/contrib/git_guide.rst +++ b/doc/contrib/git_guide.rst @@ -73,4 +73,3 @@ What is the consequence of force push ************************************* The previous two tips requires force push, this is because we altered the path of the commits. It is fine to force push to your own fork, as long as the commits changed are only yours. - diff --git a/doc/contrib/release.rst b/doc/contrib/release.rst index 61735341f725..3b99b2d61789 100644 --- a/doc/contrib/release.rst +++ b/doc/contrib/release.rst @@ -63,4 +63,4 @@ References [1] https://stat.ethz.ch/pipermail/r-package-devel/2022q4/008610.html -[2] https://github.com/readthedocs/readthedocs.org/issues/12073 \ No newline at end of file +[2] https://github.com/readthedocs/readthedocs.org/issues/12073 diff --git a/doc/python/.gitignore b/doc/python/.gitignore index f3097dfc2987..4b63774cfa74 100644 --- a/doc/python/.gitignore +++ b/doc/python/.gitignore @@ -2,4 +2,4 @@ examples dask-examples survival-examples gpu-examples -rmm-examples \ No newline at end of file +rmm-examples diff --git a/doc/python/data_input.rst b/doc/python/data_input.rst index 8343c1079e44..3eee1aad2735 100644 --- a/doc/python/data_input.rst +++ b/doc/python/data_input.rst @@ -83,4 +83,4 @@ Support Matrix The polars ``LazyFrame.collect`` supports many configurations, ranging from the choice of query engine to type coercion. XGBoost simply uses the default parameter. Please run ``collect`` to obtain the ``DataFrame`` before passing it into XGBoost for finer control -over the behaviour. \ No newline at end of file +over the behaviour. diff --git a/doc/python/python_api.rst b/doc/python/python_api.rst index 595b7f067a01..772c3febf442 100644 --- a/doc/python/python_api.rst +++ b/doc/python/python_api.rst @@ -216,4 +216,4 @@ Collective .. automodule:: xgboost.tracker -.. autoclass:: xgboost.tracker.RabitTracker \ No newline at end of file +.. autoclass:: xgboost.tracker.RabitTracker diff --git a/doc/sphinx_util.py b/doc/sphinx_util.py index 720cd33e504f..e5848949007c 100644 --- a/doc/sphinx_util.py +++ b/doc/sphinx_util.py @@ -1,15 +1,17 @@ # -*- coding: utf-8 -*- """Helper utility function for customization.""" + import os import subprocess import sys -READTHEDOCS_BUILD = (os.environ.get('READTHEDOCS', None) is not None) +READTHEDOCS_BUILD = os.environ.get("READTHEDOCS", None) is not None -if not os.path.exists('web-data'): - subprocess.call('rm -rf web-data;' + - 'git clone https://github.com/dmlc/web-data', shell = True) +if not os.path.exists("web-data"): + subprocess.call( + "rm -rf web-data;" + "git clone https://github.com/dmlc/web-data", shell=True + ) else: - subprocess.call('cd web-data; git pull', shell=True) + subprocess.call("cd web-data; git pull", shell=True) -sys.stderr.write('READTHEDOCS=%s\n' % (READTHEDOCS_BUILD)) +sys.stderr.write("READTHEDOCS=%s\n" % (READTHEDOCS_BUILD)) diff --git a/doc/tutorials/index.rst b/doc/tutorials/index.rst index bf58d7cd7b0a..c3970af5dc31 100644 --- a/doc/tutorials/index.rst +++ b/doc/tutorials/index.rst @@ -33,4 +33,4 @@ See `Awesome XGBoost `_ for mo custom_metric_obj advanced_custom_obj intercept - privacy_preserving \ No newline at end of file + privacy_preserving diff --git a/doc/tutorials/privacy_preserving.rst b/doc/tutorials/privacy_preserving.rst index 132861f7c5b3..6cc4b14ac6cd 100644 --- a/doc/tutorials/privacy_preserving.rst +++ b/doc/tutorials/privacy_preserving.rst @@ -53,7 +53,7 @@ To verify model accuracy in encrypted computations, you can run an FHE simulatio predictions = classifier.predict(X_test, fhe="simulate") -This simulation can be used to evaluate the model. The resulting accuracy of this simulation step is representative of the actual FHE execution without having to pay the cost of an actual FHE execution. +This simulation can be used to evaluate the model. The resulting accuracy of this simulation step is representative of the actual FHE execution without having to pay the cost of an actual FHE execution. When the model is ready, actual Fully Homomorphic Encryption execution can be performed: @@ -94,4 +94,4 @@ Concrete ML provides a framework for executing privacy-preserving inferences by More information and examples are given in the `Concrete ML documentation`_. .. _Concrete ML: https://github.com/zama-ai/concrete-ml -.. _`Concrete ML documentation`: https://docs.zama.ai/concrete-ml \ No newline at end of file +.. _`Concrete ML documentation`: https://docs.zama.ai/concrete-ml diff --git a/doc/tutorials/saving_model.rst b/doc/tutorials/saving_model.rst index 20aeb210c465..4a398c2e0fa8 100644 --- a/doc/tutorials/saving_model.rst +++ b/doc/tutorials/saving_model.rst @@ -299,4 +299,4 @@ Brief History - UBJSON has been set to default in 2.1. - The old binary format was removed in 3.1. - The JSON schema file is no longer maintained and has been removed in 3.2. The underlying - schema of the model is not changed. \ No newline at end of file + schema of the model is not changed. diff --git a/jvm-packages/README.md b/jvm-packages/README.md index 78f9a5e0f9a1..41b74b72f10e 100644 --- a/jvm-packages/README.md +++ b/jvm-packages/README.md @@ -14,4 +14,4 @@ into JVM ecosystem. - Run distributed xgboost natively on jvm frameworks such as Apache Flink and Apache Spark. -You can find more about XGBoost on [Documentation](https://xgboost.readthedocs.org/en/stable/jvm/index.html) and [Resource Page](../demo/README.md). \ No newline at end of file +You can find more about XGBoost on [Documentation](https://xgboost.readthedocs.org/en/stable/jvm/index.html) and [Resource Page](../demo/README.md). diff --git a/jvm-packages/checkstyle.xml b/jvm-packages/checkstyle.xml index ebfd7cd88531..19cb35c4785a 100644 --- a/jvm-packages/checkstyle.xml +++ b/jvm-packages/checkstyle.xml @@ -77,7 +77,7 @@ - + diff --git a/jvm-packages/create_jni.py b/jvm-packages/create_jni.py index 5f2ca9f88890..a1eec9870dd4 100755 --- a/jvm-packages/create_jni.py +++ b/jvm-packages/create_jni.py @@ -129,7 +129,11 @@ def native_build(cli_args: argparse.Namespace) -> None: run("cmake .. " + " ".join(args + [generator])) break except subprocess.CalledProcessError as e: - print(f"Failed to build with generator: {generator}", e, flush=True) + print( + f"Failed to build with generator: {generator}", + e, + flush=True, + ) with cd(os.path.pardir): shutil.rmtree(build_dir) maybe_makedirs(build_dir) @@ -137,7 +141,6 @@ def native_build(cli_args: argparse.Namespace) -> None: run("cmake .. " + " ".join(args)) run("cmake --build . --config Release" + maybe_parallel_build) - print("copying native library", flush=True) library_name, os_folder = { "Windows": ("xgboost4j.dll", "windows"), @@ -181,7 +184,10 @@ def native_build(cli_args: argparse.Namespace) -> None: maybe_makedirs("xgboost4j-spark-gpu/src/test/resources") for file in glob.glob("../demo/data/veterans_lung_cancer.csv"): cp(file, "xgboost4j-spark-gpu/src/test/resources") - cp("xgboost4j-spark/src/test/resources/rank.train.csv", "xgboost4j-spark-gpu/src/test/resources") + cp( + "xgboost4j-spark/src/test/resources/rank.train.csv", + "xgboost4j-spark-gpu/src/test/resources", + ) if __name__ == "__main__": diff --git a/jvm-packages/xgboost4j-example/LICENSE b/jvm-packages/xgboost4j-example/LICENSE index 9a1673be2ed6..f02ba610d0a3 100644 --- a/jvm-packages/xgboost4j-example/LICENSE +++ b/jvm-packages/xgboost4j-example/LICENSE @@ -1,10 +1,10 @@ /* -Copyright (c) 2014 by Contributors +Copyright (c) 2014 by Contributors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software @@ -12,4 +12,4 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ \ No newline at end of file +*/ diff --git a/jvm-packages/xgboost4j/LICENSE b/jvm-packages/xgboost4j/LICENSE index 9a1673be2ed6..f02ba610d0a3 100644 --- a/jvm-packages/xgboost4j/LICENSE +++ b/jvm-packages/xgboost4j/LICENSE @@ -1,10 +1,10 @@ /* -Copyright (c) 2014 by Contributors +Copyright (c) 2014 by Contributors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software @@ -12,4 +12,4 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ \ No newline at end of file +*/ diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java index 2a87e33686e5..09089c6b5688 100644 --- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java +++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java @@ -89,7 +89,7 @@ public final static native int XGDMatrixCreateFromMatRef(long dataRef, int nrow, /** * Set the feature information - * + * * @param handle the DMatrix native address * @param field "feature_names" or "feature_types" * @param values an array of string diff --git a/jvm-packages/xgboost4j/src/main/resources/xgboost4j-version.properties b/jvm-packages/xgboost4j/src/main/resources/xgboost4j-version.properties index e5683df88cb1..defbd48204e4 100644 --- a/jvm-packages/xgboost4j/src/main/resources/xgboost4j-version.properties +++ b/jvm-packages/xgboost4j/src/main/resources/xgboost4j-version.properties @@ -1 +1 @@ -version=${project.version} \ No newline at end of file +version=${project.version} diff --git a/ops/script/pypi_variants.py b/ops/script/pypi_variants.py index a8a46d6823ce..279ff64a8ad2 100644 --- a/ops/script/pypi_variants.py +++ b/ops/script/pypi_variants.py @@ -2,8 +2,8 @@ import argparse import os -import tomllib +import tomllib from packaging.version import Version from test_utils import PY_PACKAGE diff --git a/ops/script/release_artifacts.py b/ops/script/release_artifacts.py index e6b5598dbc0c..4a76ae19b469 100644 --- a/ops/script/release_artifacts.py +++ b/ops/script/release_artifacts.py @@ -17,9 +17,8 @@ from packaging import version from pypi_variants import make_pyproject from sh.contrib import git -from test_utils import PY_PACKAGE +from test_utils import PY_PACKAGE, DirectoryExcursion from test_utils import ROOT as root_path -from test_utils import DirectoryExcursion # S3 bucket hosting the release artifacts S3_BUCKET_URL = "https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds" diff --git a/ops/script/run_clang_tidy.py b/ops/script/run_clang_tidy.py index 5f9096ab5f9c..e0f2f635d446 100755 --- a/ops/script/run_clang_tidy.py +++ b/ops/script/run_clang_tidy.py @@ -237,7 +237,7 @@ def run(self) -> bool: passed = False print( BAR, - "\n" "Command args:", + "\nCommand args:", " ".join(args), ", ", "Process return code:", diff --git a/python-package/.gitignore b/python-package/.gitignore index d765c67c773e..2ebc5b00be03 100644 --- a/python-package/.gitignore +++ b/python-package/.gitignore @@ -1,3 +1,3 @@ build dist -*.egg* \ No newline at end of file +*.egg* diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 419c0dfcfb50..afb192b59583 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -804,9 +804,9 @@ def _get_categories( values = from_array_interface(jvalues) pa_offsets = pa.array(offsets).buffers() pa_values = pa.array(values).buffers() - assert ( - pa_offsets[0] is None and pa_values[0] is None - ), "Should not have null mask." + assert pa_offsets[0] is None and pa_values[0] is None, ( + "Should not have null mask." + ) pa_dict = pa.StringArray.from_buffers( len(offsets) - 1, pa_offsets[1], pa_values[1] ) diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 8349163f5db8..4aadcd623ae1 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -595,10 +595,12 @@ def get_doc(item: str) -> str: return __doc[item] def adddoc(cls: TDoc) -> TDoc: - doc = [""" + doc = [ + """ Parameters ---------- -"""] +""" + ] if extra_parameters: doc.append(extra_parameters) doc.extend([get_doc(i) for i in items]) diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py index 357ac931941c..8d47a6b58d85 100644 --- a/python-package/xgboost/spark/core.py +++ b/python-package/xgboost/spark/core.py @@ -1120,9 +1120,10 @@ def _train_booster( _rabit_args = json.loads(messages[0])["rabit_msg"] evals_result: Dict[str, Any] = {} - with config_context( - verbosity=verbosity, use_rmm=use_rmm - ), CommunicatorContext(context, **_rabit_args): + with ( + config_context(verbosity=verbosity, use_rmm=use_rmm), + CommunicatorContext(context, **_rabit_args), + ): dtrain, dvalid = create_dmatrix_from_partitions( iterator=pandas_df_iter, feature_cols=feature_prop.features_cols_names, @@ -1388,8 +1389,7 @@ def _run_on_gpu(self) -> bool: if gpu_per_task is None: if use_gpu_by_params: get_logger(_LOG_TAG).warning( - "Do the prediction on the CPUs since " - "no gpu configurations are set" + "Do the prediction on the CPUs since no gpu configurations are set" ) return False diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py index 2aeb7b44d603..52f346bad6a5 100644 --- a/python-package/xgboost/testing/__init__.py +++ b/python-package/xgboost/testing/__init__.py @@ -71,9 +71,10 @@ def has_ipv6() -> bool: if socket.has_ipv6: try: - with socket.socket( - socket.AF_INET6, socket.SOCK_STREAM - ) as server, socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as client: + with ( + socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as server, + socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as client, + ): server.bind(("::1", 0)) port = server.getsockname()[1] server.listen() diff --git a/python-package/xgboost/tracker.py b/python-package/xgboost/tracker.py index 926c957b28a4..6e39ad89c8ce 100644 --- a/python-package/xgboost/tracker.py +++ b/python-package/xgboost/tracker.py @@ -90,7 +90,6 @@ def __init__( # pylint: disable=too-many-arguments sortby: str = "host", timeout: int = 0, ) -> None: - handle = ctypes.c_void_p() if sortby not in ("host", "task"): raise ValueError("Expecting either 'host' or 'task' for sortby.") diff --git a/tests/cpp/plugin/test_sycl_hist_updater.cc b/tests/cpp/plugin/test_sycl_hist_updater.cc index 69c5047d045c..23812d53350c 100644 --- a/tests/cpp/plugin/test_sycl_hist_updater.cc +++ b/tests/cpp/plugin/test_sycl_hist_updater.cc @@ -121,7 +121,7 @@ void TestHistUpdaterSampling(const xgboost::tree::TrainParam& param) { GenerateRandomGPairs(qu, gpair.DevicePointer(), num_rows, true); updater.TestInitSampling(gpair, &row_indices_0); - + size_t n_samples = row_indices_0.Size(); // Half of gpairs have neg hess ASSERT_LT(n_samples, num_rows * 0.5 * param.subsample * 1.2); diff --git a/tests/cpp/plugin/test_sycl_lambdarank_obj.cc b/tests/cpp/plugin/test_sycl_lambdarank_obj.cc index 2129d8b3e4c3..2814b937eb20 100644 --- a/tests/cpp/plugin/test_sycl_lambdarank_obj.cc +++ b/tests/cpp/plugin/test_sycl_lambdarank_obj.cc @@ -42,4 +42,3 @@ TEST(SyclObjective, LambdaRankMAPGPair) { } } // namespace xgboost::obj - diff --git a/tests/cpp/plugin/test_sycl_linalg.cc b/tests/cpp/plugin/test_sycl_linalg.cc index 2827aa34fbb3..fc81c53dea83 100644 --- a/tests/cpp/plugin/test_sycl_linalg.cc +++ b/tests/cpp/plugin/test_sycl_linalg.cc @@ -44,4 +44,4 @@ TEST(SyclLinalg, SmallHistogram) { ASSERT_EQ(bins.HostVector()[i], cnt); } } -} // namespace xgboost::linalg \ No newline at end of file +} // namespace xgboost::linalg diff --git a/tests/cpp/plugin/test_sycl_quantile_hist_builder.cc b/tests/cpp/plugin/test_sycl_quantile_hist_builder.cc index 4bf7bd962750..0c97bf355d5c 100644 --- a/tests/cpp/plugin/test_sycl_quantile_hist_builder.cc +++ b/tests/cpp/plugin/test_sycl_quantile_hist_builder.cc @@ -50,6 +50,6 @@ TEST(SyclQuantileHistMaker, JsonIO) { auto single_precision_histogram = atoi(get(new_config["sycl_hist_train_param"]["single_precision_histogram"]).c_str()); ASSERT_EQ(single_precision_histogram, 1); } - + } } // namespace xgboost::sycl::tree diff --git a/tests/python-gpu/conftest.py b/tests/python-gpu/conftest.py index 8ce9aabcb5b2..4d4dcc841462 100644 --- a/tests/python-gpu/conftest.py +++ b/tests/python-gpu/conftest.py @@ -1,7 +1,6 @@ from typing import Any, List import pytest - from xgboost import testing as tm diff --git a/tests/python-gpu/load_pickle.py b/tests/python-gpu/load_pickle.py index 6853f4b4346d..9655edc2a1c9 100644 --- a/tests/python-gpu/load_pickle.py +++ b/tests/python-gpu/load_pickle.py @@ -6,9 +6,8 @@ import numpy as np import pytest -from test_gpu_pickling import build_dataset, load_pickle, model_path - import xgboost as xgb +from test_gpu_pickling import build_dataset, load_pickle, model_path class TestLoadPickle: diff --git a/tests/python-gpu/test_device_quantile_dmatrix.py b/tests/python-gpu/test_device_quantile_dmatrix.py index 9f62b76a7c8a..d7cca13d1118 100644 --- a/tests/python-gpu/test_device_quantile_dmatrix.py +++ b/tests/python-gpu/test_device_quantile_dmatrix.py @@ -2,9 +2,8 @@ import numpy as np import pytest -from hypothesis import given, settings, strategies - import xgboost as xgb +from hypothesis import given, settings, strategies from xgboost import testing as tm from xgboost.testing.data import check_inf from xgboost.testing.data_iter import run_mixed_sparsity diff --git a/tests/python-gpu/test_from_cudf.py b/tests/python-gpu/test_from_cudf.py index 3cd6cbb710c4..8825de7601df 100644 --- a/tests/python-gpu/test_from_cudf.py +++ b/tests/python-gpu/test_from_cudf.py @@ -3,7 +3,6 @@ import numpy as np import pytest - import xgboost as xgb from xgboost import testing as tm from xgboost.compat import is_dataframe diff --git a/tests/python-gpu/test_from_cupy.py b/tests/python-gpu/test_from_cupy.py index 4d35b2e407f7..985aedb3c65f 100644 --- a/tests/python-gpu/test_from_cupy.py +++ b/tests/python-gpu/test_from_cupy.py @@ -3,7 +3,6 @@ import numpy as np import pytest - import xgboost as xgb from xgboost import testing as tm from xgboost.testing.data import run_base_margin_info diff --git a/tests/python-gpu/test_gpu_basic_models.py b/tests/python-gpu/test_gpu_basic_models.py index b1faf6ffc016..5b3293591971 100644 --- a/tests/python-gpu/test_gpu_basic_models.py +++ b/tests/python-gpu/test_gpu_basic_models.py @@ -3,7 +3,6 @@ import numpy as np import pytest - import xgboost as xgb from xgboost import testing as tm from xgboost.testing.basic_models import run_custom_objective diff --git a/tests/python-gpu/test_gpu_callbacks.py b/tests/python-gpu/test_gpu_callbacks.py index 6f1f0081bf56..269b24310c59 100644 --- a/tests/python-gpu/test_gpu_callbacks.py +++ b/tests/python-gpu/test_gpu_callbacks.py @@ -1,5 +1,4 @@ import pytest - from xgboost import testing as tm from xgboost.testing.callbacks import ( run_eta_decay, diff --git a/tests/python-gpu/test_gpu_data_iterator.py b/tests/python-gpu/test_gpu_data_iterator.py index 33c9409743f0..08859bd49a88 100644 --- a/tests/python-gpu/test_gpu_data_iterator.py +++ b/tests/python-gpu/test_gpu_data_iterator.py @@ -2,9 +2,8 @@ import numpy as np import pytest -from hypothesis import given, settings, strategies - import xgboost as xgb +from hypothesis import given, settings, strategies from xgboost import testing as tm from xgboost.testing import no_cupy from xgboost.testing.data_iter import check_invalid_cat_batches, check_uneven_sizes diff --git a/tests/python-gpu/test_gpu_demos.py b/tests/python-gpu/test_gpu_demos.py index a7bfb778d5f4..9fa276befeff 100644 --- a/tests/python-gpu/test_gpu_demos.py +++ b/tests/python-gpu/test_gpu_demos.py @@ -2,7 +2,6 @@ import subprocess import pytest - from xgboost import testing as tm DEMO_DIR = tm.demo_dir(__file__) diff --git a/tests/python-gpu/test_gpu_eval_metrics.py b/tests/python-gpu/test_gpu_eval_metrics.py index 2f3d05f36b4b..f8f98ca0a084 100644 --- a/tests/python-gpu/test_gpu_eval_metrics.py +++ b/tests/python-gpu/test_gpu_eval_metrics.py @@ -1,7 +1,6 @@ import json import pytest - import xgboost from xgboost import testing as tm from xgboost.testing.metrics import ( diff --git a/tests/python-gpu/test_gpu_interaction_constraints.py b/tests/python-gpu/test_gpu_interaction_constraints.py index 2c22fe91b12c..77a186926075 100644 --- a/tests/python-gpu/test_gpu_interaction_constraints.py +++ b/tests/python-gpu/test_gpu_interaction_constraints.py @@ -1,7 +1,6 @@ import numpy as np import pandas as pd import pytest - import xgboost as xgb from xgboost.testing.interaction_constraints import ( run_interaction_constraints, diff --git a/tests/python-gpu/test_gpu_intercept.py b/tests/python-gpu/test_gpu_intercept.py index c5acc681b3d6..0be9d2bb8a71 100644 --- a/tests/python-gpu/test_gpu_intercept.py +++ b/tests/python-gpu/test_gpu_intercept.py @@ -1,7 +1,6 @@ from itertools import product import pytest - from xgboost.testing.intercept import ( run_adaptive, run_exp_family, diff --git a/tests/python-gpu/test_gpu_linear.py b/tests/python-gpu/test_gpu_linear.py index ace1238488ac..abff812d066a 100644 --- a/tests/python-gpu/test_gpu_linear.py +++ b/tests/python-gpu/test_gpu_linear.py @@ -1,9 +1,8 @@ from typing import Any, Dict import pytest -from hypothesis import assume, given, note, settings, strategies - import xgboost as xgb +from hypothesis import assume, given, note, settings, strategies from xgboost import testing as tm pytestmark = tm.timeout(10) diff --git a/tests/python-gpu/test_gpu_ordinal.py b/tests/python-gpu/test_gpu_ordinal.py index 1451d70bf674..1d683a1b0c2c 100644 --- a/tests/python-gpu/test_gpu_ordinal.py +++ b/tests/python-gpu/test_gpu_ordinal.py @@ -4,7 +4,6 @@ import numpy as np import pytest - import xgboost as xgb from xgboost import testing as tm from xgboost.testing.data import make_categorical diff --git a/tests/python-gpu/test_gpu_pickling.py b/tests/python-gpu/test_gpu_pickling.py index f4219388255d..1b69be117c64 100644 --- a/tests/python-gpu/test_gpu_pickling.py +++ b/tests/python-gpu/test_gpu_pickling.py @@ -7,7 +7,6 @@ import numpy as np import pytest - import xgboost as xgb from xgboost import XGBClassifier from xgboost import testing as tm @@ -102,7 +101,7 @@ def test_wrap_gpu_id(self) -> None: env.update(cuda_environment) args = self.args_template.copy() args.append( - "./tests/python-gpu/" "load_pickle.py::TestLoadPickle::test_wrap_gpu_id" + "./tests/python-gpu/load_pickle.py::TestLoadPickle::test_wrap_gpu_id" ) status = subprocess.call(args, env=env) assert status == 0 diff --git a/tests/python-gpu/test_gpu_plotting.py b/tests/python-gpu/test_gpu_plotting.py index af29dc0201e3..f77934042d4e 100644 --- a/tests/python-gpu/test_gpu_plotting.py +++ b/tests/python-gpu/test_gpu_plotting.py @@ -1,5 +1,4 @@ import pytest - from xgboost import testing as tm diff --git a/tests/python-gpu/test_gpu_prediction.py b/tests/python-gpu/test_gpu_prediction.py index 9d59e514cafc..06850f576a29 100644 --- a/tests/python-gpu/test_gpu_prediction.py +++ b/tests/python-gpu/test_gpu_prediction.py @@ -4,10 +4,9 @@ import numpy as np import pytest +import xgboost as xgb from hypothesis import assume, given, settings, strategies from hypothesis.extra.pandas import column, data_frames, range_indexes - -import xgboost as xgb from xgboost import testing as tm from xgboost.testing.predict import run_base_margin_vs_base_score, run_predict_leaf diff --git a/tests/python-gpu/test_gpu_ranking.py b/tests/python-gpu/test_gpu_ranking.py index 4b284bf4c9f0..bd92c8dd5c01 100644 --- a/tests/python-gpu/test_gpu_ranking.py +++ b/tests/python-gpu/test_gpu_ranking.py @@ -2,7 +2,6 @@ import numpy as np import pytest - import xgboost from xgboost import testing as tm from xgboost.testing.ranking import run_normalization, run_score_normalization diff --git a/tests/python-gpu/test_gpu_training_continuation.py b/tests/python-gpu/test_gpu_training_continuation.py index 6f948890dfae..7d0ef2f7a637 100644 --- a/tests/python-gpu/test_gpu_training_continuation.py +++ b/tests/python-gpu/test_gpu_training_continuation.py @@ -1,6 +1,5 @@ import numpy as np import pytest - from xgboost.testing.continuation import run_training_continuation_model_output rng = np.random.RandomState(1994) diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py index a13ebf1c3c1d..89187adc3ba4 100644 --- a/tests/python-gpu/test_gpu_updaters.py +++ b/tests/python-gpu/test_gpu_updaters.py @@ -2,9 +2,8 @@ import numpy as np import pytest -from hypothesis import assume, given, note, settings, strategies - import xgboost as xgb +from hypothesis import assume, given, note, settings, strategies from xgboost import testing as tm from xgboost.testing.params import ( cat_parameter_strategy, diff --git a/tests/python-gpu/test_gpu_with_sklearn.py b/tests/python-gpu/test_gpu_with_sklearn.py index 6d892e6b0efd..3e832701bca9 100644 --- a/tests/python-gpu/test_gpu_with_sklearn.py +++ b/tests/python-gpu/test_gpu_with_sklearn.py @@ -7,7 +7,6 @@ import numpy as np import pytest - import xgboost as xgb from xgboost import testing as tm from xgboost.testing.ranking import run_ranking_categorical, run_ranking_qid_df diff --git a/tests/python-gpu/test_large_input.py b/tests/python-gpu/test_large_input.py index 0ec203621ae8..366360369d55 100644 --- a/tests/python-gpu/test_large_input.py +++ b/tests/python-gpu/test_large_input.py @@ -1,7 +1,6 @@ import cupy as cp import numpy as np import pytest - import xgboost as xgb diff --git a/tests/python-gpu/test_monotonic_constraints.py b/tests/python-gpu/test_monotonic_constraints.py index baf64621059b..e8ba36cdf8c8 100644 --- a/tests/python-gpu/test_monotonic_constraints.py +++ b/tests/python-gpu/test_monotonic_constraints.py @@ -1,6 +1,5 @@ import numpy as np import pytest - import xgboost as xgb from xgboost import testing as tm from xgboost.testing.monotone_constraints import is_correctly_constrained, training_dset diff --git a/tests/python-sycl/test_sycl_prediction.py b/tests/python-sycl/test_sycl_prediction.py index 06167c6c02db..76c5dbf3bce2 100644 --- a/tests/python-sycl/test_sycl_prediction.py +++ b/tests/python-sycl/test_sycl_prediction.py @@ -1,11 +1,10 @@ import sys import unittest -import pytest import numpy as np +import pytest import xgboost as xgb -from hypothesis import given, strategies, assume, settings, note - +from hypothesis import assume, given, note, settings, strategies from xgboost import testing as tm rng = np.random.RandomState(1994) diff --git a/tests/python-sycl/test_sycl_simple_dask.py b/tests/python-sycl/test_sycl_simple_dask.py index 2d302573ecd1..6de69ff67fe1 100644 --- a/tests/python-sycl/test_sycl_simple_dask.py +++ b/tests/python-sycl/test_sycl_simple_dask.py @@ -1,8 +1,7 @@ -from xgboost import dask as dxgb -from xgboost import testing as tm - import dask.array as da import dask.distributed +from xgboost import dask as dxgb +from xgboost import testing as tm def train_result(client, param, dtrain, num_rounds): diff --git a/tests/python-sycl/test_sycl_training_continuation.py b/tests/python-sycl/test_sycl_training_continuation.py index 71d5965600e7..bfb72e1f03d2 100644 --- a/tests/python-sycl/test_sycl_training_continuation.py +++ b/tests/python-sycl/test_sycl_training_continuation.py @@ -1,6 +1,7 @@ +import json + import numpy as np import xgboost as xgb -import json rng = np.random.RandomState(1994) diff --git a/tests/python-sycl/test_sycl_updaters.py b/tests/python-sycl/test_sycl_updaters.py index 57ca8d783bd7..9e0d30be4339 100644 --- a/tests/python-sycl/test_sycl_updaters.py +++ b/tests/python-sycl/test_sycl_updaters.py @@ -1,11 +1,11 @@ -import numpy as np import gc +import os +import sys + +import numpy as np import pytest import xgboost as xgb -from hypothesis import given, strategies, assume, settings, note - -import sys -import os +from hypothesis import assume, given, note, settings, strategies # sys.path.append("tests/python") # import testing as tm diff --git a/tests/python-sycl/test_sycl_with_sklearn.py b/tests/python-sycl/test_sycl_with_sklearn.py index a17d384e405e..915cdea43c61 100644 --- a/tests/python-sycl/test_sycl_with_sklearn.py +++ b/tests/python-sycl/test_sycl_with_sklearn.py @@ -1,8 +1,8 @@ -import xgboost as xgb -import pytest import sys -import numpy as np +import numpy as np +import pytest +import xgboost as xgb from xgboost import testing as tm pytestmark = pytest.mark.skipif(**tm.no_sklearn()) diff --git a/tests/python/generate_models.py b/tests/python/generate_models.py index 57e0f9ed26e0..ebe82efe2418 100644 --- a/tests/python/generate_models.py +++ b/tests/python/generate_models.py @@ -1,9 +1,8 @@ import os import numpy as np -from sklearn.datasets import make_classification - import xgboost +from sklearn.datasets import make_classification from xgboost.testing import make_categorical, make_ltr kRounds = 4 diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index 8cfbdc47d8ec..b57ca93a2b19 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -6,7 +6,6 @@ import numpy as np import pytest - import xgboost as xgb from xgboost import testing as tm from xgboost.core import _parse_version @@ -131,14 +130,15 @@ def test_dump(self): # number of feature importances should == number of features dump1 = bst.get_dump() assert len(dump1) == 1, "Expected only 1 tree to be dumped." - len( - dump1[0].splitlines() - ) == 3, "Expected 1 root and 2 leaves - 3 lines in dump." + ( + len(dump1[0].splitlines()) == 3, + "Expected 1 root and 2 leaves - 3 lines in dump.", + ) dump2 = bst.get_dump(with_stats=True) - assert ( - dump2[0].count("\n") == 3 - ), "Expected 1 root and 2 leaves - 3 lines in dump." + assert dump2[0].count("\n") == 3, ( + "Expected 1 root and 2 leaves - 3 lines in dump." + ) msg = "Expected more info when with_stats=True is given." assert dump2[0].find("\n") > dump1[0].find("\n"), msg diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py index 88d275a6d2ee..2de7aa96ad60 100644 --- a/tests/python/test_basic_models.py +++ b/tests/python/test_basic_models.py @@ -4,7 +4,6 @@ import numpy as np import pytest - import xgboost as xgb from xgboost import testing as tm from xgboost.core import Integer diff --git a/tests/python/test_callback.py b/tests/python/test_callback.py index 248111b78017..6c80c3fc06d7 100644 --- a/tests/python/test_callback.py +++ b/tests/python/test_callback.py @@ -5,7 +5,6 @@ import numpy as np import pytest - import xgboost as xgb from xgboost import testing as tm from xgboost.testing.callbacks import ( diff --git a/tests/python/test_collective.py b/tests/python/test_collective.py index 1204c0faf8c9..e333c3d2f65c 100644 --- a/tests/python/test_collective.py +++ b/tests/python/test_collective.py @@ -3,7 +3,6 @@ import numpy as np import pytest - import xgboost as xgb from xgboost import RabitTracker, build_info, federated from xgboost import testing as tm diff --git a/tests/python/test_config.py b/tests/python/test_config.py index bc8da24b6f6e..588faad4d0f4 100644 --- a/tests/python/test_config.py +++ b/tests/python/test_config.py @@ -2,7 +2,6 @@ from concurrent.futures import ThreadPoolExecutor import pytest - import xgboost as xgb diff --git a/tests/python/test_data_iterator.py b/tests/python/test_data_iterator.py index b6692f4feae3..bae80a88a8cf 100644 --- a/tests/python/test_data_iterator.py +++ b/tests/python/test_data_iterator.py @@ -5,10 +5,9 @@ import numpy as np import pytest +import xgboost as xgb from hypothesis import given, settings, strategies from scipy.sparse import csr_matrix - -import xgboost as xgb from xgboost import testing as tm from xgboost.data import SingleBatchInternalIter as SingleBatch from xgboost.testing import IteratorForTest, make_batches, non_increasing diff --git a/tests/python/test_demos.py b/tests/python/test_demos.py index 7d4e1f132a72..29af9e77fff1 100644 --- a/tests/python/test_demos.py +++ b/tests/python/test_demos.py @@ -4,7 +4,6 @@ import tempfile import pytest - import xgboost from xgboost import testing as tm diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py index 874f0527bd20..cb02e87175b9 100644 --- a/tests/python/test_dmatrix.py +++ b/tests/python/test_dmatrix.py @@ -6,10 +6,9 @@ import numpy as np import pytest import scipy.sparse +import xgboost as xgb from hypothesis import given, settings, strategies from scipy.sparse import csr_matrix, rand - -import xgboost as xgb from xgboost import testing as tm from xgboost.core import DataSplitMode from xgboost.testing.data import np_dtypes, run_base_margin_info diff --git a/tests/python/test_early_stopping.py b/tests/python/test_early_stopping.py index fbb163d69658..386df16d732b 100644 --- a/tests/python/test_early_stopping.py +++ b/tests/python/test_early_stopping.py @@ -2,7 +2,6 @@ import numpy as np import pytest - import xgboost as xgb from xgboost import testing as tm from xgboost.testing.updater import get_basescore diff --git a/tests/python/test_eval_metrics.py b/tests/python/test_eval_metrics.py index 406d4258bd0c..2e7ee7d13315 100644 --- a/tests/python/test_eval_metrics.py +++ b/tests/python/test_eval_metrics.py @@ -1,6 +1,5 @@ import numpy as np import pytest - import xgboost as xgb from xgboost import testing as tm from xgboost.testing.metrics import ( diff --git a/tests/python/test_interaction_constraints.py b/tests/python/test_interaction_constraints.py index 6c95730a8ff4..fdf72acbb254 100644 --- a/tests/python/test_interaction_constraints.py +++ b/tests/python/test_interaction_constraints.py @@ -1,5 +1,4 @@ import pytest - from xgboost import testing as tm from xgboost.testing.interaction_constraints import ( run_interaction_constraints, diff --git a/tests/python/test_intercept.py b/tests/python/test_intercept.py index 77f1bdb3f484..ebd97dffe4d7 100644 --- a/tests/python/test_intercept.py +++ b/tests/python/test_intercept.py @@ -1,7 +1,6 @@ from itertools import product import pytest - from xgboost.testing.intercept import ( run_adaptive, run_exp_family, diff --git a/tests/python/test_linear.py b/tests/python/test_linear.py index b3d573537d7f..cc9bfb286865 100644 --- a/tests/python/test_linear.py +++ b/tests/python/test_linear.py @@ -1,25 +1,29 @@ from typing import Dict -from hypothesis import given, note, settings, strategies - import xgboost as xgb +from hypothesis import given, note, settings, strategies from xgboost import testing as tm pytestmark = tm.timeout(20) -parameter_strategy = strategies.fixed_dictionaries({ - 'booster': strategies.just('gblinear'), - 'eta': strategies.floats(0.01, 0.25), - 'tolerance': strategies.floats(1e-5, 1e-2), - 'nthread': strategies.integers(1, 4), -}) +parameter_strategy = strategies.fixed_dictionaries( + { + "booster": strategies.just("gblinear"), + "eta": strategies.floats(0.01, 0.25), + "tolerance": strategies.floats(1e-5, 1e-2), + "nthread": strategies.integers(1, 4), + } +) -coord_strategy = strategies.fixed_dictionaries({ - 'feature_selector': strategies.sampled_from(['cyclic', 'shuffle', - 'greedy', 'thrifty']), - 'top_k': strategies.integers(1, 10), -}) +coord_strategy = strategies.fixed_dictionaries( + { + "feature_selector": strategies.sampled_from( + ["cyclic", "shuffle", "greedy", "thrifty"] + ), + "top_k": strategies.integers(1, 10), + } +) def train_result(param: dict, dmat: xgb.DMatrix, num_rounds: int) -> Dict[str, Dict]: @@ -40,14 +44,16 @@ class TestLinear: parameter_strategy, strategies.integers(10, 50), tm.make_dataset_strategy(), - coord_strategy + coord_strategy, ) @settings(deadline=None, max_examples=20, print_blob=True) def test_coordinate(self, param, num_rounds, dataset, coord_param): - param['updater'] = 'coord_descent' + param["updater"] = "coord_descent" param.update(coord_param) param = dataset.set_params(param) - result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric] + result = train_result(param, dataset.get_dmat(), num_rounds)["train"][ + dataset.metric + ] note(result) assert tm.non_increasing(result, 5e-4) @@ -60,27 +66,31 @@ def test_coordinate(self, param, num_rounds, dataset, coord_param): tm.make_dataset_strategy(), coord_strategy, strategies.floats(1e-5, 0.8), - strategies.floats(1e-5, 0.8) + strategies.floats(1e-5, 0.8), ) @settings(deadline=None, max_examples=20, print_blob=True) - def test_coordinate_regularised(self, param, num_rounds, dataset, coord_param, alpha, lambd): - param['updater'] = 'coord_descent' - param['alpha'] = alpha - param['lambda'] = lambd + def test_coordinate_regularised( + self, param, num_rounds, dataset, coord_param, alpha, lambd + ): + param["updater"] = "coord_descent" + param["alpha"] = alpha + param["lambda"] = lambd param.update(coord_param) param = dataset.set_params(param) - result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric] + result = train_result(param, dataset.get_dmat(), num_rounds)["train"][ + dataset.metric + ] note(result) assert tm.non_increasing([result[0], result[-1]]) - @given( - parameter_strategy, strategies.integers(10, 50), tm.make_dataset_strategy() - ) + @given(parameter_strategy, strategies.integers(10, 50), tm.make_dataset_strategy()) @settings(deadline=None, max_examples=20, print_blob=True) def test_shotgun(self, param, num_rounds, dataset): - param['updater'] = 'shotgun' + param["updater"] = "shotgun" param = dataset.set_params(param) - result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric] + result = train_result(param, dataset.get_dmat(), num_rounds)["train"][ + dataset.metric + ] note(result) # shotgun is non-deterministic, so we relax the test by only using first and last # iteration. @@ -95,14 +105,16 @@ def test_shotgun(self, param, num_rounds, dataset): strategies.integers(10, 50), tm.make_dataset_strategy(), strategies.floats(1e-5, 1.0), - strategies.floats(1e-5, 1.0) + strategies.floats(1e-5, 1.0), ) @settings(deadline=None, max_examples=20, print_blob=True) def test_shotgun_regularised(self, param, num_rounds, dataset, alpha, lambd): - param['updater'] = 'shotgun' - param['alpha'] = alpha - param['lambda'] = lambd + param["updater"] = "shotgun" + param["alpha"] = alpha + param["lambda"] = lambd param = dataset.set_params(param) - result = train_result(param, dataset.get_dmat(), num_rounds)['train'][dataset.metric] + result = train_result(param, dataset.get_dmat(), num_rounds)["train"][ + dataset.metric + ] note(result) assert tm.non_increasing([result[0], result[-1]]) diff --git a/tests/python/test_model_compatibility.py b/tests/python/test_model_compatibility.py index 719a35562a40..fa0c9fdc87fc 100644 --- a/tests/python/test_model_compatibility.py +++ b/tests/python/test_model_compatibility.py @@ -8,7 +8,6 @@ import generate_models as gm import pytest - import xgboost from xgboost import testing as tm from xgboost.testing.updater import get_basescore diff --git a/tests/python/test_model_io.py b/tests/python/test_model_io.py index 5d9979c26729..58ad01281f29 100644 --- a/tests/python/test_model_io.py +++ b/tests/python/test_model_io.py @@ -8,7 +8,6 @@ import numpy as np import pytest - import xgboost as xgb from xgboost import testing as tm @@ -149,9 +148,9 @@ def test_with_pathlib(self) -> None: def dump_assertions(dump: List[str]) -> None: """Assertions for the expected dump from Booster""" assert len(dump) == 1, "Exepcted only 1 tree to be dumped." - assert ( - len(dump[0].splitlines()) == 3 - ), "Expected 1 root and 2 leaves - 3 lines." + assert len(dump[0].splitlines()) == 3, ( + "Expected 1 root and 2 leaves - 3 lines." + ) # load the model again using Path bst2 = xgb.Booster(model_file=save_path) diff --git a/tests/python/test_monotone_constraints.py b/tests/python/test_monotone_constraints.py index 312d363c4c91..49086208e9be 100644 --- a/tests/python/test_monotone_constraints.py +++ b/tests/python/test_monotone_constraints.py @@ -2,7 +2,6 @@ import numpy as np import pytest - import xgboost as xgb from xgboost import testing as tm from xgboost.testing.monotone_constraints import training_dset, x, y @@ -48,7 +47,6 @@ def is_correctly_constrained(learner, feature_names=None): class TestMonotoneConstraints: def test_monotone_constraints_for_exact_tree_method(self) -> None: - # first check monotonicity for the 'exact' tree method params_for_constrained_exact_method = { "tree_method": "exact", @@ -85,7 +83,6 @@ def test_monotone_constraints_tuple(self) -> None: @pytest.mark.parametrize("format", [dict, list]) def test_monotone_constraints_feature_names(self, format: Type) -> None: - # next check monotonicity when initializing monotone_constraints by feature names params = { "tree_method": "hist", diff --git a/tests/python/test_openmp.py b/tests/python/test_openmp.py index 4710426b0e52..af6ef4b7ad3c 100644 --- a/tests/python/test_openmp.py +++ b/tests/python/test_openmp.py @@ -4,7 +4,6 @@ import numpy as np import pytest - import xgboost as xgb from xgboost import testing as tm @@ -15,65 +14,67 @@ class TestOMP: def test_omp(self): dtrain, dtest = tm.load_agaricus(__file__) - param = {'booster': 'gbtree', - 'objective': 'binary:logistic', - 'grow_policy': 'depthwise', - 'tree_method': 'hist', - 'eval_metric': 'error', - 'max_depth': 5, - 'min_child_weight': 0} - - watchlist = [(dtest, 'eval'), (dtrain, 'train')] + param = { + "booster": "gbtree", + "objective": "binary:logistic", + "grow_policy": "depthwise", + "tree_method": "hist", + "eval_metric": "error", + "max_depth": 5, + "min_child_weight": 0, + } + + watchlist = [(dtest, "eval"), (dtrain, "train")] num_round = 5 def run_trial(): res = {} bst = xgb.train(param, dtrain, num_round, watchlist, evals_result=res) - metrics = [res['train']['error'][-1], res['eval']['error'][-1]] + metrics = [res["train"]["error"][-1], res["eval"]["error"][-1]] preds = bst.predict(dtest) return metrics, preds def consist_test(title, n): auc, pred = run_trial() - for i in range(n-1): + for i in range(n - 1): auc2, pred2 = run_trial() try: assert auc == auc2 assert np.array_equal(pred, pred2) except Exception as e: - print('-------test %s failed, num_trial: %d-------' % (title, i)) + print("-------test %s failed, num_trial: %d-------" % (title, i)) raise e auc, pred = auc2, pred2 return auc, pred - print('test approx ...') - param['tree_method'] = 'approx' + print("test approx ...") + param["tree_method"] = "approx" n_trials = 10 - param['nthread'] = 1 - auc_1, pred_1 = consist_test('approx_thread_1', n_trials) + param["nthread"] = 1 + auc_1, pred_1 = consist_test("approx_thread_1", n_trials) - param['nthread'] = 2 - auc_2, pred_2 = consist_test('approx_thread_2', n_trials) + param["nthread"] = 2 + auc_2, pred_2 = consist_test("approx_thread_2", n_trials) - param['nthread'] = 3 - auc_3, pred_3 = consist_test('approx_thread_3', n_trials) + param["nthread"] = 3 + auc_3, pred_3 = consist_test("approx_thread_3", n_trials) assert auc_1 == auc_2 == auc_3 assert np.array_equal(auc_1, auc_2) assert np.array_equal(auc_1, auc_3) - print('test hist ...') - param['tree_method'] = 'hist' + print("test hist ...") + param["tree_method"] = "hist" - param['nthread'] = 1 - auc_1, pred_1 = consist_test('hist_thread_1', n_trials) + param["nthread"] = 1 + auc_1, pred_1 = consist_test("hist_thread_1", n_trials) - param['nthread'] = 2 - auc_2, pred_2 = consist_test('hist_thread_2', n_trials) + param["nthread"] = 2 + auc_2, pred_2 = consist_test("hist_thread_2", n_trials) - param['nthread'] = 3 - auc_3, pred_3 = consist_test('hist_thread_3', n_trials) + param["nthread"] = 3 + auc_3, pred_3 = consist_test("hist_thread_3", n_trials) assert auc_1 == auc_2 == auc_3 assert np.array_equal(auc_1, auc_2) @@ -83,9 +84,8 @@ def consist_test(title, n): @pytest.mark.timeout(30) def test_with_omp_thread_limit(self): args = [ - "python", os.path.join( - os.path.dirname(tm.normpath(__file__)), "with_omp_limit.py" - ) + "python", + os.path.join(os.path.dirname(tm.normpath(__file__)), "with_omp_limit.py"), ] results = [] with tempfile.TemporaryDirectory() as tmpdir: diff --git a/tests/python/test_ordinal.py b/tests/python/test_ordinal.py index 3e76a8b37eee..cf0193d946ed 100644 --- a/tests/python/test_ordinal.py +++ b/tests/python/test_ordinal.py @@ -1,5 +1,4 @@ import pytest - from xgboost import testing as tm from xgboost.testing.ordinal import ( run_cat_container, diff --git a/tests/python/test_parse_tree.py b/tests/python/test_parse_tree.py index 0155acca1d79..00002c8bb33f 100644 --- a/tests/python/test_parse_tree.py +++ b/tests/python/test_parse_tree.py @@ -1,6 +1,5 @@ import numpy as np import pytest - import xgboost as xgb from xgboost import testing as tm from xgboost.testing.parse_tree import ( diff --git a/tests/python/test_pickling.py b/tests/python/test_pickling.py index 198c7f0866fd..48d03d039dce 100644 --- a/tests/python/test_pickling.py +++ b/tests/python/test_pickling.py @@ -3,7 +3,6 @@ import pickle import numpy as np - import xgboost as xgb kRows = 100 diff --git a/tests/python/test_plotting.py b/tests/python/test_plotting.py index ddcf9c571116..9decef5b9555 100644 --- a/tests/python/test_plotting.py +++ b/tests/python/test_plotting.py @@ -1,6 +1,5 @@ import numpy as np import pytest - import xgboost as xgb from xgboost import testing as tm from xgboost.testing.plotting import run_categorical diff --git a/tests/python/test_predict.py b/tests/python/test_predict.py index 5994d653a796..55b7f052892f 100644 --- a/tests/python/test_predict.py +++ b/tests/python/test_predict.py @@ -6,9 +6,8 @@ import numpy as np import pandas as pd import pytest -from scipy import sparse - import xgboost as xgb +from scipy import sparse from xgboost import testing as tm from xgboost.testing.data import get_california_housing, np_dtypes, pd_dtypes from xgboost.testing.predict import run_base_margin_vs_base_score, run_predict_leaf diff --git a/tests/python/test_quantile_dmatrix.py b/tests/python/test_quantile_dmatrix.py index 7883a1417cd1..40e387d3f896 100644 --- a/tests/python/test_quantile_dmatrix.py +++ b/tests/python/test_quantile_dmatrix.py @@ -2,10 +2,9 @@ import numpy as np import pytest +import xgboost as xgb from hypothesis import given, settings, strategies from scipy import sparse - -import xgboost as xgb from xgboost.testing import ( IteratorForTest, make_batches, diff --git a/tests/python/test_shap.py b/tests/python/test_shap.py index 097298f07d68..3517a142b61a 100644 --- a/tests/python/test_shap.py +++ b/tests/python/test_shap.py @@ -3,7 +3,6 @@ import numpy as np import scipy.special - import xgboost as xgb from xgboost import testing as tm diff --git a/tests/python/test_survival.py b/tests/python/test_survival.py index e5ca30fffd07..1d2297e376e9 100644 --- a/tests/python/test_survival.py +++ b/tests/python/test_survival.py @@ -4,7 +4,6 @@ import numpy as np import pytest - import xgboost as xgb from xgboost import testing as tm @@ -56,7 +55,7 @@ def run(evals: Optional[list]) -> None: def test_aft_survival_toy_data( - toy_data: Tuple[xgb.DMatrix, np.ndarray, np.ndarray] + toy_data: Tuple[xgb.DMatrix, np.ndarray, np.ndarray], ) -> None: # See demo/aft_survival/aft_survival_viz_demo.py X = np.array([1, 2, 3, 4, 5]).reshape((-1, 1)) @@ -125,44 +124,56 @@ def test_aft_empty_dmatrix(): y_lower, y_upper = np.array([]), np.array([]) dtrain = xgb.DMatrix(X) dtrain.set_info(label_lower_bound=y_lower, label_upper_bound=y_upper) - bst = xgb.train({'objective': 'survival:aft', 'tree_method': 'hist'}, - dtrain, num_boost_round=2, evals=[(dtrain, 'train')]) + bst = xgb.train( + {"objective": "survival:aft", "tree_method": "hist"}, + dtrain, + num_boost_round=2, + evals=[(dtrain, "train")], + ) @pytest.mark.skipif(**tm.no_pandas()) def test_aft_survival_demo_data(): import pandas as pd - df = pd.read_csv(os.path.join(dpath, 'veterans_lung_cancer.csv')) - y_lower_bound = df['Survival_label_lower_bound'] - y_upper_bound = df['Survival_label_upper_bound'] - X = df.drop(['Survival_label_lower_bound', 'Survival_label_upper_bound'], axis=1) + df = pd.read_csv(os.path.join(dpath, "veterans_lung_cancer.csv")) + + y_lower_bound = df["Survival_label_lower_bound"] + y_upper_bound = df["Survival_label_upper_bound"] + X = df.drop(["Survival_label_lower_bound", "Survival_label_upper_bound"], axis=1) dtrain = xgb.DMatrix(X) - dtrain.set_float_info('label_lower_bound', y_lower_bound) - dtrain.set_float_info('label_upper_bound', y_upper_bound) - - base_params = {'verbosity': 0, - 'objective': 'survival:aft', - 'eval_metric': 'aft-nloglik', - 'tree_method': 'hist', - 'learning_rate': 0.05, - 'aft_loss_distribution_scale': 1.20, - 'max_depth': 6, - 'lambda': 0.01, - 'alpha': 0.02} + dtrain.set_float_info("label_lower_bound", y_lower_bound) + dtrain.set_float_info("label_upper_bound", y_upper_bound) + + base_params = { + "verbosity": 0, + "objective": "survival:aft", + "eval_metric": "aft-nloglik", + "tree_method": "hist", + "learning_rate": 0.05, + "aft_loss_distribution_scale": 1.20, + "max_depth": 6, + "lambda": 0.01, + "alpha": 0.02, + } nloglik_rec = {} - dists = ['normal', 'logistic', 'extreme'] + dists = ["normal", "logistic", "extreme"] for dist in dists: params = base_params - params.update({'aft_loss_distribution': dist}) + params.update({"aft_loss_distribution": dist}) evals_result = {} - bst = xgb.train(params, dtrain, num_boost_round=500, evals=[(dtrain, 'train')], - evals_result=evals_result) - nloglik_rec[dist] = evals_result['train']['aft-nloglik'] + bst = xgb.train( + params, + dtrain, + num_boost_round=500, + evals=[(dtrain, "train")], + evals_result=evals_result, + ) + nloglik_rec[dist] = evals_result["train"]["aft-nloglik"] # AFT metric (negative log likelihood) improve monotonically assert all(p >= q for p, q in zip(nloglik_rec[dist], nloglik_rec[dist][:1])) # For this data, normal distribution works the best - assert nloglik_rec['normal'][-1] < 4.9 - assert nloglik_rec['logistic'][-1] > 4.9 - assert nloglik_rec['extreme'][-1] > 4.9 + assert nloglik_rec["normal"][-1] < 4.9 + assert nloglik_rec["logistic"][-1] > 4.9 + assert nloglik_rec["extreme"][-1] > 4.9 diff --git a/tests/python/test_tracker.py b/tests/python/test_tracker.py index 1183656ddae4..0e7811ee07c9 100644 --- a/tests/python/test_tracker.py +++ b/tests/python/test_tracker.py @@ -6,7 +6,6 @@ import numpy as np import pytest from hypothesis import HealthCheck, given, settings, strategies - from xgboost import RabitTracker, collective from xgboost import testing as tm @@ -160,7 +159,6 @@ def test_broadcast(): @pytest.mark.skipif(**tm.no_dask()) def test_rank_assignment() -> None: from distributed import Client, LocalCluster - from xgboost import dask as dxgb from xgboost.testing.dask import get_rabit_args diff --git a/tests/python/test_training_continuation.py b/tests/python/test_training_continuation.py index 9e9f37ea51a8..c9360018a069 100644 --- a/tests/python/test_training_continuation.py +++ b/tests/python/test_training_continuation.py @@ -3,7 +3,6 @@ import numpy as np import pytest - import xgboost as xgb from xgboost import testing as tm from xgboost.testing.continuation import run_training_continuation_model_output diff --git a/tests/python/test_tree_regularization.py b/tests/python/test_tree_regularization.py index c5bace3b61bb..4a78299da9fd 100644 --- a/tests/python/test_tree_regularization.py +++ b/tests/python/test_tree_regularization.py @@ -1,7 +1,6 @@ import numpy as np -from numpy.testing import assert_approx_equal - import xgboost as xgb +from numpy.testing import assert_approx_equal train_data = xgb.DMatrix(np.array([[1]]), label=np.array([1])) diff --git a/tests/python/test_with_arrow.py b/tests/python/test_with_arrow.py index 3cfc07296f65..fdab62d67353 100644 --- a/tests/python/test_with_arrow.py +++ b/tests/python/test_with_arrow.py @@ -2,7 +2,6 @@ import numpy as np import pytest - import xgboost as xgb from xgboost import testing as tm from xgboost.compat import is_dataframe diff --git a/tests/python/test_with_modin.py b/tests/python/test_with_modin.py index ea1ed691b820..245726c05fe3 100644 --- a/tests/python/test_with_modin.py +++ b/tests/python/test_with_modin.py @@ -1,7 +1,6 @@ import numpy as np import pandas as pd import pytest - import xgboost as xgb from xgboost import testing as tm from xgboost.compat import is_dataframe @@ -25,60 +24,60 @@ def test_type_check() -> None: class TestModin: @pytest.mark.xfail def test_modin(self) -> None: - df = md.DataFrame([[1, 2., True], [2, 3., False]], - columns=['a', 'b', 'c']) + df = md.DataFrame([[1, 2.0, True], [2, 3.0, False]], columns=["a", "b", "c"]) dm = xgb.DMatrix(df, label=md.Series([1, 2])) - assert dm.feature_names == ['a', 'b', 'c'] - assert dm.feature_types == ['int', 'float', 'i'] + assert dm.feature_names == ["a", "b", "c"] + assert dm.feature_types == ["int", "float", "i"] assert dm.num_row() == 2 assert dm.num_col() == 3 np.testing.assert_array_equal(dm.get_label(), np.array([1, 2])) # overwrite feature_names and feature_types - dm = xgb.DMatrix(df, label=md.Series([1, 2]), - feature_names=['x', 'y', 'z'], - feature_types=['q', 'q', 'q']) - assert dm.feature_names == ['x', 'y', 'z'] - assert dm.feature_types == ['q', 'q', 'q'] + dm = xgb.DMatrix( + df, + label=md.Series([1, 2]), + feature_names=["x", "y", "z"], + feature_types=["q", "q", "q"], + ) + assert dm.feature_names == ["x", "y", "z"] + assert dm.feature_types == ["q", "q", "q"] assert dm.num_row() == 2 assert dm.num_col() == 3 # incorrect dtypes - df = md.DataFrame([[1, 2., 'x'], [2, 3., 'y']], - columns=['a', 'b', 'c']) + df = md.DataFrame([[1, 2.0, "x"], [2, 3.0, "y"]], columns=["a", "b", "c"]) with pytest.raises(ValueError): xgb.DMatrix(df) # numeric columns - df = md.DataFrame([[1, 2., True], [2, 3., False]]) + df = md.DataFrame([[1, 2.0, True], [2, 3.0, False]]) dm = xgb.DMatrix(df, label=md.Series([1, 2])) - assert dm.feature_names == ['0', '1', '2'] - assert dm.feature_types == ['int', 'float', 'i'] + assert dm.feature_names == ["0", "1", "2"] + assert dm.feature_types == ["int", "float", "i"] assert dm.num_row() == 2 assert dm.num_col() == 3 np.testing.assert_array_equal(dm.get_label(), np.array([1, 2])) - df = md.DataFrame([[1, 2., 1], [2, 3., 1]], columns=[4, 5, 6]) + df = md.DataFrame([[1, 2.0, 1], [2, 3.0, 1]], columns=[4, 5, 6]) dm = xgb.DMatrix(df, label=md.Series([1, 2])) - assert dm.feature_names == ['4', '5', '6'] - assert dm.feature_types == ['int', 'float', 'int'] + assert dm.feature_names == ["4", "5", "6"] + assert dm.feature_types == ["int", "float", "int"] assert dm.num_row() == 2 assert dm.num_col() == 3 - df = md.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]}) + df = md.DataFrame({"A": ["X", "Y", "Z"], "B": [1, 2, 3]}) dummies = md.get_dummies(df) # B A_X A_Y A_Z # 0 1 1 0 0 # 1 2 0 1 0 # 2 3 0 0 1 - result, _, _ = xgb.data._transform_pandas_df(dummies, - enable_categorical=False) - exp = np.array([[1., 1., 0., 0.], - [2., 0., 1., 0.], - [3., 0., 0., 1.]]).T + result, _, _ = xgb.data._transform_pandas_df(dummies, enable_categorical=False) + exp = np.array( + [[1.0, 1.0, 0.0, 0.0], [2.0, 0.0, 1.0, 0.0], [3.0, 0.0, 0.0, 1.0]] + ).T np.testing.assert_array_equal(result.columns, exp) dm = xgb.DMatrix(dummies) - assert dm.feature_names == ['B', 'A_X', 'A_Y', 'A_Z'] + assert dm.feature_names == ["B", "A_X", "A_Y", "A_Z"] if int(pd.__version__[0]) >= 2: assert dm.feature_types == ["int", "i", "i", "i"] else: @@ -87,10 +86,10 @@ def test_modin(self) -> None: assert dm.num_row() == 3 assert dm.num_col() == 4 - df = md.DataFrame({'A=1': [1, 2, 3], 'A=2': [4, 5, 6]}) + df = md.DataFrame({"A=1": [1, 2, 3], "A=2": [4, 5, 6]}) dm = xgb.DMatrix(df) - assert dm.feature_names == ['A=1', 'A=2'] - assert dm.feature_types == ['int', 'int'] + assert dm.feature_names == ["A=1", "A=2"] + assert dm.feature_types == ["int", "int"] assert dm.num_row() == 3 assert dm.num_col() == 2 @@ -98,23 +97,26 @@ def test_modin(self) -> None: dm_int = xgb.DMatrix(df_int) df_range = md.DataFrame([[1, 1.1], [2, 2.2]], columns=range(9, 11, 1)) dm_range = xgb.DMatrix(df_range) - assert dm_int.feature_names == ['9', '10'] # assert not "9 " + assert dm_int.feature_names == ["9", "10"] # assert not "9 " assert dm_int.feature_names == dm_range.feature_names # test MultiIndex as columns df = md.DataFrame( - [ - (1, 2, 3, 4, 5, 6), - (6, 5, 4, 3, 2, 1) - ], - columns=md.MultiIndex.from_tuples(( - ('a', 1), ('a', 2), ('a', 3), - ('b', 1), ('b', 2), ('b', 3), - )) + [(1, 2, 3, 4, 5, 6), (6, 5, 4, 3, 2, 1)], + columns=md.MultiIndex.from_tuples( + ( + ("a", 1), + ("a", 2), + ("a", 3), + ("b", 1), + ("b", 2), + ("b", 3), + ) + ), ) dm = xgb.DMatrix(df) - assert dm.feature_names == ['a 1', 'a 2', 'a 3', 'b 1', 'b 2', 'b 3'] - assert dm.feature_types == ['int', 'int', 'int', 'int', 'int', 'int'] + assert dm.feature_names == ["a 1", "a 2", "a 3", "b 1", "b 2", "b 3"] + assert dm.feature_types == ["int", "int", "int", "int", "int", "int"] assert dm.num_row() == 2 assert dm.num_col() == 6 @@ -130,9 +132,7 @@ def test_modin_label(self): xgb.data._transform_pandas_df(df, False, None, None, "label") df = md.DataFrame({"A": np.array([1, 2, 3], dtype=int)}) - result, _, _ = xgb.data._transform_pandas_df( - df, False, None, None, "label" - ) + result, _, _ = xgb.data._transform_pandas_df(df, False, None, None, "label") np.testing.assert_array_equal( np.stack(result.columns, axis=1), np.array([[1.0], [2.0], [3.0]], dtype=float), diff --git a/tests/python/test_with_pandas.py b/tests/python/test_with_pandas.py index 5a3e28c44222..e9e894bc8d89 100644 --- a/tests/python/test_with_pandas.py +++ b/tests/python/test_with_pandas.py @@ -2,7 +2,6 @@ import numpy as np import pytest - import xgboost as xgb from xgboost import testing as tm from xgboost.compat import is_dataframe diff --git a/tests/python/test_with_polars.py b/tests/python/test_with_polars.py index c3686448f8a0..08341f1c1e85 100644 --- a/tests/python/test_with_polars.py +++ b/tests/python/test_with_polars.py @@ -7,7 +7,6 @@ import numpy as np import pytest - import xgboost as xgb from xgboost.compat import is_dataframe diff --git a/tests/python/test_with_scipy.py b/tests/python/test_with_scipy.py index 3990c4b0580e..cb3db13c9b9d 100644 --- a/tests/python/test_with_scipy.py +++ b/tests/python/test_with_scipy.py @@ -5,7 +5,6 @@ import numpy as np import pytest import scipy.sparse - import xgboost as xgb from xgboost.testing.utils import predictor_equal diff --git a/tests/python/test_with_shap.py b/tests/python/test_with_shap.py index 1aeb56f7a54f..f10e4d4e61ee 100644 --- a/tests/python/test_with_shap.py +++ b/tests/python/test_with_shap.py @@ -1,6 +1,5 @@ import numpy as np import pytest - import xgboost as xgb from xgboost.testing.data import get_california_housing diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index 936d22c0200e..67236677e92c 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -8,9 +8,8 @@ import numpy as np import pytest -from sklearn.utils.estimator_checks import parametrize_with_checks - import xgboost as xgb +from sklearn.utils.estimator_checks import parametrize_with_checks from xgboost import testing as tm from xgboost.testing.data import get_california_housing from xgboost.testing.ranking import run_ranking_categorical, run_ranking_qid_df @@ -33,17 +32,18 @@ def test_binary_classification(): from sklearn.model_selection import KFold digits = load_digits(n_class=2) - y = digits['target'] - X = digits['data'] + y = digits["target"] + X = digits["data"] kf = KFold(n_splits=2, shuffle=True, random_state=rng) for cls in (xgb.XGBClassifier, xgb.XGBRFClassifier): for train_index, test_index in kf.split(X, y): - clf = cls(random_state=42, eval_metric=['auc', 'logloss']) + clf = cls(random_state=42, eval_metric=["auc", "logloss"]) xgb_model = clf.fit(X[train_index], y[train_index]) preds = xgb_model.predict(X[test_index]) labels = y[test_index] - err = sum(1 for i in range(len(preds)) - if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) + err = sum( + 1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i] + ) / float(len(preds)) assert err < 0.1 @@ -232,13 +232,12 @@ def test_stacking_regression(): X, y = load_diabetes(return_X_y=True) estimators = [ - ('gbm', xgb.sklearn.XGBRegressor(objective='reg:squarederror')), - ('lr', RidgeCV()) + ("gbm", xgb.sklearn.XGBRegressor(objective="reg:squarederror")), + ("lr", RidgeCV()), ] reg = StackingRegressor( estimators=estimators, - final_estimator=RandomForestRegressor(n_estimators=10, - random_state=42) + final_estimator=RandomForestRegressor(n_estimators=10, random_state=42), ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) @@ -256,9 +255,8 @@ def test_stacking_classification(): X, y = load_iris(return_X_y=True) estimators = [ - ('gbm', xgb.sklearn.XGBClassifier()), - ('svr', make_pipeline(StandardScaler(), - LinearSVC(random_state=42))) + ("gbm", xgb.sklearn.XGBClassifier()), + ("svr", make_pipeline(StandardScaler(), LinearSVC(random_state=42))), ] clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression() @@ -284,26 +282,89 @@ def test_feature_importances_weight(): base_score=0.5, ).fit(X, y) - exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.00833333, 0., - 0., 0., 0., 0., 0., 0., 0., 0.025, 0.14166667, 0., 0., 0., - 0., 0., 0., 0.00833333, 0.25833333, 0., 0., 0., 0., - 0.03333334, 0.03333334, 0., 0.32499999, 0., 0., 0., 0., - 0.05, 0.06666667, 0., 0., 0., 0., 0., 0., 0., 0.04166667, - 0., 0., 0., 0., 0., 0., 0., 0.00833333, 0., 0., 0., 0., - 0.], dtype=np.float32) + exp = np.array( + [ + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.00833333, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.025, + 0.14166667, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.00833333, + 0.25833333, + 0.0, + 0.0, + 0.0, + 0.0, + 0.03333334, + 0.03333334, + 0.0, + 0.32499999, + 0.0, + 0.0, + 0.0, + 0.0, + 0.05, + 0.06666667, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.04166667, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.00833333, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + ], + dtype=np.float32, + ) np.testing.assert_almost_equal(xgb_model.feature_importances_, exp) # numeric columns import pandas as pd - y = pd.Series(digits['target']) - X = pd.DataFrame(digits['data']) + + y = pd.Series(digits["target"]) + X = pd.DataFrame(digits["data"]) xgb_model = xgb.XGBClassifier( random_state=0, tree_method="exact", learning_rate=0.1, - base_score=.5, - importance_type="weight" + base_score=0.5, + importance_type="weight", ).fit(X, y) np.testing.assert_almost_equal(xgb_model.feature_importances_, exp) @@ -312,7 +373,7 @@ def test_feature_importances_weight(): tree_method="exact", learning_rate=0.1, importance_type="weight", - base_score=.5, + base_score=0.5, ).fit(X, y) np.testing.assert_almost_equal(xgb_model.feature_importances_, exp) @@ -349,30 +410,93 @@ def test_feature_importances_gain(): from sklearn.datasets import load_digits digits = load_digits(n_class=2) - y = digits['target'] - X = digits['data'] + y = digits["target"] + X = digits["data"] xgb_model = xgb.XGBClassifier( - random_state=0, tree_method="exact", + random_state=0, + tree_method="exact", learning_rate=0.1, importance_type="gain", base_score=0.5, ).fit(X, y) - exp = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., - 0.00326159, 0., 0., 0., 0., 0., 0., 0., 0., - 0.00297238, 0.00988034, 0., 0., 0., 0., 0., 0., - 0.03512521, 0.41123885, 0., 0., 0., 0., - 0.01326332, 0.00160674, 0., 0.4206952, 0., 0., 0., - 0., 0.00616747, 0.01237546, 0., 0., 0., 0., 0., - 0., 0., 0.08240705, 0., 0., 0., 0., 0., 0., 0., - 0.00100649, 0., 0., 0., 0., 0.], dtype=np.float32) + exp = np.array( + [ + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.00326159, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.00297238, + 0.00988034, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.03512521, + 0.41123885, + 0.0, + 0.0, + 0.0, + 0.0, + 0.01326332, + 0.00160674, + 0.0, + 0.4206952, + 0.0, + 0.0, + 0.0, + 0.0, + 0.00616747, + 0.01237546, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.08240705, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.00100649, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + ], + dtype=np.float32, + ) np.testing.assert_almost_equal(xgb_model.feature_importances_, exp) # numeric columns import pandas as pd - y = pd.Series(digits['target']) - X = pd.DataFrame(digits['data']) + + y = pd.Series(digits["target"]) + X = pd.DataFrame(digits["data"]) xgb_model = xgb.XGBClassifier( random_state=0, tree_method="exact", @@ -400,9 +524,10 @@ def test_feature_importances_gain(): def test_select_feature(): from sklearn.datasets import load_digits from sklearn.feature_selection import SelectFromModel + digits = load_digits(n_class=2) - y = digits['target'] - X = digits['data'] + y = digits["target"] + X = digits["data"] cls = xgb.XGBClassifier() cls.fit(X, y) selector = SelectFromModel(cls, prefit=True, max_features=1) @@ -551,11 +676,7 @@ def dummy_objective(y_true, y_preds): raise XGBCustomObjectiveException() xgb_model = xgb.XGBClassifier(objective=dummy_objective) - np.testing.assert_raises( - XGBCustomObjectiveException, - xgb_model.fit, - X, y - ) + np.testing.assert_raises(XGBCustomObjectiveException, xgb_model.fit, X, y) cls = xgb.XGBClassifier(n_estimators=1) cls.fit(X, y) @@ -578,8 +699,9 @@ def run_sklearn_api(booster, error, n_est): from sklearn.model_selection import train_test_split iris = load_iris() - tr_d, te_d, tr_l, te_l = train_test_split(iris.data, iris.target, - train_size=120, test_size=0.2) + tr_d, te_d, tr_l, te_l = train_test_split( + iris.data, iris.target, train_size=120, test_size=0.2 + ) classifier = xgb.XGBClassifier(booster=booster, n_estimators=n_est) classifier.fit(tr_d, tr_l) @@ -606,16 +728,17 @@ def test_sklearn_plotting(): classifier.fit(iris.data, iris.target) import matplotlib - matplotlib.use('Agg') + + matplotlib.use("Agg") from graphviz import Source from matplotlib.axes import Axes ax = xgb.plot_importance(classifier) assert isinstance(ax, Axes) - assert ax.get_title() == 'Feature importance' - assert ax.get_xlabel() == 'Importance score' - assert ax.get_ylabel() == 'Features' + assert ax.get_title() == "Feature importance" + assert ax.get_xlabel() == "Importance score" + assert ax.get_ylabel() == "Features" assert len(ax.patches) == 4 g = xgb.to_graphviz(classifier, num_trees=0) @@ -631,29 +754,43 @@ def test_sklearn_nfolds_cv(): from sklearn.model_selection import StratifiedKFold digits = load_digits(n_class=3) - X = digits['data'] - y = digits['target'] + X = digits["data"] + y = digits["target"] dm = xgb.DMatrix(X, label=y) params = { - 'max_depth': 2, - 'eta': 1, - 'verbosity': 0, - 'objective': - 'multi:softprob', - 'num_class': 3 + "max_depth": 2, + "eta": 1, + "verbosity": 0, + "objective": "multi:softprob", + "num_class": 3, } seed = 2016 nfolds = 5 skf = StratifiedKFold(n_splits=nfolds, shuffle=True, random_state=seed) - cv1 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, - seed=seed, as_pandas=True) - cv2 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, - folds=skf, seed=seed, as_pandas=True) - cv3 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, - stratified=True, seed=seed, as_pandas=True) + cv1 = xgb.cv( + params, dm, num_boost_round=10, nfold=nfolds, seed=seed, as_pandas=True + ) + cv2 = xgb.cv( + params, + dm, + num_boost_round=10, + nfold=nfolds, + folds=skf, + seed=seed, + as_pandas=True, + ) + cv3 = xgb.cv( + params, + dm, + num_boost_round=10, + nfold=nfolds, + stratified=True, + seed=seed, + as_pandas=True, + ) assert cv1.shape[0] == cv2.shape[0] and cv2.shape[0] == cv3.shape[0] assert cv2.iloc[-1, 0] == cv3.iloc[-1, 0] @@ -687,26 +824,26 @@ def test_split_value_histograms(): def test_sklearn_random_state(): clf = xgb.XGBClassifier(random_state=402) - assert clf.get_xgb_params()['random_state'] == 402 + assert clf.get_xgb_params()["random_state"] == 402 clf = xgb.XGBClassifier(random_state=401) - assert clf.get_xgb_params()['random_state'] == 401 + assert clf.get_xgb_params()["random_state"] == 401 random_state = np.random.RandomState(seed=403) clf = xgb.XGBClassifier(random_state=random_state) - assert isinstance(clf.get_xgb_params()['random_state'], int) + assert isinstance(clf.get_xgb_params()["random_state"], int) random_state = np.random.default_rng(seed=404) clf = xgb.XGBClassifier(random_state=random_state) - assert isinstance(clf.get_xgb_params()['random_state'], int) + assert isinstance(clf.get_xgb_params()["random_state"], int) def test_sklearn_n_jobs(): clf = xgb.XGBClassifier(n_jobs=1) - assert clf.get_xgb_params()['n_jobs'] == 1 + assert clf.get_xgb_params()["n_jobs"] == 1 clf = xgb.XGBClassifier(n_jobs=2) - assert clf.get_xgb_params()['n_jobs'] == 2 + assert clf.get_xgb_params()["n_jobs"] == 2 def test_parameters_access(): @@ -806,7 +943,7 @@ def test_get_params_works_as_expected(): def test_kwargs_error(): - params = {'updater': 'grow_gpu_hist', 'subsample': .5, 'n_jobs': -1} + params = {"updater": "grow_gpu_hist", "subsample": 0.5, "n_jobs": -1} with pytest.raises(TypeError): clf = xgb.XGBClassifier(n_jobs=1000, **params) assert isinstance(clf, xgb.XGBClassifier) @@ -935,42 +1072,54 @@ def test_RFECV(): # Regression X, y = load_diabetes(return_X_y=True) - bst = xgb.XGBRegressor(booster='gblinear', learning_rate=0.1, - n_estimators=10, - objective='reg:squarederror', - random_state=0, verbosity=0) - rfecv = RFECV( - estimator=bst, step=1, cv=3, scoring='neg_mean_squared_error') + bst = xgb.XGBRegressor( + booster="gblinear", + learning_rate=0.1, + n_estimators=10, + objective="reg:squarederror", + random_state=0, + verbosity=0, + ) + rfecv = RFECV(estimator=bst, step=1, cv=3, scoring="neg_mean_squared_error") rfecv.fit(X, y) # Binary classification X, y = load_breast_cancer(return_X_y=True) - bst = xgb.XGBClassifier(booster='gblinear', learning_rate=0.1, - n_estimators=10, - objective='binary:logistic', - random_state=0, verbosity=0) - rfecv = RFECV(estimator=bst, step=0.5, cv=3, scoring='roc_auc') + bst = xgb.XGBClassifier( + booster="gblinear", + learning_rate=0.1, + n_estimators=10, + objective="binary:logistic", + random_state=0, + verbosity=0, + ) + rfecv = RFECV(estimator=bst, step=0.5, cv=3, scoring="roc_auc") rfecv.fit(X, y) # Multi-class classification X, y = load_iris(return_X_y=True) - bst = xgb.XGBClassifier(base_score=0.4, booster='gblinear', - learning_rate=0.1, - n_estimators=10, - objective='multi:softprob', - random_state=0, reg_alpha=0.001, reg_lambda=0.01, - scale_pos_weight=0.5, verbosity=0) - rfecv = RFECV(estimator=bst, step=0.5, cv=3, scoring='neg_log_loss') + bst = xgb.XGBClassifier( + base_score=0.4, + booster="gblinear", + learning_rate=0.1, + n_estimators=10, + objective="multi:softprob", + random_state=0, + reg_alpha=0.001, + reg_lambda=0.01, + scale_pos_weight=0.5, + verbosity=0, + ) + rfecv = RFECV(estimator=bst, step=0.5, cv=3, scoring="neg_log_loss") rfecv.fit(X, y) - X[0:4, :] = np.nan # verify scikit_learn doesn't throw with nan + X[0:4, :] = np.nan # verify scikit_learn doesn't throw with nan reg = xgb.XGBRegressor() rfecv = RFECV(estimator=reg) rfecv.fit(X, y) cls = xgb.XGBClassifier() - rfecv = RFECV(estimator=cls, step=0.5, cv=3, - scoring='neg_mean_squared_error') + rfecv = RFECV(estimator=cls, step=0.5, cv=3, scoring="neg_mean_squared_error") rfecv.fit(X, y) @@ -979,13 +1128,12 @@ def test_XGBClassifier_resume(): from sklearn.metrics import log_loss with tempfile.TemporaryDirectory() as tempdir: - model1_path = os.path.join(tempdir, 'test_XGBClassifier.model') - model1_booster_path = os.path.join(tempdir, 'test_XGBClassifier.booster') + model1_path = os.path.join(tempdir, "test_XGBClassifier.model") + model1_booster_path = os.path.join(tempdir, "test_XGBClassifier.booster") X, Y = load_breast_cancer(return_X_y=True) - model1 = xgb.XGBClassifier( - learning_rate=0.3, random_state=0, n_estimators=8) + model1 = xgb.XGBClassifier(learning_rate=0.3, random_state=0, n_estimators=8) model1.fit(X, Y) pred1 = model1.predict(X) @@ -1049,6 +1197,7 @@ def test_parameter_validation(): def test_deprecate_position_arg(): from sklearn.datasets import load_digits + X, y = load_digits(return_X_y=True, n_class=2) w = np.random.default_rng(0).uniform(size=y.size) with pytest.warns(FutureWarning): @@ -1064,7 +1213,7 @@ def test_deprecate_position_arg(): model.fit(X, y, w) with pytest.warns(FutureWarning): - xgb.XGBRanker('rank:ndcg', learning_rate=0.1) + xgb.XGBRanker("rank:ndcg", learning_rate=0.1) model = xgb.XGBRanker(n_estimators=1) group = np.repeat(1, X.shape[0]) with pytest.warns(FutureWarning): @@ -1167,9 +1316,9 @@ def test_feature_weights(tree_method): assert poly_increasing[0] > 0.08 assert poly_decreasing[0] < -0.08 - reg = xgb.XGBRegressor(feature_weights=np.ones((kCols, ))) + reg = xgb.XGBRegressor(feature_weights=np.ones((kCols,))) with pytest.raises(ValueError, match="Use the one in"): - reg.fit(X, y, feature_weights=np.ones((kCols, ))) + reg.fit(X, y, feature_weights=np.ones((kCols,))) @pytest.mark.parametrize("tree_method", ["hist", "approx", "exact"]) @@ -1320,7 +1469,7 @@ def test_evaluation_metric(): n_estimators=n_estimators, ) reg.fit(X, y, eval_set=[(X, y)]) - lines = out.getvalue().strip().split('\n') + lines = out.getvalue().strip().split("\n") assert len(lines) == n_estimators for line in lines: @@ -1339,7 +1488,7 @@ def merror(y_true: np.ndarray, predt: np.ndarray): tree_method="hist", eval_metric=merror, n_estimators=16, - objective="multi:softmax" + objective="multi:softmax", ) clf.fit(X, y, eval_set=[(X, y)]) custom = clf.evals_result() @@ -1348,19 +1497,18 @@ def merror(y_true: np.ndarray, predt: np.ndarray): tree_method="hist", eval_metric="merror", n_estimators=16, - objective="multi:softmax" + objective="multi:softmax", ) clf.fit(X, y, eval_set=[(X, y)]) internal = clf.evals_result() np.testing.assert_allclose( - custom["validation_0"]["merror"], - internal["validation_0"]["merror"], - atol=1e-6 + custom["validation_0"]["merror"], internal["validation_0"]["merror"], atol=1e-6 ) clf = xgb.XGBRFClassifier( - tree_method="hist", n_estimators=16, + tree_method="hist", + n_estimators=16, objective=tm.softprob_obj(10), eval_metric=merror, ) @@ -1411,24 +1559,34 @@ def test_weighted_evaluation_metric(): n_estimators=16, objective="binary:logistic", ) - clf.fit(X_train, y_train, sample_weight=weights_train, eval_set=[(X_test, y_test)], - sample_weight_eval_set=[weights_eval_set]) + clf.fit( + X_train, + y_train, + sample_weight=weights_train, + eval_set=[(X_test, y_test)], + sample_weight_eval_set=[weights_eval_set], + ) custom = clf.evals_result() clf = xgb.XGBClassifier( tree_method="hist", eval_metric="logloss", n_estimators=16, - objective="binary:logistic" + objective="binary:logistic", + ) + clf.fit( + X_train, + y_train, + sample_weight=weights_train, + eval_set=[(X_test, y_test)], + sample_weight_eval_set=[weights_eval_set], ) - clf.fit(X_train, y_train, sample_weight=weights_train, eval_set=[(X_test, y_test)], - sample_weight_eval_set=[weights_eval_set]) internal = clf.evals_result() np.testing.assert_allclose( custom["validation_0"]["log_loss"], internal["validation_0"]["logloss"], - atol=1e-6 + atol=1e-6, ) @@ -1468,7 +1626,6 @@ def test_tags() -> None: # the try-excepts in this test should be removed once xgboost's # minimum supported scikit-learn version is at least 1.6 def test_sklearn_tags(): - def _assert_has_xgbmodel_tags(tags): # values set by XGBModel.__sklearn_tags__() assert tags.non_deterministic is False @@ -1506,7 +1663,9 @@ def _assert_has_xgbmodel_tags(tags): # only the exact error we expected to be raised should be raised assert bool(re.search(r"__sklearn_tags__.* should not be called", str(err))) - for rnk in [xgb.XGBRanker(),]: + for rnk in [ + xgb.XGBRanker(), + ]: try: # if no AttributeError was thrown, we must be using scikit-learn>=1.6, # and so the actual effects of __sklearn_tags__() should be tested diff --git a/tests/python/with_omp_limit.py b/tests/python/with_omp_limit.py index 856914e96a3a..b72f49b6f8b2 100644 --- a/tests/python/with_omp_limit.py +++ b/tests/python/with_omp_limit.py @@ -1,10 +1,9 @@ import sys +import xgboost as xgb from sklearn.datasets import make_classification from sklearn.metrics import roc_auc_score -import xgboost as xgb - def run_omp(output_path: str): X, y = make_classification( diff --git a/tests/test_distributed/test_federated/test_federated.py b/tests/test_distributed/test_federated/test_federated.py index 460b1b2206c9..5c3338e2594e 100644 --- a/tests/test_distributed/test_federated/test_federated.py +++ b/tests/test_distributed/test_federated/test_federated.py @@ -1,5 +1,4 @@ import pytest - from xgboost.testing.federated import run_federated_learning diff --git a/tests/test_distributed/test_gpu_federated/test_gpu_federated.py b/tests/test_distributed/test_gpu_federated/test_gpu_federated.py index c366a743f45f..2424a443c3ec 100644 --- a/tests/test_distributed/test_gpu_federated/test_gpu_federated.py +++ b/tests/test_distributed/test_gpu_federated/test_gpu_federated.py @@ -1,5 +1,4 @@ import pytest - from xgboost.testing.federated import run_federated_learning diff --git a/tests/test_distributed/test_gpu_with_dask/__init__.py b/tests/test_distributed/test_gpu_with_dask/__init__.py index 8b137891791f..e69de29bb2d1 100644 --- a/tests/test_distributed/test_gpu_with_dask/__init__.py +++ b/tests/test_distributed/test_gpu_with_dask/__init__.py @@ -1 +0,0 @@ - diff --git a/tests/test_distributed/test_gpu_with_dask/conftest.py b/tests/test_distributed/test_gpu_with_dask/conftest.py index a066461303d3..b3c8e2896cca 100644 --- a/tests/test_distributed/test_gpu_with_dask/conftest.py +++ b/tests/test_distributed/test_gpu_with_dask/conftest.py @@ -1,7 +1,6 @@ from typing import Any, Generator, Sequence import pytest - from xgboost import testing as tm diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_demos.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_demos.py index 848321ae4613..be14f8366aac 100644 --- a/tests/test_distributed/test_gpu_with_dask/test_gpu_demos.py +++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_demos.py @@ -2,7 +2,6 @@ import subprocess import pytest - from xgboost import testing as tm pytestmark = [ diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_external_memory.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_external_memory.py index 2e790f41e633..20eb8e3c9a80 100644 --- a/tests/test_distributed/test_gpu_with_dask/test_gpu_external_memory.py +++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_external_memory.py @@ -4,10 +4,9 @@ from typing import Any import pytest +import xgboost as xgb from dask_cuda import LocalCUDACluster from distributed import Client - -import xgboost as xgb from xgboost import collective as coll from xgboost import testing as tm from xgboost.testing.dask import check_external_memory, get_rabit_args diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_ranking.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_ranking.py index 0e2013e4933e..3b0385a595c8 100644 --- a/tests/test_distributed/test_gpu_with_dask/test_gpu_ranking.py +++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_ranking.py @@ -3,7 +3,6 @@ import dask import pytest from distributed import Client - from xgboost import testing as tm from xgboost.testing import dask as dtm diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py index d1ec4e4f7444..4c6e9071f6b2 100644 --- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py +++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py @@ -8,11 +8,10 @@ import numpy as np import pytest +import xgboost as xgb from hypothesis import given, note, settings, strategies from hypothesis._settings import duration from packaging.version import parse as parse_version - -import xgboost as xgb from xgboost import testing as tm from xgboost.collective import CommunicatorContext from xgboost.testing.dask import get_rabit_args, make_categorical, run_recode @@ -20,9 +19,6 @@ from ..test_with_dask.test_with_dask import ( generate_array, -) -from ..test_with_dask.test_with_dask import kCols as random_cols -from ..test_with_dask.test_with_dask import ( run_auc, run_boost_from_prediction, run_boost_from_prediction_multi_class, @@ -34,6 +30,7 @@ run_tree_stats, suppress, ) +from ..test_with_dask.test_with_dask import kCols as random_cols pytestmark = [ pytest.mark.skipif(**tm.no_dask()), @@ -48,7 +45,6 @@ from dask import array as da from dask.distributed import Client from dask_cuda import LocalCUDACluster - from xgboost import dask as dxgb from xgboost.testing.dask import check_init_estimation, check_uneven_nan diff --git a/tests/test_distributed/test_gpu_with_spark/test_data.py b/tests/test_distributed/test_gpu_with_spark/test_data.py index c2e068a87672..7abca13ba6b0 100644 --- a/tests/test_distributed/test_gpu_with_spark/test_data.py +++ b/tests/test_distributed/test_gpu_with_spark/test_data.py @@ -1,5 +1,4 @@ import pytest - from xgboost import testing as tm pytestmark = [ diff --git a/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py b/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py index 9a506f2c4b6f..7ac7bea99479 100644 --- a/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py +++ b/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py @@ -5,7 +5,6 @@ import numpy as np import pytest import sklearn - from xgboost import testing as tm pytestmark = [ @@ -16,7 +15,6 @@ from pyspark.ml.linalg import Vectors from pyspark.ml.tuning import CrossValidator, ParamGridBuilder from pyspark.sql import SparkSession - from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor, SparkXGBRegressorModel gpu_discovery_script_path = "tests/test_distributed/test_gpu_with_spark/discover_gpu.sh" diff --git a/tests/test_distributed/test_with_dask/__init__.py b/tests/test_distributed/test_with_dask/__init__.py index 8b137891791f..e69de29bb2d1 100644 --- a/tests/test_distributed/test_with_dask/__init__.py +++ b/tests/test_distributed/test_with_dask/__init__.py @@ -1 +0,0 @@ - diff --git a/tests/test_distributed/test_with_dask/test_demos.py b/tests/test_distributed/test_with_dask/test_demos.py index cbcd6322bfb1..0d96a9ef36ea 100644 --- a/tests/test_distributed/test_with_dask/test_demos.py +++ b/tests/test_distributed/test_with_dask/test_demos.py @@ -2,7 +2,6 @@ import subprocess import pytest - from xgboost import testing as tm diff --git a/tests/test_distributed/test_with_dask/test_external_memory.py b/tests/test_distributed/test_with_dask/test_external_memory.py index ccd5740618e7..d66bbfaf8de1 100644 --- a/tests/test_distributed/test_with_dask/test_external_memory.py +++ b/tests/test_distributed/test_with_dask/test_external_memory.py @@ -3,7 +3,6 @@ import pytest from distributed import Client, Scheduler, Worker from distributed.utils_test import gen_cluster - from xgboost import testing as tm from xgboost.testing.dask import check_external_memory, get_rabit_args diff --git a/tests/test_distributed/test_with_spark/test_data.py b/tests/test_distributed/test_with_spark/test_data.py index 3f88f47b7445..8ee9cfa3b36a 100644 --- a/tests/test_distributed/test_with_spark/test_data.py +++ b/tests/test_distributed/test_with_spark/test_data.py @@ -3,7 +3,6 @@ import numpy as np import pandas as pd import pytest - from xgboost import testing as tm pytestmark = [pytest.mark.skipif(**tm.no_spark())]