diff --git a/README.md b/README.md index 47fbfd952..c7c1cf7f6 100644 --- a/README.md +++ b/README.md @@ -30,22 +30,23 @@ Where does `linfa` stand right now? [Are we learning yet?](http://www.arewelearn | Name | Purpose | Status | Category | Notes | | :--- | :--- | :---| :--- | :---| +| [bayes](algorithms/linfa-bayes/) | Naive Bayes | Tested | Supervised learning | Contains Bernouilli, Gaussian and Multinomial Naive Bayes | | [clustering](algorithms/linfa-clustering/) | Data clustering | Tested / Benchmarked | Unsupervised learning | Clustering of unlabeled data; contains K-Means, Gaussian-Mixture-Model, DBSCAN and OPTICS | -| [kernel](algorithms/linfa-kernel/) | Kernel methods for data transformation | Tested | Pre-processing | Maps feature vector into higher-dimensional space| -| [linear](algorithms/linfa-linear/) | Linear regression | Tested | Partial fit | Contains Ordinary Least Squares (OLS), Generalized Linear Models (GLM) | +| [ensemble](algorithms/linfa-ensemble/) | Ensemble methods | Tested | Supervised learning | Contains bagging | | [elasticnet](algorithms/linfa-elasticnet/) | Elastic Net | Tested | Supervised learning | Linear regression with elastic net constraints | -| [logistic](algorithms/linfa-logistic/) | Logistic regression | Tested | Partial fit | Builds two-class logistic regression models -| [reduction](algorithms/linfa-reduction/) | Dimensionality reduction | Tested | Pre-processing | Diffusion mapping, Principal Component Analysis (PCA), Random projections | -| [trees](algorithms/linfa-trees/) | Decision trees | Tested / Benchmarked | Supervised learning | Linear decision trees -| [svm](algorithms/linfa-svm/) | Support Vector Machines | Tested | Supervised learning | Classification or regression analysis of labeled datasets | +| [ftrl](algorithms/linfa-ftrl/) | Follow The Regularized Leader - proximal | Tested / Benchmarked | Partial fit | Contains L1 and L2 regularization. Possible incremental update | | [hierarchical](algorithms/linfa-hierarchical/) | Agglomerative hierarchical clustering | Tested | Unsupervised learning | Cluster and build hierarchy of clusters | -| [bayes](algorithms/linfa-bayes/) | Naive Bayes | Tested | Supervised learning | Contains Gaussian Naive Bayes | | [ica](algorithms/linfa-ica/) | Independent component analysis | Tested | Unsupervised learning | Contains FastICA implementation | -| [pls](algorithms/linfa-pls/) | Partial Least Squares | Tested | Supervised learning | Contains PLS estimators for dimensionality reduction and regression | -| [tsne](algorithms/linfa-tsne/) | Dimensionality reduction| Tested | Unsupervised learning | Contains exact solution and Barnes-Hut approximation t-SNE | -| [preprocessing](algorithms/linfa-preprocessing/) |Normalization & Vectorization| Tested / Benchmarked | Pre-processing | Contains data normalization/whitening and count vectorization/tf-idf | +| [kernel](algorithms/linfa-kernel/) | Kernel methods for data transformation | Tested | Pre-processing | Maps feature vector into higher-dimensional space | +| [linear](algorithms/linfa-linear/) | Linear regression | Tested | Partial fit | Contains Ordinary Least Squares (OLS), Generalized Linear Models (GLM) | +| [logistic](algorithms/linfa-logistic/) | Logistic regression | Tested | Partial fit | Builds two-class logistic regression models | | [nn](algorithms/linfa-nn/) | Nearest Neighbours & Distances | Tested / Benchmarked | Pre-processing | Spatial index structures and distance functions | -| [ftrl](algorithms/linfa-ftrl/) | Follow The Regularized Leader - proximal | Tested / Benchmarked | Partial fit | Contains L1 and L2 regularization. Possible incremental update | +| [pls](algorithms/linfa-pls/) | Partial Least Squares | Tested | Supervised learning | Contains PLS estimators for dimensionality reduction and regression | +| [preprocessing](algorithms/linfa-preprocessing/) | Normalization & Vectorization| Tested / Benchmarked | Pre-processing | Contains data normalization/whitening and count vectorization/tf-idf | +| [reduction](algorithms/linfa-reduction/) | Dimensionality reduction | Tested | Pre-processing | Diffusion mapping, Principal Component Analysis (PCA), Random projections | +| [svm](algorithms/linfa-svm/) | Support Vector Machines | Tested | Supervised learning | Classification or regression analysis of labeled datasets | +| [trees](algorithms/linfa-trees/) | Decision trees | Tested / Benchmarked | Supervised learning | Linear decision trees | +| [tsne](algorithms/linfa-tsne/) | Dimensionality reduction | Tested | Unsupervised learning | Contains exact solution and Barnes-Hut approximation t-SNE | We believe that only a significant community effort can nurture, build, and sustain a machine learning ecosystem in Rust - there is no other way forward. diff --git a/algorithms/linfa-ensemble/Cargo.toml b/algorithms/linfa-ensemble/Cargo.toml new file mode 100644 index 000000000..0b2ce3453 --- /dev/null +++ b/algorithms/linfa-ensemble/Cargo.toml @@ -0,0 +1,40 @@ +[package] +name = "linfa-ensemble" +version = "0.7.0" +edition = "2018" +authors = [ + "James Knight ", + "James Kay ", +] +description = "A general method for creating ensemble classifiers" +license = "MIT/Apache-2.0" + +repository = "https://github.com/rust-ml/linfa" +readme = "README.md" + +keywords = ["machine-learning", "linfa", "ensemble"] +categories = ["algorithms", "mathematics", "science"] + +[features] +default = [] +serde = ["serde_crate", "ndarray/serde"] + +[dependencies.serde_crate] +package = "serde" +optional = true +version = "1.0" +default-features = false +features = ["std", "derive"] + +[dependencies] +ndarray = { version = "0.15", features = ["rayon", "approx"] } +ndarray-rand = "0.14" +rand = "0.8.5" + +linfa = { version = "0.7.1", path = "../.." } +linfa-trees = { version = "0.7.1", path = "../linfa-trees" } + +[dev-dependencies] +linfa-datasets = { version = "0.7.1", path = "../../datasets/", features = [ + "iris", +] } diff --git a/algorithms/linfa-ensemble/README.md b/algorithms/linfa-ensemble/README.md new file mode 100644 index 000000000..599bc1fdf --- /dev/null +++ b/algorithms/linfa-ensemble/README.md @@ -0,0 +1,21 @@ +# Ensemble Learning + +`linfa-ensemble` provides pure Rust implementations of Ensemble Learning algorithms for the Linfa toolkit. + +## The Big Picture + +`linfa-ensemble` is a crate in the [`linfa`](https://crates.io/crates/linfa) ecosystem, an effort to create a toolkit for classical Machine Learning implemented in pure Rust, akin to Python's `scikit-learn`. + +## Current state + +`linfa-ensemble` currently provides an implementation of bootstrap aggregation (bagging) for other classifiers provided in linfa. + +## Examples + +You can find examples in the `examples/` directory. To run an bootstrap aggregation for ensemble of decision trees (a Random Forest) use: + +```bash +$ cargo run --example randomforest_iris --release +``` + + diff --git a/algorithms/linfa-ensemble/examples/bagging_iris.rs b/algorithms/linfa-ensemble/examples/bagging_iris.rs new file mode 100644 index 000000000..373ae0817 --- /dev/null +++ b/algorithms/linfa-ensemble/examples/bagging_iris.rs @@ -0,0 +1,35 @@ +use linfa::prelude::{Fit, Predict, ToConfusionMatrix}; +use linfa_ensemble::EnsembleLearnerParams; +use linfa_trees::DecisionTree; +use ndarray_rand::rand::SeedableRng; +use rand::rngs::SmallRng; + +fn main() { + // Number of models in the ensemble + let ensemble_size = 100; + // Proportion of training data given to each model + let bootstrap_proportion = 0.7; + + // Load dataset + let mut rng = SmallRng::seed_from_u64(42); + let (train, test) = linfa_datasets::iris() + .shuffle(&mut rng) + .split_with_ratio(0.8); + + // Train ensemble learner model + let model = EnsembleLearnerParams::new(DecisionTree::params()) + .ensemble_size(ensemble_size) + .bootstrap_proportion(bootstrap_proportion) + .fit(&train) + .unwrap(); + + // Return highest ranking predictions + let final_predictions_ensemble = model.predict(&test); + println!("Final Predictions: \n{:?}", final_predictions_ensemble); + + let cm = final_predictions_ensemble.confusion_matrix(&test).unwrap(); + + println!("{:?}", cm); + println!("Test accuracy: {} \n with default Decision Tree params, \n Ensemble Size: {},\n Bootstrap Proportion: {}", + 100.0 * cm.accuracy(), ensemble_size, bootstrap_proportion); +} diff --git a/algorithms/linfa-ensemble/src/algorithm.rs b/algorithms/linfa-ensemble/src/algorithm.rs new file mode 100644 index 000000000..86cd17f50 --- /dev/null +++ b/algorithms/linfa-ensemble/src/algorithm.rs @@ -0,0 +1,103 @@ +use crate::EnsembleLearnerValidParams; +use linfa::{ + dataset::{AsTargets, AsTargetsMut, FromTargetArrayOwned, Records}, + error::Error, + traits::*, + DatasetBase, +}; +use ndarray::{Array2, Axis, Zip}; +use rand::Rng; +use std::{cmp::Eq, collections::HashMap, hash::Hash}; + +pub struct EnsembleLearner { + pub models: Vec, +} + +impl EnsembleLearner { + // Generates prediction iterator returning predictions from each model + pub fn generate_predictions<'b, R: Records, T>( + &'b self, + x: &'b R, + ) -> impl Iterator + 'b + where + M: Predict<&'b R, T>, + { + self.models.iter().map(move |m| m.predict(x)) + } +} + +impl PredictInplace, T> for EnsembleLearner +where + M: PredictInplace, T>, + ::Elem: Copy + Eq + Hash + std::fmt::Debug, + T: AsTargets + AsTargetsMut::Elem>, +{ + fn predict_inplace(&self, x: &Array2, y: &mut T) { + let y_array = y.as_targets(); + assert_eq!( + x.nrows(), + y_array.len_of(Axis(0)), + "The number of data points must match the number of outputs." + ); + + let predictions = self.generate_predictions(x); + + // prediction map has same shape as y_array, but the elements are maps + let mut prediction_maps = y_array.map(|_| HashMap::new()); + + for prediction in predictions { + let p_arr = prediction.as_targets(); + assert_eq!(p_arr.shape(), y_array.shape()); + // Insert each prediction value into the corresponding map + Zip::from(&mut prediction_maps) + .and(&p_arr) + .for_each(|map, val| *map.entry(*val).or_insert(0) += 1); + } + + // For each prediction, pick the result with the highest number of votes + let agg_preds = prediction_maps.map(|map| map.iter().max_by_key(|(_, v)| **v).unwrap().0); + let mut y_array = y.as_targets_mut(); + for (y, pred) in y_array.iter_mut().zip(agg_preds.iter()) { + *y = **pred + } + } + + fn default_target(&self, x: &Array2) -> T { + self.models[0].default_target(x) + } +} + +impl, T::Owned, Error>, R: Rng + Clone> Fit, T, Error> + for EnsembleLearnerValidParams +where + D: Clone, + T: FromTargetArrayOwned, + T::Elem: Copy + Eq + Hash, + T::Owned: AsTargets, +{ + type Object = EnsembleLearner; + + fn fit( + &self, + dataset: &DatasetBase, T>, + ) -> core::result::Result { + let mut models = Vec::new(); + let mut rng = self.rng.clone(); + + let dataset_size = + ((dataset.records.nrows() as f64) * self.bootstrap_proportion).ceil() as usize; + + let iter = dataset.bootstrap_samples(dataset_size, &mut rng); + + for train in iter { + let model = self.model_params.fit(&train).unwrap(); + models.push(model); + + if models.len() == self.ensemble_size { + break; + } + } + + Ok(EnsembleLearner { models }) + } +} diff --git a/algorithms/linfa-ensemble/src/hyperparams.rs b/algorithms/linfa-ensemble/src/hyperparams.rs new file mode 100644 index 000000000..1e2c5471a --- /dev/null +++ b/algorithms/linfa-ensemble/src/hyperparams.rs @@ -0,0 +1,73 @@ +use linfa::{ + error::{Error, Result}, + ParamGuard, +}; +use rand::rngs::ThreadRng; +use rand::Rng; + +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct EnsembleLearnerValidParams { + /// The number of models in the ensemble + pub ensemble_size: usize, + /// The proportion of the total number of training samples that should be given to each model for training + pub bootstrap_proportion: f64, + /// The model parameters for the base model + pub model_params: P, + pub rng: R, +} + +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct EnsembleLearnerParams(EnsembleLearnerValidParams); + +impl

EnsembleLearnerParams { + pub fn new(model_params: P) -> EnsembleLearnerParams { + Self::new_fixed_rng(model_params, rand::thread_rng()) + } +} + +impl EnsembleLearnerParams { + pub fn new_fixed_rng(model_params: P, rng: R) -> EnsembleLearnerParams { + Self(EnsembleLearnerValidParams { + ensemble_size: 1, + bootstrap_proportion: 1.0, + model_params, + rng, + }) + } + + pub fn ensemble_size(mut self, size: usize) -> Self { + self.0.ensemble_size = size; + self + } + + pub fn bootstrap_proportion(mut self, proportion: f64) -> Self { + self.0.bootstrap_proportion = proportion; + self + } +} + +impl ParamGuard for EnsembleLearnerParams { + type Checked = EnsembleLearnerValidParams; + type Error = Error; + + fn check_ref(&self) -> Result<&Self::Checked> { + if self.0.bootstrap_proportion > 1.0 || self.0.bootstrap_proportion <= 0.0 { + Err(Error::Parameters(format!( + "Bootstrap proportion should be greater than zero and less than or equal to one, but was {}", + self.0.bootstrap_proportion + ))) + } else if self.0.ensemble_size < 1 { + Err(Error::Parameters(format!( + "Ensemble size should be less than one, but was {}", + self.0.ensemble_size + ))) + } else { + Ok(&self.0) + } + } + + fn check(self) -> Result { + self.check_ref()?; + Ok(self.0) + } +} diff --git a/algorithms/linfa-ensemble/src/lib.rs b/algorithms/linfa-ensemble/src/lib.rs new file mode 100644 index 000000000..fa0e62007 --- /dev/null +++ b/algorithms/linfa-ensemble/src/lib.rs @@ -0,0 +1,77 @@ +//! # Ensemble Learning Algorithms +//! +//! Ensemble methods combine the predictions of several base estimators built with a given +//! learning algorithm in order to improve generalizability / robustness over a single estimator. +//! +//! ## Bootstrap Aggregation (aka Bagging) +//! +//! A typical example of ensemble method is Bootstrapo AGgregation, which combines the predictions of +//! several decision trees (see `linfa-trees`) trained on different samples subset of the training dataset. +//! +//! ## Reference +//! +//! * [Scikit-Learn User Guide](https://scikit-learn.org/stable/modules/ensemble.html) +//! +//! ## Example +//! +//! This example shows how to train a bagging model using 100 decision trees, +//! each trained on 70% of the training data (bootstrap sampling). +//! +//! ```no_run +//! use linfa::prelude::{Fit, Predict}; +//! use linfa_ensemble::EnsembleLearnerParams; +//! use linfa_trees::DecisionTree; +//! use ndarray_rand::rand::SeedableRng; +//! use rand::rngs::SmallRng; +//! +//! // Load Iris dataset +//! let mut rng = SmallRng::seed_from_u64(42); +//! let (train, test) = linfa_datasets::iris() +//! .shuffle(&mut rng) +//! .split_with_ratio(0.8); +//! +//! // Train the model on the iris dataset +//! let bagging_model = EnsembleLearnerParams::new(DecisionTree::params()) +//! .ensemble_size(100) +//! .bootstrap_proportion(0.7) +//! .fit(&train) +//! .unwrap(); +//! +//! // Make predictions on the test set +//! let predictions = bagging_model.predict(&test); +//! ``` +//! +mod algorithm; +mod hyperparams; + +pub use algorithm::*; +pub use hyperparams::*; + +#[cfg(test)] +mod tests { + use super::*; + use linfa::prelude::{Fit, Predict, ToConfusionMatrix}; + use linfa_trees::DecisionTree; + use ndarray_rand::rand::SeedableRng; + use rand::rngs::SmallRng; + + #[test] + fn test_ensemble_learner_accuracy_on_iris_dataset() { + let mut rng = SmallRng::seed_from_u64(42); + let (train, test) = linfa_datasets::iris() + .shuffle(&mut rng) + .split_with_ratio(0.8); + + let model = EnsembleLearnerParams::new(DecisionTree::params()) + .ensemble_size(100) + .bootstrap_proportion(0.7) + .fit(&train) + .unwrap(); + + let predictions = model.predict(&test); + + let cm = predictions.confusion_matrix(&test).unwrap(); + let acc = cm.accuracy(); + assert!(acc >= 0.9, "Expected accuracy to be above 90%, got {}", acc); + } +} diff --git a/src/dataset/impl_dataset.rs b/src/dataset/impl_dataset.rs index a43944057..81202932a 100644 --- a/src/dataset/impl_dataset.rs +++ b/src/dataset/impl_dataset.rs @@ -2,7 +2,7 @@ use super::{ super::traits::{Predict, PredictInplace}, iter::{ChunksIter, DatasetIter, Iter}, AsSingleTargets, AsTargets, AsTargetsMut, CountedTargets, Dataset, DatasetBase, DatasetView, - Float, FromTargetArray, Label, Labels, Records, Result, TargetDim, + Float, FromTargetArray, FromTargetArrayOwned, Label, Labels, Records, Result, TargetDim, }; use crate::traits::Fit; use ndarray::{concatenate, prelude::*, Data, DataMut, Dimension}; @@ -457,7 +457,7 @@ where impl<'b, F: Clone, E: Copy + 'b, D, T> DatasetBase, T> where D: Data, - T: FromTargetArray<'b, Elem = E>, + T: FromTargetArrayOwned, T::Owned: AsTargets, { /// Apply bootstrapping for samples and features @@ -480,7 +480,7 @@ where &'b self, sample_feature_size: (usize, usize), rng: &'b mut R, - ) -> impl Iterator, >::Owned>> + 'b { + ) -> impl Iterator, T::Owned>> + 'b { std::iter::repeat(()).map(move |_| { // sample with replacement let indices = (0..sample_feature_size.0) @@ -520,7 +520,7 @@ where &'b self, num_samples: usize, rng: &'b mut R, - ) -> impl Iterator, >::Owned>> + 'b { + ) -> impl Iterator, T::Owned>> + 'b { std::iter::repeat(()).map(move |_| { // sample with replacement let indices = (0..num_samples) @@ -554,7 +554,7 @@ where &'b self, num_features: usize, rng: &'b mut R, - ) -> impl Iterator, >::Owned>> + 'b { + ) -> impl Iterator, T::Owned>> + 'b { std::iter::repeat(()).map(move |_| { let targets = T::new_targets(self.as_targets().to_owned()); diff --git a/src/dataset/impl_targets.rs b/src/dataset/impl_targets.rs index e0fdfc7fd..e732c250e 100644 --- a/src/dataset/impl_targets.rs +++ b/src/dataset/impl_targets.rs @@ -2,8 +2,8 @@ use std::collections::HashMap; use super::{ AsMultiTargets, AsMultiTargetsMut, AsProbabilities, AsSingleTargets, AsSingleTargetsMut, - AsTargets, AsTargetsMut, CountedTargets, DatasetBase, FromTargetArray, Label, Labels, Pr, - TargetDim, + AsTargets, AsTargetsMut, CountedTargets, DatasetBase, FromTargetArray, FromTargetArrayOwned, + Label, Labels, Pr, TargetDim, }; use ndarray::{ Array, Array1, Array2, ArrayBase, ArrayView, ArrayViewMut, Axis, CowArray, Data, DataMut, @@ -25,14 +25,17 @@ impl, I: TargetDim> AsTargets for ArrayBase { impl> AsSingleTargets for T {} impl> AsMultiTargets for T {} -impl<'a, L: Clone + 'a, S: Data, I: TargetDim> FromTargetArray<'a> for ArrayBase { +impl, I: TargetDim> FromTargetArrayOwned for ArrayBase { type Owned = ArrayBase, I>; - type View = ArrayBase, I>; /// Returns an owned representation of the target array fn new_targets(targets: Array) -> Self::Owned { targets } +} + +impl<'a, L: Clone + 'a, S: Data, I: TargetDim> FromTargetArray<'a> for ArrayBase { + type View = ArrayBase, I>; /// Returns a reference to the target array fn new_targets_view(targets: ArrayView<'a, L, I>) -> Self::View { @@ -79,23 +82,28 @@ impl> AsTargetsMut for CountedTargets } } -impl<'a, L: Label + 'a, T> FromTargetArray<'a> for CountedTargets +impl FromTargetArrayOwned for CountedTargets where - T: FromTargetArray<'a, Elem = L>, + T: FromTargetArrayOwned, T::Owned: Labels, - T::View: Labels + AsTargets, { type Owned = CountedTargets; - type View = CountedTargets; fn new_targets(targets: Array) -> Self::Owned { let targets = T::new_targets(targets); - CountedTargets { labels: targets.label_count(), targets, } } +} + +impl<'a, L: Label + 'a, T> FromTargetArray<'a> for CountedTargets +where + T: FromTargetArray<'a, Elem = L>, + T::View: Labels, +{ + type View = CountedTargets; fn new_targets_view(targets: ArrayView<'a, L, T::Ix>) -> Self::View { let targets = T::new_targets_view(targets); diff --git a/src/dataset/mod.rs b/src/dataset/mod.rs index 2f275a553..9ab3c2a5a 100644 --- a/src/dataset/mod.rs +++ b/src/dataset/mod.rs @@ -262,17 +262,22 @@ pub trait AsMultiTargets: AsTargets { } } +pub trait FromTargetArrayOwned: AsTargets { + type Owned; + + /// Create self object from new target array + fn new_targets(targets: Array) -> Self::Owned; +} + /// Helper trait to construct counted labels /// /// This is implemented for objects which can act as targets and created from a target matrix. For /// targets represented as `ndarray` matrix this is identity, for counted labels, i.e. /// `TargetsWithLabels`, it creates the corresponding wrapper struct. pub trait FromTargetArray<'a>: AsTargets { - type Owned; type View; /// Create self object from new target array - fn new_targets(targets: Array) -> Self::Owned; fn new_targets_view(targets: ArrayView<'a, Self::Elem, Self::Ix>) -> Self::View; }