From 6b6ecfcafb81d5d1a16f7d1502d1c5fbdbed4ba7 Mon Sep 17 00:00:00 2001 From: Munir Hiabu Date: Fri, 8 Aug 2025 15:02:27 +0200 Subject: [PATCH 1/3] feat: enhance RandomPlantedForest with new parameters (max_candidates, split_decay_rate, delete_leaves ) and different behaviour of split_try --- src/include/cpf.hpp | 13 +- src/include/rpf.hpp | 32 +++- src/lib/cpf.cpp | 169 ++++++++++++++++++--- src/lib/rpf.cpp | 354 ++++++++++++++++++++++++++------------------ 4 files changed, 394 insertions(+), 174 deletions(-) diff --git a/src/include/cpf.hpp b/src/include/cpf.hpp index 1c913ce..b1ce853 100644 --- a/src/include/cpf.hpp +++ b/src/include/cpf.hpp @@ -9,7 +9,7 @@ class ClassificationRPF : public RandomPlantedForest public: using RandomPlantedForest::calcOptimalSplit; ClassificationRPF(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, - const String loss = "L2", const NumericVector parameters = {1, 50, 30, 10, 0.4, 0, 0, 0, 0, 0, 0.1}); + const String loss = "L2", const NumericVector parameters = {1, 50, 30, 10, 0.4, 0, 0, 0, 0, 0.1, 0, 0.1, 50,1}); void set_parameters(StringVector keys, NumericVector values); ~ClassificationRPF(){}; @@ -33,9 +33,12 @@ class ClassificationRPF : public RandomPlantedForest void (ClassificationRPF::*calcLoss)(Split &); void create_tree_family(std::vector initial_leaves, size_t n) override; void fit() override; - Split calcOptimalSplit(const std::vector> &Y, const std::vector> &X, - std::multimap> &possible_splits, TreeFamily &curr_family, - std::vector> &weights); + Split calcOptimalSplit( + const std::vector>& Y, + const std::vector>& X, + std::vector& possible_splits, + TreeFamily& curr_family, + std::vector>& weights) ; void L1_loss(Split &split); void median_loss(Split &split); void logit_loss(Split &split); @@ -47,4 +50,4 @@ class ClassificationRPF : public RandomPlantedForest void exponential_loss_3(Split &split); }; -#endif \ No newline at end of file +#endif diff --git a/src/include/rpf.hpp b/src/include/rpf.hpp index 53e8d13..4957746 100644 --- a/src/include/rpf.hpp +++ b/src/include/rpf.hpp @@ -10,7 +10,7 @@ class RandomPlantedForest public: RandomPlantedForest(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, - const NumericVector parameters = {1, 50, 30, 10, 0.4, 0, 0, 0, 0}); + const NumericVector parameters = {1, 50, 30, 10, 0.4, 0, 0, 0, 0, 0.1, 50,1}); RandomPlantedForest(){}; void set_data(const NumericMatrix &samples_Y, const NumericMatrix &samples_X); NumericMatrix predict_matrix(const NumericMatrix &X, const NumericVector components = {0}); @@ -26,7 +26,7 @@ class RandomPlantedForest List get_model(); virtual ~RandomPlantedForest(){}; bool is_purified(); - + protected: double MSE_vec(const NumericVector &Y_predicted, const NumericVector &Y_true); std::vector> X; /**< Nested vector feature samples of size (sample_size x feature_size) */ @@ -53,8 +53,30 @@ class RandomPlantedForest void L2_loss(Split &split); virtual void fit(); virtual void create_tree_family(std::vector initial_leaves, size_t n); - virtual Split calcOptimalSplit(const std::vector> &Y, const std::vector> &X, - std::multimap> &possible_splits, TreeFamily &curr_family); + struct SplitCandidate; + // overload possibleExists for your vector of SplitCandidate + static bool possibleExists( + int dim, + const std::vector& possible_splits, + const std::set& resulting_dims + ); + virtual Split calcOptimalSplit(const std::vector> &Y, + const std::vector> &X, + std::vector &possible_splits, + TreeFamily &curr_family); + // exponential‐decay rate for split age + double split_decay_rate_; + size_t max_candidates_; + // track each split candidate and how long it’s sat unchosen + struct SplitCandidate { + int dim; + std::shared_ptr tree; + double age; + // single ctor with default age + SplitCandidate(int d, std::shared_ptr t, double a = 0.0) + : dim(d), tree(std::move(t)), age(a) {} + }; + bool delete_leaves; }; -#endif // RPF_HPP \ No newline at end of file +#endif // RPF_HPP diff --git a/src/lib/cpf.cpp b/src/lib/cpf.cpp index f810700..9ae3c8b 100644 --- a/src/lib/cpf.cpp +++ b/src/lib/cpf.cpp @@ -1,6 +1,10 @@ #include "cpf.hpp" - +#include +#include +#include +#include +#include // ----------------- rpf subclass for classification ----------------- @@ -589,7 +593,11 @@ void ClassificationRPF::exponential_loss_3(Split &split) // constructor with parameters split_try, t_try, purify_forest, deterministic, nthreads ClassificationRPF::ClassificationRPF(const NumericMatrix &samples_Y, const NumericMatrix &samples_X, const String loss, const NumericVector parameters) - : RandomPlantedForest{} + : RandomPlantedForest( + samples_Y, + samples_X, + parameters[Rcpp::Range(0, 11)] + ) { // Ensure correct Rcpp RNG state @@ -653,7 +661,7 @@ ClassificationRPF::ClassificationRPF(const NumericMatrix &samples_Y, const Numer this->loss = LossType::L2; this->calcLoss = &ClassificationRPF::L2_loss; } - if (pars.size() != 11) + if (pars.size() != 14) { Rcout << "Wrong number of parameters - set to default." << std::endl; this->max_interaction = 1; @@ -665,6 +673,9 @@ ClassificationRPF::ClassificationRPF(const NumericMatrix &samples_Y, const Numer this->deterministic = 0; this->nthreads = 1; this->cross_validate = 0; + this->split_decay_rate_ = 0.1; + this->max_candidates_ = 50; + this->delete_leaves = 1; this->delta = 0.1; this->epsilon = 0; } @@ -679,8 +690,11 @@ ClassificationRPF::ClassificationRPF(const NumericMatrix &samples_Y, const Numer this->deterministic = pars[6]; this->nthreads = pars[7]; this->cross_validate = pars[8]; - this->delta = pars[9]; - this->epsilon = pars[10]; + this->split_decay_rate_ = pars[9]; + this->max_candidates_ = static_cast(pars[10]); + this->delete_leaves = pars[11]; + this->delta = pars[12]; + this->epsilon = pars[13]; } // set data and data related members @@ -689,10 +703,11 @@ ClassificationRPF::ClassificationRPF(const NumericMatrix &samples_Y, const Numer // determine optimal split Split ClassificationRPF::calcOptimalSplit(const std::vector> &Y, const std::vector> &X, - std::multimap> &possible_splits, TreeFamily &curr_family, std::vector> &weights) + std::vector &possible_splits, TreeFamily &curr_family, std::vector> &weights) { Split curr_split, min_split; + min_split.min_sum = std::numeric_limits::infinity(); curr_split.Y = &Y; curr_split.W = &weights; std::set tree_dims; @@ -703,14 +718,42 @@ Split ClassificationRPF::calcOptimalSplit(const std::vector> // sample possible splits unsigned int n_candidates = ceil(t_try * possible_splits.size()); // number of candidates that will be considered - std::vector split_candidates(possible_splits.size()); - std::iota(split_candidates.begin(), split_candidates.end(), 0); // consecutive indices of possible candidates + std::vector split_candidates; - if (!deterministic) - { - shuffle_vector(split_candidates.begin(), - split_candidates.end()); // shuffle for random order - } + // 1) Build weights = exp(-decay_rate * age) + std::vector weights_vec(possible_splits.size()); + for (size_t i = 0; i < possible_splits.size(); ++i) { + weights_vec[i] = std::exp(-split_decay_rate_ * possible_splits[i].age); + } + + // 2) Sample n_candidates indices *without* replacement + std::vector sample_idxs; + sample_idxs.reserve(n_candidates); + + if (!deterministic) { + // fully-qualified random_device: + std::mt19937 gen{ std::random_device{}() }; + std::discrete_distribution dist(weights_vec.begin(), weights_vec.end()); + std::vector used(possible_splits.size(), false); + + // draw until we have n_candidates distinct picks + while (sample_idxs.size() < n_candidates) { + size_t idx = dist(gen); + if (!used[idx]) { + used[idx] = true; + sample_idxs.push_back(idx); + } + } + } else { + // deterministic fallback: first n_candidates + for (size_t i = 0; i < n_candidates && i < possible_splits.size(); ++i) + sample_idxs.push_back(i); + } + + split_candidates = sample_idxs; + + // track which one gave us the best split + size_t chosen_idx = std::numeric_limits::max(); // consider a fraction of possible splits while (n < n_candidates) @@ -724,11 +767,11 @@ Split ClassificationRPF::calcOptimalSplit(const std::vector> auto candidate = possible_splits.begin(); std::advance(candidate, split_candidates[n]); // get random split candidate without replacement - k = candidate->first - 1; // split dim of candidate, converted to index starting at 0 + k = candidate->dim - 1; // split dim of candidate, converted to index starting at 0 leaf_size = n_leaves[k]; // Test if splitting in the tree w.r.t. the coordinate "k" is an element of candidate tree - tree_dims = candidate->second->split_dims; + tree_dims = candidate->tree->split_dims; tree_dims.erase(k + 1); tree_dims.erase(0); @@ -737,7 +780,7 @@ Split ClassificationRPF::calcOptimalSplit(const std::vector> curr_trees.push_back(curr_family[std::set{0}]); if (curr_family.find(tree_dims) != curr_family.end()) curr_trees.push_back(curr_family[tree_dims]); - if (curr_family.find(candidate->second->split_dims) != curr_family.end()) + if (curr_family.find(candidate->tree->split_dims) != curr_family.end()) // go through all trees in current family for (auto &curr_tree : curr_trees) @@ -748,7 +791,7 @@ Split ClassificationRPF::calcOptimalSplit(const std::vector> continue; // go through all leaves of current tree - for (auto &leaf : curr_tree->leaves) + /* for (auto &leaf : curr_tree->leaves) { std::vector tot_sum(value_size, 0); @@ -853,14 +896,84 @@ Split ClassificationRPF::calcOptimalSplit(const std::vector> min_split.leaf_index = &leaf; min_split.split_coordinate = k + 1; min_split.split_point = sample_point; + chosen_idx = split_candidates[n]; } } + } */ + + // new: exactly split_try random (leaf, cut‐point) trials per tree + for (size_t trial = 0; trial < split_try; ++trial) { + // 1) pick a random leaf + int leaf_idx = static_cast(R::runif(0, curr_tree->leaves.size())); + auto &leaf = curr_tree->leaves[leaf_idx]; + + // 2) collect unique feature values in that leaf + std::vector unique_samples(leaf.individuals.size()); + for (size_t i = 0; i < leaf.individuals.size(); ++i) + unique_samples[i] = X[leaf.individuals[i]][k]; + std::sort(unique_samples.begin(), unique_samples.end()); + unique_samples.erase( + std::unique(unique_samples.begin(), unique_samples.end()), + unique_samples.end() + ); + + // if too few distinct values, retry this trial + if (unique_samples.size() < 2 * leaf_size) { + --trial; + continue; } + + // 3) pick one random cut‐point in [leaf_size, unique_samples.size()-leaf_size) + int s_idx = static_cast( + R::runif(leaf_size, unique_samples.size() - leaf_size) + ); + double sample_point = unique_samples[s_idx]; + + // 4) partition individuals and accumulate sums + curr_split.I_s.clear(); + curr_split.I_b.clear(); + curr_split.I_s.reserve(leaf.individuals.size()); + curr_split.I_b.reserve(leaf.individuals.size()); + curr_split.sum_s.assign(value_size, 0); + curr_split.sum_b.assign(value_size, 0); + + for (int ind : leaf.individuals) { + if (X[ind][k] < sample_point) { + curr_split.I_s.push_back(ind); + curr_split.sum_s += Y[ind]; + } else { + curr_split.I_b.push_back(ind); + curr_split.sum_b += Y[ind]; + } + } + + // 5) compute your L1/L2/logit/etc. loss + (this->*ClassificationRPF::calcLoss)(curr_split); + + // 6) update best‐so‐far split + if (curr_split.min_sum < min_split.min_sum) { + min_split = curr_split; + min_split.tree_index = curr_tree; + min_split.leaf_index = &leaf; + min_split.split_coordinate = k + 1; + min_split.split_point = sample_point; + chosen_idx = split_candidates[n]; + } + } + } ++n; } + for (size_t idx : split_candidates) { + if (idx == chosen_idx) { + possible_splits[idx].age = 0.0; // reset for the winner + } else { + possible_splits[idx].age += 1.0; // age everyone else + } + } + return min_split; } @@ -871,12 +984,16 @@ void ClassificationRPF::create_tree_family(std::vector initial_leaves, siz curr_family.insert(std::make_pair(std::set{0}, std::make_shared(DecisionTree(std::set{0}, initial_leaves)))); // save tree with one leaf in the beginning // store possible splits in map with splitting variable as key and pointer to resulting tree - std::multimap> possible_splits; + std::vector possible_splits; for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) { // add pointer to resulting tree with split dimension as key curr_family.insert(std::make_pair(std::set{feature_dim}, std::make_shared(DecisionTree(std::set{feature_dim})))); - possible_splits.insert(std::make_pair(feature_dim, curr_family[std::set{0}])); + possible_splits.emplace_back( + SplitCandidate{ feature_dim, + curr_family[{0}], + /*age=*/0 } + ); } // sample data points with replacement @@ -969,12 +1086,12 @@ void ClassificationRPF::create_tree_family(std::vector initial_leaves, siz // update possible_splits if not already existing if (found_tree) { // if yes add pointer - possible_splits.insert(std::make_pair(feature_dim, found_tree)); + possible_splits.emplace_back(SplitCandidate{feature_dim, found_tree, 0}); } else { // if not create new tree curr_family.insert(std::make_pair(curr_dims, std::make_shared(DecisionTree(curr_dims)))); - possible_splits.insert(std::make_pair(feature_dim, curr_family[curr_dims])); + possible_splits.emplace_back(SplitCandidate{feature_dim, curr_family[curr_dims], 0}); } } } @@ -1328,7 +1445,7 @@ void ClassificationRPF::create_tree_family(std::vector initial_leaves, siz std::shared_ptr found_tree = treeExists(resulting_dims, curr_family); // determine which tree is modified - if (curr_split.tree_index->split_dims.count(curr_split.split_coordinate)) + if ((curr_split.tree_index->split_dims.count(curr_split.split_coordinate))&& delete_leaves) { // if split variable is already in tree to be split // change values { @@ -1529,6 +1646,14 @@ void ClassificationRPF::set_parameters(StringVector keys, NumericVector values) { this->epsilon = values[i]; } + else if (keys[i] == "split_decay_rate") + { + this->split_decay_rate_ = values[i]; + } + else if (keys[i] == "max_candidates") + { + this->max_candidates_ = static_cast(values[i]); + } else { Rcout << "Unkown parameter key '" << keys[i] << "' ." << std::endl; diff --git a/src/lib/rpf.cpp b/src/lib/rpf.cpp index 085df8c..b31ada9 100644 --- a/src/lib/rpf.cpp +++ b/src/lib/rpf.cpp @@ -1,4 +1,23 @@ #include "rpf.hpp" +#include +#include +#include +#include // for std::min, std::max +#include + + +bool RandomPlantedForest::possibleExists( + int dim, + const std::vector& possible_splits, + const std::set& resulting_dims) +{ + for (const auto& c : possible_splits) { + if (c.dim == dim && c.tree->split_dims == resulting_dims) + return true; + } + return false; +} + bool RandomPlantedForest::is_purified() @@ -31,21 +50,12 @@ RandomPlantedForest::RandomPlantedForest(const NumericMatrix &samples_Y, const N // initialize class members std::vector pars = to_std_vec(parameters); - if (pars.size() != 9) + if (pars.size() != 12) { - Rcout << "Wrong number of parameters - set to default." << std::endl; - this->max_interaction = 1; - this->n_trees = 50; - this->n_splits = 30; - this->split_try = 10; - this->t_try = 0.4; - this->purify_forest = 0; - this->deterministic = 0; - this->nthreads = 1; - this->cross_validate = 0; + Rcpp::stop("RandomPlantedForest requires 12 parameters, got %d", pars.size()); } else - { + { this->max_interaction = pars[0]; this->n_trees = pars[1]; this->n_splits = pars[2]; @@ -55,6 +65,10 @@ RandomPlantedForest::RandomPlantedForest(const NumericMatrix &samples_Y, const N this->deterministic = pars[6]; this->nthreads = pars[7]; this->cross_validate = pars[8]; + this->split_decay_rate_ = pars[9]; + this->max_candidates_ = static_cast(pars[10]); + this->delete_leaves = pars[11]; + } // set data and data related members @@ -63,177 +77,223 @@ RandomPlantedForest::RandomPlantedForest(const NumericMatrix &samples_Y, const N // determine optimal split Split RandomPlantedForest::calcOptimalSplit(const std::vector> &Y, const std::vector> &X, - std::multimap> &possible_splits, TreeFamily &curr_family) + std::vector &possible_splits, TreeFamily &curr_family) { Split curr_split, min_split; + min_split.min_sum = std::numeric_limits::infinity(); curr_split.Y = &Y; std::set tree_dims; std::vector unique_samples; - int k; - unsigned int n = 0; - double leaf_size, sample_point; + double leaf_size; - // sample possible splits - unsigned int n_candidates = ceil(t_try * possible_splits.size()); // number of candidates that will be considered - std::vector split_candidates(possible_splits.size()); - std::iota(split_candidates.begin(), split_candidates.end(), 0); // consecutive indices of possible candidates - if (!deterministic) - { - shuffle_vector(split_candidates.begin(), split_candidates.end()); // shuffle for random order - } + // raw count based on t_try + unsigned int raw_candidates = static_cast( + std::ceil(t_try * possible_splits.size())); + // never try more than available splits or max_candidates_ + unsigned int upper = std::min( + max_candidates_, possible_splits.size()); + // clamp into [1, upper] + unsigned int n_candidates = std::max( + 1u, std::min(raw_candidates, upper)); - // consider a fraction of possible splits - while (n < n_candidates) - { - if (possible_splits.empty()) - break; - if (split_candidates[n] >= 0 && (size_t)split_candidates[n] >= possible_splits.size()) - continue; + // 1) Build weights = exp(-decay_rate * age) + std::vector weights(possible_splits.size()); + for (size_t i = 0; i < possible_splits.size(); ++i) { + weights[i] = std::exp(-split_decay_rate_ * possible_splits[i].age); + } + + // 2) Sample n_candidates indices without replacement + // 2) Sample n_candidates indices without replacement via discrete_distribution + std::vector sample_idxs; + sample_idxs.reserve(n_candidates); + + if (!deterministic) { + // set up RNG and discrete distribution over our weights + std::mt19937 gen(std::random_device{}()); + std::discrete_distribution dist(weights.begin(), weights.end()); + std::vector used(possible_splits.size(), false); + + // draw until we have n_candidates _distinct_ indices + while (sample_idxs.size() < n_candidates) { + size_t idx = dist(gen); + if (!used[idx]) { + used[idx] = true; + sample_idxs.push_back(idx); + } + } + } else { + // deterministically take the first n_candidates indices: + for (size_t i = 0; i < n_candidates && i < possible_splits.size(); ++i) + sample_idxs.push_back(i); + } - auto candidate = possible_splits.begin(); - std::advance(candidate, split_candidates[n]); // get random split candidate without replacement - k = candidate->first - 1; // split dim of current candidate, converted to index starting at 0 - leaf_size = n_leaves[k]; + // 3) Evaluate each sampled candidate + int best_idx = -1; + for (size_t idx : sample_idxs) { + // reproduce exactly your old body, but with `idx` in place of `n`: + auto candidate_it = possible_splits.begin(); + std::advance(candidate_it, idx); + int k = candidate_it->dim - 1; + leaf_size = n_leaves[k]; // Test if splitting in the current tree w.r.t. the coordinate "k" is an element of candidate tree - tree_dims = candidate->second->split_dims; + tree_dims = candidate_it->tree->split_dims; tree_dims.erase(k + 1); tree_dims.erase(0); std::vector> curr_trees; - if (tree_dims.size() == 0) - curr_trees.push_back(curr_family[std::set{0}]); + if (tree_dims.empty()) + curr_trees.push_back(curr_family[{0}]); if (curr_family.find(tree_dims) != curr_family.end()) curr_trees.push_back(curr_family[tree_dims]); - if (curr_family.find(candidate->second->split_dims) != curr_family.end()) - curr_trees.push_back(curr_family[candidate->second->split_dims]); + if (curr_family.find(candidate_it->tree->split_dims) != curr_family.end()) + curr_trees.push_back(curr_family[candidate_it->tree->split_dims]); + + for (auto &curr_tree : curr_trees) { + if (curr_tree->leaves.empty()) continue; + // new: exactly split_try random (leaf, cut‐point) trials per tree + for (size_t trial = 0; trial < split_try; ++trial) { + // 1) pick a random leaf + int leaf_idx = static_cast(R::runif(0, curr_tree->leaves.size())); + auto &leaf = curr_tree->leaves[leaf_idx]; + + // 2) collect unique feature values within that leaf + std::vector unique_samples(leaf.individuals.size()); + for (size_t i = 0; i < leaf.individuals.size(); ++i) + unique_samples[i] = X[leaf.individuals[i]][k]; + std::sort(unique_samples.begin(), unique_samples.end()); + unique_samples.erase( + std::unique(unique_samples.begin(), unique_samples.end()), + unique_samples.end() + ); + + // if too few distinct values, retry this trial + if (unique_samples.size() < 2 * leaf_size) { + --trial; + continue; + } - // go through all trees in current family - for (auto &curr_tree : curr_trees) - { + // 3) pick one random cut‐point in [leaf_size, n - leaf_size) + int s_idx = static_cast( + R::runif(leaf_size, unique_samples.size() - leaf_size) + ); + double sample_point = unique_samples[s_idx]; + + // 4) partition and compute L2 loss + curr_split.I_s.clear(); + curr_split.I_b.clear(); + curr_split.I_s.reserve(leaf.individuals.size()); + curr_split.I_b.reserve(leaf.individuals.size()); + curr_split.sum_s.assign(value_size, 0); + curr_split.sum_b.assign(value_size, 0); + + for (int ind : leaf.individuals) { + if (X[ind][k] < sample_point) { + curr_split.I_s.push_back(ind); + curr_split.sum_s += Y[ind]; + } else { + curr_split.I_b.push_back(ind); + curr_split.sum_b += Y[ind]; + } + } - // skip if tree has no leaves - if (curr_tree->leaves.size() == 0) - continue; + L2_loss(curr_split); - // go through all leaves of current tree - for (auto &leaf : curr_tree->leaves) - { + // 5) update best split if improved + if (curr_split.min_sum < min_split.min_sum) { + min_split = curr_split; + min_split.tree_index = curr_tree; + min_split.leaf_index = &leaf; + min_split.split_coordinate = k + 1; + min_split.split_point = sample_point; + best_idx = idx; + } + } - std::vector tot_sum(value_size, 0); - // extract sample points according to individuals from X and Y - unique_samples = std::vector(leaf.individuals.size()); - for (unsigned int i = 0; i < leaf.individuals.size(); ++i) - { +/* for (auto &leaf : curr_tree->leaves) { + std::vector tot_sum(value_size, 0); + unique_samples.resize(leaf.individuals.size()); + for (size_t i = 0; i < leaf.individuals.size(); ++i) unique_samples[i] = X[leaf.individuals[i]][k]; - } std::sort(unique_samples.begin(), unique_samples.end()); unique_samples.erase(std::unique(unique_samples.begin(), unique_samples.end()), unique_samples.end()); + if (unique_samples.size() < 2 * leaf_size) continue; - // check if number of sample points is within limit - if (unique_samples.size() < 2 * leaf_size) - continue; - - // consider split_try-number of samples std::vector samples; - if (deterministic) - { // sequential samples if deterministic - samples = std::vector(std::min((int)unique_samples.size() - 1, 9)); + if (deterministic) { + samples.resize(std::min((int)unique_samples.size() - 1, 9)); std::iota(samples.begin(), samples.end(), 1); - } - else - { // randomly picked samples otherwise - samples = std::vector(split_try); + } else { + samples.resize(split_try); for (size_t i = 0; i < samples.size(); ++i) samples[i] = R::runif(leaf_size, unique_samples.size() - leaf_size); std::sort(samples.begin(), samples.end()); } - // go through samples - for (size_t sample_pos = 0; sample_pos < samples.size(); ++sample_pos) - { - - // get samplepoint - sample_point = unique_samples[samples[sample_pos]]; - - // clear current split - { - curr_split.I_s.clear(); - curr_split.I_b.clear(); - curr_split.I_s.reserve(leaf.individuals.size()); - curr_split.I_b.reserve(leaf.individuals.size()); - curr_split.M_s = std::vector(value_size, 0); - curr_split.M_b = std::vector(value_size, 0); - } - - // get samples greater/smaller than samplepoint - if (sample_pos == 0) - { - curr_split.sum_s = std::vector(value_size, 0); - curr_split.sum_b = std::vector(value_size, 0); - - for (int individual : leaf.individuals) - { - if (X[individual][k] < sample_point) - { - curr_split.I_s.push_back(individual); - curr_split.sum_s += Y[individual]; - } - else - { - curr_split.I_b.push_back(individual); - curr_split.sum_b += Y[individual]; + for (size_t si = 0; si < samples.size(); ++si) { + sample_point = unique_samples[samples[si]]; + curr_split.I_s.clear(); curr_split.I_b.clear(); + curr_split.I_s.reserve(leaf.individuals.size()); + curr_split.I_b.reserve(leaf.individuals.size()); + curr_split.M_s.assign(value_size, 0); + curr_split.M_b.assign(value_size, 0); + + if (si == 0) { + curr_split.sum_s.assign(value_size, 0); + curr_split.sum_b.assign(value_size, 0); + for (int ind : leaf.individuals) { + if (X[ind][k] < sample_point) { + curr_split.I_s.push_back(ind); + curr_split.sum_s += Y[ind]; + } else { + curr_split.I_b.push_back(ind); + curr_split.sum_b += Y[ind]; } } - tot_sum = curr_split.sum_s + curr_split.sum_b; - } - else - { - - for (int individual : leaf.individuals) - { - if (X[individual][k] < sample_point) - { - if (X[individual][k] >= unique_samples[samples[sample_pos - 1]]) - { - curr_split.sum_s += Y[individual]; - } - curr_split.I_s.push_back(individual); - } - else - { - curr_split.I_b.push_back(individual); + } else { + for (int ind : leaf.individuals) { + if (X[ind][k] < sample_point) { + if (X[ind][k] >= unique_samples[samples[si - 1]]) + curr_split.sum_s += Y[ind]; + curr_split.I_s.push_back(ind); + } else { + curr_split.I_b.push_back(ind); } } - curr_split.sum_b = tot_sum - curr_split.sum_s; } - // accumulate squared mean and get mean L2_loss(curr_split); - // update split if squared sum is smaller - if (curr_split.min_sum < min_split.min_sum) - { + if (curr_split.min_sum < min_split.min_sum) { min_split = curr_split; - min_split.tree_index = curr_tree; - min_split.leaf_index = &leaf; + min_split.tree_index = curr_tree; + min_split.leaf_index = &leaf; min_split.split_coordinate = k + 1; - min_split.split_point = sample_point; + min_split.split_point = sample_point; + best_idx = idx; } } - } + } */ } - - ++n; } + + // 4) Age update: only sampled splits age; chosen resets to zero +for (size_t idx : sample_idxs) { + if ((int)idx != best_idx) + possible_splits[idx].age += 1.0; + else + possible_splits[idx].age = 0.0; +} + return min_split; + } void RandomPlantedForest::set_data(const NumericMatrix &samples_Y, const NumericMatrix &samples_X) @@ -293,12 +353,11 @@ void RandomPlantedForest::create_tree_family(std::vector initial_leaves, s TreeFamily curr_family; curr_family.insert(std::make_pair(std::set{0}, std::make_shared(DecisionTree(std::set{0}, initial_leaves)))); // save tree with one leaf in the beginning // store possible splits in map with splitting variable as key and pointer to resulting tree - std::multimap> possible_splits; - for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) - { - // add pointer to resulting tree with split dimension as key - curr_family.insert(std::make_pair(std::set{feature_dim}, std::make_shared(DecisionTree(std::set{feature_dim})))); - possible_splits.insert(std::make_pair(feature_dim, curr_family[std::set{feature_dim}])); + std::vector possible_splits; + for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) { + auto treePtr = std::make_shared(DecisionTree({feature_dim})); + curr_family.insert({{feature_dim}, treePtr}); + possible_splits.emplace_back(feature_dim, treePtr); } // sample data points with replacement @@ -334,7 +393,7 @@ void RandomPlantedForest::create_tree_family(std::vector initial_leaves, s // find optimal split curr_split = calcOptimalSplit(samples_Y, samples_X, possible_splits, curr_family); - + // continue only if we get a significant result if (!std::isinf(curr_split.min_sum)) { @@ -363,12 +422,13 @@ void RandomPlantedForest::create_tree_family(std::vector initial_leaves, s // update possible_splits if not already existing if (found_tree) { // if yes add pointer - possible_splits.insert(std::make_pair(feature_dim, found_tree)); + possible_splits.emplace_back(feature_dim, found_tree); } else { // if not create new tree curr_family.insert(std::make_pair(curr_dims, std::make_shared(DecisionTree(curr_dims)))); - possible_splits.insert(std::make_pair(feature_dim, curr_family[curr_dims])); + possible_splits.emplace_back(feature_dim, curr_family[curr_dims]); + } } @@ -413,7 +473,7 @@ void RandomPlantedForest::create_tree_family(std::vector initial_leaves, s std::shared_ptr found_tree = treeExists(resulting_dims, curr_family); // determine which tree is modified - if (curr_split.tree_index->split_dims.count(curr_split.split_coordinate)) + if ((curr_split.tree_index->split_dims.count(curr_split.split_coordinate))&& delete_leaves) { // if split variable is already in tree to be split // change values { @@ -423,8 +483,8 @@ void RandomPlantedForest::create_tree_family(std::vector initial_leaves, s *curr_split.leaf_index = leaf_b; // replace old interval curr_split.tree_index->leaves.push_back(leaf_s); // add new leaf } - else - { // otherwise + else // we keep the parent leaf and create two new children + { found_tree->leaves.push_back(leaf_s); // append new leaves found_tree->leaves.push_back(leaf_b); } @@ -1986,6 +2046,7 @@ void RandomPlantedForest::print() void RandomPlantedForest::get_parameters() { Rcout << "Parameters: n_trees=" << n_trees << ", n_splits=" << n_splits << ", max_interaction=" << max_interaction << ", t_try=" << t_try + << ", split_decay_rate=" << split_decay_rate_<< ", max_candidates=" << max_candidates_ << ", split_try=" << split_try << ", purified=" << purified << ", deterministic=" << deterministic << ", nthreads=" << nthreads << ", feature_size=" << feature_size << ", sample_size=" << sample_size << std::endl; } @@ -2040,10 +2101,19 @@ void RandomPlantedForest::set_parameters(StringVector keys, NumericVector values { this->cross_validate = values[i]; } + else if (keys[i] == "split_decay_rate") + { + this->split_decay_rate_ = values[i]; + } + else if (keys[i] == "max_candidates") + { + this->max_candidates_ = static_cast(values[i]); + } else { Rcout << "Unkown parameter key '" << keys[i] << "' ." << std::endl; } + } this->fit(); } From fd2a969390d12f2f5d11981fe564a4b94cd2251f Mon Sep 17 00:00:00 2001 From: Munir Hiabu Date: Fri, 8 Aug 2025 15:37:10 +0200 Subject: [PATCH 2/3] bug-fix: max_candidates was not not doing anything for classification. --- src/lib/cpf.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/lib/cpf.cpp b/src/lib/cpf.cpp index 9ae3c8b..16e98e5 100644 --- a/src/lib/cpf.cpp +++ b/src/lib/cpf.cpp @@ -714,10 +714,12 @@ Split ClassificationRPF::calcOptimalSplit(const std::vector> std::vector unique_samples; int k; unsigned int n = 0; - double leaf_size, sample_point; + double leaf_size; // sample possible splits - unsigned int n_candidates = ceil(t_try * possible_splits.size()); // number of candidates that will be considered + unsigned int raw_candidates = static_cast(std::ceil(t_try * possible_splits.size())); + unsigned int upper = std::min(max_candidates_, possible_splits.size()); + unsigned int n_candidates = std::max(1u, std::min(raw_candidates, upper)); std::vector split_candidates; // 1) Build weights = exp(-decay_rate * age) From af74ec0a6082b9918ee3ed8b0453596311e37371 Mon Sep 17 00:00:00 2001 From: Munir Hiabu Date: Fri, 8 Aug 2025 16:00:37 +0200 Subject: [PATCH 3/3] r-file rpf.R has not been updated (actual part of the previous commit) --- R/rpf.R | 56 +++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 17 deletions(-) diff --git a/R/rpf.R b/R/rpf.R index a331656..97d9a3b 100644 --- a/R/rpf.R +++ b/R/rpf.R @@ -29,6 +29,9 @@ #' @param epsilon `[0.1]`: Only used if loss = `"logit"` or `"exponential"`. #' Proportion of class membership is truncated to be smaller 1-epsilon when calculating #' the fit in a leaf. +#' @param split_decay_rate `[0.1]`: Exponential decay factor λ for aging split-candidates. A candidate’s weight is `exp(−λ * age)`. +#' @param max_candidates `[50]`: Maximum number of split‐candidates to sample at each node (will be clamped to [1, #possible_splits]). +#' @param delete_leaves `[1]`: Whether parents should be deleted if split is an existing coordinate #' @param ... (Unused). #' #' @return Object of class `"rpf"` with model object contained in `$fit`. @@ -63,14 +66,16 @@ rpf.default <- function(x, ...) { #' @export #' @rdname rpf rpf.data.frame <- function(x, y, max_interaction = 1, ntrees = 50, splits = 30, - split_try = 10, t_try = 0.4, deterministic = FALSE, + split_try = 10, t_try = 0.4, split_decay_rate = 0.1, + max_candidates = 50, delete_leaves = 1, + deterministic = FALSE, nthreads = 1, purify = FALSE, cv = FALSE, loss = "L2", delta = 0, epsilon = 0.1, ...) { blueprint <- hardhat::default_xy_blueprint(intercept = FALSE) processed <- hardhat::mold(x, y, blueprint = blueprint) rpf_bridge( processed, max_interaction, ntrees, splits, - split_try, t_try, deterministic, + split_try, t_try, split_decay_rate, max_candidates, delete_leaves, deterministic, nthreads, purify, cv, loss, delta, epsilon ) @@ -80,14 +85,16 @@ rpf.data.frame <- function(x, y, max_interaction = 1, ntrees = 50, splits = 30, #' @export #' @rdname rpf rpf.matrix <- function(x, y, max_interaction = 1, ntrees = 50, splits = 30, - split_try = 10, t_try = 0.4, deterministic = FALSE, + split_try = 10, t_try = 0.4, split_decay_rate = 0.1, + max_candidates = 50, delete_leaves = 1, + deterministic = FALSE, nthreads = 1, purify = FALSE, cv = FALSE, loss = "L2", delta = 0, epsilon = 0.1, ...) { blueprint <- hardhat::default_xy_blueprint(intercept = FALSE) processed <- hardhat::mold(x, y, blueprint = blueprint) rpf_bridge( processed, max_interaction, ntrees, splits, - split_try, t_try, deterministic, + split_try, t_try, split_decay_rate, max_candidates, delete_leaves, deterministic, nthreads, purify, cv, loss, delta, epsilon )} @@ -96,14 +103,16 @@ rpf.matrix <- function(x, y, max_interaction = 1, ntrees = 50, splits = 30, #' @export #' @rdname rpf rpf.formula <- function(formula, data, max_interaction = 1, ntrees = 50, splits = 30, - split_try = 10, t_try = 0.4, deterministic = FALSE, + split_try = 10, t_try = 0.4, split_decay_rate = 0.1, + max_candidates = 50, delete_leaves = 1, + deterministic = FALSE, nthreads = 1, purify = FALSE, cv = FALSE, loss = "L2", delta = 0, epsilon = 0.1, ...) { blueprint <- hardhat::default_formula_blueprint(intercept = FALSE, indicators = "none") processed <- hardhat::mold(formula, data, blueprint = blueprint) rpf_bridge( processed, max_interaction, ntrees, splits, - split_try, t_try, deterministic, + split_try, t_try, split_decay_rate, max_candidates, delete_leaves, deterministic, nthreads, purify, cv, loss, delta, epsilon ) @@ -113,14 +122,16 @@ rpf.formula <- function(formula, data, max_interaction = 1, ntrees = 50, splits #' @export #' @rdname rpf rpf.recipe <- function(x, data, max_interaction = 1, ntrees = 50, splits = 30, - split_try = 10, t_try = 0.4, deterministic = FALSE, + split_try = 10, t_try = 0.4, split_decay_rate = 0.1, + max_candidates = 50, delete_leaves = 1, + deterministic = FALSE, nthreads = 1, purify = FALSE, cv = FALSE, loss = "L2", delta = 0, epsilon = 0.1, ...) { blueprint <- hardhat::default_recipe_blueprint(intercept = FALSE) processed <- hardhat::mold(x, data, blueprint = blueprint) rpf_bridge( processed, max_interaction, ntrees, splits, - split_try, t_try, deterministic, + split_try, t_try, split_decay_rate, max_candidates, delete_leaves, deterministic, nthreads, purify, cv, loss, delta, epsilon ) @@ -131,7 +142,9 @@ rpf.recipe <- function(x, data, max_interaction = 1, ntrees = 50, splits = 30, #' @param processed Output of `hardhat::mold` from respective rpf methods #' @importFrom hardhat validate_outcomes_are_univariate rpf_bridge <- function(processed, max_interaction = 1, ntrees = 50, splits = 30, - split_try = 10, t_try = 0.4, deterministic = FALSE, + split_try = 10, t_try = 0.4, split_decay_rate = 0.1, + max_candidates = 50, delete_leaves = 1, + deterministic = FALSE, nthreads = 1, purify = FALSE, cv = FALSE, loss = "L2", delta = 0, epsilon = 0.1) { hardhat::validate_outcomes_are_univariate(processed$outcomes) @@ -141,7 +154,7 @@ rpf_bridge <- function(processed, max_interaction = 1, ntrees = 50, splits = 30, # Check arguments checkmate::assert_int(max_interaction, lower = 0) - + # rewrite max_interaction so 0 -> "maximum", e.g. ncol(X) if (max_interaction == 0) { max_interaction <- p @@ -156,10 +169,13 @@ rpf_bridge <- function(processed, max_interaction = 1, ntrees = 50, splits = 30, checkmate::assert_int(ntrees, lower = 1) checkmate::assert_int(splits, lower = 1) checkmate::assert_int(split_try, lower = 1) - + checkmate::assert_int(max_candidates, lower = 1) + checkmate::assert_number(t_try, lower = 0, upper = 1) checkmate::assert_number(delta, lower = 0, upper = 1) checkmate::assert_number(epsilon, lower = 0, upper = 1) + checkmate::assert_number(split_decay_rate, lower = 0) + # "median" loss is implemented but discarded loss_functions <- switch(outcomes$mode, @@ -172,12 +188,14 @@ rpf_bridge <- function(processed, max_interaction = 1, ntrees = 50, splits = 30, checkmate::assert_int(nthreads, lower = 1L) checkmate::assert_flag(purify) checkmate::assert_flag(cv) + checkmate::assert_flag(delete_leaves) + fit <- rpf_impl( Y = outcomes$outcomes, X = predictors$predictors_matrix, mode = outcomes$mode, max_interaction = max_interaction, ntrees = ntrees, splits = splits, - split_try = split_try, t_try = t_try, deterministic = deterministic, + split_try = split_try, t_try = t_try, split_decay_rate = split_decay_rate, max_candidates = max_candidates, delete_leaves=delete_leaves, deterministic = deterministic, nthreads = nthreads, purify = purify, cv = cv, loss = loss, delta = delta, epsilon = epsilon ) @@ -195,7 +213,11 @@ rpf_bridge <- function(processed, max_interaction = 1, ntrees = 50, splits = 30, ntrees = ntrees, max_interaction = max_interaction, splits = splits, - split_try = split_try, t_try = t_try, + split_try = split_try, + t_try = t_try, + split_decay_rate = split_decay_rate, + max_candidates = max_candidates, + delete_leaves = delete_leaves, delta = delta, epsilon = epsilon, deterministic = deterministic, nthreads = nthreads, purify = purify, cv = cv @@ -217,7 +239,7 @@ new_rpf <- function(fit, blueprint, ...) { # Main fitting function and interface to C++ implementation rpf_impl <- function(Y, X, mode = c("regression", "classification"), max_interaction = 1, ntrees = 50, splits = 30, split_try = 10, t_try = 0.4, - deterministic = FALSE, nthreads = 1, purify = FALSE, cv = FALSE, + deterministic = FALSE, nthreads = 1, purify = FALSE, cv = FALSE, split_decay_rate = 0.1, max_candidates = 50, delete_leaves = 1, loss = "L2", delta = 0, epsilon = 0.1) { # Final input validation, should be superfluous checkmate::assert_matrix(X, mode = "numeric", any.missing = FALSE) @@ -226,12 +248,12 @@ rpf_impl <- function(Y, X, mode = c("regression", "classification"), if (mode == "classification") { fit <- new(ClassificationRPF, Y, X, loss, c( max_interaction, ntrees, splits, split_try, t_try, - purify, deterministic, nthreads, cv, delta, epsilon + purify, deterministic, nthreads, cv, split_decay_rate, max_candidates, delete_leaves, delta, epsilon )) } else if (mode == "regression") { fit <- new(RandomPlantedForest, Y, X, c( - max_interaction, ntrees, splits, split_try, t_try, - purify, deterministic, nthreads, cv + max_interaction, ntrees, splits, split_try, t_try, + purify, deterministic, nthreads, cv, split_decay_rate, max_candidates, delete_leaves )) }