From 1482164a05eea32bbbcd1b638776d4e151dcd9ce Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Fri, 19 Apr 2024 17:34:22 +0200 Subject: [PATCH 01/16] Move purify method into separate file --- src/include/rpf.hpp | 2 + src/lib/rpf.cpp | 1052 +-------------------------------------- src/lib/rpf_purify.cpp | 1058 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1062 insertions(+), 1050 deletions(-) create mode 100644 src/lib/rpf_purify.cpp diff --git a/src/include/rpf.hpp b/src/include/rpf.hpp index 53e8d13..b87327d 100644 --- a/src/include/rpf.hpp +++ b/src/include/rpf.hpp @@ -55,6 +55,8 @@ class RandomPlantedForest virtual void create_tree_family(std::vector initial_leaves, size_t n); virtual Split calcOptimalSplit(const std::vector> &Y, const std::vector> &X, std::multimap> &possible_splits, TreeFamily &curr_family); + + std::vector> get_lim_list(const TreeFamily &curr_family); }; #endif // RPF_HPP \ No newline at end of file diff --git a/src/lib/rpf.cpp b/src/lib/rpf.cpp index 085df8c..7bc168e 100644 --- a/src/lib/rpf.cpp +++ b/src/lib/rpf.cpp @@ -636,7 +636,8 @@ std::vector RandomPlantedForest::predict_single(const std::vector X[std::max(0, dim - 1)] || leaf.intervals[std::max(0, dim - 1)].second == upper_bounds[std::max(0, dim - 1)]))) + if (!((leaf.intervals[std::max(0, dim - 1)].first <= X[std::max(0, dim - 1)] || leaf.intervals[std::max(0, dim - 1)].first == lower_bounds[std::max(0, dim - 1)]) + && (leaf.intervals[std::max(0, dim - 1)].second > X[std::max(0, dim - 1)] || leaf.intervals[std::max(0, dim - 1)].second == upper_bounds[std::max(0, dim - 1)]))) { valid = false; } @@ -899,1055 +900,6 @@ double RandomPlantedForest::MSE(const NumericMatrix &Y_predicted, const NumericM return sum / Y_size; } -void RandomPlantedForest::purify_1() -{ - - // go through all n_trees families - for (auto &curr_family : this->tree_families) - { - - // recap maximum number of dimensions of current family - unsigned int curr_max = 0; - for (auto tree : curr_family) - { - if (tree.first.size() > curr_max) - curr_max = tree.first.size(); - } - - while (curr_max >= 1) - { - - // go through split dimensions of all trees - auto keys = getKeys(curr_family); - std::vector>::reverse_iterator key = keys.rbegin(); - while (key != keys.rend()) - { - - auto &curr_tree = curr_family[(*key)]; - std::set curr_dims = curr_tree->split_dims; - - // check if number of dims same as current max_interaction - if (curr_dims.size() == curr_max) - { - - // go through feature dims - for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) - { - - // continue only if dim in current tree - if (curr_tree->split_dims.count(feature_dim) != 0) - { - - std::set tree_dims = curr_tree->split_dims; - tree_dims.erase(tree_dims.find(feature_dim)); // remove current feature dim from current tree - - // check if tree with dimensions exists, if not create - std::shared_ptr tree = treeExists(tree_dims, curr_family); - if (curr_max == 1) - { - tree = curr_family[std::set{0}]; - } - else - { - if (!tree) - { - curr_family.insert(std::make_pair(tree_dims, std::make_shared(DecisionTree(tree_dims)))); - tree = curr_family[tree_dims]; - } - } - - // go through leaves of current tree - int n_leaves = curr_tree->leaves.size(); - for (int l = 0; l < n_leaves; ++l) - { - auto &curr_leaf = curr_tree->leaves[l]; - - double multiplier = (curr_leaf.intervals[feature_dim - 1].second - curr_leaf.intervals[feature_dim - 1].first) / (upper_bounds[feature_dim - 1] - lower_bounds[feature_dim - 1]); - - // new leaf including intervals and value - Leaf new_leaf = curr_leaf; // initialize intervals with first leaf - new_leaf.intervals[feature_dim - 1].first = lower_bounds[feature_dim - 1]; - new_leaf.intervals[feature_dim - 1].second = upper_bounds[feature_dim - 1]; - for (size_t i = 0; i < value_size; ++i) - new_leaf.value[i] = -curr_leaf.value[i] * multiplier; // update value of new leaf - - // append new leaf - if (!leafExists(new_leaf.intervals, curr_tree)) - curr_tree->leaves.push_back(new_leaf); - for (size_t i = 0; i < value_size; ++i) - new_leaf.value[i] = curr_leaf.value[i] * multiplier; // update value of new leaf - if (!leafExists(new_leaf.intervals, tree)) - tree->leaves.push_back(new_leaf); - } - } - } - } - key++; - } - - // update currently considered dimension size - --curr_max; - } - } - - purified = true; -} - -void RandomPlantedForest::purify_2() -{ - - // go through all n_trees families - for (auto &curr_family : this->tree_families) - { - - // lim_list is a list giving for each variable all interval end-points - std::vector> lim_list(feature_size); - - // go through all variables of the component - for (int curr_dim = 1; curr_dim <= feature_size; ++curr_dim) - { - std::vector bounds; - - // go through trees of family - for (const auto &curr_tree : curr_family) - { - - // consider only relevant trees that have current dimension as variable - if (!curr_tree.first.count(curr_dim)) - continue; - - // go through leaves of tree - for (const auto &curr_leaf : curr_tree.second->leaves) - { - // get interval ends of variable - bounds.push_back(curr_leaf.intervals[curr_dim - 1].second); - } - } - std::sort(bounds.begin(), bounds.end()); - bounds.erase(std::unique(bounds.begin(), bounds.end()), bounds.end()); - lim_list[curr_dim - 1] = bounds; - } - - // initialize values and individuals for each tree in family - std::vector grids(curr_family.size() - 1); - std::vector> individuals(curr_family.size() - 1); - std::vector>> values(curr_family.size() - 1); - std::vector> variables(curr_family.size() - 1); - - // ------------- setup finer grid ------------- - - int tree_index = 0; - for (const auto &curr_tree : curr_family) - { - - if (curr_tree.first == std::set{0}) - continue; // ignore null tree - - // fill space with dimensions - std::vector dimensions; - for (const auto &dim : curr_tree.first) - { - dimensions.push_back(lim_list[dim - 1].size() - 1); // size - 1 ? - } - - // setup grid for leaf indices - auto grid = grid::NDGrid(dimensions); - - // initialize data for current tree - grids[tree_index] = grid; - individuals[tree_index] = utils::Matrix(dimensions, 0); - values[tree_index] = utils::Matrix>(dimensions, std::vector(value_size, 0)); // changed - variables[tree_index] = curr_tree.first; - - // fill grid points with individuals and values - while (!grid.nextPoint()) - { - - std::vector gridPoint = grid.getPoint(); - - bool in_leaf = true; - - // go through sample points to sum up individuals - for (const auto &feature_vec : X) - { - int dim_index = 0; - in_leaf = true; - for (const auto &dim : curr_tree.first) - { - double val = feature_vec[dim - 1]; - if (!((val >= lim_list[dim - 1][gridPoint[dim_index]]) && (val < lim_list[dim - 1][gridPoint[dim_index] + 1]))) - in_leaf = false; - ++dim_index; - } - - // consider individuals only if all in - if (in_leaf) - individuals[tree_index][gridPoint] += 1; - } - - // go through leaves of tree to sum up values - for (const auto &leaf : curr_tree.second->get_leaves()) - { - - in_leaf = true; - int dim_index = 0; - for (const auto &dim : curr_tree.first) - { - // consider values only if all in - if (!((leaf.intervals[dim - 1].first <= lim_list[dim - 1][gridPoint[dim_index]]) && (leaf.intervals[dim - 1].second >= lim_list[dim - 1][gridPoint[dim_index] + 1]))) - in_leaf = false; - ++dim_index; - } - - // sum up values - if (in_leaf) - values[tree_index][gridPoint] += leaf.value; // todo: multiclass - } - } - - ++tree_index; - } - - // ------------- create new trees ------------- - - // insert null tree - grids.insert(grids.begin(), grid::NDGrid()); - values.insert(values.begin(), utils::Matrix>(std::vector{1}, std::vector(value_size, 0))); - individuals.insert(individuals.begin(), utils::Matrix(std::vector{1})); - variables.insert(variables.begin(), std::set{0}); - - // recap maximum number of dimensions of current family - unsigned int curr_max = 0; - for (const auto &tree : curr_family) - { - if (tree.first.size() > curr_max) - curr_max = tree.first.size(); - } - - auto keys = getKeys(curr_family); - while (curr_max > 1) - { - - // go through split dimensions of all trees - for (std::vector>::reverse_iterator key = keys.rbegin(); key != keys.rend(); ++key) - { - - auto &curr_tree = curr_family[(*key)]; - std::set curr_dims = curr_tree->split_dims; - - // check if number of dims same as current max_interaction - if (curr_dims.size() == curr_max) - { - - // go through feature dims - int dim_index = 0; - for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) - { - - // continue only if dim in current tree - if (curr_tree->split_dims.count(feature_dim) != 0) - { - - std::set tree_dims = curr_tree->split_dims; - tree_dims.erase(tree_dims.find(feature_dim)); // remove current feature dim from current tree - - // check if tree with dimensions exists, if not create - std::shared_ptr tree = treeExists(tree_dims, curr_family); - if (!tree) - { - - // get index of old and new tree - auto old_tree_index = std::distance(std::begin(curr_family), curr_family.find(curr_tree->get_split_dims())); - curr_family.insert(std::make_pair(tree_dims, std::make_shared(DecisionTree(tree_dims)))); - auto tree_index = std::distance(std::begin(curr_family), curr_family.find(tree_dims)); - - // remove matrix dimension of respective variable - std::vector matrix_dimensions = values[old_tree_index].dims; - matrix_dimensions.erase(matrix_dimensions.begin() + dim_index); - - // initialize data for new tree - auto grid = grid::NDGrid(matrix_dimensions); - grids.insert(grids.begin() + tree_index, grid); - values.insert(values.begin() + tree_index, utils::Matrix>(matrix_dimensions, std::vector(0, value_size))); - individuals.insert(individuals.begin() + tree_index, utils::Matrix(matrix_dimensions)); - variables.insert(variables.begin() + tree_index, tree_dims); - - // fill individuals of new trees - while (!grid.nextPoint()) - { - - std::vector gridPoint = grid.getPoint(); - bool in_leaf = true; - - // go through sample points to sum up individuals - for (const auto &feature_vec : X) - { - int dim_index = 0; - in_leaf = true; - for (const auto &dim : tree_dims) - { - double val = feature_vec[dim - 1]; - if (!((val >= lim_list[dim - 1][gridPoint[dim_index]]) && (val < lim_list[dim - 1][gridPoint[dim_index] + 1]))) - in_leaf = false; - ++dim_index; - } - - // consider individuals only if all in - if (in_leaf) - individuals[tree_index][gridPoint] += 1; - } - } - } - - dim_index++; - } - } - } - } - - // update currently considered dimension size - --curr_max; - } - - // ------------- purify ------------- - - // measure tolerance and number of iterations - std::vector tol(curr_family.size(), 1); - int iter; - - // iterate backwards through tree family - int curr_tree_index = curr_family.size() - 1; - for (TreeFamily::reverse_iterator curr_tree = curr_family.rbegin(); curr_tree != curr_family.rend(); ++curr_tree) - { - iter = 0; - std::set curr_dims = curr_tree->second->get_split_dims(); - - // do not purify null - if (curr_dims == std::set{0}) - continue; - - // repeat until tolerance small enough and (?) maximum number of iterations reached - while ((tol[curr_tree_index] > 0.00000000001) && (iter < 100)) - { - - // go through feature dims - int curr_dim_index = 0; - for (const auto &feature_dim : curr_dims) - { - - // get tree that has same variables as curr_tree minus j-variable - std::set tree_dims = curr_dims; - tree_dims.erase(tree_dims.find(feature_dim)); - int tree_index = 0; // if tree not exist, set to null tree - if (curr_family.find(tree_dims) != curr_family.end()) - tree_index = std::distance(std::begin(curr_family), curr_family.find(tree_dims)) - 1; - - // update values - if (grids[curr_tree_index].dimensions.size() == 1) - { // one dimensional case - - int sum_ind = 0; - std::vector avg(value_size, 0); - - // get sum of individuals - for (int i = 0; i < individuals[curr_tree_index].n_entries; ++i) - { - std::vector tmp{i}; - sum_ind += individuals[curr_tree_index][tmp]; - } - if (sum_ind == 0) - continue; - - // calc avg - for (int i = 0; i < individuals[curr_tree_index].n_entries; ++i) - { - std::vector tmp{i}; - avg += (individuals[curr_tree_index][tmp] * values[curr_tree_index][tmp]) / sum_ind; - } - - // update values of one dimensional and null tree - for (int i = 0; i < values[curr_tree_index].n_entries; ++i) - { - std::vector tmp{i}; - values[curr_tree_index][tmp] -= avg; - } - std::vector tmp{0}; - values[tree_index][tmp] += avg; - } - else - { // higher dimensional case - - // setup new grid without dimension j - std::vector new_dimensions = grids[curr_tree_index].dimensions; - int j_dim = new_dimensions[curr_dim_index]; - new_dimensions.erase(new_dimensions.begin() + curr_dim_index); - grid::NDGrid grid = grid::NDGrid(new_dimensions); - - // go through values without dimension j - while (!grid.nextPoint()) - { - auto gridPoint = grid.getPoint(); - gridPoint.push_back(0); - - int sum_ind = 0; - std::vector avg(value_size, 0); - - // go through slice to sum up individuals - for (int j = 0; j < j_dim; ++j) - { - gridPoint.back() = j; - - // get sum of individuals - sum_ind += individuals[curr_tree_index][gridPoint]; - } - - // go through slice to calc avg - for (int j = 0; j < j_dim; ++j) - { - gridPoint.back() = j; - - // calc avg - avg += (individuals[curr_tree_index][gridPoint] * values[curr_tree_index][gridPoint]) / sum_ind; - } - - // go through slice to update values - for (int j = 0; j < j_dim; ++j) - { - gridPoint.back() = j; - - // update values of current slice - values[curr_tree_index][gridPoint] -= avg; - } - - // update lower dimensional tree - gridPoint.pop_back(); - values[tree_index][gridPoint] += avg; - } - } - - ++curr_dim_index; - } - - // update tolerance - if (variables[curr_tree_index].size() == 1) - { - tol[curr_tree_index] = 1; // todo - } - else - { - tol[curr_tree_index] = 1; - } - - ++iter; - } - - --curr_tree_index; - } - - // ------------- attach to rpf class ------------- - - // fill with new trees - for (size_t tree_index = 0; tree_index < variables.size(); ++tree_index) - { - LeafGrid curr_gridLeaf; - curr_gridLeaf.grid = grids[tree_index]; - curr_gridLeaf.individuals = individuals[tree_index]; - curr_gridLeaf.lim_list = lim_list; - curr_gridLeaf.values = values[tree_index]; - curr_family[variables[tree_index]]->GridLeaves = curr_gridLeaf; - } - } - - purified = true; -} - -void RandomPlantedForest::purify_3() -{ - - // go through all n_trees families - for (auto &curr_family : this->tree_families) - { - - // lim_list is a list giving for each variable all interval end-points - std::vector> lim_list(feature_size); - - // go through all variables of the component - for (int curr_dim = 1; curr_dim <= feature_size; ++curr_dim) - { - std::vector bounds; - - // go through trees of family - for (const auto &curr_tree : curr_family) - { - - // consider only relevant trees that have current dimension as variable - if (!curr_tree.first.count(curr_dim)) - continue; - - // go through leaves of tree - for (const auto &curr_leaf : curr_tree.second->leaves) - { - // get interval ends of variable - bounds.push_back(curr_leaf.intervals[curr_dim - 1].first); - bounds.push_back(curr_leaf.intervals[curr_dim - 1].second); - } - } - std::sort(bounds.begin(), bounds.end()); - bounds.erase(std::unique(bounds.begin(), bounds.end()), bounds.end()); - // int i_last = bounds.size()-1; - // double bibi = bounds[i_last] + 0.0001; - // bounds[i_last] = bounds[i_last] + 0.0001; - lim_list[curr_dim - 1] = bounds; - } - - // initialize values and individuals for each tree in family - std::vector grids(curr_family.size() - 1); - std::vector> individuals(curr_family.size() - 1); - std::vector>> values(curr_family.size() - 1); - std::vector>> values_old(curr_family.size() - 1); - std::vector> variables(curr_family.size() - 1); - - // ------------- setup finer grid ------------- - - int tree_index = 0; - for (const auto &curr_tree : curr_family) - { - - if (curr_tree.first == std::set{0}) - { - - // values[tree_index] = rpf::Matrix>(dimensions, std::vector(value_size, 0)); // changed - continue; // ignore null tree - } - - // fill space with dimensions - std::vector dimensions; - for (const auto &dim : curr_tree.first) - { - dimensions.push_back(lim_list[dim - 1].size()); // size - 1 ? WICHTIG - } - - // setup grid for leaf indices - auto grid = grid::NDGrid(dimensions); - - // initialize data for current tree - grids[tree_index] = grid; - individuals[tree_index] = utils::Matrix(dimensions, 0); - values[tree_index] = utils::Matrix>(dimensions, std::vector(value_size, 0)); // changed - values_old[tree_index] = utils::Matrix>(dimensions, std::vector(value_size, 0)); // changed - variables[tree_index] = curr_tree.first; - - // fill grid points with individuals and values - while (!grid.nextPoint()) - { - - std::vector gridPoint = grid.getPoint(); - - bool in_leaf = true; - - // go through sample points to sum up individuals - for (const auto &feature_vec : X) - { - int dim_index = 0; - in_leaf = true; - for (const auto &dim : curr_tree.first) - { - double val = feature_vec[dim - 1]; - if (!((val >= lim_list[dim - 1][gridPoint[dim_index]]) && (val < lim_list[dim - 1][gridPoint[dim_index] + 1]))) - in_leaf = false; - ++dim_index; - } - - // consider individuals only if all in - if (in_leaf) - individuals[tree_index][gridPoint] += 1; - } - - // go through leaves of tree to sum up values - for (const auto &leaf : curr_tree.second->get_leaves()) - { - - in_leaf = true; - int dim_index = 0; - for (const auto &dim : curr_tree.first) - { - // consider values only if all in - if (!((leaf.intervals[dim - 1].first <= lim_list[dim - 1][gridPoint[dim_index]]) && (leaf.intervals[dim - 1].second >= lim_list[dim - 1][gridPoint[dim_index] + 1]))) - in_leaf = false; - ++dim_index; - } - - // sum up values - if (in_leaf) - { - - values[tree_index][gridPoint] += leaf.value; // todo: multiclass - values_old[tree_index][gridPoint] += leaf.value; // todo: multiclass - } - } - } - - ++tree_index; - } - - // Rcout << variables.size(); - // for(int i = 0; i>(std::vector{1}, std::vector(value_size, 0))); - values_old.insert(values_old.begin(), utils::Matrix>(std::vector{1}, std::vector(value_size, 0))); - individuals.insert(individuals.begin(), utils::Matrix(std::vector{1})); - variables.insert(variables.begin(), std::set{0}); - - // recap maximum number of dimensions of current family - unsigned int curr_max = curr_family.rbegin()->first.size(); - - while (curr_max > 1) - { - - auto keys = getKeys(curr_family); - // go through split dimensions of all trees - for (std::vector>::reverse_iterator key = keys.rbegin(); key != keys.rend(); ++key) - { - auto &curr_tree = curr_family[(*key)]; - std::set curr_dims = curr_tree->split_dims; - // check if number of dims same as current max_interaction - if (curr_dims.size() == curr_max) - { - // go through feature dims - int dim_index = 0; - for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) - { - // continue only if dim in current tree - if (curr_tree->split_dims.count(feature_dim) != 0) - { - std::set tree_dims = curr_tree->split_dims; - tree_dims.erase(tree_dims.find(feature_dim)); // remove current feature dim from current tree - // check if tree with dimensions exists, if not create - std::shared_ptr tree = treeExists(tree_dims, curr_family); - if (!tree) - { - // get index of old and new tree - auto old_tree_index = std::distance(std::begin(curr_family), curr_family.find(curr_tree->get_split_dims())); - curr_family.insert(std::make_pair(tree_dims, std::make_shared(DecisionTree(tree_dims)))); - auto tree_index = std::distance(std::begin(curr_family), curr_family.find(tree_dims)); - // remove matrix dimension of respective variable - std::vector matrix_dimensions = values[old_tree_index].dims; - // std::vector matrix_dimensions = values_old[old_tree_index].dims; - - // Rcout << typeof(matrix_dimensions.begin()) << std::endl; - - matrix_dimensions.erase(matrix_dimensions.begin() + dim_index); - // initialize data for new tree - auto grid = grid::NDGrid(matrix_dimensions); - grids.insert(grids.begin() + tree_index, grid); - values.insert(values.begin() + tree_index, utils::Matrix>(matrix_dimensions, std::vector(value_size, 0))); - values_old.insert(values_old.begin() + tree_index, utils::Matrix>(matrix_dimensions, std::vector(value_size, 0))); - individuals.insert(individuals.begin() + tree_index, utils::Matrix(matrix_dimensions)); - variables.insert(variables.begin() + tree_index, tree_dims); - // fill individuals of new trees - while (!grid.nextPoint()) - { - std::vector gridPoint = grid.getPoint(); - bool in_leaf = true; - // go through sample points to sum up individuals - for (const auto &feature_vec : X) - { - int dim_index2 = 0; - in_leaf = true; - for (const auto &dim : tree_dims) - { - double val = feature_vec[dim - 1]; - if (!((val >= lim_list[dim - 1][gridPoint[dim_index2]]) && (val < lim_list[dim - 1][gridPoint[dim_index2] + 1]))) - in_leaf = false; - ++dim_index2; - } - // consider individuals only if all in - if (in_leaf) - individuals[tree_index][gridPoint] += 1; - } - } - } - dim_index++; - } - } - } - } - // update currently considered dimension size - --curr_max; - } - - // Rcout << std::endl; - // Rcout << std::endl; - // Rcout << std::endl; - // - // for(int i = 0; i curr_dims = *tree_t; - // do not purify null - if (curr_dims == std::set{0}) - continue; - // Rcout << std::endl << tree_index_t << " - T: "; - // Rcout << "tree_t:"; - // for(auto dim: curr_dims) Rcout << dim << ", "; - // Rcout << std::endl; - - auto grid = grids[tree_index_t]; - // Rcout << "Grid dimensions of T: "; - // for(auto dim: grid.dimensions) Rcout << dim << ", "; - // Rcout << std::endl; - // go through subtrees of t - int tree_index_u = variables.size(); - for (auto tree_u = variables.rbegin(); tree_u != variables.rend(); ++tree_u) - { - --tree_index_u; - // j_dims = dims of t without u - std::set j_dims = curr_dims; - if (tree_u->size() > curr_dims.size()) - continue; - // check if subset - bool subset = true; - for (const auto dim : *tree_u) - { - if (tree_t->count(dim) == 0) - { - subset = false; - break; - } - j_dims.erase(dim); - } - if (!subset) - continue; - - // Rcout << "Hello"; - // Rcout << " " << tree_index_u << " - U: "; - // for(auto dim: *tree_u) Rcout << dim << ", "; - // Rcout << std::endl; - // Rcout << " Individuals: "; - - double tot_sum = 0; - grid = grids[tree_index_u]; - while (!grid.nextPoint()) - { - auto gridPoint = grid.getPoint(); - // Rcout << individuals[tree_index_u][gridPoint] << ", "; - tot_sum += individuals[tree_index_u][gridPoint]; - } - // Rcout << "Total sum: " << tot_sum << std::endl; - // Rcout << std::endl; - - grid = grids[tree_index_u]; - // Rcout << " Grid dimensions of U: "; - // for(auto dim: grid.dimensions) Rcout << dim << ", "; - // Rcout << std::endl; - - // Rcout<< "j_dims: "< update(value_size, 0); - - if (j_dims.size() == 0) - { - - // grid = grids[tree_index_u]; - while (!grid.nextPoint()) - { - auto gridPoint_i = grid.getPoint(); - // Rcout << " " << "i: "; - // for(auto p: gridPoint_i) Rcout << p << ", "; - // Rcout << std::endl << " "; - double curr_sum = individuals[tree_index_u][gridPoint_i]; - // Rcout << ", Current Sum: " << curr_sum << std::endl; - // Rcout << std::endl << " " << "i, j: "; - update += (curr_sum / tot_sum) * values_old[tree_index_t][gridPoint_i]; - // Rcout << std::endl; - } - - int tree_index_s = variables.size(); - for (auto tree_s = variables.rbegin(); tree_s != variables.rend(); ++tree_s) - { - - // Rcout << "tree_s:"; - // for(auto dim: *tree_s) Rcout << dim << ", "; - // Rcout << std::endl; - - --tree_index_s; - if (*tree_s == std::set{0}) - { - - auto gridPoint_0 = std::vector{0}; - values[tree_index_s][gridPoint_0] += update; - // Rcout << std::endl; - //} - - /* - for(auto tree_0: curr_family){ - - if(tree_0.first == std::set{0}){ - - Rcout << tree_0.first.size(); - std::vector leaf_index(tree_0.first.size(), 0); - std::vector leaf_index(tree_0.second->GridLeaves.values.size(), 0); - - int Test = tree_0.second->GridLeaves.values.size(); - Rcout << Test; - tree_0.second->GridLeaves.values[leaf_index] += update; - } - } - */ - } - else - { - - // check if S subset of T - - bool subset = true; - for (const auto dim : *tree_s) - { - if (tree_t->count(dim) == 0) - { - subset = false; - break; - } - } - if (!subset) - continue; - - // Rcout << pow(-1, (*tree_s).size()) << std::endl; - - auto grid_k = grids[tree_index_s]; - while (!grid_k.nextPoint()) - { - auto gridPoint_k = grid_k.getPoint(); - // - // if((*tree_s).size()>2){ - // Rcout << std::endl << " " << "j, k: "; - // for(auto p: gridPoint_k) Rcout << p << ", "; - // Rcout << std::endl; - // } - // - // Rcout << pow(-1, (*tree_s).size()) * update << std::endl; - values[tree_index_s][gridPoint_k] += pow(-1, (*tree_s).size()) * update; - } - } - } - // Rcout << std::endl; - } - else - { - - std::vector j_sizes(j_dims.size(), 0); - for (size_t j = 0; j < j_dims.size(); ++j) - { - auto tmp = j_dims.begin(); - std::advance(tmp, j); - int j_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*tmp)); - j_sizes[j] = grids[tree_index_t].dimensions[j_index]; - } - - // Rcout<<"Hello 1"; - - grid::NDGrid grid_j = grid::NDGrid(j_sizes); - while (!grid_j.nextPoint()) - { - - std::vector update(value_size, 0); - auto gridPoint_j = grid_j.getPoint(); - // Rcout << " " << "j: "; - // for(auto p: gridPoint_j) Rcout << p << ", "; - // Rcout << std::endl; - // calc update - grid = grids[tree_index_u]; - while (!grid.nextPoint()) - { - auto gridPoint_i = grid.getPoint(); - // Rcout << " " << "i: "; - // for(auto p: gridPoint_i) Rcout << p << ", "; - // Rcout << std::endl << " "; - double curr_sum = individuals[tree_index_u][gridPoint_i]; - // Rcout << ", Current Sum: " << curr_sum << std::endl; - std::vector gridPoint_ij(tree_t->size(), 0); - for (size_t j = 0; j < gridPoint_j.size(); ++j) - { - auto j_dim = j_dims.begin(); - std::advance(j_dim, j); - int j_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*j_dim)); - // Rcout << " j_dim=" << *j_dim << ", j_index=" << j_index; - gridPoint_ij[j_index] = gridPoint_j[j]; - } - for (size_t i = 0; i < gridPoint_i.size(); ++i) - { - auto i_dim = tree_u->begin(); - std::advance(i_dim, i); - int i_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*i_dim)); - // Rcout << " i_dim=" << *i_dim << ", i_index=" << i_index; - gridPoint_ij[i_index] = gridPoint_i[i]; - } - // Rcout << std::endl << " " << "i, j: "; - // for(auto p: gridPoint_ij) Rcout << p << ", "; - // Rcout << std::endl; - update += (curr_sum / tot_sum) * values_old[tree_index_t][gridPoint_ij]; - // Rcout << std::endl; - } - - // Rcout << "Hello_2"; - // update trees - int tree_index_s = variables.size(); - for (auto tree_s = variables.rbegin(); tree_s != variables.rend(); ++tree_s) - { - --tree_index_s; - // check if T\U=j_dims subset of S and S subset of T - bool subset = true; - for (const auto dim : j_dims) - { - if (tree_s->count(dim) == 0) - { - subset = false; - break; - } - } - for (const auto dim : *tree_s) - { - if (tree_t->count(dim) == 0) - { - subset = false; - break; - } - } - if (!subset) - continue; - // Rcout << " " << "S: "; - // for(auto dim: *tree_s) Rcout << dim << ", "; - // Rcout << std::endl; - // S cap U - std::set k_dims = *tree_s; - std::set k_dims_h1 = *tree_s; - std::set k_dims_h2 = *tree_u; - for (const auto dim : *tree_u) - k_dims.insert(dim); - for (const auto dim : *tree_s) - k_dims_h2.erase(dim); - for (const auto dim : *tree_u) - k_dims_h1.erase(dim); - for (const auto dim : k_dims_h1) - k_dims.erase(dim); - for (const auto dim : k_dims_h2) - k_dims.erase(dim); - - // std::set k_dims = *tree_s; - // for(const auto dim: *tree_t) k_dims.erase(dim); - // for(const auto dim: *tree_u) k_dims.insert(dim); - - // Rcout << " " << "k_dims: "; - // for(auto dim: k_dims) Rcout << dim << ", "; - // Rcout << std::endl; - - if (k_dims.size() == 0) - { - - values[tree_index_s][gridPoint_j] += pow(-1, (*tree_s).size() - j_dims.size()) * update; - } - else - { - - // Rcout <<"k_dims :"; - // for(auto dim: k_dims) Rcout << dim << ", "; - // Rcout << std::endl; - - std::vector k_sizes(k_dims.size(), 0); - for (size_t k = 0; k < k_dims.size(); ++k) - { - auto tmp = k_dims.begin(); - std::advance(tmp, k); - int k_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*tmp)); - k_sizes[k] = grids[tree_index_t].dimensions[k_index]; - } - // Rcout << " " << "k_sizes: "; - // for(auto dim: k_sizes) Rcout << dim << ", "; - // Rcout << std::endl; - grid::NDGrid grid_k = grid::NDGrid(k_sizes); - while (!grid_k.nextPoint()) - { - auto gridPoint_k = grid_k.getPoint(); - // Rcout << " " << "k: "; - // for(auto p: gridPoint_k) Rcout << p << ", "; - // Rcout << std::endl << " "; - std::vector gridPoint_jk(tree_s->size(), 0); - for (size_t j = 0; j < gridPoint_j.size(); ++j) - { - auto j_dim = j_dims.begin(); - std::advance(j_dim, j); - int j_index = std::distance(variables[tree_index_s].begin(), variables[tree_index_s].find(*j_dim)); - // Rcout << " j_dim=" << *j_dim << ", j_index=" << j_index; - gridPoint_jk[j_index] = gridPoint_j[j]; - } - for (size_t k = 0; k < gridPoint_k.size(); ++k) - { - auto k_dim = k_dims.begin(); - std::advance(k_dim, k); - int k_index = std::distance(variables[tree_index_s].begin(), variables[tree_index_s].find(*k_dim)); - // Rcout << " k_dim=" << *k_dim << ", k_index=" << k_index; - gridPoint_jk[k_index] = gridPoint_k[k]; - } - // Rcout << std::endl << " " << "j, k: "; - // for(auto p: gridPoint_jk) Rcout << p << ", "; - // Rcout << std::endl; - - // Rcout << pow(-1, (*tree_s).size() - j_dims.size()) * update[0]; - values[tree_index_s][gridPoint_jk] += pow(-1, (*tree_s).size() - j_dims.size()) * update; - } - } - } - } - } - } - --tree_index_t; - } - - // ------------- attach to rpf class ------------- - - // fill with new trees - for (size_t tree_index = 0; tree_index < variables.size(); ++tree_index) - { - LeafGrid curr_gridLeaf; - curr_gridLeaf.grid = grids[tree_index]; - curr_gridLeaf.individuals = individuals[tree_index]; - curr_gridLeaf.lim_list = lim_list; - curr_gridLeaf.values = values[tree_index]; - curr_family[variables[tree_index]]->GridLeaves = curr_gridLeaf; - } - } - - purified = true; -} - void RandomPlantedForest::print() { for (int n = 0; n < n_trees; ++n) diff --git a/src/lib/rpf_purify.cpp b/src/lib/rpf_purify.cpp new file mode 100644 index 0000000..17df13b --- /dev/null +++ b/src/lib/rpf_purify.cpp @@ -0,0 +1,1058 @@ +#include "rpf.hpp" + + +void RandomPlantedForest::purify_3() +{ + + // go through all n_trees families + for (auto &curr_family : this->tree_families) + { + + // lim_list is a list giving for each variable all interval end-points + std::vector> lim_list = get_lim_list(curr_family); + + // initialize values and individuals for each tree in family + std::vector grids(curr_family.size() - 1); + std::vector> individuals(curr_family.size() - 1); + std::vector>> values(curr_family.size() - 1); + std::vector>> values_old(curr_family.size() - 1); + std::vector> variables(curr_family.size() - 1); + + // ------------- setup finer grid ------------- + + int tree_index = 0; + for (const auto &curr_tree : curr_family) + { + + if (curr_tree.first == std::set{0}) + { + + // values[tree_index] = rpf::Matrix>(dimensions, std::vector(value_size, 0)); // changed + continue; // ignore null tree + } + + // fill space with dimensions + std::vector dimensions; + for (const auto &dim : curr_tree.first) + { + dimensions.push_back(lim_list[dim - 1].size()); // size - 1 ? WICHTIG + } + + // setup grid for leaf indices + auto grid = grid::NDGrid(dimensions); + + // initialize data for current tree + grids[tree_index] = grid; + individuals[tree_index] = utils::Matrix(dimensions, 0); + values[tree_index] = utils::Matrix>(dimensions, std::vector(value_size, 0)); // changed + values_old[tree_index] = utils::Matrix>(dimensions, std::vector(value_size, 0)); // changed + variables[tree_index] = curr_tree.first; + + // fill grid points with individuals and values + while (!grid.nextPoint()) + { + + std::vector gridPoint = grid.getPoint(); + + bool in_leaf = true; + + // go through sample points to sum up individuals + for (const auto &feature_vec : X) + { + int dim_index = 0; + in_leaf = true; + for (const auto &dim : curr_tree.first) + { + double val = feature_vec[dim - 1]; + if (!((val >= lim_list[dim - 1][gridPoint[dim_index]]) && (val < lim_list[dim - 1][gridPoint[dim_index] + 1]))) + in_leaf = false; //Break here? + ++dim_index; + } + + // consider individuals only if all in + if (in_leaf) + individuals[tree_index][gridPoint] += 1; + } + + // go through leaves of tree to sum up values + for (const auto &leaf : curr_tree.second->get_leaves()) + { + + in_leaf = true; + int dim_index = 0; + for (const auto &dim : curr_tree.first) + { + // consider values only if all in + if (!((leaf.intervals[dim - 1].first <= lim_list[dim - 1][gridPoint[dim_index]]) && (leaf.intervals[dim - 1].second >= lim_list[dim - 1][gridPoint[dim_index] + 1]))) + in_leaf = false; // Break here? + ++dim_index; + } + + // sum up values + if (in_leaf) + { + + values[tree_index][gridPoint] += leaf.value; // todo: multiclass + values_old[tree_index][gridPoint] += leaf.value; // todo: multiclass + } + } + } + + ++tree_index; + } + + // Rcout << variables.size(); + // for(int i = 0; i>(std::vector{1}, std::vector(value_size, 0))); + values_old.insert(values_old.begin(), utils::Matrix>(std::vector{1}, std::vector(value_size, 0))); + individuals.insert(individuals.begin(), utils::Matrix(std::vector{1})); + variables.insert(variables.begin(), std::set{0}); + + // recap maximum number of dimensions of current family + unsigned int curr_max = curr_family.rbegin()->first.size(); + + while (curr_max > 1) + { + + auto keys = getKeys(curr_family); + // go through split dimensions of all trees + for (std::vector>::reverse_iterator key = keys.rbegin(); key != keys.rend(); ++key) + { + auto &curr_tree = curr_family[(*key)]; + std::set curr_dims = curr_tree->split_dims; + // check if number of dims same as current max_interaction + if (curr_dims.size() == curr_max) + { + // go through feature dims + int dim_index = 0; + for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) + { + // continue only if dim in current tree + if (curr_tree->split_dims.count(feature_dim) != 0) + { + std::set tree_dims = curr_tree->split_dims; + tree_dims.erase(tree_dims.find(feature_dim)); // remove current feature dim from current tree + // check if tree with dimensions exists, if not create + std::shared_ptr tree = treeExists(tree_dims, curr_family); + if (!tree) + { + // get index of old and new tree + auto old_tree_index = std::distance(std::begin(curr_family), curr_family.find(curr_tree->get_split_dims())); + curr_family.insert(std::make_pair(tree_dims, std::make_shared(DecisionTree(tree_dims)))); + auto tree_index = std::distance(std::begin(curr_family), curr_family.find(tree_dims)); + // remove matrix dimension of respective variable + std::vector matrix_dimensions = values[old_tree_index].dims; + // std::vector matrix_dimensions = values_old[old_tree_index].dims; + + // Rcout << typeof(matrix_dimensions.begin()) << std::endl; + + matrix_dimensions.erase(matrix_dimensions.begin() + dim_index); + // initialize data for new tree + auto grid = grid::NDGrid(matrix_dimensions); + grids.insert(grids.begin() + tree_index, grid); + values.insert(values.begin() + tree_index, utils::Matrix>(matrix_dimensions, std::vector(value_size, 0))); + values_old.insert(values_old.begin() + tree_index, utils::Matrix>(matrix_dimensions, std::vector(value_size, 0))); + individuals.insert(individuals.begin() + tree_index, utils::Matrix(matrix_dimensions)); + variables.insert(variables.begin() + tree_index, tree_dims); + // fill individuals of new trees + while (!grid.nextPoint()) + { + std::vector gridPoint = grid.getPoint(); + bool in_leaf = true; + // go through sample points to sum up individuals + for (const auto &feature_vec : X) + { + int dim_index2 = 0; + in_leaf = true; + for (const auto &dim : tree_dims) + { + double val = feature_vec[dim - 1]; + if (!((val >= lim_list[dim - 1][gridPoint[dim_index2]]) && (val < lim_list[dim - 1][gridPoint[dim_index2] + 1]))) + in_leaf = false; + ++dim_index2; + } + // consider individuals only if all in + if (in_leaf) + individuals[tree_index][gridPoint] += 1; + } + } + } + dim_index++; + } + } + } + } + // update currently considered dimension size + --curr_max; + } + + // Rcout << std::endl; + // Rcout << std::endl; + // Rcout << std::endl; + // + // for(int i = 0; i curr_dims = *tree_t; + // do not purify null + if (curr_dims == std::set{0}) + continue; + // Rcout << std::endl << tree_index_t << " - T: "; + // Rcout << "tree_t:"; + // for(auto dim: curr_dims) Rcout << dim << ", "; + // Rcout << std::endl; + + auto grid = grids[tree_index_t]; + // Rcout << "Grid dimensions of T: "; + // for(auto dim: grid.dimensions) Rcout << dim << ", "; + // Rcout << std::endl; + // go through subtrees of t + int tree_index_u = variables.size(); + for (auto tree_u = variables.rbegin(); tree_u != variables.rend(); ++tree_u) + { + --tree_index_u; + // j_dims = dims of t without u + std::set j_dims = curr_dims; + if (tree_u->size() > curr_dims.size()) + continue; + // check if subset + bool subset = true; + for (const auto dim : *tree_u) + { + if (tree_t->count(dim) == 0) + { + subset = false; + break; + } + j_dims.erase(dim); + } + if (!subset) + continue; + + // Rcout << "Hello"; + // Rcout << " " << tree_index_u << " - U: "; + // for(auto dim: *tree_u) Rcout << dim << ", "; + // Rcout << std::endl; + // Rcout << " Individuals: "; + + double tot_sum = 0; + grid = grids[tree_index_u]; + while (!grid.nextPoint()) + { + auto gridPoint = grid.getPoint(); + // Rcout << individuals[tree_index_u][gridPoint] << ", "; + tot_sum += individuals[tree_index_u][gridPoint]; + } + // Rcout << "Total sum: " << tot_sum << std::endl; + // Rcout << std::endl; + + // Rcout << " Grid dimensions of U: "; + // for(auto dim: grid.dimensions) Rcout << dim << ", "; + // Rcout << std::endl; + + // Rcout<< "j_dims: "< update(value_size, 0); + + if (j_dims.size() == 0) + { + + // grid = grids[tree_index_u]; + while (!grid.nextPoint()) + { + auto gridPoint_i = grid.getPoint(); + // Rcout << " " << "i: "; + // for(auto p: gridPoint_i) Rcout << p << ", "; + // Rcout << std::endl << " "; + double curr_sum = individuals[tree_index_u][gridPoint_i]; + // Rcout << ", Current Sum: " << curr_sum << std::endl; + // Rcout << std::endl << " " << "i, j: "; + update += (curr_sum / tot_sum) * values_old[tree_index_t][gridPoint_i]; + // Rcout << std::endl; + } + + int tree_index_s = variables.size(); + for (auto tree_s = variables.rbegin(); tree_s != variables.rend(); ++tree_s) + { + + // Rcout << "tree_s:"; + // for(auto dim: *tree_s) Rcout << dim << ", "; + // Rcout << std::endl; + + --tree_index_s; + if (*tree_s == std::set{0}) + { + + auto gridPoint_0 = std::vector{0}; + values[tree_index_s][gridPoint_0] += update; + // Rcout << std::endl; + //} + + /* + for(auto tree_0: curr_family){ + + if(tree_0.first == std::set{0}){ + + Rcout << tree_0.first.size(); + std::vector leaf_index(tree_0.first.size(), 0); + std::vector leaf_index(tree_0.second->GridLeaves.values.size(), 0); + + int Test = tree_0.second->GridLeaves.values.size(); + Rcout << Test; + tree_0.second->GridLeaves.values[leaf_index] += update; + } + } + */ + } + else + { + + // check if S subset of T + + bool subset = true; + for (const auto dim : *tree_s) + { + if (tree_t->count(dim) == 0) + { + subset = false; + break; + } + } + if (!subset) + continue; + + // Rcout << pow(-1, (*tree_s).size()) << std::endl; + + auto grid_k = grids[tree_index_s]; + while (!grid_k.nextPoint()) + { + auto gridPoint_k = grid_k.getPoint(); + // + // if((*tree_s).size()>2){ + // Rcout << std::endl << " " << "j, k: "; + // for(auto p: gridPoint_k) Rcout << p << ", "; + // Rcout << std::endl; + // } + // + // Rcout << pow(-1, (*tree_s).size()) * update << std::endl; + values[tree_index_s][gridPoint_k] += pow(-1, (*tree_s).size()) * update; + } + } + } + // Rcout << std::endl; + } + else + { + + std::vector j_sizes(j_dims.size(), 0); + for (size_t j = 0; j < j_dims.size(); ++j) + { + auto tmp = j_dims.begin(); + std::advance(tmp, j); + int j_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*tmp)); + j_sizes[j] = grids[tree_index_t].dimensions[j_index]; + } + + // Rcout<<"Hello 1"; + + grid::NDGrid grid_j = grid::NDGrid(j_sizes); + while (!grid_j.nextPoint()) // looping over every x_{i, T\U} essentially + { + + std::vector update(value_size, 0); + auto gridPoint_j = grid_j.getPoint(); + // Rcout << " " << "j: "; + // for(auto p: gridPoint_j) Rcout << p << ", "; + // Rcout << std::endl; + // calc update + grid = grids[tree_index_u]; + while (!grid.nextPoint()) + { + auto gridPoint_i = grid.getPoint(); + // Rcout << " " << "i: "; + // for(auto p: gridPoint_i) Rcout << p << ", "; + // Rcout << std::endl << " "; + double curr_sum = individuals[tree_index_u][gridPoint_i]; + // Rcout << ", Current Sum: " << curr_sum << std::endl; + std::vector gridPoint_ij(tree_t->size(), 0); + for (size_t j = 0; j < gridPoint_j.size(); ++j) + { + auto j_dim = j_dims.begin(); // to keep: T\U + std::advance(j_dim, j); + int j_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*j_dim)); + // Rcout << " j_dim=" << *j_dim << ", j_index=" << j_index; + gridPoint_ij[j_index] = gridPoint_j[j]; + } + for (size_t i = 0; i < gridPoint_i.size(); ++i) + { + auto i_dim = tree_u->begin(); // to marginalize: U + std::advance(i_dim, i); + int i_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*i_dim)); + // Rcout << " i_dim=" << *i_dim << ", i_index=" << i_index; + gridPoint_ij[i_index] = gridPoint_i[i]; + } + // Rcout << std::endl << " " << "i, j: "; + // for(auto p: gridPoint_ij) Rcout << p << ", "; + // Rcout << std::endl; + update += (curr_sum / tot_sum) * values_old[tree_index_t][gridPoint_ij]; + // Rcout << std::endl; + } + + // Rcout << "Hello_2"; + // update trees + int tree_index_s = variables.size(); + for (auto tree_s = variables.rbegin(); tree_s != variables.rend(); ++tree_s) + { + --tree_index_s; + // check if T\U=j_dims subset of S and S subset of T + bool subset = true; + for (const auto dim : j_dims) + { + if (tree_s->count(dim) == 0) + { + subset = false; + break; + } + } + for (const auto dim : *tree_s) + { + if (tree_t->count(dim) == 0) + { + subset = false; + break; + } + } + if (!subset) + continue; + // Rcout << " " << "S: "; + // for(auto dim: *tree_s) Rcout << dim << ", "; + // Rcout << std::endl; + // S cap U + std::set k_dims = *tree_s; + std::set k_dims_h1 = *tree_s; + std::set k_dims_h2 = *tree_u; + for (const auto dim : *tree_u) + k_dims.insert(dim); + for (const auto dim : *tree_s) + k_dims_h2.erase(dim); + for (const auto dim : *tree_u) + k_dims_h1.erase(dim); + for (const auto dim : k_dims_h1) + k_dims.erase(dim); + for (const auto dim : k_dims_h2) + k_dims.erase(dim); + + // std::set k_dims = *tree_s; + // for(const auto dim: *tree_t) k_dims.erase(dim); + // for(const auto dim: *tree_u) k_dims.insert(dim); + + // Rcout << " " << "k_dims: "; + // for(auto dim: k_dims) Rcout << dim << ", "; + // Rcout << std::endl; + + if (k_dims.size() == 0) + { + + values[tree_index_s][gridPoint_j] += pow(-1, (*tree_s).size() - j_dims.size()) * update; + } + else + { + + // Rcout <<"k_dims :"; + // for(auto dim: k_dims) Rcout << dim << ", "; + // Rcout << std::endl; + + std::vector k_sizes(k_dims.size(), 0); + for (size_t k = 0; k < k_dims.size(); ++k) + { + auto tmp = k_dims.begin(); + std::advance(tmp, k); + int k_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*tmp)); + k_sizes[k] = grids[tree_index_t].dimensions[k_index]; + } + // Rcout << " " << "k_sizes: "; + // for(auto dim: k_sizes) Rcout << dim << ", "; + // Rcout << std::endl; + grid::NDGrid grid_k = grid::NDGrid(k_sizes); + while (!grid_k.nextPoint()) + { + auto gridPoint_k = grid_k.getPoint(); + // Rcout << " " << "k: "; + // for(auto p: gridPoint_k) Rcout << p << ", "; + // Rcout << std::endl << " "; + std::vector gridPoint_jk(tree_s->size(), 0); + for (size_t j = 0; j < gridPoint_j.size(); ++j) + { + auto j_dim = j_dims.begin(); + std::advance(j_dim, j); + int j_index = std::distance(variables[tree_index_s].begin(), variables[tree_index_s].find(*j_dim)); + // Rcout << " j_dim=" << *j_dim << ", j_index=" << j_index; + gridPoint_jk[j_index] = gridPoint_j[j]; + } + for (size_t k = 0; k < gridPoint_k.size(); ++k) + { + auto k_dim = k_dims.begin(); + std::advance(k_dim, k); + int k_index = std::distance(variables[tree_index_s].begin(), variables[tree_index_s].find(*k_dim)); + // Rcout << " k_dim=" << *k_dim << ", k_index=" << k_index; + gridPoint_jk[k_index] = gridPoint_k[k]; + } + // Rcout << std::endl << " " << "j, k: "; + // for(auto p: gridPoint_jk) Rcout << p << ", "; + // Rcout << std::endl; + + // Rcout << pow(-1, (*tree_s).size() - j_dims.size()) * update[0]; + values[tree_index_s][gridPoint_jk] += pow(-1, (*tree_s).size() - j_dims.size()) * update; + } + } + } + } + } + } + --tree_index_t; + } + + // ------------- attach to rpf class ------------- + + // fill with new trees + for (size_t tree_index = 0; tree_index < variables.size(); ++tree_index) + { + LeafGrid curr_gridLeaf; + curr_gridLeaf.grid = grids[tree_index]; + curr_gridLeaf.individuals = individuals[tree_index]; + curr_gridLeaf.lim_list = lim_list; + curr_gridLeaf.values = values[tree_index]; + curr_family[variables[tree_index]]->GridLeaves = curr_gridLeaf; + } + } + + purified = true; +} + +std::vector> RandomPlantedForest::get_lim_list(const TreeFamily &curr_family) { + // lim_list is a list giving for each variable all interval end-points + std::vector> lim_list(feature_size); + + // go through all variables of the component + for (int curr_dim = 1; curr_dim <= feature_size; ++curr_dim) + { + std::vector bounds; + + // go through trees of family + for (const auto &curr_tree : curr_family) + { + + // consider only relevant trees that have current dimension as variable + if (!curr_tree.first.count(curr_dim)) + continue; + + // go through leaves of tree + for (const auto &curr_leaf : curr_tree.second->leaves) + { + // get interval ends of variable + bounds.push_back(curr_leaf.intervals[curr_dim - 1].first); + bounds.push_back(curr_leaf.intervals[curr_dim - 1].second); + } + } + std::sort(bounds.begin(), bounds.end()); + bounds.erase(std::unique(bounds.begin(), bounds.end()), bounds.end()); + // int i_last = bounds.size()-1; + // double bibi = bounds[i_last] + 0.0001; + // bounds[i_last] = bounds[i_last] + 0.0001; + lim_list[curr_dim - 1] = bounds; + } + + return lim_list; +} + + +void RandomPlantedForest::purify_1() +{ + + // go through all n_trees families + for (auto &curr_family : this->tree_families) + { + + // recap maximum number of dimensions of current family + unsigned int curr_max = 0; + for (auto tree : curr_family) + { + if (tree.first.size() > curr_max) + curr_max = tree.first.size(); + } + + while (curr_max >= 1) + { + + // go through split dimensions of all trees + auto keys = getKeys(curr_family); + std::vector>::reverse_iterator key = keys.rbegin(); + while (key != keys.rend()) + { + + auto &curr_tree = curr_family[(*key)]; + std::set curr_dims = curr_tree->split_dims; + + // check if number of dims same as current max_interaction + if (curr_dims.size() == curr_max) + { + + // go through feature dims + for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) + { + + // continue only if dim in current tree + if (curr_tree->split_dims.count(feature_dim) != 0) + { + + std::set tree_dims = curr_tree->split_dims; + tree_dims.erase(tree_dims.find(feature_dim)); // remove current feature dim from current tree + + // check if tree with dimensions exists, if not create + std::shared_ptr tree = treeExists(tree_dims, curr_family); + if (curr_max == 1) + { + tree = curr_family[std::set{0}]; + } + else + { + if (!tree) + { + curr_family.insert(std::make_pair(tree_dims, std::make_shared(DecisionTree(tree_dims)))); + tree = curr_family[tree_dims]; + } + } + + // go through leaves of current tree + int n_leaves = curr_tree->leaves.size(); + for (int l = 0; l < n_leaves; ++l) + { + auto &curr_leaf = curr_tree->leaves[l]; + + double multiplier = (curr_leaf.intervals[feature_dim - 1].second - curr_leaf.intervals[feature_dim - 1].first) / (upper_bounds[feature_dim - 1] - lower_bounds[feature_dim - 1]); + + // new leaf including intervals and value + Leaf new_leaf = curr_leaf; // initialize intervals with first leaf + new_leaf.intervals[feature_dim - 1].first = lower_bounds[feature_dim - 1]; + new_leaf.intervals[feature_dim - 1].second = upper_bounds[feature_dim - 1]; + for (size_t i = 0; i < value_size; ++i) + new_leaf.value[i] = -curr_leaf.value[i] * multiplier; // update value of new leaf + + // append new leaf + if (!leafExists(new_leaf.intervals, curr_tree)) + curr_tree->leaves.push_back(new_leaf); + for (size_t i = 0; i < value_size; ++i) + new_leaf.value[i] = curr_leaf.value[i] * multiplier; // update value of new leaf + if (!leafExists(new_leaf.intervals, tree)) + tree->leaves.push_back(new_leaf); + } + } + } + } + key++; + } + + // update currently considered dimension size + --curr_max; + } + } + + purified = true; +} + +void RandomPlantedForest::purify_2() +{ + + // go through all n_trees families + for (auto &curr_family : this->tree_families) + { + + // lim_list is a list giving for each variable all interval end-points + std::vector> lim_list(feature_size); + + // go through all variables of the component + for (int curr_dim = 1; curr_dim <= feature_size; ++curr_dim) + { + std::vector bounds; + + // go through trees of family + for (const auto &curr_tree : curr_family) + { + + // consider only relevant trees that have current dimension as variable + if (!curr_tree.first.count(curr_dim)) + continue; + + // go through leaves of tree + for (const auto &curr_leaf : curr_tree.second->leaves) + { + // get interval ends of variable + bounds.push_back(curr_leaf.intervals[curr_dim - 1].second); + } + } + std::sort(bounds.begin(), bounds.end()); + bounds.erase(std::unique(bounds.begin(), bounds.end()), bounds.end()); + lim_list[curr_dim - 1] = bounds; + } + + // initialize values and individuals for each tree in family + std::vector grids(curr_family.size() - 1); + std::vector> individuals(curr_family.size() - 1); + std::vector>> values(curr_family.size() - 1); + std::vector> variables(curr_family.size() - 1); + + // ------------- setup finer grid ------------- + + int tree_index = 0; + for (const auto &curr_tree : curr_family) + { + + if (curr_tree.first == std::set{0}) + continue; // ignore null tree + + // fill space with dimensions + std::vector dimensions; + for (const auto &dim : curr_tree.first) + { + dimensions.push_back(lim_list[dim - 1].size() - 1); // size - 1 ? + } + + // setup grid for leaf indices + auto grid = grid::NDGrid(dimensions); + + // initialize data for current tree + grids[tree_index] = grid; + individuals[tree_index] = utils::Matrix(dimensions, 0); + values[tree_index] = utils::Matrix>(dimensions, std::vector(value_size, 0)); // changed + variables[tree_index] = curr_tree.first; + + // fill grid points with individuals and values + while (!grid.nextPoint()) + { + + std::vector gridPoint = grid.getPoint(); + + bool in_leaf = true; + + // go through sample points to sum up individuals + for (const auto &feature_vec : X) + { + int dim_index = 0; + in_leaf = true; + for (const auto &dim : curr_tree.first) + { + double val = feature_vec[dim - 1]; + if (!((val >= lim_list[dim - 1][gridPoint[dim_index]]) && (val < lim_list[dim - 1][gridPoint[dim_index] + 1]))) + in_leaf = false; + ++dim_index; + } + + // consider individuals only if all in + if (in_leaf) + individuals[tree_index][gridPoint] += 1; + } + + // go through leaves of tree to sum up values + for (const auto &leaf : curr_tree.second->get_leaves()) + { + + in_leaf = true; + int dim_index = 0; + for (const auto &dim : curr_tree.first) + { + // consider values only if all in + if (!((leaf.intervals[dim - 1].first <= lim_list[dim - 1][gridPoint[dim_index]]) && (leaf.intervals[dim - 1].second >= lim_list[dim - 1][gridPoint[dim_index] + 1]))) + in_leaf = false; + ++dim_index; + } + + // sum up values + if (in_leaf) + values[tree_index][gridPoint] += leaf.value; // todo: multiclass + } + } + + ++tree_index; + } + + // ------------- create new trees ------------- + + // insert null tree + grids.insert(grids.begin(), grid::NDGrid()); + values.insert(values.begin(), utils::Matrix>(std::vector{1}, std::vector(value_size, 0))); + individuals.insert(individuals.begin(), utils::Matrix(std::vector{1})); + variables.insert(variables.begin(), std::set{0}); + + // recap maximum number of dimensions of current family + unsigned int curr_max = 0; + for (const auto &tree : curr_family) + { + if (tree.first.size() > curr_max) + curr_max = tree.first.size(); + } + + auto keys = getKeys(curr_family); + while (curr_max > 1) + { + + // go through split dimensions of all trees + for (std::vector>::reverse_iterator key = keys.rbegin(); key != keys.rend(); ++key) + { + + auto &curr_tree = curr_family[(*key)]; + std::set curr_dims = curr_tree->split_dims; + + // check if number of dims same as current max_interaction + if (curr_dims.size() == curr_max) + { + + // go through feature dims + int dim_index = 0; + for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) + { + + // continue only if dim in current tree + if (curr_tree->split_dims.count(feature_dim) != 0) + { + + std::set tree_dims = curr_tree->split_dims; + tree_dims.erase(tree_dims.find(feature_dim)); // remove current feature dim from current tree + + // check if tree with dimensions exists, if not create + std::shared_ptr tree = treeExists(tree_dims, curr_family); + if (!tree) + { + + // get index of old and new tree + auto old_tree_index = std::distance(std::begin(curr_family), curr_family.find(curr_tree->get_split_dims())); + curr_family.insert(std::make_pair(tree_dims, std::make_shared(DecisionTree(tree_dims)))); + auto tree_index = std::distance(std::begin(curr_family), curr_family.find(tree_dims)); + + // remove matrix dimension of respective variable + std::vector matrix_dimensions = values[old_tree_index].dims; + matrix_dimensions.erase(matrix_dimensions.begin() + dim_index); + + // initialize data for new tree + auto grid = grid::NDGrid(matrix_dimensions); + grids.insert(grids.begin() + tree_index, grid); + values.insert(values.begin() + tree_index, utils::Matrix>(matrix_dimensions, std::vector(0, value_size))); + individuals.insert(individuals.begin() + tree_index, utils::Matrix(matrix_dimensions)); + variables.insert(variables.begin() + tree_index, tree_dims); + + // fill individuals of new trees + while (!grid.nextPoint()) + { + + std::vector gridPoint = grid.getPoint(); + bool in_leaf = true; + + // go through sample points to sum up individuals + for (const auto &feature_vec : X) + { + int dim_index = 0; + in_leaf = true; + for (const auto &dim : tree_dims) + { + double val = feature_vec[dim - 1]; + if (!((val >= lim_list[dim - 1][gridPoint[dim_index]]) && (val < lim_list[dim - 1][gridPoint[dim_index] + 1]))) + in_leaf = false; + ++dim_index; + } + + // consider individuals only if all in + if (in_leaf) + individuals[tree_index][gridPoint] += 1; + } + } + } + + dim_index++; + } + } + } + } + + // update currently considered dimension size + --curr_max; + } + + // ------------- purify ------------- + + // measure tolerance and number of iterations + std::vector tol(curr_family.size(), 1); + int iter; + + // iterate backwards through tree family + int curr_tree_index = curr_family.size() - 1; + for (TreeFamily::reverse_iterator curr_tree = curr_family.rbegin(); curr_tree != curr_family.rend(); ++curr_tree) + { + iter = 0; + std::set curr_dims = curr_tree->second->get_split_dims(); + + // do not purify null + if (curr_dims == std::set{0}) + continue; + + // repeat until tolerance small enough and (?) maximum number of iterations reached + while ((tol[curr_tree_index] > 0.00000000001) && (iter < 100)) + { + + // go through feature dims + int curr_dim_index = 0; + for (const auto &feature_dim : curr_dims) + { + + // get tree that has same variables as curr_tree minus j-variable + std::set tree_dims = curr_dims; + tree_dims.erase(tree_dims.find(feature_dim)); + int tree_index = 0; // if tree not exist, set to null tree + if (curr_family.find(tree_dims) != curr_family.end()) + tree_index = std::distance(std::begin(curr_family), curr_family.find(tree_dims)) - 1; + + // update values + if (grids[curr_tree_index].dimensions.size() == 1) + { // one dimensional case + + int sum_ind = 0; + std::vector avg(value_size, 0); + + // get sum of individuals + for (int i = 0; i < individuals[curr_tree_index].n_entries; ++i) + { + std::vector tmp{i}; + sum_ind += individuals[curr_tree_index][tmp]; + } + if (sum_ind == 0) + continue; + + // calc avg + for (int i = 0; i < individuals[curr_tree_index].n_entries; ++i) + { + std::vector tmp{i}; + avg += (individuals[curr_tree_index][tmp] * values[curr_tree_index][tmp]) / sum_ind; + } + + // update values of one dimensional and null tree + for (int i = 0; i < values[curr_tree_index].n_entries; ++i) + { + std::vector tmp{i}; + values[curr_tree_index][tmp] -= avg; + } + std::vector tmp{0}; + values[tree_index][tmp] += avg; + } + else + { // higher dimensional case + + // setup new grid without dimension j + std::vector new_dimensions = grids[curr_tree_index].dimensions; + int j_dim = new_dimensions[curr_dim_index]; + new_dimensions.erase(new_dimensions.begin() + curr_dim_index); + grid::NDGrid grid = grid::NDGrid(new_dimensions); + + // go through values without dimension j + while (!grid.nextPoint()) + { + auto gridPoint = grid.getPoint(); + gridPoint.push_back(0); + + int sum_ind = 0; + std::vector avg(value_size, 0); + + // go through slice to sum up individuals + for (int j = 0; j < j_dim; ++j) + { + gridPoint.back() = j; + + // get sum of individuals + sum_ind += individuals[curr_tree_index][gridPoint]; + } + + // go through slice to calc avg + for (int j = 0; j < j_dim; ++j) + { + gridPoint.back() = j; + + // calc avg + avg += (individuals[curr_tree_index][gridPoint] * values[curr_tree_index][gridPoint]) / sum_ind; + } + + // go through slice to update values + for (int j = 0; j < j_dim; ++j) + { + gridPoint.back() = j; + + // update values of current slice + values[curr_tree_index][gridPoint] -= avg; + } + + // update lower dimensional tree + gridPoint.pop_back(); + values[tree_index][gridPoint] += avg; + } + } + + ++curr_dim_index; + } + + // update tolerance + if (variables[curr_tree_index].size() == 1) + { + tol[curr_tree_index] = 1; // todo + } + else + { + tol[curr_tree_index] = 1; + } + + ++iter; + } + + --curr_tree_index; + } + + // ------------- attach to rpf class ------------- + + // fill with new trees + for (size_t tree_index = 0; tree_index < variables.size(); ++tree_index) + { + LeafGrid curr_gridLeaf; + curr_gridLeaf.grid = grids[tree_index]; + curr_gridLeaf.individuals = individuals[tree_index]; + curr_gridLeaf.lim_list = lim_list; + curr_gridLeaf.values = values[tree_index]; + curr_family[variables[tree_index]]->GridLeaves = curr_gridLeaf; + } + } + + purified = true; +} From cddb3d21b436b5b5beb2235d71e0fbae9e7c337e Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Fri, 19 Apr 2024 17:38:17 +0200 Subject: [PATCH 02/16] First sum contributions, then weigh by total sum --- src/lib/rpf_purify.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/lib/rpf_purify.cpp b/src/lib/rpf_purify.cpp index 17df13b..cfa3764 100644 --- a/src/lib/rpf_purify.cpp +++ b/src/lib/rpf_purify.cpp @@ -266,12 +266,12 @@ void RandomPlantedForest::purify_3() double tot_sum = 0; grid = grids[tree_index_u]; - while (!grid.nextPoint()) - { - auto gridPoint = grid.getPoint(); - // Rcout << individuals[tree_index_u][gridPoint] << ", "; - tot_sum += individuals[tree_index_u][gridPoint]; - } + // while (!grid.nextPoint()) + // { + // auto gridPoint = grid.getPoint(); + // // Rcout << individuals[tree_index_u][gridPoint] << ", "; + // tot_sum += individuals[tree_index_u][gridPoint]; + // } // Rcout << "Total sum: " << tot_sum << std::endl; // Rcout << std::endl; @@ -285,7 +285,6 @@ void RandomPlantedForest::purify_3() if (j_dims.size() == 0) { - // grid = grids[tree_index_u]; while (!grid.nextPoint()) { @@ -296,9 +295,11 @@ void RandomPlantedForest::purify_3() double curr_sum = individuals[tree_index_u][gridPoint_i]; // Rcout << ", Current Sum: " << curr_sum << std::endl; // Rcout << std::endl << " " << "i, j: "; - update += (curr_sum / tot_sum) * values_old[tree_index_t][gridPoint_i]; + update += curr_sum * values_old[tree_index_t][gridPoint_i]; + tot_sum += individuals[tree_index_u][gridPoint_i]; // Rcout << std::endl; } + update /= tot_sum; int tree_index_s = variables.size(); for (auto tree_s = variables.rbegin(); tree_s != variables.rend(); ++tree_s) From a6e0e9303315f46e80b911e4d1e69d555d34fc92 Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Fri, 19 Apr 2024 18:01:05 +0200 Subject: [PATCH 03/16] Do not extrapolate --- src/lib/rpf_purify.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/lib/rpf_purify.cpp b/src/lib/rpf_purify.cpp index cfa3764..bff8535 100644 --- a/src/lib/rpf_purify.cpp +++ b/src/lib/rpf_purify.cpp @@ -296,7 +296,7 @@ void RandomPlantedForest::purify_3() // Rcout << ", Current Sum: " << curr_sum << std::endl; // Rcout << std::endl << " " << "i, j: "; update += curr_sum * values_old[tree_index_t][gridPoint_i]; - tot_sum += individuals[tree_index_u][gridPoint_i]; + tot_sum += curr_sum; // Rcout << std::endl; } update /= tot_sum; @@ -421,12 +421,18 @@ void RandomPlantedForest::purify_3() // Rcout << " i_dim=" << *i_dim << ", i_index=" << i_index; gridPoint_ij[i_index] = gridPoint_i[i]; } + + if (individuals[tree_index_t][gridPoint_ij] == 0) { + continue; // Skip non-support areas + } // Rcout << std::endl << " " << "i, j: "; // for(auto p: gridPoint_ij) Rcout << p << ", "; // Rcout << std::endl; - update += (curr_sum / tot_sum) * values_old[tree_index_t][gridPoint_ij]; + update += curr_sum * values_old[tree_index_t][gridPoint_ij]; + tot_sum += curr_sum; // Rcout << std::endl; } + update /= tot_sum; // Rcout << "Hello_2"; // update trees From 752a64ba51ad23b59e8fc2737e5cf5dc5e142f6a Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Fri, 19 Apr 2024 19:08:26 +0200 Subject: [PATCH 04/16] Update Makevars --- src/Makevars | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makevars b/src/Makevars index 71ae982..eaefccc 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,4 +1,4 @@ -SOURCES=lib/cpf.cpp lib/grid.cpp lib/helper.cpp lib/rpf.cpp lib/trees.cpp randomPlantedForest.cpp RcppExports.cpp +SOURCES=lib/cpf.cpp lib/grid.cpp lib/helper.cpp lib/rpf.cpp lib/trees.cpp lib/rpf_purify.cpp randomPlantedForest.cpp RcppExports.cpp OBJECTS = $(SOURCES:.cpp=.o) From 380ece5fe4043ae49c89d7c916ff3e5477d97a09 Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Mon, 22 Apr 2024 12:58:13 +0200 Subject: [PATCH 05/16] Split new and old purification method --- src/Makevars | 2 +- src/include/rpf.hpp | 1 + src/lib/rpf_purify.cpp | 36 +- src/lib/rpf_purify_no_extrapolation.cpp | 565 ++++++++++++++++++++++++ src/randomPlantedForest.cpp | 1 + 5 files changed, 583 insertions(+), 22 deletions(-) create mode 100644 src/lib/rpf_purify_no_extrapolation.cpp diff --git a/src/Makevars b/src/Makevars index eaefccc..ca5e0c2 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,4 +1,4 @@ -SOURCES=lib/cpf.cpp lib/grid.cpp lib/helper.cpp lib/rpf.cpp lib/trees.cpp lib/rpf_purify.cpp randomPlantedForest.cpp RcppExports.cpp +SOURCES=lib/cpf.cpp lib/grid.cpp lib/helper.cpp lib/rpf.cpp lib/trees.cpp lib/rpf_purify.cpp lib/rpf_purify_no_extrapolation.cpp randomPlantedForest.cpp RcppExports.cpp OBJECTS = $(SOURCES:.cpp=.o) diff --git a/src/include/rpf.hpp b/src/include/rpf.hpp index b87327d..f5ea5f0 100644 --- a/src/include/rpf.hpp +++ b/src/include/rpf.hpp @@ -18,6 +18,7 @@ class RandomPlantedForest void purify_1(); void purify_2(); void purify_3(); + void purify_no_extrapolation(); void print(); void cross_validation(int n_sets = 4, IntegerVector splits = {5, 50}, NumericVector t_tries = {0.2, 0.5, 0.7, 0.9}, IntegerVector split_tries = {1, 2, 5, 10}); double MSE(const NumericMatrix &Y_predicted, const NumericMatrix &Y_true); diff --git a/src/lib/rpf_purify.cpp b/src/lib/rpf_purify.cpp index bff8535..a7165af 100644 --- a/src/lib/rpf_purify.cpp +++ b/src/lib/rpf_purify.cpp @@ -65,7 +65,7 @@ void RandomPlantedForest::purify_3() { double val = feature_vec[dim - 1]; if (!((val >= lim_list[dim - 1][gridPoint[dim_index]]) && (val < lim_list[dim - 1][gridPoint[dim_index] + 1]))) - in_leaf = false; //Break here? + in_leaf = false; ++dim_index; } @@ -84,7 +84,7 @@ void RandomPlantedForest::purify_3() { // consider values only if all in if (!((leaf.intervals[dim - 1].first <= lim_list[dim - 1][gridPoint[dim_index]]) && (leaf.intervals[dim - 1].second >= lim_list[dim - 1][gridPoint[dim_index] + 1]))) - in_leaf = false; // Break here? + in_leaf = false; ++dim_index; } @@ -266,15 +266,16 @@ void RandomPlantedForest::purify_3() double tot_sum = 0; grid = grids[tree_index_u]; - // while (!grid.nextPoint()) - // { - // auto gridPoint = grid.getPoint(); - // // Rcout << individuals[tree_index_u][gridPoint] << ", "; - // tot_sum += individuals[tree_index_u][gridPoint]; - // } + while (!grid.nextPoint()) + { + auto gridPoint = grid.getPoint(); + // Rcout << individuals[tree_index_u][gridPoint] << ", "; + tot_sum += individuals[tree_index_u][gridPoint]; + } // Rcout << "Total sum: " << tot_sum << std::endl; // Rcout << std::endl; + grid = grids[tree_index_u]; // Rcout << " Grid dimensions of U: "; // for(auto dim: grid.dimensions) Rcout << dim << ", "; // Rcout << std::endl; @@ -285,6 +286,7 @@ void RandomPlantedForest::purify_3() if (j_dims.size() == 0) { + // grid = grids[tree_index_u]; while (!grid.nextPoint()) { @@ -295,11 +297,9 @@ void RandomPlantedForest::purify_3() double curr_sum = individuals[tree_index_u][gridPoint_i]; // Rcout << ", Current Sum: " << curr_sum << std::endl; // Rcout << std::endl << " " << "i, j: "; - update += curr_sum * values_old[tree_index_t][gridPoint_i]; - tot_sum += curr_sum; + update += (curr_sum / tot_sum) * values_old[tree_index_t][gridPoint_i]; // Rcout << std::endl; } - update /= tot_sum; int tree_index_s = variables.size(); for (auto tree_s = variables.rbegin(); tree_s != variables.rend(); ++tree_s) @@ -386,7 +386,7 @@ void RandomPlantedForest::purify_3() // Rcout<<"Hello 1"; grid::NDGrid grid_j = grid::NDGrid(j_sizes); - while (!grid_j.nextPoint()) // looping over every x_{i, T\U} essentially + while (!grid_j.nextPoint()) { std::vector update(value_size, 0); @@ -407,7 +407,7 @@ void RandomPlantedForest::purify_3() std::vector gridPoint_ij(tree_t->size(), 0); for (size_t j = 0; j < gridPoint_j.size(); ++j) { - auto j_dim = j_dims.begin(); // to keep: T\U + auto j_dim = j_dims.begin(); std::advance(j_dim, j); int j_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*j_dim)); // Rcout << " j_dim=" << *j_dim << ", j_index=" << j_index; @@ -415,24 +415,18 @@ void RandomPlantedForest::purify_3() } for (size_t i = 0; i < gridPoint_i.size(); ++i) { - auto i_dim = tree_u->begin(); // to marginalize: U + auto i_dim = tree_u->begin(); std::advance(i_dim, i); int i_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*i_dim)); // Rcout << " i_dim=" << *i_dim << ", i_index=" << i_index; gridPoint_ij[i_index] = gridPoint_i[i]; } - - if (individuals[tree_index_t][gridPoint_ij] == 0) { - continue; // Skip non-support areas - } // Rcout << std::endl << " " << "i, j: "; // for(auto p: gridPoint_ij) Rcout << p << ", "; // Rcout << std::endl; - update += curr_sum * values_old[tree_index_t][gridPoint_ij]; - tot_sum += curr_sum; + update += (curr_sum / tot_sum) * values_old[tree_index_t][gridPoint_ij]; // Rcout << std::endl; } - update /= tot_sum; // Rcout << "Hello_2"; // update trees diff --git a/src/lib/rpf_purify_no_extrapolation.cpp b/src/lib/rpf_purify_no_extrapolation.cpp new file mode 100644 index 0000000..f107d9c --- /dev/null +++ b/src/lib/rpf_purify_no_extrapolation.cpp @@ -0,0 +1,565 @@ +#include "rpf.hpp" + +void RandomPlantedForest::purify_no_extrapolation() +{ + + // go through all n_trees families + for (auto &curr_family : this->tree_families) + { + + // lim_list is a list giving for each variable all interval end-points + std::vector> lim_list = get_lim_list(curr_family); + + // initialize values and individuals for each tree in family + std::vector grids(curr_family.size() - 1); + std::vector> individuals(curr_family.size() - 1); + std::vector>> values(curr_family.size() - 1); + std::vector>> values_old(curr_family.size() - 1); + std::vector> variables(curr_family.size() - 1); + + // ------------- setup finer grid ------------- + + int tree_index = 0; + for (const auto &curr_tree : curr_family) + { + + if (curr_tree.first == std::set{0}) + { + + // values[tree_index] = rpf::Matrix>(dimensions, std::vector(value_size, 0)); // changed + continue; // ignore null tree + } + + // fill space with dimensions + std::vector dimensions; + for (const auto &dim : curr_tree.first) + { + dimensions.push_back(lim_list[dim - 1].size()); // size - 1 ? WICHTIG + } + + // setup grid for leaf indices + auto grid = grid::NDGrid(dimensions); + + // initialize data for current tree + grids[tree_index] = grid; + individuals[tree_index] = utils::Matrix(dimensions, 0); + values[tree_index] = utils::Matrix>(dimensions, std::vector(value_size, 0)); // changed + values_old[tree_index] = utils::Matrix>(dimensions, std::vector(value_size, 0)); // changed + variables[tree_index] = curr_tree.first; + + // fill grid points with individuals and values + while (!grid.nextPoint()) + { + + std::vector gridPoint = grid.getPoint(); + + bool in_leaf = true; + + // go through sample points to sum up individuals + for (const auto &feature_vec : X) + { + int dim_index = 0; + in_leaf = true; + for (const auto &dim : curr_tree.first) + { + double val = feature_vec[dim - 1]; + if (!((val >= lim_list[dim - 1][gridPoint[dim_index]]) && (val < lim_list[dim - 1][gridPoint[dim_index] + 1]))) + in_leaf = false; //Break here? + ++dim_index; + } + + // consider individuals only if all in + if (in_leaf) + individuals[tree_index][gridPoint] += 1; + } + + // go through leaves of tree to sum up values + for (const auto &leaf : curr_tree.second->get_leaves()) + { + + in_leaf = true; + int dim_index = 0; + for (const auto &dim : curr_tree.first) + { + // consider values only if all in + if (!((leaf.intervals[dim - 1].first <= lim_list[dim - 1][gridPoint[dim_index]]) && (leaf.intervals[dim - 1].second >= lim_list[dim - 1][gridPoint[dim_index] + 1]))) + in_leaf = false; // Break here? + ++dim_index; + } + + // sum up values + if (in_leaf) + { + + values[tree_index][gridPoint] += leaf.value; // todo: multiclass + values_old[tree_index][gridPoint] += leaf.value; // todo: multiclass + } + } + } + + ++tree_index; + } + + // Rcout << variables.size(); + // for(int i = 0; i>(std::vector{1}, std::vector(value_size, 0))); + values_old.insert(values_old.begin(), utils::Matrix>(std::vector{1}, std::vector(value_size, 0))); + individuals.insert(individuals.begin(), utils::Matrix(std::vector{1})); + variables.insert(variables.begin(), std::set{0}); + + // recap maximum number of dimensions of current family + unsigned int curr_max = curr_family.rbegin()->first.size(); + + while (curr_max > 1) + { + + auto keys = getKeys(curr_family); + // go through split dimensions of all trees + for (std::vector>::reverse_iterator key = keys.rbegin(); key != keys.rend(); ++key) + { + auto &curr_tree = curr_family[(*key)]; + std::set curr_dims = curr_tree->split_dims; + // check if number of dims same as current max_interaction + if (curr_dims.size() == curr_max) + { + // go through feature dims + int dim_index = 0; + for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) + { + // continue only if dim in current tree + if (curr_tree->split_dims.count(feature_dim) != 0) + { + std::set tree_dims = curr_tree->split_dims; + tree_dims.erase(tree_dims.find(feature_dim)); // remove current feature dim from current tree + // check if tree with dimensions exists, if not create + std::shared_ptr tree = treeExists(tree_dims, curr_family); + if (!tree) + { + // get index of old and new tree + auto old_tree_index = std::distance(std::begin(curr_family), curr_family.find(curr_tree->get_split_dims())); + curr_family.insert(std::make_pair(tree_dims, std::make_shared(DecisionTree(tree_dims)))); + auto tree_index = std::distance(std::begin(curr_family), curr_family.find(tree_dims)); + // remove matrix dimension of respective variable + std::vector matrix_dimensions = values[old_tree_index].dims; + // std::vector matrix_dimensions = values_old[old_tree_index].dims; + + // Rcout << typeof(matrix_dimensions.begin()) << std::endl; + + matrix_dimensions.erase(matrix_dimensions.begin() + dim_index); + // initialize data for new tree + auto grid = grid::NDGrid(matrix_dimensions); + grids.insert(grids.begin() + tree_index, grid); + values.insert(values.begin() + tree_index, utils::Matrix>(matrix_dimensions, std::vector(value_size, 0))); + values_old.insert(values_old.begin() + tree_index, utils::Matrix>(matrix_dimensions, std::vector(value_size, 0))); + individuals.insert(individuals.begin() + tree_index, utils::Matrix(matrix_dimensions)); + variables.insert(variables.begin() + tree_index, tree_dims); + // fill individuals of new trees + while (!grid.nextPoint()) + { + std::vector gridPoint = grid.getPoint(); + bool in_leaf = true; + // go through sample points to sum up individuals + for (const auto &feature_vec : X) + { + int dim_index2 = 0; + in_leaf = true; + for (const auto &dim : tree_dims) + { + double val = feature_vec[dim - 1]; + if (!((val >= lim_list[dim - 1][gridPoint[dim_index2]]) && (val < lim_list[dim - 1][gridPoint[dim_index2] + 1]))) + in_leaf = false; + ++dim_index2; + } + // consider individuals only if all in + if (in_leaf) + individuals[tree_index][gridPoint] += 1; + } + } + } + dim_index++; + } + } + } + } + // update currently considered dimension size + --curr_max; + } + + // Rcout << std::endl; + // Rcout << std::endl; + // Rcout << std::endl; + // + // for(int i = 0; i curr_dims = *tree_t; + // do not purify null + if (curr_dims == std::set{0}) + continue; + // Rcout << std::endl << tree_index_t << " - T: "; + // Rcout << "tree_t:"; + // for(auto dim: curr_dims) Rcout << dim << ", "; + // Rcout << std::endl; + + auto grid = grids[tree_index_t]; + // Rcout << "Grid dimensions of T: "; + // for(auto dim: grid.dimensions) Rcout << dim << ", "; + // Rcout << std::endl; + // go through subtrees of t + int tree_index_u = variables.size(); + for (auto tree_u = variables.rbegin(); tree_u != variables.rend(); ++tree_u) + { + --tree_index_u; + // j_dims = dims of t without u + std::set j_dims = curr_dims; + if (tree_u->size() > curr_dims.size()) + continue; + // check if subset + bool subset = true; + for (const auto dim : *tree_u) + { + if (tree_t->count(dim) == 0) + { + subset = false; + break; + } + j_dims.erase(dim); + } + if (!subset) + continue; + + // Rcout << "Hello"; + // Rcout << " " << tree_index_u << " - U: "; + // for(auto dim: *tree_u) Rcout << dim << ", "; + // Rcout << std::endl; + // Rcout << " Individuals: "; + + double tot_sum = 0; + grid = grids[tree_index_u]; + // while (!grid.nextPoint()) + // { + // auto gridPoint = grid.getPoint(); + // // Rcout << individuals[tree_index_u][gridPoint] << ", "; + // tot_sum += individuals[tree_index_u][gridPoint]; + // } + // Rcout << "Total sum: " << tot_sum << std::endl; + // Rcout << std::endl; + + // Rcout << " Grid dimensions of U: "; + // for(auto dim: grid.dimensions) Rcout << dim << ", "; + // Rcout << std::endl; + + // Rcout<< "j_dims: "< update(value_size, 0); + + if (j_dims.size() == 0) + { + // grid = grids[tree_index_u]; + while (!grid.nextPoint()) + { + auto gridPoint_i = grid.getPoint(); + // Rcout << " " << "i: "; + // for(auto p: gridPoint_i) Rcout << p << ", "; + // Rcout << std::endl << " "; + double curr_sum = individuals[tree_index_u][gridPoint_i]; + // Rcout << ", Current Sum: " << curr_sum << std::endl; + // Rcout << std::endl << " " << "i, j: "; + update += curr_sum * values_old[tree_index_t][gridPoint_i]; + tot_sum += curr_sum; + // Rcout << std::endl; + } + update /= tot_sum; + + int tree_index_s = variables.size(); + for (auto tree_s = variables.rbegin(); tree_s != variables.rend(); ++tree_s) + { + + // Rcout << "tree_s:"; + // for(auto dim: *tree_s) Rcout << dim << ", "; + // Rcout << std::endl; + + --tree_index_s; + if (*tree_s == std::set{0}) + { + + auto gridPoint_0 = std::vector{0}; + values[tree_index_s][gridPoint_0] += update; + // Rcout << std::endl; + //} + + /* + for(auto tree_0: curr_family){ + + if(tree_0.first == std::set{0}){ + + Rcout << tree_0.first.size(); + std::vector leaf_index(tree_0.first.size(), 0); + std::vector leaf_index(tree_0.second->GridLeaves.values.size(), 0); + + int Test = tree_0.second->GridLeaves.values.size(); + Rcout << Test; + tree_0.second->GridLeaves.values[leaf_index] += update; + } + } + */ + } + else + { + + // check if S subset of T + + bool subset = true; + for (const auto dim : *tree_s) + { + if (tree_t->count(dim) == 0) + { + subset = false; + break; + } + } + if (!subset) + continue; + + // Rcout << pow(-1, (*tree_s).size()) << std::endl; + + auto grid_k = grids[tree_index_s]; + while (!grid_k.nextPoint()) + { + auto gridPoint_k = grid_k.getPoint(); + // + // if((*tree_s).size()>2){ + // Rcout << std::endl << " " << "j, k: "; + // for(auto p: gridPoint_k) Rcout << p << ", "; + // Rcout << std::endl; + // } + // + // Rcout << pow(-1, (*tree_s).size()) * update << std::endl; + values[tree_index_s][gridPoint_k] += pow(-1, (*tree_s).size()) * update; + } + } + } + // Rcout << std::endl; + } + else + { + + std::vector j_sizes(j_dims.size(), 0); + for (size_t j = 0; j < j_dims.size(); ++j) + { + auto tmp = j_dims.begin(); + std::advance(tmp, j); + int j_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*tmp)); + j_sizes[j] = grids[tree_index_t].dimensions[j_index]; + } + + // Rcout<<"Hello 1"; + + grid::NDGrid grid_j = grid::NDGrid(j_sizes); + while (!grid_j.nextPoint()) // looping over every x_{i, T\U} essentially + { + + std::vector update(value_size, 0); + auto gridPoint_j = grid_j.getPoint(); + // Rcout << " " << "j: "; + // for(auto p: gridPoint_j) Rcout << p << ", "; + // Rcout << std::endl; + // calc update + grid = grids[tree_index_u]; + while (!grid.nextPoint()) + { + auto gridPoint_i = grid.getPoint(); + // Rcout << " " << "i: "; + // for(auto p: gridPoint_i) Rcout << p << ", "; + // Rcout << std::endl << " "; + double curr_sum = individuals[tree_index_u][gridPoint_i]; + // Rcout << ", Current Sum: " << curr_sum << std::endl; + std::vector gridPoint_ij(tree_t->size(), 0); + for (size_t j = 0; j < gridPoint_j.size(); ++j) + { + auto j_dim = j_dims.begin(); // to keep: T\U + std::advance(j_dim, j); + int j_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*j_dim)); + // Rcout << " j_dim=" << *j_dim << ", j_index=" << j_index; + gridPoint_ij[j_index] = gridPoint_j[j]; + } + for (size_t i = 0; i < gridPoint_i.size(); ++i) + { + auto i_dim = tree_u->begin(); // to marginalize: U + std::advance(i_dim, i); + int i_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*i_dim)); + // Rcout << " i_dim=" << *i_dim << ", i_index=" << i_index; + gridPoint_ij[i_index] = gridPoint_i[i]; + } + + if (individuals[tree_index_t][gridPoint_ij] == 0) { + continue; // Skip non-support areas + } + // Rcout << std::endl << " " << "i, j: "; + // for(auto p: gridPoint_ij) Rcout << p << ", "; + // Rcout << std::endl; + update += curr_sum * values_old[tree_index_t][gridPoint_ij]; + tot_sum += curr_sum; + // Rcout << std::endl; + } + update /= tot_sum; + + // Rcout << "Hello_2"; + // update trees + int tree_index_s = variables.size(); + for (auto tree_s = variables.rbegin(); tree_s != variables.rend(); ++tree_s) + { + --tree_index_s; + // check if T\U=j_dims subset of S and S subset of T + bool subset = true; + for (const auto dim : j_dims) + { + if (tree_s->count(dim) == 0) + { + subset = false; + break; + } + } + for (const auto dim : *tree_s) + { + if (tree_t->count(dim) == 0) + { + subset = false; + break; + } + } + if (!subset) + continue; + // Rcout << " " << "S: "; + // for(auto dim: *tree_s) Rcout << dim << ", "; + // Rcout << std::endl; + // S cap U + std::set k_dims = *tree_s; + std::set k_dims_h1 = *tree_s; + std::set k_dims_h2 = *tree_u; + for (const auto dim : *tree_u) + k_dims.insert(dim); + for (const auto dim : *tree_s) + k_dims_h2.erase(dim); + for (const auto dim : *tree_u) + k_dims_h1.erase(dim); + for (const auto dim : k_dims_h1) + k_dims.erase(dim); + for (const auto dim : k_dims_h2) + k_dims.erase(dim); + + // std::set k_dims = *tree_s; + // for(const auto dim: *tree_t) k_dims.erase(dim); + // for(const auto dim: *tree_u) k_dims.insert(dim); + + // Rcout << " " << "k_dims: "; + // for(auto dim: k_dims) Rcout << dim << ", "; + // Rcout << std::endl; + + if (k_dims.size() == 0) + { + + values[tree_index_s][gridPoint_j] += pow(-1, (*tree_s).size() - j_dims.size()) * update; + } + else + { + + // Rcout <<"k_dims :"; + // for(auto dim: k_dims) Rcout << dim << ", "; + // Rcout << std::endl; + + std::vector k_sizes(k_dims.size(), 0); + for (size_t k = 0; k < k_dims.size(); ++k) + { + auto tmp = k_dims.begin(); + std::advance(tmp, k); + int k_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*tmp)); + k_sizes[k] = grids[tree_index_t].dimensions[k_index]; + } + // Rcout << " " << "k_sizes: "; + // for(auto dim: k_sizes) Rcout << dim << ", "; + // Rcout << std::endl; + grid::NDGrid grid_k = grid::NDGrid(k_sizes); + while (!grid_k.nextPoint()) + { + auto gridPoint_k = grid_k.getPoint(); + // Rcout << " " << "k: "; + // for(auto p: gridPoint_k) Rcout << p << ", "; + // Rcout << std::endl << " "; + std::vector gridPoint_jk(tree_s->size(), 0); + for (size_t j = 0; j < gridPoint_j.size(); ++j) + { + auto j_dim = j_dims.begin(); + std::advance(j_dim, j); + int j_index = std::distance(variables[tree_index_s].begin(), variables[tree_index_s].find(*j_dim)); + // Rcout << " j_dim=" << *j_dim << ", j_index=" << j_index; + gridPoint_jk[j_index] = gridPoint_j[j]; + } + for (size_t k = 0; k < gridPoint_k.size(); ++k) + { + auto k_dim = k_dims.begin(); + std::advance(k_dim, k); + int k_index = std::distance(variables[tree_index_s].begin(), variables[tree_index_s].find(*k_dim)); + // Rcout << " k_dim=" << *k_dim << ", k_index=" << k_index; + gridPoint_jk[k_index] = gridPoint_k[k]; + } + // Rcout << std::endl << " " << "j, k: "; + // for(auto p: gridPoint_jk) Rcout << p << ", "; + // Rcout << std::endl; + + // Rcout << pow(-1, (*tree_s).size() - j_dims.size()) * update[0]; + values[tree_index_s][gridPoint_jk] += pow(-1, (*tree_s).size() - j_dims.size()) * update; + } + } + } + } + } + } + --tree_index_t; + } + + // ------------- attach to rpf class ------------- + + // fill with new trees + for (size_t tree_index = 0; tree_index < variables.size(); ++tree_index) + { + LeafGrid curr_gridLeaf; + curr_gridLeaf.grid = grids[tree_index]; + curr_gridLeaf.individuals = individuals[tree_index]; + curr_gridLeaf.lim_list = lim_list; + curr_gridLeaf.values = values[tree_index]; + curr_family[variables[tree_index]]->GridLeaves = curr_gridLeaf; + } + } + + purified = true; +} diff --git a/src/randomPlantedForest.cpp b/src/randomPlantedForest.cpp index 9fd3630..23ac219 100644 --- a/src/randomPlantedForest.cpp +++ b/src/randomPlantedForest.cpp @@ -15,6 +15,7 @@ RCPP_MODULE(mod_rpf) .method("predict_vector", &RandomPlantedForest::predict_vector) .method("MSE", &RandomPlantedForest::MSE) .method("purify", &RandomPlantedForest::purify_3) + .method("purify_new", &RandomPlantedForest::purify_no_extrapolation) .method("print", &RandomPlantedForest::print) .method("get_parameters", &RandomPlantedForest::get_parameters) .method("set_parameters", &RandomPlantedForest::set_parameters) From faa95895531297a037adb004414044c4f52a9d2e Mon Sep 17 00:00:00 2001 From: MHiabu <46436255+MHiabu@users.noreply.github.com> Date: Mon, 6 May 2024 11:15:12 +0200 Subject: [PATCH 06/16] adding purify1 and purify2 as additional arguments --- src/randomPlantedForest.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/randomPlantedForest.cpp b/src/randomPlantedForest.cpp index 23ac219..e57d727 100644 --- a/src/randomPlantedForest.cpp +++ b/src/randomPlantedForest.cpp @@ -15,6 +15,8 @@ RCPP_MODULE(mod_rpf) .method("predict_vector", &RandomPlantedForest::predict_vector) .method("MSE", &RandomPlantedForest::MSE) .method("purify", &RandomPlantedForest::purify_3) + .method("purify1", &RandomPlantedForest::purify_1) + .method("purify2", &RandomPlantedForest::purify_2) .method("purify_new", &RandomPlantedForest::purify_no_extrapolation) .method("print", &RandomPlantedForest::print) .method("get_parameters", &RandomPlantedForest::get_parameters) From 7ea401c422184d113bf1ca1b278cbfca75b435a3 Mon Sep 17 00:00:00 2001 From: Munir Hiabu Date: Thu, 16 May 2024 16:28:17 +0200 Subject: [PATCH 07/16] added ANOVA purification (purify_2) --- src/include/rpf.hpp | 2 + src/lib/rpf_purify.cpp | 869 ++++++++++++++++++++++++++++++------ src/randomPlantedForest.cpp | 1 + 3 files changed, 739 insertions(+), 133 deletions(-) diff --git a/src/include/rpf.hpp b/src/include/rpf.hpp index f5ea5f0..cc59ca4 100644 --- a/src/include/rpf.hpp +++ b/src/include/rpf.hpp @@ -17,7 +17,9 @@ class RandomPlantedForest NumericMatrix predict_vector(const NumericVector &X, const NumericVector components = {0}); void purify_1(); void purify_2(); + void purify_22(); void purify_3(); + void purify_3_new(); void purify_no_extrapolation(); void print(); void cross_validation(int n_sets = 4, IntegerVector splits = {5, 50}, NumericVector t_tries = {0.2, 0.5, 0.7, 0.9}, IntegerVector split_tries = {1, 2, 5, 10}); diff --git a/src/lib/rpf_purify.cpp b/src/lib/rpf_purify.cpp index a7165af..726e8f0 100644 --- a/src/lib/rpf_purify.cpp +++ b/src/lib/rpf_purify.cpp @@ -1,6 +1,8 @@ #include "rpf.hpp" + + void RandomPlantedForest::purify_3() { @@ -275,7 +277,567 @@ void RandomPlantedForest::purify_3() // Rcout << "Total sum: " << tot_sum << std::endl; // Rcout << std::endl; - grid = grids[tree_index_u]; + grid = grids[tree_index_u]; + // Rcout << " Grid dimensions of U: "; + // for(auto dim: grid.dimensions) Rcout << dim << ", "; + // Rcout << std::endl; + + // Rcout<< "j_dims: "< update(value_size, 0); + + if (j_dims.size() == 0) + { + + // grid = grids[tree_index_u]; + while (!grid.nextPoint()) + { + auto gridPoint_i = grid.getPoint(); + // Rcout << " " << "i: "; + // for(auto p: gridPoint_i) Rcout << p << ", "; + // Rcout << std::endl << " "; + double curr_sum = individuals[tree_index_u][gridPoint_i]; + // Rcout << ", Current Sum: " << curr_sum << std::endl; + // Rcout << std::endl << " " << "i, j: "; + update += (curr_sum / tot_sum) * values_old[tree_index_t][gridPoint_i]; + // Rcout << std::endl; + } + + int tree_index_s = variables.size(); + for (auto tree_s = variables.rbegin(); tree_s != variables.rend(); ++tree_s) + { + + // Rcout << "tree_s:"; + // for(auto dim: *tree_s) Rcout << dim << ", "; + // Rcout << std::endl; + + --tree_index_s; + if (*tree_s == std::set{0}) + { + + auto gridPoint_0 = std::vector{0}; + values[tree_index_s][gridPoint_0] += update; + // Rcout << std::endl; + //} + + /* + for(auto tree_0: curr_family){ + + if(tree_0.first == std::set{0}){ + + Rcout << tree_0.first.size(); + std::vector leaf_index(tree_0.first.size(), 0); + std::vector leaf_index(tree_0.second->GridLeaves.values.size(), 0); + + int Test = tree_0.second->GridLeaves.values.size(); + Rcout << Test; + tree_0.second->GridLeaves.values[leaf_index] += update; + } + } + */ + } + else + { + + // check if S subset of T + + bool subset = true; + for (const auto dim : *tree_s) + { + if (tree_t->count(dim) == 0) + { + subset = false; + break; + } + } + if (!subset) + continue; + + // Rcout << pow(-1, (*tree_s).size()) << std::endl; + + auto grid_k = grids[tree_index_s]; + while (!grid_k.nextPoint()) + { + auto gridPoint_k = grid_k.getPoint(); + // + // if((*tree_s).size()>2){ + // Rcout << std::endl << " " << "j, k: "; + // for(auto p: gridPoint_k) Rcout << p << ", "; + // Rcout << std::endl; + // } + // + // Rcout << pow(-1, (*tree_s).size()) * update << std::endl; + values[tree_index_s][gridPoint_k] += pow(-1, (*tree_s).size()) * update; + } + } + } + // Rcout << std::endl; + } + else + { + + std::vector j_sizes(j_dims.size(), 0); + for (size_t j = 0; j < j_dims.size(); ++j) + { + auto tmp = j_dims.begin(); + std::advance(tmp, j); + int j_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*tmp)); + j_sizes[j] = grids[tree_index_t].dimensions[j_index]; + } + + // Rcout<<"Hello 1"; + + grid::NDGrid grid_j = grid::NDGrid(j_sizes); + while (!grid_j.nextPoint()) + { + + std::vector update(value_size, 0); + auto gridPoint_j = grid_j.getPoint(); + // Rcout << " " << "j: "; + // for(auto p: gridPoint_j) Rcout << p << ", "; + // Rcout << std::endl; + // calc update + grid = grids[tree_index_u]; + while (!grid.nextPoint()) + { + auto gridPoint_i = grid.getPoint(); + // Rcout << " " << "i: "; + // for(auto p: gridPoint_i) Rcout << p << ", "; + // Rcout << std::endl << " "; + double curr_sum = individuals[tree_index_u][gridPoint_i]; + // Rcout << ", Current Sum: " << curr_sum << std::endl; + std::vector gridPoint_ij(tree_t->size(), 0); + for (size_t j = 0; j < gridPoint_j.size(); ++j) + { + auto j_dim = j_dims.begin(); + std::advance(j_dim, j); + int j_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*j_dim)); + // Rcout << " j_dim=" << *j_dim << ", j_index=" << j_index; + gridPoint_ij[j_index] = gridPoint_j[j]; + } + for (size_t i = 0; i < gridPoint_i.size(); ++i) + { + auto i_dim = tree_u->begin(); + std::advance(i_dim, i); + int i_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*i_dim)); + // Rcout << " i_dim=" << *i_dim << ", i_index=" << i_index; + gridPoint_ij[i_index] = gridPoint_i[i]; + } + // Rcout << std::endl << " " << "i, j: "; + // for(auto p: gridPoint_ij) Rcout << p << ", "; + // Rcout << std::endl; + update += (curr_sum / tot_sum) * values_old[tree_index_t][gridPoint_ij]; + // Rcout << std::endl; + } + + // Rcout << "Hello_2"; + // update trees + int tree_index_s = variables.size(); + for (auto tree_s = variables.rbegin(); tree_s != variables.rend(); ++tree_s) + { + --tree_index_s; + // check if T\U=j_dims subset of S and S subset of T + bool subset = true; + for (const auto dim : j_dims) + { + if (tree_s->count(dim) == 0) + { + subset = false; + break; + } + } + for (const auto dim : *tree_s) + { + if (tree_t->count(dim) == 0) + { + subset = false; + break; + } + } + if (!subset) + continue; + // Rcout << " " << "S: "; + // for(auto dim: *tree_s) Rcout << dim << ", "; + // Rcout << std::endl; + // S cap U + std::set k_dims = *tree_s; + std::set k_dims_h1 = *tree_s; + std::set k_dims_h2 = *tree_u; + for (const auto dim : *tree_u) + k_dims.insert(dim); + for (const auto dim : *tree_s) + k_dims_h2.erase(dim); + for (const auto dim : *tree_u) + k_dims_h1.erase(dim); + for (const auto dim : k_dims_h1) + k_dims.erase(dim); + for (const auto dim : k_dims_h2) + k_dims.erase(dim); + + // std::set k_dims = *tree_s; + // for(const auto dim: *tree_t) k_dims.erase(dim); + // for(const auto dim: *tree_u) k_dims.insert(dim); + + // Rcout << " " << "k_dims: "; + // for(auto dim: k_dims) Rcout << dim << ", "; + // Rcout << std::endl; + + if (k_dims.size() == 0) + { + + values[tree_index_s][gridPoint_j] += pow(-1, (*tree_s).size() - j_dims.size()) * update; + } + else + { + + // Rcout <<"k_dims :"; + // for(auto dim: k_dims) Rcout << dim << ", "; + // Rcout << std::endl; + + std::vector k_sizes(k_dims.size(), 0); + for (size_t k = 0; k < k_dims.size(); ++k) + { + auto tmp = k_dims.begin(); + std::advance(tmp, k); + int k_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*tmp)); + k_sizes[k] = grids[tree_index_t].dimensions[k_index]; + } + // Rcout << " " << "k_sizes: "; + // for(auto dim: k_sizes) Rcout << dim << ", "; + // Rcout << std::endl; + grid::NDGrid grid_k = grid::NDGrid(k_sizes); + while (!grid_k.nextPoint()) + { + auto gridPoint_k = grid_k.getPoint(); + // Rcout << " " << "k: "; + // for(auto p: gridPoint_k) Rcout << p << ", "; + // Rcout << std::endl << " "; + std::vector gridPoint_jk(tree_s->size(), 0); + for (size_t j = 0; j < gridPoint_j.size(); ++j) + { + auto j_dim = j_dims.begin(); + std::advance(j_dim, j); + int j_index = std::distance(variables[tree_index_s].begin(), variables[tree_index_s].find(*j_dim)); + // Rcout << " j_dim=" << *j_dim << ", j_index=" << j_index; + gridPoint_jk[j_index] = gridPoint_j[j]; + } + for (size_t k = 0; k < gridPoint_k.size(); ++k) + { + auto k_dim = k_dims.begin(); + std::advance(k_dim, k); + int k_index = std::distance(variables[tree_index_s].begin(), variables[tree_index_s].find(*k_dim)); + // Rcout << " k_dim=" << *k_dim << ", k_index=" << k_index; + gridPoint_jk[k_index] = gridPoint_k[k]; + } + // Rcout << std::endl << " " << "j, k: "; + // for(auto p: gridPoint_jk) Rcout << p << ", "; + // Rcout << std::endl; + + // Rcout << pow(-1, (*tree_s).size() - j_dims.size()) * update[0]; + values[tree_index_s][gridPoint_jk] += pow(-1, (*tree_s).size() - j_dims.size()) * update; + } + } + } + } + } + } + --tree_index_t; + } + + // ------------- attach to rpf class ------------- + + // fill with new trees + for (size_t tree_index = 0; tree_index < variables.size(); ++tree_index) + { + LeafGrid curr_gridLeaf; + curr_gridLeaf.grid = grids[tree_index]; + curr_gridLeaf.individuals = individuals[tree_index]; + curr_gridLeaf.lim_list = lim_list; + curr_gridLeaf.values = values[tree_index]; + curr_family[variables[tree_index]]->GridLeaves = curr_gridLeaf; + } + } + + purified = true; +} + + + + +void RandomPlantedForest::purify_3_new() +{ + + // go through all n_trees families + for (auto &curr_family : this->tree_families) + { + + // lim_list is a list giving for each variable all interval end-points + std::vector> lim_list = get_lim_list(curr_family); + + // initialize values and individuals for each tree in family + std::vector grids(curr_family.size() - 1); + std::vector> individuals(curr_family.size() - 1); + std::vector>> values(curr_family.size() - 1); + std::vector>> values_old(curr_family.size() - 1); + std::vector> variables(curr_family.size() - 1); + + // ------------- setup finer grid ------------- + + int tree_index = 0; + for (const auto &curr_tree : curr_family) + { + + if (curr_tree.first == std::set{0}) + { + + // values[tree_index] = rpf::Matrix>(dimensions, std::vector(value_size, 0)); // changed + continue; // ignore null tree + } + + // fill space with dimensions + std::vector dimensions; + for (const auto &dim : curr_tree.first) + { + dimensions.push_back(lim_list[dim - 1].size()); // size - 1 ? WICHTIG + } + + // setup grid for leaf indices + auto grid = grid::NDGrid(dimensions); + + // initialize data for current tree + grids[tree_index] = grid; + individuals[tree_index] = utils::Matrix(dimensions, 0); + values[tree_index] = utils::Matrix>(dimensions, std::vector(value_size, 0)); // changed + values_old[tree_index] = utils::Matrix>(dimensions, std::vector(value_size, 0)); // changed + variables[tree_index] = curr_tree.first; + + // fill grid points with individuals and values + while (!grid.nextPoint()) + { + + std::vector gridPoint = grid.getPoint(); + + bool in_leaf = true; + + // go through sample points to sum up individuals + for (const auto &feature_vec : X) + { + int dim_index = 0; + in_leaf = true; + for (const auto &dim : curr_tree.first) + { + double val = feature_vec[dim - 1]; + if (!((val >= lim_list[dim - 1][gridPoint[dim_index]]) && (val < lim_list[dim - 1][gridPoint[dim_index] + 1]))) + in_leaf = false; + ++dim_index; + } + + // consider individuals only if all in + if (in_leaf) + individuals[tree_index][gridPoint] += 1; + } + + // go through leaves of tree to sum up values + for (const auto &leaf : curr_tree.second->get_leaves()) + { + + in_leaf = true; + int dim_index = 0; + for (const auto &dim : curr_tree.first) + { + // consider values only if all in + if (!((leaf.intervals[dim - 1].first <= lim_list[dim - 1][gridPoint[dim_index]]) && (leaf.intervals[dim - 1].second >= lim_list[dim - 1][gridPoint[dim_index] + 1]))) + in_leaf = false; + ++dim_index; + } + + // sum up values + if (in_leaf) + { + + values[tree_index][gridPoint] += leaf.value; // todo: multiclass + values_old[tree_index][gridPoint] += leaf.value; // todo: multiclass + } + } + } + + ++tree_index; + } + + // Rcout << variables.size(); + // for(int i = 0; i>(std::vector{1}, std::vector(value_size, 0))); + values_old.insert(values_old.begin(), utils::Matrix>(std::vector{1}, std::vector(value_size, 0))); + individuals.insert(individuals.begin(), utils::Matrix(std::vector{1})); + variables.insert(variables.begin(), std::set{0}); + + // recap maximum number of dimensions of current family + unsigned int curr_max = curr_family.rbegin()->first.size(); + + while (curr_max > 1) + { + + auto keys = getKeys(curr_family); + // go through split dimensions of all trees + for (std::vector>::reverse_iterator key = keys.rbegin(); key != keys.rend(); ++key) + { + auto &curr_tree = curr_family[(*key)]; + std::set curr_dims = curr_tree->split_dims; + // check if number of dims same as current max_interaction + if (curr_dims.size() == curr_max) + { + // go through feature dims + int dim_index = 0; + for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) + { + // continue only if dim in current tree + if (curr_tree->split_dims.count(feature_dim) != 0) + { + std::set tree_dims = curr_tree->split_dims; + tree_dims.erase(tree_dims.find(feature_dim)); // remove current feature dim from current tree + // check if tree with dimensions exists, if not create + std::shared_ptr tree = treeExists(tree_dims, curr_family); + if (!tree) + { + // get index of old and new tree + auto old_tree_index = std::distance(std::begin(curr_family), curr_family.find(curr_tree->get_split_dims())); + curr_family.insert(std::make_pair(tree_dims, std::make_shared(DecisionTree(tree_dims)))); + auto tree_index = std::distance(std::begin(curr_family), curr_family.find(tree_dims)); + // remove matrix dimension of respective variable + std::vector matrix_dimensions = values[old_tree_index].dims; + // std::vector matrix_dimensions = values_old[old_tree_index].dims; + + // Rcout << typeof(matrix_dimensions.begin()) << std::endl; + + matrix_dimensions.erase(matrix_dimensions.begin() + dim_index); + // initialize data for new tree + auto grid = grid::NDGrid(matrix_dimensions); + grids.insert(grids.begin() + tree_index, grid); + values.insert(values.begin() + tree_index, utils::Matrix>(matrix_dimensions, std::vector(value_size, 0))); + values_old.insert(values_old.begin() + tree_index, utils::Matrix>(matrix_dimensions, std::vector(value_size, 0))); + individuals.insert(individuals.begin() + tree_index, utils::Matrix(matrix_dimensions)); + variables.insert(variables.begin() + tree_index, tree_dims); + // fill individuals of new trees + while (!grid.nextPoint()) + { + std::vector gridPoint = grid.getPoint(); + bool in_leaf = true; + // go through sample points to sum up individuals + for (const auto &feature_vec : X) + { + int dim_index2 = 0; + in_leaf = true; + for (const auto &dim : tree_dims) + { + double val = feature_vec[dim - 1]; + if (!((val >= lim_list[dim - 1][gridPoint[dim_index2]]) && (val < lim_list[dim - 1][gridPoint[dim_index2] + 1]))) + in_leaf = false; + ++dim_index2; + } + // consider individuals only if all in + if (in_leaf) + individuals[tree_index][gridPoint] += 1; + } + } + } + dim_index++; + } + } + } + } + // update currently considered dimension size + --curr_max; + } + + // Rcout << std::endl; + // Rcout << std::endl; + // Rcout << std::endl; + // + // for(int i = 0; i curr_dims = *tree_t; + // do not purify null + if (curr_dims == std::set{0}) + continue; + // Rcout << std::endl << tree_index_t << " - T: "; + // Rcout << "tree_t:"; + // for(auto dim: curr_dims) Rcout << dim << ", "; + // Rcout << std::endl; + + auto grid = grids[tree_index_t]; + // Rcout << "Grid dimensions of T: "; + // for(auto dim: grid.dimensions) Rcout << dim << ", "; + // Rcout << std::endl; + // go through subtrees of t + int tree_index_u = variables.size(); + for (auto tree_u = variables.rbegin(); tree_u != variables.rend(); ++tree_u) + { + --tree_index_u; + // j_dims = dims of t without u + std::set j_dims = curr_dims; + if (tree_u->size() > curr_dims.size()) + continue; + // check if subset + bool subset = true; + for (const auto dim : *tree_u) + { + if (tree_t->count(dim) == 0) + { + subset = false; + break; + } + j_dims.erase(dim); + } + if (!subset) + continue; + + // Rcout << "Hello"; + // Rcout << " " << tree_index_u << " - U: "; + // for(auto dim: *tree_u) Rcout << dim << ", "; + // Rcout << std::endl; + // Rcout << " Individuals: "; + + double tot_sum = 0; + grid = grids[tree_index_u]; + while (!grid.nextPoint()) + { + auto gridPoint = grid.getPoint(); + // Rcout << individuals[tree_index_u][gridPoint] << ", "; + tot_sum += individuals[tree_index_u][gridPoint]; + } + // Rcout << "Total sum: " << tot_sum << std::endl; + // Rcout << std::endl; + // Rcout << " Grid dimensions of U: "; // for(auto dim: grid.dimensions) Rcout << dim << ", "; // Rcout << std::endl; @@ -459,19 +1021,10 @@ void RandomPlantedForest::purify_3() // Rcout << std::endl; // S cap U std::set k_dims = *tree_s; - std::set k_dims_h1 = *tree_s; - std::set k_dims_h2 = *tree_u; - for (const auto dim : *tree_u) - k_dims.insert(dim); - for (const auto dim : *tree_s) - k_dims_h2.erase(dim); - for (const auto dim : *tree_u) - k_dims_h1.erase(dim); - for (const auto dim : k_dims_h1) - k_dims.erase(dim); - for (const auto dim : k_dims_h2) - k_dims.erase(dim); + for (const auto dim : j_dims) + k_dims.erase(dim); + // std::set k_dims = *tree_s; // for(const auto dim: *tree_t) k_dims.erase(dim); // for(const auto dim: *tree_u) k_dims.insert(dim); @@ -559,6 +1112,8 @@ void RandomPlantedForest::purify_3() purified = true; } + + std::vector> RandomPlantedForest::get_lim_list(const TreeFamily &curr_family) { // lim_list is a list giving for each variable all interval end-points std::vector> lim_list(feature_size); @@ -693,42 +1248,19 @@ void RandomPlantedForest::purify_1() void RandomPlantedForest::purify_2() { + // go through all n_trees families for (auto &curr_family : this->tree_families) { // lim_list is a list giving for each variable all interval end-points - std::vector> lim_list(feature_size); - - // go through all variables of the component - for (int curr_dim = 1; curr_dim <= feature_size; ++curr_dim) - { - std::vector bounds; - - // go through trees of family - for (const auto &curr_tree : curr_family) - { - - // consider only relevant trees that have current dimension as variable - if (!curr_tree.first.count(curr_dim)) - continue; - - // go through leaves of tree - for (const auto &curr_leaf : curr_tree.second->leaves) - { - // get interval ends of variable - bounds.push_back(curr_leaf.intervals[curr_dim - 1].second); - } - } - std::sort(bounds.begin(), bounds.end()); - bounds.erase(std::unique(bounds.begin(), bounds.end()), bounds.end()); - lim_list[curr_dim - 1] = bounds; - } + std::vector> lim_list = get_lim_list(curr_family); // initialize values and individuals for each tree in family std::vector grids(curr_family.size() - 1); std::vector> individuals(curr_family.size() - 1); std::vector>> values(curr_family.size() - 1); + std::vector>> values_old(curr_family.size() - 1); std::vector> variables(curr_family.size() - 1); // ------------- setup finer grid ------------- @@ -738,13 +1270,17 @@ void RandomPlantedForest::purify_2() { if (curr_tree.first == std::set{0}) + { + + // values[tree_index] = rpf::Matrix>(dimensions, std::vector(value_size, 0)); // changed continue; // ignore null tree + } // fill space with dimensions std::vector dimensions; for (const auto &dim : curr_tree.first) { - dimensions.push_back(lim_list[dim - 1].size() - 1); // size - 1 ? + dimensions.push_back(lim_list[dim - 1].size()); // size - 1 ? WICHTIG } // setup grid for leaf indices @@ -753,7 +1289,8 @@ void RandomPlantedForest::purify_2() // initialize data for current tree grids[tree_index] = grid; individuals[tree_index] = utils::Matrix(dimensions, 0); - values[tree_index] = utils::Matrix>(dimensions, std::vector(value_size, 0)); // changed + values[tree_index] = utils::Matrix>(dimensions, std::vector(value_size, 0)); // changed + values_old[tree_index] = utils::Matrix>(dimensions, std::vector(value_size, 0)); // changed variables[tree_index] = curr_tree.first; // fill grid points with individuals and values @@ -798,114 +1335,135 @@ void RandomPlantedForest::purify_2() // sum up values if (in_leaf) - values[tree_index][gridPoint] += leaf.value; // todo: multiclass + { + + values[tree_index][gridPoint] += leaf.value; // todo: multiclass + values_old[tree_index][gridPoint] += leaf.value; // todo: multiclass + } } } ++tree_index; } + // Rcout << variables.size(); + // for(int i = 0; i>(std::vector{1}, std::vector(value_size, 0))); + values_old.insert(values_old.begin(), utils::Matrix>(std::vector{1}, std::vector(value_size, 0))); individuals.insert(individuals.begin(), utils::Matrix(std::vector{1})); variables.insert(variables.begin(), std::set{0}); // recap maximum number of dimensions of current family - unsigned int curr_max = 0; - for (const auto &tree : curr_family) - { - if (tree.first.size() > curr_max) - curr_max = tree.first.size(); - } + unsigned int curr_max = curr_family.rbegin()->first.size(); - auto keys = getKeys(curr_family); while (curr_max > 1) { + auto keys = getKeys(curr_family); // go through split dimensions of all trees for (std::vector>::reverse_iterator key = keys.rbegin(); key != keys.rend(); ++key) { - auto &curr_tree = curr_family[(*key)]; std::set curr_dims = curr_tree->split_dims; - // check if number of dims same as current max_interaction if (curr_dims.size() == curr_max) { - // go through feature dims int dim_index = 0; for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) { - // continue only if dim in current tree if (curr_tree->split_dims.count(feature_dim) != 0) { - std::set tree_dims = curr_tree->split_dims; tree_dims.erase(tree_dims.find(feature_dim)); // remove current feature dim from current tree - // check if tree with dimensions exists, if not create std::shared_ptr tree = treeExists(tree_dims, curr_family); if (!tree) { - // get index of old and new tree auto old_tree_index = std::distance(std::begin(curr_family), curr_family.find(curr_tree->get_split_dims())); curr_family.insert(std::make_pair(tree_dims, std::make_shared(DecisionTree(tree_dims)))); auto tree_index = std::distance(std::begin(curr_family), curr_family.find(tree_dims)); - // remove matrix dimension of respective variable std::vector matrix_dimensions = values[old_tree_index].dims; - matrix_dimensions.erase(matrix_dimensions.begin() + dim_index); + // std::vector matrix_dimensions = values_old[old_tree_index].dims; + + // Rcout << typeof(matrix_dimensions.begin()) << std::endl; + matrix_dimensions.erase(matrix_dimensions.begin() + dim_index); // initialize data for new tree auto grid = grid::NDGrid(matrix_dimensions); grids.insert(grids.begin() + tree_index, grid); - values.insert(values.begin() + tree_index, utils::Matrix>(matrix_dimensions, std::vector(0, value_size))); + values.insert(values.begin() + tree_index, utils::Matrix>(matrix_dimensions, std::vector(value_size, 0))); + values_old.insert(values_old.begin() + tree_index, utils::Matrix>(matrix_dimensions, std::vector(value_size, 0))); individuals.insert(individuals.begin() + tree_index, utils::Matrix(matrix_dimensions)); variables.insert(variables.begin() + tree_index, tree_dims); - // fill individuals of new trees while (!grid.nextPoint()) { - std::vector gridPoint = grid.getPoint(); bool in_leaf = true; - // go through sample points to sum up individuals for (const auto &feature_vec : X) { - int dim_index = 0; + int dim_index2 = 0; in_leaf = true; for (const auto &dim : tree_dims) { double val = feature_vec[dim - 1]; - if (!((val >= lim_list[dim - 1][gridPoint[dim_index]]) && (val < lim_list[dim - 1][gridPoint[dim_index] + 1]))) + if (!((val >= lim_list[dim - 1][gridPoint[dim_index2]]) && (val < lim_list[dim - 1][gridPoint[dim_index2] + 1]))) in_leaf = false; - ++dim_index; + ++dim_index2; } - // consider individuals only if all in if (in_leaf) individuals[tree_index][gridPoint] += 1; } } } - dim_index++; } } } } - // update currently considered dimension size --curr_max; } + // Rcout << std::endl; + // Rcout << std::endl; + // Rcout << std::endl; + // + // for(int i = 0; i curr_dims = curr_tree->second->get_split_dims(); + std::set curr_dims = *tree_t; + // do not purify null if (curr_dims == std::set{0}) continue; + // Rcout << std::endl << tree_index_t << " - T: "; + // Rcout << "tree_t:"; + // for(auto dim: curr_dims) Rcout << dim << ", "; + // Rcout << std::endl; + + auto grid = grids[tree_index_t]; // repeat until tolerance small enough and (?) maximum number of iterations reached - while ((tol[curr_tree_index] > 0.00000000001) && (iter < 100)) + while ((tol[tree_index_t] > 0.00000000001) && (iter < 100)) { + tol[tree_index_t] = 0; // go through feature dims int curr_dim_index = 0; - for (const auto &feature_dim : curr_dims) + for (const auto j : curr_dims) { // get tree that has same variables as curr_tree minus j-variable - std::set tree_dims = curr_dims; - tree_dims.erase(tree_dims.find(feature_dim)); - int tree_index = 0; // if tree not exist, set to null tree - if (curr_family.find(tree_dims) != curr_family.end()) - tree_index = std::distance(std::begin(curr_family), curr_family.find(tree_dims)) - 1; + std::set tree_dims_minusj = curr_dims; + tree_dims_minusj.erase(tree_dims_minusj.find(j)); + int tree_index_minusj = 0; // if tree not exist, set to null tree + if (curr_family.find(tree_dims_minusj) != curr_family.end()) + tree_index_minusj = std::distance(std::begin(curr_family), curr_family.find(tree_dims_minusj)) ; // update values - if (grids[curr_tree_index].dimensions.size() == 1) - { // one dimensional case + if (grids[tree_index_t].dimensions.size() == 1) + { // one dimensional case (tree_t one dimensional) int sum_ind = 0; std::vector avg(value_size, 0); // get sum of individuals - for (int i = 0; i < individuals[curr_tree_index].n_entries; ++i) + for (int i = 0; i < individuals[tree_index_t].n_entries; ++i) { std::vector tmp{i}; - sum_ind += individuals[curr_tree_index][tmp]; + sum_ind += individuals[tree_index_t][tmp]; } if (sum_ind == 0) continue; // calc avg - for (int i = 0; i < individuals[curr_tree_index].n_entries; ++i) + for (int i = 0; i < individuals[tree_index_t].n_entries; ++i) { std::vector tmp{i}; - avg += (individuals[curr_tree_index][tmp] * values[curr_tree_index][tmp]) / sum_ind; + avg += (individuals[tree_index_t][tmp] * values[tree_index_t][tmp]) / sum_ind; } // update values of one dimensional and null tree - for (int i = 0; i < values[curr_tree_index].n_entries; ++i) + for (int i = 0; i < values[tree_index_t].n_entries; ++i) { std::vector tmp{i}; - values[curr_tree_index][tmp] -= avg; + values[tree_index_t][tmp] -= avg; } std::vector tmp{0}; - values[tree_index][tmp] += avg; + values[tree_index_minusj][tmp] += avg; } else - { // higher dimensional case + { // higher dimensional case (tree_t dimension >1) // setup new grid without dimension j - std::vector new_dimensions = grids[curr_tree_index].dimensions; - int j_dim = new_dimensions[curr_dim_index]; - new_dimensions.erase(new_dimensions.begin() + curr_dim_index); - grid::NDGrid grid = grid::NDGrid(new_dimensions); + std::vector new_dimensions = grids[tree_index_t].dimensions; + int index_j_in_t = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(j)); + std::vector j_dim; + j_dim.push_back(new_dimensions[index_j_in_t]); + new_dimensions.erase(new_dimensions.begin() + index_j_in_t); + + grid::NDGrid grid_minusj = grid::NDGrid(new_dimensions); + grid::NDGrid grid_minusj2 = grid::NDGrid(grids[tree_index_minusj]); // go through values without dimension j - while (!grid.nextPoint()) + while (!grid_minusj.nextPoint()) { - auto gridPoint = grid.getPoint(); - gridPoint.push_back(0); - int sum_ind = 0; + std::vector update(value_size, 0); + auto gridPoint_minusj = grid_minusj.getPoint(); + // Rcout << " " << "j: "; + // for(auto p: gridPoint_j) Rcout << p << ", "; + // Rcout << std::endl; + // calc update + + int tree_index_j = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(j)); + + + grid::NDGrid grid_j = grid::NDGrid(j_dim); + + double sum_ind = 0; std::vector avg(value_size, 0); - // go through slice to sum up individuals - for (int j = 0; j < j_dim; ++j) + while (!grid_j.nextPoint()) { - gridPoint.back() = j; - - // get sum of individuals - sum_ind += individuals[curr_tree_index][gridPoint]; + auto gridPoint_j = grid_j.getPoint(); + // Rcout << " " << "i: "; + // for(auto p: gridPoint_i) Rcout << p << ", "; + // Rcout << std::endl << " "; + std::vector gridPoint_t(tree_t->size(), 0); + + gridPoint_t[tree_index_j] = gridPoint_j[0]; + + for (size_t minusj = 0; minusj < gridPoint_minusj.size(); ++minusj) + { + auto minusj_dim = tree_dims_minusj.begin(); + std::advance(minusj_dim, minusj); + int minusj_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*minusj_dim)); + // Rcout << " i_dim=" << *i_dim << ", i_index=" << i_index; + gridPoint_t[minusj_index] = gridPoint_minusj[minusj]; + } + double sum_curr = individuals[tree_index_t][gridPoint_t]; + sum_ind += sum_curr; + avg += (sum_curr * values[tree_index_t][gridPoint_t][0]); } + + if (sum_ind !=0) { + avg = avg/sum_ind; + } + else + { continue; + } + double avgtol = avg[0]; + tol[tree_index_t] = std::max(std::fabs(avgtol), tol[tree_index_t]); + values[tree_index_minusj][gridPoint_minusj] += avg; - // go through slice to calc avg - for (int j = 0; j < j_dim; ++j) - { - gridPoint.back() = j; - - // calc avg - avg += (individuals[curr_tree_index][gridPoint] * values[curr_tree_index][gridPoint]) / sum_ind; - } + // update values for t-tree + grid_j = grid::NDGrid(j_dim); - // go through slice to update values - for (int j = 0; j < j_dim; ++j) + while (!grid_j.nextPoint()) { - gridPoint.back() = j; - - // update values of current slice - values[curr_tree_index][gridPoint] -= avg; + auto gridPoint_j = grid_j.getPoint(); + // Rcout << " " << "i: "; + // for(auto p: gridPoint_i) Rcout << p << ", "; + // Rcout << std::endl << " "; + std::vector gridPoint_t(tree_t->size(), 0); + + gridPoint_t[tree_index_j] = gridPoint_j[0]; + + for (size_t minusj = 0; minusj < gridPoint_minusj.size(); ++minusj) + { + auto minusj_dim = tree_dims_minusj.begin(); + std::advance(minusj_dim, minusj); + int minusj_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*minusj_dim)); + // Rcout << " i_dim=" << *i_dim << ", i_index=" << i_index; + gridPoint_t[minusj_index] = gridPoint_minusj[minusj]; + } + + values[tree_index_t][gridPoint_t] -= avg; } + - // update lower dimensional tree - gridPoint.pop_back(); - values[tree_index][gridPoint] += avg; - } - } + } // end of minusj loop + } // end of loop for higher dim t case ++curr_dim_index; - } - // update tolerance - if (variables[curr_tree_index].size() == 1) - { - tol[curr_tree_index] = 1; // todo - } - else - { - tol[curr_tree_index] = 1; - } + } // end of j loop + ++iter; - } + } // finished tol loop - --curr_tree_index; + --tree_index_t; } + // ------------- attach to rpf class ------------- // fill with new trees diff --git a/src/randomPlantedForest.cpp b/src/randomPlantedForest.cpp index e57d727..68d54f2 100644 --- a/src/randomPlantedForest.cpp +++ b/src/randomPlantedForest.cpp @@ -15,6 +15,7 @@ RCPP_MODULE(mod_rpf) .method("predict_vector", &RandomPlantedForest::predict_vector) .method("MSE", &RandomPlantedForest::MSE) .method("purify", &RandomPlantedForest::purify_3) + .method("purify3_new", &RandomPlantedForest::purify_3_new) .method("purify1", &RandomPlantedForest::purify_1) .method("purify2", &RandomPlantedForest::purify_2) .method("purify_new", &RandomPlantedForest::purify_no_extrapolation) From e3e8d39b03b1bac8811114331b95b1ed55c36e92 Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Sun, 23 Jun 2024 18:29:06 +0200 Subject: [PATCH 08/16] Purify2 now modifies existing leaves --- src/include/helper.hpp | 2 +- src/lib/rpf.cpp | 3 +- src/lib/rpf_purify_anova.cpp | 457 +++++++++++++++++++++++++++++++++++ 3 files changed, 460 insertions(+), 2 deletions(-) create mode 100644 src/lib/rpf_purify_anova.cpp diff --git a/src/include/helper.hpp b/src/include/helper.hpp index c80cbfd..b19b09f 100644 --- a/src/include/helper.hpp +++ b/src/include/helper.hpp @@ -59,7 +59,7 @@ namespace utils if (n_entries != entries.size()) throw std::invalid_argument("Invalid matrix size."); } - T &operator[](std::vector &indices) + T &operator[](const std::vector& indices) { if (indices.size() != dims.size()) throw std::invalid_argument("Invalid number of indices."); diff --git a/src/lib/rpf.cpp b/src/lib/rpf.cpp index 7bc168e..9835cfe 100644 --- a/src/lib/rpf.cpp +++ b/src/lib/rpf.cpp @@ -757,7 +757,8 @@ std::vector RandomPlantedForest::predict_single(const std::vectorGridLeaves.values[leaf_index]; + const auto to_add = tree.second->GridLeaves.values[leaf_index]; + total_res += to_add; } } } diff --git a/src/lib/rpf_purify_anova.cpp b/src/lib/rpf_purify_anova.cpp new file mode 100644 index 0000000..af6c125 --- /dev/null +++ b/src/lib/rpf_purify_anova.cpp @@ -0,0 +1,457 @@ +#include "rpf.hpp" + +void RandomPlantedForest::purify_2() +{ + + // go through all n_trees families + for (auto &curr_family : this->tree_families) + { + + // lim_list is a list giving for each variable all interval end-points + std::vector> lim_list = get_lim_list(curr_family); + + // initialize values and individuals for each tree in family + std::vector grids(curr_family.size() - 1); + std::vector> individuals(curr_family.size() - 1); + std::vector>> values(curr_family.size() - 1); + std::vector>> values_old(curr_family.size() - 1); + std::vector> variables(curr_family.size() - 1); + + // ------------- setup finer grid ------------- + + int tree_index = 0; + for (const auto &curr_tree : curr_family) + { + + if (curr_tree.first == std::set{0}) + { + + // values[tree_index] = rpf::Matrix>(dimensions, std::vector(value_size, 0)); // changed + continue; // ignore null tree + } + + // fill space with dimensions + std::vector dimensions; + for (const auto &dim : curr_tree.first) + { + dimensions.push_back(lim_list[dim - 1].size()); // size - 1 ? WICHTIG + } + + // setup grid for leaf indices + auto grid = grid::NDGrid(dimensions); + + // initialize data for current tree + grids[tree_index] = grid; + individuals[tree_index] = utils::Matrix(dimensions, 0); + values[tree_index] = utils::Matrix>(dimensions, std::vector(value_size, 0)); // changed + values_old[tree_index] = utils::Matrix>(dimensions, std::vector(value_size, 0)); // changed + variables[tree_index] = curr_tree.first; + + // fill grid points with individuals and values + while (!grid.nextPoint()) + { + + std::vector gridPoint = grid.getPoint(); + + bool in_leaf = true; + + // go through sample points to sum up individuals + for (const auto &feature_vec : X) + { + int dim_index = 0; + in_leaf = true; + for (const auto &dim : curr_tree.first) + { + double val = feature_vec[dim - 1]; + if (!((val >= lim_list[dim - 1][gridPoint[dim_index]]) && (val < lim_list[dim - 1][gridPoint[dim_index] + 1]))) + in_leaf = false; + ++dim_index; + } + + // consider individuals only if all in + if (in_leaf) + individuals[tree_index][gridPoint] += 1; + } + + // go through leaves of tree to sum up values + for (const auto &leaf : curr_tree.second->get_leaves()) + { + + in_leaf = true; + int dim_index = 0; + for (const auto &dim : curr_tree.first) + { + // consider values only if all in + if (!((leaf.intervals[dim - 1].first <= lim_list[dim - 1][gridPoint[dim_index]]) && (leaf.intervals[dim - 1].second >= lim_list[dim - 1][gridPoint[dim_index] + 1]))) + in_leaf = false; + ++dim_index; + } + + // sum up values + if (in_leaf) + { + + values[tree_index][gridPoint] += leaf.value; // todo: multiclass + values_old[tree_index][gridPoint] += leaf.value; // todo: multiclass + } + } + } + + ++tree_index; + } + + // Rcout << variables.size(); + // for(int i = 0; i>(std::vector{1}, std::vector(value_size, 0))); + values_old.insert(values_old.begin(), utils::Matrix>(std::vector{1}, std::vector(value_size, 0))); + individuals.insert(individuals.begin(), utils::Matrix(std::vector{1})); + variables.insert(variables.begin(), std::set{0}); + + // recap maximum number of dimensions of current family + unsigned int curr_max = curr_family.rbegin()->first.size(); + + while (curr_max > 1) + { + + auto keys = getKeys(curr_family); + // go through split dimensions of all trees + for (std::vector>::reverse_iterator key = keys.rbegin(); key != keys.rend(); ++key) + { + auto &curr_tree = curr_family[(*key)]; + std::set curr_dims = curr_tree->split_dims; + // check if number of dims same as current max_interaction + if (curr_dims.size() == curr_max) + { + // go through feature dims + int dim_index = 0; + for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) + { + // continue only if dim in current tree + if (curr_tree->split_dims.count(feature_dim) != 0) + { + std::set tree_dims = curr_tree->split_dims; + tree_dims.erase(tree_dims.find(feature_dim)); // remove current feature dim from current tree + // check if tree with dimensions exists, if not create + std::shared_ptr tree = treeExists(tree_dims, curr_family); + if (!tree) + { + // get index of old and new tree + auto old_tree_index = std::distance(std::begin(curr_family), curr_family.find(curr_tree->get_split_dims())); + curr_family.insert(std::make_pair(tree_dims, std::make_shared(DecisionTree(tree_dims)))); + auto tree_index = std::distance(std::begin(curr_family), curr_family.find(tree_dims)); + // remove matrix dimension of respective variable + std::vector matrix_dimensions = values[old_tree_index].dims; + // std::vector matrix_dimensions = values_old[old_tree_index].dims; + + // Rcout << typeof(matrix_dimensions.begin()) << std::endl; + + matrix_dimensions.erase(matrix_dimensions.begin() + dim_index); + // initialize data for new tree + auto grid = grid::NDGrid(matrix_dimensions); + grids.insert(grids.begin() + tree_index, grid); + values.insert(values.begin() + tree_index, utils::Matrix>(matrix_dimensions, std::vector(value_size, 0))); + values_old.insert(values_old.begin() + tree_index, utils::Matrix>(matrix_dimensions, std::vector(value_size, 0))); + individuals.insert(individuals.begin() + tree_index, utils::Matrix(matrix_dimensions)); + variables.insert(variables.begin() + tree_index, tree_dims); + // fill individuals of new trees + while (!grid.nextPoint()) + { + std::vector gridPoint = grid.getPoint(); + bool in_leaf = true; + // go through sample points to sum up individuals + for (const auto &feature_vec : X) + { + int dim_index2 = 0; + in_leaf = true; + for (const auto &dim : tree_dims) + { + double val = feature_vec[dim - 1]; + const double lower = lim_list[dim - 1][gridPoint[dim_index2]]; + const double upper = lim_list[dim - 1][gridPoint[dim_index2] + 1]; + if (!((val >= lower) && (val < upper))) + in_leaf = false; + ++dim_index2; + } + // consider individuals only if all in + if (in_leaf) + individuals[tree_index][gridPoint] += 1; + } + } + } + dim_index++; + } + } + } + } + // update currently considered dimension size + --curr_max; + } + + // Rcout << std::endl; + // Rcout << std::endl; + // Rcout << std::endl; + // + // for(int i = 0; i tol(curr_family.size(), 1); + int iter; + + // iterate backwards through tree family + int tree_index_t = curr_family.size() - 1; + + for (auto tree_t = variables.rbegin(); tree_t != variables.rend(); ++tree_t) + { + iter = 0; + std::set curr_dims = *tree_t; + + // do not purify null + if (curr_dims == std::set{0}) + continue; + // Rcout << std::endl << tree_index_t << " - T: "; + // Rcout << "tree_t:"; + // for(auto dim: curr_dims) Rcout << dim << ", "; + // Rcout << std::endl; + + auto grid = grids[tree_index_t]; + + // repeat until tolerance small enough and (?) maximum number of iterations reached + while ((tol[tree_index_t] > 0.00000000001) && (iter < 100)) + { + tol[tree_index_t] = 0; + + // go through feature dims + int curr_dim_index = 0; + for (const auto j : curr_dims) + { + + // get tree that has same variables as curr_tree minus j-variable + std::set tree_dims_minusj = curr_dims; + tree_dims_minusj.erase(tree_dims_minusj.find(j)); + int tree_index_minusj = 0; // if tree not exist, set to null tree + if (curr_family.find(tree_dims_minusj) != curr_family.end()) + tree_index_minusj = std::distance(std::begin(curr_family), curr_family.find(tree_dims_minusj)); + + // update values + if (grids[tree_index_t].dimensions.size() == 1) + { // one dimensional case (tree_t one dimensional) + + int sum_ind = 0; + std::vector avg(value_size, 0); + + // get sum of individuals + for (int i = 0; i < individuals[tree_index_t].n_entries; ++i) + { + std::vector tmp{i}; + sum_ind += individuals[tree_index_t][tmp]; + } + if (sum_ind == 0) + continue; + + // calc avg + for (int i = 0; i < individuals[tree_index_t].n_entries; ++i) + { + std::vector tmp{i}; + avg += (individuals[tree_index_t][tmp] * values[tree_index_t][tmp]) / sum_ind; + } + + // update values of one dimensional and null tree + for (int i = 0; i < values[tree_index_t].n_entries; ++i) + { + std::vector tmp{i}; + values[tree_index_t][tmp] -= avg; + } + std::vector tmp{0}; + values[tree_index_minusj][tmp] += avg; + } + else + { // higher dimensional case (tree_t dimension >1) + + // setup new grid without dimension j + std::vector new_dimensions = grids[tree_index_t].dimensions; + int index_j_in_t = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(j)); + std::vector j_dim; + j_dim.push_back(new_dimensions[index_j_in_t]); + new_dimensions.erase(new_dimensions.begin() + index_j_in_t); + + grid::NDGrid grid_minusj = grid::NDGrid(new_dimensions); + grid::NDGrid grid_minusj2 = grid::NDGrid(grids[tree_index_minusj]); + + // go through values without dimension j + while (!grid_minusj.nextPoint()) + { + + std::vector update(value_size, 0); + auto gridPoint_minusj = grid_minusj.getPoint(); + // Rcout << " " << "j: "; + // for(auto p: gridPoint_j) Rcout << p << ", "; + // Rcout << std::endl; + // calc update + + int tree_index_j = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(j)); + + grid::NDGrid grid_j = grid::NDGrid(j_dim); + + double sum_ind = 0; + std::vector avg(value_size, 0); + + while (!grid_j.nextPoint()) + { + auto gridPoint_j = grid_j.getPoint(); + // Rcout << " " << "i: "; + // for(auto p: gridPoint_i) Rcout << p << ", "; + // Rcout << std::endl << " "; + std::vector gridPoint_t(tree_t->size(), 0); + + gridPoint_t[tree_index_j] = gridPoint_j[0]; + + for (size_t minusj = 0; minusj < gridPoint_minusj.size(); ++minusj) + { + auto minusj_dim = tree_dims_minusj.begin(); + std::advance(minusj_dim, minusj); + int minusj_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*minusj_dim)); + // Rcout << " i_dim=" << *i_dim << ", i_index=" << i_index; + gridPoint_t[minusj_index] = gridPoint_minusj[minusj]; + } + double sum_curr = individuals[tree_index_t][gridPoint_t]; + sum_ind += sum_curr; + avg += (sum_curr * values[tree_index_t][gridPoint_t][0]); + } + + if (sum_ind != 0) + { + avg = avg / sum_ind; + } + else + { + continue; + } + double avgtol = avg[0]; + tol[tree_index_t] = std::max(std::fabs(avgtol), tol[tree_index_t]); + values[tree_index_minusj][gridPoint_minusj] += avg; + + // update values for t-tree + grid_j = grid::NDGrid(j_dim); + + while (!grid_j.nextPoint()) + { + auto gridPoint_j = grid_j.getPoint(); + // Rcout << " " << "i: "; + // for(auto p: gridPoint_i) Rcout << p << ", "; + // Rcout << std::endl << " "; + std::vector gridPoint_t(tree_t->size(), 0); + + gridPoint_t[tree_index_j] = gridPoint_j[0]; + + for (size_t minusj = 0; minusj < gridPoint_minusj.size(); ++minusj) + { + auto minusj_dim = tree_dims_minusj.begin(); + std::advance(minusj_dim, minusj); + int minusj_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*minusj_dim)); + // Rcout << " i_dim=" << *i_dim << ", i_index=" << i_index; + gridPoint_t[minusj_index] = gridPoint_minusj[minusj]; + } + + values[tree_index_t][gridPoint_t] -= avg; + } + + } // end of minusj loop + + } // end of loop for higher dim t case + ++curr_dim_index; + + } // end of j loop + + ++iter; + } // finished tol loop + + --tree_index_t; + } + + // ------------- attach to rpf class ------------- + + // fill with new trees + for (size_t tree_index = 0; tree_index < variables.size(); ++tree_index) + { + LeafGrid curr_gridLeaf; + curr_gridLeaf.grid = grids[tree_index]; + curr_gridLeaf.individuals = individuals[tree_index]; + curr_gridLeaf.lim_list = lim_list; + curr_gridLeaf.values = values[tree_index]; + auto &curr_tree = curr_family[variables[tree_index]]; + curr_tree->GridLeaves = curr_gridLeaf; + + auto &tree_dims = variables[tree_index]; + + if (tree_dims == std::set{0}) { + curr_tree->leaves[0].value = curr_gridLeaf.values[{0}]; + continue; + } + curr_tree->leaves.clear(); + + while (!curr_gridLeaf.grid.nextPoint()) { + auto grid_point = curr_gridLeaf.grid.getPoint(); + + bool not_end_leaf = true; + Leaf new_leaf; + { + new_leaf.value = curr_gridLeaf.values[grid_point]; + // initialize interval with split interval + int dim_idx = 0; + for (int dim = 1; dim <= feature_size; ++dim) { + const int gp = grid_point[dim_idx]; + if (gp >= lim_list[dim-1].size()-1){ + not_end_leaf = false; + break; + } + + double lower, upper; + if (tree_dims.count(dim) == 0) { + lower = lower_bounds[dim-1]; + upper = upper_bounds[dim-1]; + } else { + lower = lim_list[dim-1][gp]; + upper = lim_list[dim-1][gp + 1]; + ++dim_idx; + } + + new_leaf.intervals.push_back(Interval{lower, upper}); + + } + } + + if (not_end_leaf) + curr_tree->leaves.push_back(new_leaf); + } + } + } + + purified = true; // debug +} From 64c6a1f3f6354fc6505d4f0688a839d045afff99 Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Tue, 25 Jun 2024 13:32:25 +0200 Subject: [PATCH 09/16] Update makevars --- src/Makevars | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makevars b/src/Makevars index ca5e0c2..17a0e98 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,4 +1,4 @@ -SOURCES=lib/cpf.cpp lib/grid.cpp lib/helper.cpp lib/rpf.cpp lib/trees.cpp lib/rpf_purify.cpp lib/rpf_purify_no_extrapolation.cpp randomPlantedForest.cpp RcppExports.cpp +SOURCES=lib/cpf.cpp lib/grid.cpp lib/helper.cpp lib/rpf.cpp lib/trees.cpp lib/rpf_purify_no_extrapolation.cpp lib/rpf_purify_anova.cpp lib/rpf_purify.cpp randomPlantedForest.cpp RcppExports.cpp OBJECTS = $(SOURCES:.cpp=.o) From 1c8e3e1f08e5084949c7cd24fa8ea237de7b5303 Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Tue, 25 Jun 2024 13:35:01 +0200 Subject: [PATCH 10/16] Delete old purify --- src/lib/rpf_purify.cpp | 681 ++++++++--------------------------------- 1 file changed, 128 insertions(+), 553 deletions(-) diff --git a/src/lib/rpf_purify.cpp b/src/lib/rpf_purify.cpp index 726e8f0..6411021 100644 --- a/src/lib/rpf_purify.cpp +++ b/src/lib/rpf_purify.cpp @@ -1,8 +1,5 @@ #include "rpf.hpp" - - - void RandomPlantedForest::purify_3() { @@ -12,7 +9,7 @@ void RandomPlantedForest::purify_3() // lim_list is a list giving for each variable all interval end-points std::vector> lim_list = get_lim_list(curr_family); - + // initialize values and individuals for each tree in family std::vector grids(curr_family.size() - 1); std::vector> individuals(curr_family.size() - 1); @@ -561,8 +558,135 @@ void RandomPlantedForest::purify_3() purified = true; } +std::vector> RandomPlantedForest::get_lim_list(const TreeFamily &curr_family) { + // lim_list is a list giving for each variable all interval end-points + std::vector> lim_list(feature_size); + + // go through all variables of the component + for (int curr_dim = 1; curr_dim <= feature_size; ++curr_dim) + { + std::vector bounds; + + // go through trees of family + for (const auto &curr_tree : curr_family) + { + + // consider only relevant trees that have current dimension as variable + if (!curr_tree.first.count(curr_dim)) + continue; + + // go through leaves of tree + for (const auto &curr_leaf : curr_tree.second->leaves) + { + // get interval ends of variable + bounds.push_back(curr_leaf.intervals[curr_dim - 1].first); + bounds.push_back(curr_leaf.intervals[curr_dim - 1].second); + } + } + std::sort(bounds.begin(), bounds.end()); + bounds.erase(std::unique(bounds.begin(), bounds.end()), bounds.end()); + // int i_last = bounds.size()-1; + // double bibi = bounds[i_last] + 0.0001; + // bounds[i_last] = bounds[i_last] + 0.0001; + lim_list[curr_dim - 1] = bounds; + } + + return lim_list; +} + +void RandomPlantedForest::purify_1() +{ + + // go through all n_trees families + for (auto &curr_family : this->tree_families) + { + + // recap maximum number of dimensions of current family + unsigned int curr_max = 0; + for (auto tree : curr_family) + { + if (tree.first.size() > curr_max) + curr_max = tree.first.size(); + } + + while (curr_max >= 1) + { + + // go through split dimensions of all trees + auto keys = getKeys(curr_family); + std::vector>::reverse_iterator key = keys.rbegin(); + while (key != keys.rend()) + { + + auto &curr_tree = curr_family[(*key)]; + std::set curr_dims = curr_tree->split_dims; + + // check if number of dims same as current max_interaction + if (curr_dims.size() == curr_max) + { + + // go through feature dims + for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) + { + + // continue only if dim in current tree + if (curr_tree->split_dims.count(feature_dim) != 0) + { + + std::set tree_dims = curr_tree->split_dims; + tree_dims.erase(tree_dims.find(feature_dim)); // remove current feature dim from current tree + + // check if tree with dimensions exists, if not create + std::shared_ptr tree = treeExists(tree_dims, curr_family); + if (curr_max == 1) + { + tree = curr_family[std::set{0}]; + } + else + { + if (!tree) + { + curr_family.insert(std::make_pair(tree_dims, std::make_shared(DecisionTree(tree_dims)))); + tree = curr_family[tree_dims]; + } + } + // go through leaves of current tree + int n_leaves = curr_tree->leaves.size(); + for (int l = 0; l < n_leaves; ++l) + { + auto &curr_leaf = curr_tree->leaves[l]; + + double multiplier = (curr_leaf.intervals[feature_dim - 1].second - curr_leaf.intervals[feature_dim - 1].first) / (upper_bounds[feature_dim - 1] - lower_bounds[feature_dim - 1]); + + // new leaf including intervals and value + Leaf new_leaf = curr_leaf; // initialize intervals with first leaf + new_leaf.intervals[feature_dim - 1].first = lower_bounds[feature_dim - 1]; + new_leaf.intervals[feature_dim - 1].second = upper_bounds[feature_dim - 1]; + for (size_t i = 0; i < value_size; ++i) + new_leaf.value[i] = -curr_leaf.value[i] * multiplier; // update value of new leaf + + // append new leaf + if (!leafExists(new_leaf.intervals, curr_tree)) + curr_tree->leaves.push_back(new_leaf); + for (size_t i = 0; i < value_size; ++i) + new_leaf.value[i] = curr_leaf.value[i] * multiplier; // update value of new leaf + if (!leafExists(new_leaf.intervals, tree)) + tree->leaves.push_back(new_leaf); + } + } + } + } + key++; + } + + // update currently considered dimension size + --curr_max; + } + } + purified = true; +} void RandomPlantedForest::purify_3_new() { @@ -1111,552 +1235,3 @@ void RandomPlantedForest::purify_3_new() purified = true; } - - - -std::vector> RandomPlantedForest::get_lim_list(const TreeFamily &curr_family) { - // lim_list is a list giving for each variable all interval end-points - std::vector> lim_list(feature_size); - - // go through all variables of the component - for (int curr_dim = 1; curr_dim <= feature_size; ++curr_dim) - { - std::vector bounds; - - // go through trees of family - for (const auto &curr_tree : curr_family) - { - - // consider only relevant trees that have current dimension as variable - if (!curr_tree.first.count(curr_dim)) - continue; - - // go through leaves of tree - for (const auto &curr_leaf : curr_tree.second->leaves) - { - // get interval ends of variable - bounds.push_back(curr_leaf.intervals[curr_dim - 1].first); - bounds.push_back(curr_leaf.intervals[curr_dim - 1].second); - } - } - std::sort(bounds.begin(), bounds.end()); - bounds.erase(std::unique(bounds.begin(), bounds.end()), bounds.end()); - // int i_last = bounds.size()-1; - // double bibi = bounds[i_last] + 0.0001; - // bounds[i_last] = bounds[i_last] + 0.0001; - lim_list[curr_dim - 1] = bounds; - } - - return lim_list; -} - - -void RandomPlantedForest::purify_1() -{ - - // go through all n_trees families - for (auto &curr_family : this->tree_families) - { - - // recap maximum number of dimensions of current family - unsigned int curr_max = 0; - for (auto tree : curr_family) - { - if (tree.first.size() > curr_max) - curr_max = tree.first.size(); - } - - while (curr_max >= 1) - { - - // go through split dimensions of all trees - auto keys = getKeys(curr_family); - std::vector>::reverse_iterator key = keys.rbegin(); - while (key != keys.rend()) - { - - auto &curr_tree = curr_family[(*key)]; - std::set curr_dims = curr_tree->split_dims; - - // check if number of dims same as current max_interaction - if (curr_dims.size() == curr_max) - { - - // go through feature dims - for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) - { - - // continue only if dim in current tree - if (curr_tree->split_dims.count(feature_dim) != 0) - { - - std::set tree_dims = curr_tree->split_dims; - tree_dims.erase(tree_dims.find(feature_dim)); // remove current feature dim from current tree - - // check if tree with dimensions exists, if not create - std::shared_ptr tree = treeExists(tree_dims, curr_family); - if (curr_max == 1) - { - tree = curr_family[std::set{0}]; - } - else - { - if (!tree) - { - curr_family.insert(std::make_pair(tree_dims, std::make_shared(DecisionTree(tree_dims)))); - tree = curr_family[tree_dims]; - } - } - - // go through leaves of current tree - int n_leaves = curr_tree->leaves.size(); - for (int l = 0; l < n_leaves; ++l) - { - auto &curr_leaf = curr_tree->leaves[l]; - - double multiplier = (curr_leaf.intervals[feature_dim - 1].second - curr_leaf.intervals[feature_dim - 1].first) / (upper_bounds[feature_dim - 1] - lower_bounds[feature_dim - 1]); - - // new leaf including intervals and value - Leaf new_leaf = curr_leaf; // initialize intervals with first leaf - new_leaf.intervals[feature_dim - 1].first = lower_bounds[feature_dim - 1]; - new_leaf.intervals[feature_dim - 1].second = upper_bounds[feature_dim - 1]; - for (size_t i = 0; i < value_size; ++i) - new_leaf.value[i] = -curr_leaf.value[i] * multiplier; // update value of new leaf - - // append new leaf - if (!leafExists(new_leaf.intervals, curr_tree)) - curr_tree->leaves.push_back(new_leaf); - for (size_t i = 0; i < value_size; ++i) - new_leaf.value[i] = curr_leaf.value[i] * multiplier; // update value of new leaf - if (!leafExists(new_leaf.intervals, tree)) - tree->leaves.push_back(new_leaf); - } - } - } - } - key++; - } - - // update currently considered dimension size - --curr_max; - } - } - - purified = true; -} - -void RandomPlantedForest::purify_2() -{ - - - // go through all n_trees families - for (auto &curr_family : this->tree_families) - { - - // lim_list is a list giving for each variable all interval end-points - std::vector> lim_list = get_lim_list(curr_family); - - // initialize values and individuals for each tree in family - std::vector grids(curr_family.size() - 1); - std::vector> individuals(curr_family.size() - 1); - std::vector>> values(curr_family.size() - 1); - std::vector>> values_old(curr_family.size() - 1); - std::vector> variables(curr_family.size() - 1); - - // ------------- setup finer grid ------------- - - int tree_index = 0; - for (const auto &curr_tree : curr_family) - { - - if (curr_tree.first == std::set{0}) - { - - // values[tree_index] = rpf::Matrix>(dimensions, std::vector(value_size, 0)); // changed - continue; // ignore null tree - } - - // fill space with dimensions - std::vector dimensions; - for (const auto &dim : curr_tree.first) - { - dimensions.push_back(lim_list[dim - 1].size()); // size - 1 ? WICHTIG - } - - // setup grid for leaf indices - auto grid = grid::NDGrid(dimensions); - - // initialize data for current tree - grids[tree_index] = grid; - individuals[tree_index] = utils::Matrix(dimensions, 0); - values[tree_index] = utils::Matrix>(dimensions, std::vector(value_size, 0)); // changed - values_old[tree_index] = utils::Matrix>(dimensions, std::vector(value_size, 0)); // changed - variables[tree_index] = curr_tree.first; - - // fill grid points with individuals and values - while (!grid.nextPoint()) - { - - std::vector gridPoint = grid.getPoint(); - - bool in_leaf = true; - - // go through sample points to sum up individuals - for (const auto &feature_vec : X) - { - int dim_index = 0; - in_leaf = true; - for (const auto &dim : curr_tree.first) - { - double val = feature_vec[dim - 1]; - if (!((val >= lim_list[dim - 1][gridPoint[dim_index]]) && (val < lim_list[dim - 1][gridPoint[dim_index] + 1]))) - in_leaf = false; - ++dim_index; - } - - // consider individuals only if all in - if (in_leaf) - individuals[tree_index][gridPoint] += 1; - } - - // go through leaves of tree to sum up values - for (const auto &leaf : curr_tree.second->get_leaves()) - { - - in_leaf = true; - int dim_index = 0; - for (const auto &dim : curr_tree.first) - { - // consider values only if all in - if (!((leaf.intervals[dim - 1].first <= lim_list[dim - 1][gridPoint[dim_index]]) && (leaf.intervals[dim - 1].second >= lim_list[dim - 1][gridPoint[dim_index] + 1]))) - in_leaf = false; - ++dim_index; - } - - // sum up values - if (in_leaf) - { - - values[tree_index][gridPoint] += leaf.value; // todo: multiclass - values_old[tree_index][gridPoint] += leaf.value; // todo: multiclass - } - } - } - - ++tree_index; - } - - // Rcout << variables.size(); - // for(int i = 0; i>(std::vector{1}, std::vector(value_size, 0))); - values_old.insert(values_old.begin(), utils::Matrix>(std::vector{1}, std::vector(value_size, 0))); - individuals.insert(individuals.begin(), utils::Matrix(std::vector{1})); - variables.insert(variables.begin(), std::set{0}); - - // recap maximum number of dimensions of current family - unsigned int curr_max = curr_family.rbegin()->first.size(); - - while (curr_max > 1) - { - - auto keys = getKeys(curr_family); - // go through split dimensions of all trees - for (std::vector>::reverse_iterator key = keys.rbegin(); key != keys.rend(); ++key) - { - auto &curr_tree = curr_family[(*key)]; - std::set curr_dims = curr_tree->split_dims; - // check if number of dims same as current max_interaction - if (curr_dims.size() == curr_max) - { - // go through feature dims - int dim_index = 0; - for (int feature_dim = 1; feature_dim <= feature_size; ++feature_dim) - { - // continue only if dim in current tree - if (curr_tree->split_dims.count(feature_dim) != 0) - { - std::set tree_dims = curr_tree->split_dims; - tree_dims.erase(tree_dims.find(feature_dim)); // remove current feature dim from current tree - // check if tree with dimensions exists, if not create - std::shared_ptr tree = treeExists(tree_dims, curr_family); - if (!tree) - { - // get index of old and new tree - auto old_tree_index = std::distance(std::begin(curr_family), curr_family.find(curr_tree->get_split_dims())); - curr_family.insert(std::make_pair(tree_dims, std::make_shared(DecisionTree(tree_dims)))); - auto tree_index = std::distance(std::begin(curr_family), curr_family.find(tree_dims)); - // remove matrix dimension of respective variable - std::vector matrix_dimensions = values[old_tree_index].dims; - // std::vector matrix_dimensions = values_old[old_tree_index].dims; - - // Rcout << typeof(matrix_dimensions.begin()) << std::endl; - - matrix_dimensions.erase(matrix_dimensions.begin() + dim_index); - // initialize data for new tree - auto grid = grid::NDGrid(matrix_dimensions); - grids.insert(grids.begin() + tree_index, grid); - values.insert(values.begin() + tree_index, utils::Matrix>(matrix_dimensions, std::vector(value_size, 0))); - values_old.insert(values_old.begin() + tree_index, utils::Matrix>(matrix_dimensions, std::vector(value_size, 0))); - individuals.insert(individuals.begin() + tree_index, utils::Matrix(matrix_dimensions)); - variables.insert(variables.begin() + tree_index, tree_dims); - // fill individuals of new trees - while (!grid.nextPoint()) - { - std::vector gridPoint = grid.getPoint(); - bool in_leaf = true; - // go through sample points to sum up individuals - for (const auto &feature_vec : X) - { - int dim_index2 = 0; - in_leaf = true; - for (const auto &dim : tree_dims) - { - double val = feature_vec[dim - 1]; - if (!((val >= lim_list[dim - 1][gridPoint[dim_index2]]) && (val < lim_list[dim - 1][gridPoint[dim_index2] + 1]))) - in_leaf = false; - ++dim_index2; - } - // consider individuals only if all in - if (in_leaf) - individuals[tree_index][gridPoint] += 1; - } - } - } - dim_index++; - } - } - } - } - // update currently considered dimension size - --curr_max; - } - - // Rcout << std::endl; - // Rcout << std::endl; - // Rcout << std::endl; - // - // for(int i = 0; i tol(curr_family.size(), 1); - int iter; - - // iterate backwards through tree family - int tree_index_t = curr_family.size() - 1; - - for (auto tree_t = variables.rbegin(); tree_t != variables.rend(); ++tree_t) - { - iter = 0; - std::set curr_dims = *tree_t; - - - // do not purify null - if (curr_dims == std::set{0}) - continue; - // Rcout << std::endl << tree_index_t << " - T: "; - // Rcout << "tree_t:"; - // for(auto dim: curr_dims) Rcout << dim << ", "; - // Rcout << std::endl; - - auto grid = grids[tree_index_t]; - - // repeat until tolerance small enough and (?) maximum number of iterations reached - while ((tol[tree_index_t] > 0.00000000001) && (iter < 100)) - { - tol[tree_index_t] = 0; - - // go through feature dims - int curr_dim_index = 0; - for (const auto j : curr_dims) - { - - // get tree that has same variables as curr_tree minus j-variable - std::set tree_dims_minusj = curr_dims; - tree_dims_minusj.erase(tree_dims_minusj.find(j)); - int tree_index_minusj = 0; // if tree not exist, set to null tree - if (curr_family.find(tree_dims_minusj) != curr_family.end()) - tree_index_minusj = std::distance(std::begin(curr_family), curr_family.find(tree_dims_minusj)) ; - - // update values - if (grids[tree_index_t].dimensions.size() == 1) - { // one dimensional case (tree_t one dimensional) - - int sum_ind = 0; - std::vector avg(value_size, 0); - - // get sum of individuals - for (int i = 0; i < individuals[tree_index_t].n_entries; ++i) - { - std::vector tmp{i}; - sum_ind += individuals[tree_index_t][tmp]; - } - if (sum_ind == 0) - continue; - - // calc avg - for (int i = 0; i < individuals[tree_index_t].n_entries; ++i) - { - std::vector tmp{i}; - avg += (individuals[tree_index_t][tmp] * values[tree_index_t][tmp]) / sum_ind; - } - - // update values of one dimensional and null tree - for (int i = 0; i < values[tree_index_t].n_entries; ++i) - { - std::vector tmp{i}; - values[tree_index_t][tmp] -= avg; - } - std::vector tmp{0}; - values[tree_index_minusj][tmp] += avg; - } - else - { // higher dimensional case (tree_t dimension >1) - - // setup new grid without dimension j - std::vector new_dimensions = grids[tree_index_t].dimensions; - int index_j_in_t = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(j)); - std::vector j_dim; - j_dim.push_back(new_dimensions[index_j_in_t]); - new_dimensions.erase(new_dimensions.begin() + index_j_in_t); - - grid::NDGrid grid_minusj = grid::NDGrid(new_dimensions); - grid::NDGrid grid_minusj2 = grid::NDGrid(grids[tree_index_minusj]); - - // go through values without dimension j - while (!grid_minusj.nextPoint()) - { - - std::vector update(value_size, 0); - auto gridPoint_minusj = grid_minusj.getPoint(); - // Rcout << " " << "j: "; - // for(auto p: gridPoint_j) Rcout << p << ", "; - // Rcout << std::endl; - // calc update - - int tree_index_j = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(j)); - - - grid::NDGrid grid_j = grid::NDGrid(j_dim); - - double sum_ind = 0; - std::vector avg(value_size, 0); - - while (!grid_j.nextPoint()) - { - auto gridPoint_j = grid_j.getPoint(); - // Rcout << " " << "i: "; - // for(auto p: gridPoint_i) Rcout << p << ", "; - // Rcout << std::endl << " "; - std::vector gridPoint_t(tree_t->size(), 0); - - gridPoint_t[tree_index_j] = gridPoint_j[0]; - - for (size_t minusj = 0; minusj < gridPoint_minusj.size(); ++minusj) - { - auto minusj_dim = tree_dims_minusj.begin(); - std::advance(minusj_dim, minusj); - int minusj_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*minusj_dim)); - // Rcout << " i_dim=" << *i_dim << ", i_index=" << i_index; - gridPoint_t[minusj_index] = gridPoint_minusj[minusj]; - } - double sum_curr = individuals[tree_index_t][gridPoint_t]; - sum_ind += sum_curr; - avg += (sum_curr * values[tree_index_t][gridPoint_t][0]); - } - - if (sum_ind !=0) { - avg = avg/sum_ind; - } - else - { continue; - } - double avgtol = avg[0]; - tol[tree_index_t] = std::max(std::fabs(avgtol), tol[tree_index_t]); - values[tree_index_minusj][gridPoint_minusj] += avg; - - // update values for t-tree - grid_j = grid::NDGrid(j_dim); - - while (!grid_j.nextPoint()) - { - auto gridPoint_j = grid_j.getPoint(); - // Rcout << " " << "i: "; - // for(auto p: gridPoint_i) Rcout << p << ", "; - // Rcout << std::endl << " "; - std::vector gridPoint_t(tree_t->size(), 0); - - gridPoint_t[tree_index_j] = gridPoint_j[0]; - - for (size_t minusj = 0; minusj < gridPoint_minusj.size(); ++minusj) - { - auto minusj_dim = tree_dims_minusj.begin(); - std::advance(minusj_dim, minusj); - int minusj_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*minusj_dim)); - // Rcout << " i_dim=" << *i_dim << ", i_index=" << i_index; - gridPoint_t[minusj_index] = gridPoint_minusj[minusj]; - } - - values[tree_index_t][gridPoint_t] -= avg; - } - - - } // end of minusj loop - - } // end of loop for higher dim t case - ++curr_dim_index; - - } // end of j loop - - - ++iter; - } // finished tol loop - - --tree_index_t; - } - - - // ------------- attach to rpf class ------------- - - // fill with new trees - for (size_t tree_index = 0; tree_index < variables.size(); ++tree_index) - { - LeafGrid curr_gridLeaf; - curr_gridLeaf.grid = grids[tree_index]; - curr_gridLeaf.individuals = individuals[tree_index]; - curr_gridLeaf.lim_list = lim_list; - curr_gridLeaf.values = values[tree_index]; - curr_family[variables[tree_index]]->GridLeaves = curr_gridLeaf; - } - } - - purified = true; -} From 0a619abd08c3f64fd5e7fe8eda399e761a171f10 Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Tue, 25 Jun 2024 15:56:20 +0200 Subject: [PATCH 11/16] Fix bug: do not stop if is not end leaf --- src/lib/rpf_purify_anova.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/lib/rpf_purify_anova.cpp b/src/lib/rpf_purify_anova.cpp index af6c125..5dfa6a0 100644 --- a/src/lib/rpf_purify_anova.cpp +++ b/src/lib/rpf_purify_anova.cpp @@ -427,16 +427,16 @@ void RandomPlantedForest::purify_2() int dim_idx = 0; for (int dim = 1; dim <= feature_size; ++dim) { const int gp = grid_point[dim_idx]; - if (gp >= lim_list[dim-1].size()-1){ - not_end_leaf = false; - break; - } double lower, upper; if (tree_dims.count(dim) == 0) { lower = lower_bounds[dim-1]; upper = upper_bounds[dim-1]; } else { + if (gp >= lim_list[dim-1].size()-1){ + not_end_leaf = false; + break; + } lower = lim_list[dim-1][gp]; upper = lim_list[dim-1][gp + 1]; ++dim_idx; From 28a639d1869bcdc0a859e4dc2623258b82eca111 Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Wed, 26 Jun 2024 10:52:36 +0200 Subject: [PATCH 12/16] Rename purify anova --- src/include/rpf.hpp | 2 +- src/lib/rpf_purify_anova.cpp | 6 +++--- src/randomPlantedForest.cpp | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/include/rpf.hpp b/src/include/rpf.hpp index cc59ca4..6f8c93b 100644 --- a/src/include/rpf.hpp +++ b/src/include/rpf.hpp @@ -16,7 +16,7 @@ class RandomPlantedForest NumericMatrix predict_matrix(const NumericMatrix &X, const NumericVector components = {0}); NumericMatrix predict_vector(const NumericVector &X, const NumericVector components = {0}); void purify_1(); - void purify_2(); + void purify_anova(); void purify_22(); void purify_3(); void purify_3_new(); diff --git a/src/lib/rpf_purify_anova.cpp b/src/lib/rpf_purify_anova.cpp index 5dfa6a0..ab5ad13 100644 --- a/src/lib/rpf_purify_anova.cpp +++ b/src/lib/rpf_purify_anova.cpp @@ -1,6 +1,6 @@ #include "rpf.hpp" -void RandomPlantedForest::purify_2() +void RandomPlantedForest::purify_anova() { // go through all n_trees families @@ -426,13 +426,13 @@ void RandomPlantedForest::purify_2() // initialize interval with split interval int dim_idx = 0; for (int dim = 1; dim <= feature_size; ++dim) { - const int gp = grid_point[dim_idx]; - double lower, upper; + if (tree_dims.count(dim) == 0) { lower = lower_bounds[dim-1]; upper = upper_bounds[dim-1]; } else { + const int gp = grid_point[dim_idx]; if (gp >= lim_list[dim-1].size()-1){ not_end_leaf = false; break; diff --git a/src/randomPlantedForest.cpp b/src/randomPlantedForest.cpp index 68d54f2..fcc0289 100644 --- a/src/randomPlantedForest.cpp +++ b/src/randomPlantedForest.cpp @@ -17,7 +17,7 @@ RCPP_MODULE(mod_rpf) .method("purify", &RandomPlantedForest::purify_3) .method("purify3_new", &RandomPlantedForest::purify_3_new) .method("purify1", &RandomPlantedForest::purify_1) - .method("purify2", &RandomPlantedForest::purify_2) + .method("purify_anova", &RandomPlantedForest::purify_anova) .method("purify_new", &RandomPlantedForest::purify_no_extrapolation) .method("print", &RandomPlantedForest::print) .method("get_parameters", &RandomPlantedForest::get_parameters) From abf898411185d89fee5285cb5e24a16a49b7e1ed Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Wed, 26 Jun 2024 10:53:26 +0200 Subject: [PATCH 13/16] Split predict up --- src/include/rpf.hpp | 2 + src/lib/rpf.cpp | 287 ++++++++++++++++++++++---------------------- 2 files changed, 148 insertions(+), 141 deletions(-) diff --git a/src/include/rpf.hpp b/src/include/rpf.hpp index 6f8c93b..15f8207 100644 --- a/src/include/rpf.hpp +++ b/src/include/rpf.hpp @@ -53,6 +53,8 @@ class RandomPlantedForest std::vector lower_bounds; std::vector tree_families; /**< random planted forest containing result */ std::vector predict_single(const std::vector &X, std::set component_index); + std::vector predict_single_grid(const std::vector &X, std::set component_index); + std::vector predict_single_no_grid(const std::vector &X, std::set component_index); void L2_loss(Split &split); virtual void fit(); virtual void create_tree_family(std::vector initial_leaves, size_t n); diff --git a/src/lib/rpf.cpp b/src/lib/rpf.cpp index 9835cfe..d322d1e 100644 --- a/src/lib/rpf.cpp +++ b/src/lib/rpf.cpp @@ -1,6 +1,5 @@ #include "rpf.hpp" - bool RandomPlantedForest::is_purified() { return purified; @@ -619,207 +618,213 @@ void RandomPlantedForest::cross_validation(int n_sets, IntegerVector splits, Num // predict single feature vector std::vector RandomPlantedForest::predict_single(const std::vector &X, std::set component_index) { + // const auto& res_grid = predict_single_grid(X, component_index); + // const auto& res_no_grid = predict_single_no_grid(X, component_index); + + return purified ? predict_single_grid(X, component_index) : predict_single_no_grid(X, component_index); + // return false ? res_grid: res_no_grid; +} +std::vector RandomPlantedForest::predict_single_grid(const std::vector &X, std::set component_index) +{ std::vector total_res = std::vector(value_size, 0); - if (!purified) + if (component_index == std::set{-1}) { - // consider all components - if (component_index == std::set{0}) + for (auto &tree_family : this->tree_families) { - for (auto &tree_family : this->tree_families) + for (auto &tree : tree_family) { - for (auto &tree : tree_family) + std::vector leaf_index(tree.first.size(), -1); + // add value of null tree + if (tree.first == std::set{0}) { - for (auto &leaf : tree.second->leaves) - { - bool valid = true; - for (auto &dim : tree.first) - { - if (!((leaf.intervals[std::max(0, dim - 1)].first <= X[std::max(0, dim - 1)] || leaf.intervals[std::max(0, dim - 1)].first == lower_bounds[std::max(0, dim - 1)]) - && (leaf.intervals[std::max(0, dim - 1)].second > X[std::max(0, dim - 1)] || leaf.intervals[std::max(0, dim - 1)].second == upper_bounds[std::max(0, dim - 1)]))) - { - valid = false; - } - } - if (valid) - { - // Rcout << leaf.value[0] << "\n"; - total_res += leaf.value; - } - } + // Rcout << tree.first.size() ; + leaf_index = std::vector(tree.first.size(), 0); + total_res += tree.second->GridLeaves.values[leaf_index]; } } } - else - { // choose components for prediction - for (auto &tree_family : this->tree_families) + } + else if (component_index == std::set{0}) + { + for (auto &tree_family : this->tree_families) + { + for (auto &tree : tree_family) { - for (auto &tree : tree_family) + std::vector leaf_index(tree.first.size(), -1); + + // add value of null tree + if (tree.first == std::set{0}) { - // only consider trees with same dimensions as component_index - if (tree.first != component_index) - continue; + // Rcout << tree.first.size() ; + leaf_index = std::vector(tree.first.size(), 0); + } + else + { - std::vector dims; - for (auto dim : tree.first) + // go through limits of grid + for (size_t dim_index = 0; dim_index < tree.first.size(); ++dim_index) { - dims.push_back(dim); - } + // get dim at dim_index + int dim = 0; + { + auto dim_pnt = tree.first.begin(); + std::advance(dim_pnt, dim_index); + dim = *dim_pnt; + --dim; // transform into index + } - for (auto &leaf : tree.second->leaves) - { - bool valid = true; - for (unsigned int i = 0; i < dims.size(); ++i) + auto bounds = tree.second->GridLeaves.lim_list[dim]; + for (double bound : bounds) { - int dim = dims[i]; + // check if sample in leaf at dimension + if (X[dim] < bound) + break; // changed - if (!((leaf.intervals[std::max(0, dim - 1)].first <= X[i] || leaf.intervals[std::max(0, dim - 1)].first == lower_bounds[std::max(0, dim - 1)]) && (leaf.intervals[std::max(0, dim - 1)].second > X[i] || leaf.intervals[std::max(0, dim - 1)].second == upper_bounds[std::max(0, dim - 1)]))) - { - valid = false; - } + // if no interval smaller, set to end of bounds, otherwise set to leaf index + leaf_index[dim_index] = std::min(leaf_index[dim_index] + 1, (int)bounds.size() - 2); } - if (valid) - total_res += leaf.value; } } + + // if interval of first leaf smaller smaller + for (int &index : leaf_index) + index = std::max(0, index); + + const std::vector to_add = tree.second->GridLeaves.values[leaf_index]; + // Rcout << "grid: " << to_add[0] << "\n"; + total_res += to_add; } } } else { - if (component_index == std::set{-1}) + + for (auto &tree_family : this->tree_families) { - for (auto &tree_family : this->tree_families) + for (auto &tree : tree_family) { - for (auto &tree : tree_family) - { - std::vector leaf_index(tree.first.size(), -1); - // add value of null tree - if (tree.first == std::set{0}) - { - // Rcout << tree.first.size() ; - leaf_index = std::vector(tree.first.size(), 0); - total_res += tree.second->GridLeaves.values[leaf_index]; - } + // only consider trees with same dimensions as component_index + if (tree.first != component_index) + continue; + + std::vector leaf_index(tree.first.size(), -1); + + // add value of null tree + if (tree.first == std::set{0}) + { + leaf_index = std::vector(tree.first.size(), 0); } - } - } - else if (component_index == std::set{0}) - { - for (auto &tree_family : this->tree_families) - { - for (auto &tree : tree_family) + else { - std::vector leaf_index(tree.first.size(), -1); - // add value of null tree - if (tree.first == std::set{0}) + // go through limits of grid + for (size_t dim_index = 0; dim_index < tree.first.size(); ++dim_index) { - - // Rcout << tree.first.size() ; - leaf_index = std::vector(tree.first.size(), 0); - } - else - { - - // go through limits of grid - for (size_t dim_index = 0; dim_index < tree.first.size(); ++dim_index) + // get dim at dim_index + int dim = 0; { - // get dim at dim_index - int dim = 0; - { - auto dim_pnt = tree.first.begin(); - std::advance(dim_pnt, dim_index); - dim = *dim_pnt; - --dim; // transform into index - } + auto dim_pnt = tree.first.begin(); + std::advance(dim_pnt, dim_index); + dim = *dim_pnt; + --dim; // transform into index + } - auto bounds = tree.second->GridLeaves.lim_list[dim]; - for (double bound : bounds) - { + auto bounds = tree.second->GridLeaves.lim_list[dim]; + for (double bound : bounds) + { - // check if sample in leaf at dimension - if (X[dim] < bound) - break; // changed + // check if sample in leaf at dimension + if (X[dim_index] < bound) + break; // changed - // if no interval smaller, set to end of bounds, otherwise set to leaf index - leaf_index[dim_index] = std::min(leaf_index[dim_index] + 1, (int)bounds.size() - 2); - } + // if no interval smaller, set to end of bounds, otherwise set to leaf index + leaf_index[dim_index] = std::min(leaf_index[dim_index] + 1, (int)bounds.size() - 2); } } + } - // if interval of first leaf smaller smaller - for (int &index : leaf_index) - index = std::max(0, index); + // if interval of first leaf smaller smaller + for (int &index : leaf_index) + index = std::max(0, index); - const auto to_add = tree.second->GridLeaves.values[leaf_index]; - total_res += to_add; - } + total_res += tree.second->GridLeaves.values[leaf_index]; } } - else - { + } + return total_res / n_trees; +} - for (auto &tree_family : this->tree_families) +std::vector RandomPlantedForest::predict_single_no_grid(const std::vector &X, std::set component_index) +{ + std::vector total_res = std::vector(value_size, 0); + + if (component_index == std::set{0}) + { + for (auto &tree_family : this->tree_families) + { + for (auto &tree : tree_family) { - for (auto &tree : tree_family) + for (auto &leaf : tree.second->leaves) { - - // only consider trees with same dimensions as component_index - if (tree.first != component_index) - continue; - - std::vector leaf_index(tree.first.size(), -1); - - // add value of null tree - if (tree.first == std::set{0}) + bool valid = true; + for (auto &dim : tree.first) { - leaf_index = std::vector(tree.first.size(), 0); + if (!((leaf.intervals[std::max(0, dim - 1)].first <= X[std::max(0, dim - 1)] || leaf.intervals[std::max(0, dim - 1)].first == lower_bounds[std::max(0, dim - 1)]) && (leaf.intervals[std::max(0, dim - 1)].second > X[std::max(0, dim - 1)] || leaf.intervals[std::max(0, dim - 1)].second == upper_bounds[std::max(0, dim - 1)]))) + { + valid = false; + } } - else + if (valid) { + // Rcout << "no grid: " << leaf.value[0] << "\n"; + total_res += leaf.value; + } + } + } + } + } + else + { // choose components for prediction + for (auto &tree_family : this->tree_families) + { + for (auto &tree : tree_family) + { - // go through limits of grid - for (size_t dim_index = 0; dim_index < tree.first.size(); ++dim_index) - { - // get dim at dim_index - int dim = 0; - { - auto dim_pnt = tree.first.begin(); - std::advance(dim_pnt, dim_index); - dim = *dim_pnt; - --dim; // transform into index - } + // only consider trees with same dimensions as component_index + if (tree.first != component_index) + continue; - auto bounds = tree.second->GridLeaves.lim_list[dim]; - for (double bound : bounds) - { + std::vector dims; + for (auto dim : tree.first) + { + dims.push_back(dim); + } - // check if sample in leaf at dimension - if (X[dim_index] < bound) - break; // changed + for (auto &leaf : tree.second->leaves) + { + bool valid = true; + for (unsigned int i = 0; i < dims.size(); ++i) + { - // if no interval smaller, set to end of bounds, otherwise set to leaf index - leaf_index[dim_index] = std::min(leaf_index[dim_index] + 1, (int)bounds.size() - 2); - } + int dim = dims[i]; + + if (!((leaf.intervals[std::max(0, dim - 1)].first <= X[i] || leaf.intervals[std::max(0, dim - 1)].first == lower_bounds[std::max(0, dim - 1)]) && (leaf.intervals[std::max(0, dim - 1)].second > X[i] || leaf.intervals[std::max(0, dim - 1)].second == upper_bounds[std::max(0, dim - 1)]))) + { + valid = false; } } - - // if interval of first leaf smaller smaller - for (int &index : leaf_index) - index = std::max(0, index); - - total_res += tree.second->GridLeaves.values[leaf_index]; + if (valid) + total_res += leaf.value; } } } } - return total_res / n_trees; } From 50f9b321bc9af144f7b2b61b84fdea8421f41dbb Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Wed, 26 Jun 2024 10:54:52 +0200 Subject: [PATCH 14/16] Add purify within bounds from existing grid --- src/include/rpf.hpp | 1 + src/lib/rpf_purify_no_extrapolation_grid.cpp | 384 +++++++++++++++++++ 2 files changed, 385 insertions(+) create mode 100644 src/lib/rpf_purify_no_extrapolation_grid.cpp diff --git a/src/include/rpf.hpp b/src/include/rpf.hpp index 15f8207..9647c6d 100644 --- a/src/include/rpf.hpp +++ b/src/include/rpf.hpp @@ -21,6 +21,7 @@ class RandomPlantedForest void purify_3(); void purify_3_new(); void purify_no_extrapolation(); + void purify_no_extrapolation_existing_grid(); void print(); void cross_validation(int n_sets = 4, IntegerVector splits = {5, 50}, NumericVector t_tries = {0.2, 0.5, 0.7, 0.9}, IntegerVector split_tries = {1, 2, 5, 10}); double MSE(const NumericMatrix &Y_predicted, const NumericMatrix &Y_true); diff --git a/src/lib/rpf_purify_no_extrapolation_grid.cpp b/src/lib/rpf_purify_no_extrapolation_grid.cpp new file mode 100644 index 0000000..ac981ce --- /dev/null +++ b/src/lib/rpf_purify_no_extrapolation_grid.cpp @@ -0,0 +1,384 @@ +#include "rpf.hpp" + +void RandomPlantedForest::purify_no_extrapolation_existing_grid() +{ + + // go through all n_trees families + for (auto &curr_family : this->tree_families) + { + + // lim_list is a list giving for each variable all interval end-points + std::vector> lim_list = get_lim_list(curr_family); + + // initialize values and individuals for each tree in family + std::vector grids; + std::vector> individuals; + std::vector>> values; + std::vector>> values_old; + std::vector> variables; + + // ------------- setup finer grid ------------- + + int tree_index = 0; + for (const auto &curr_tree : curr_family) + { + grids.push_back(curr_tree.second->GridLeaves.grid); + individuals.push_back(curr_tree.second->GridLeaves.individuals); + values.push_back(curr_tree.second->GridLeaves.values); + values_old.push_back(curr_tree.second->GridLeaves.values); + variables.push_back(curr_tree.first); + ++tree_index; + } + + + + // recap maximum number of dimensions of current family + unsigned int curr_max = curr_family.rbegin()->first.size(); + + // ------------- purify ------------- + // iterate backwards through tree family + int tree_index_t = curr_family.size() - 1; + for (auto tree_t = variables.rbegin(); tree_t != variables.rend(); ++tree_t) + { + std::set curr_dims = *tree_t; + // do not purify null + if (curr_dims == std::set{0}) + continue; + // Rcout << std::endl << tree_index_t << " - T: "; + // Rcout << "tree_t:"; + // for(auto dim: curr_dims) Rcout << dim << ", "; + // Rcout << std::endl; + + auto grid = grids[tree_index_t]; + // Rcout << "Grid dimensions of T: "; + // for(auto dim: grid.dimensions) Rcout << dim << ", "; + // Rcout << std::endl; + // go through subtrees of t + int tree_index_u = variables.size(); + for (auto tree_u = variables.rbegin(); tree_u != variables.rend(); ++tree_u) + { + --tree_index_u; + // j_dims = dims of t without u + std::set j_dims = curr_dims; + if (tree_u->size() > curr_dims.size()) + continue; + // check if subset + bool subset = true; + for (const auto dim : *tree_u) + { + if (tree_t->count(dim) == 0) + { + subset = false; + break; + } + j_dims.erase(dim); + } + if (!subset) + continue; + + // Rcout << "Hello"; + // Rcout << " " << tree_index_u << " - U: "; + // for(auto dim: *tree_u) Rcout << dim << ", "; + // Rcout << std::endl; + // Rcout << " Individuals: "; + + double tot_sum = 0; + grid = grids[tree_index_u]; + // while (!grid.nextPoint()) + // { + // auto gridPoint = grid.getPoint(); + // // Rcout << individuals[tree_index_u][gridPoint] << ", "; + // tot_sum += individuals[tree_index_u][gridPoint]; + // } + // Rcout << "Total sum: " << tot_sum << std::endl; + // Rcout << std::endl; + + // Rcout << " Grid dimensions of U: "; + // for(auto dim: grid.dimensions) Rcout << dim << ", "; + // Rcout << std::endl; + + // Rcout<< "j_dims: "< update(value_size, 0); + + if (j_dims.size() == 0) + { + // grid = grids[tree_index_u]; + while (!grid.nextPoint()) + { + auto gridPoint_i = grid.getPoint(); + // Rcout << " " << "i: "; + // for(auto p: gridPoint_i) Rcout << p << ", "; + // Rcout << std::endl << " "; + double curr_sum = individuals[tree_index_u][gridPoint_i]; + // Rcout << ", Current Sum: " << curr_sum << std::endl; + // Rcout << std::endl << " " << "i, j: "; + update += curr_sum * values_old[tree_index_t][gridPoint_i]; + tot_sum += curr_sum; + // Rcout << std::endl; + } + update /= tot_sum; + + int tree_index_s = variables.size(); + for (auto tree_s = variables.rbegin(); tree_s != variables.rend(); ++tree_s) + { + + // Rcout << "tree_s:"; + // for(auto dim: *tree_s) Rcout << dim << ", "; + // Rcout << std::endl; + + --tree_index_s; + if (*tree_s == std::set{0}) + { + + auto gridPoint_0 = std::vector{0}; + values[tree_index_s][gridPoint_0] += update; + // Rcout << std::endl; + //} + + /* + for(auto tree_0: curr_family){ + + if(tree_0.first == std::set{0}){ + + Rcout << tree_0.first.size(); + std::vector leaf_index(tree_0.first.size(), 0); + std::vector leaf_index(tree_0.second->GridLeaves.values.size(), 0); + + int Test = tree_0.second->GridLeaves.values.size(); + Rcout << Test; + tree_0.second->GridLeaves.values[leaf_index] += update; + } + } + */ + } + else + { + + // check if S subset of T + + bool subset = true; + for (const auto dim : *tree_s) + { + if (tree_t->count(dim) == 0) + { + subset = false; + break; + } + } + if (!subset) + continue; + + // Rcout << pow(-1, (*tree_s).size()) << std::endl; + + auto grid_k = grids[tree_index_s]; + while (!grid_k.nextPoint()) + { + auto gridPoint_k = grid_k.getPoint(); + // + // if((*tree_s).size()>2){ + // Rcout << std::endl << " " << "j, k: "; + // for(auto p: gridPoint_k) Rcout << p << ", "; + // Rcout << std::endl; + // } + // + // Rcout << pow(-1, (*tree_s).size()) * update << std::endl; + values[tree_index_s][gridPoint_k] += pow(-1, (*tree_s).size()) * update; + } + } + } + // Rcout << std::endl; + } + else + { + + std::vector j_sizes(j_dims.size(), 0); + for (size_t j = 0; j < j_dims.size(); ++j) + { + auto tmp = j_dims.begin(); + std::advance(tmp, j); + int j_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*tmp)); + j_sizes[j] = grids[tree_index_t].dimensions[j_index]; + } + + // Rcout<<"Hello 1"; + + grid::NDGrid grid_j = grid::NDGrid(j_sizes); + while (!grid_j.nextPoint()) // looping over every x_{i, T\U} essentially + { + + std::vector update(value_size, 0); + auto gridPoint_j = grid_j.getPoint(); + // Rcout << " " << "j: "; + // for(auto p: gridPoint_j) Rcout << p << ", "; + // Rcout << std::endl; + // calc update + grid = grids[tree_index_u]; + while (!grid.nextPoint()) + { + auto gridPoint_i = grid.getPoint(); + // Rcout << " " << "i: "; + // for(auto p: gridPoint_i) Rcout << p << ", "; + // Rcout << std::endl << " "; + double curr_sum = individuals[tree_index_u][gridPoint_i]; + // Rcout << ", Current Sum: " << curr_sum << std::endl; + std::vector gridPoint_ij(tree_t->size(), 0); + for (size_t j = 0; j < gridPoint_j.size(); ++j) + { + auto j_dim = j_dims.begin(); // to keep: T\U + std::advance(j_dim, j); + int j_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*j_dim)); + // Rcout << " j_dim=" << *j_dim << ", j_index=" << j_index; + gridPoint_ij[j_index] = gridPoint_j[j]; + } + for (size_t i = 0; i < gridPoint_i.size(); ++i) + { + auto i_dim = tree_u->begin(); // to marginalize: U + std::advance(i_dim, i); + int i_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*i_dim)); + // Rcout << " i_dim=" << *i_dim << ", i_index=" << i_index; + gridPoint_ij[i_index] = gridPoint_i[i]; + } + + if (individuals[tree_index_t][gridPoint_ij] == 0) { + continue; // Skip non-support areas + } + // Rcout << std::endl << " " << "i, j: "; + // for(auto p: gridPoint_ij) Rcout << p << ", "; + // Rcout << std::endl; + update += curr_sum * values_old[tree_index_t][gridPoint_ij]; + tot_sum += curr_sum; + // Rcout << std::endl; + } + update /= tot_sum; + + // Rcout << "Hello_2"; + // update trees + int tree_index_s = variables.size(); + for (auto tree_s = variables.rbegin(); tree_s != variables.rend(); ++tree_s) + { + --tree_index_s; + // check if T\U=j_dims subset of S and S subset of T + bool subset = true; + for (const auto dim : j_dims) + { + if (tree_s->count(dim) == 0) + { + subset = false; + break; + } + } + for (const auto dim : *tree_s) + { + if (tree_t->count(dim) == 0) + { + subset = false; + break; + } + } + if (!subset) + continue; + // Rcout << " " << "S: "; + // for(auto dim: *tree_s) Rcout << dim << ", "; + // Rcout << std::endl; + // S cap U + std::set k_dims = *tree_s; + std::set k_dims_h1 = *tree_s; + std::set k_dims_h2 = *tree_u; + for (const auto dim : *tree_u) + k_dims.insert(dim); + for (const auto dim : *tree_s) + k_dims_h2.erase(dim); + for (const auto dim : *tree_u) + k_dims_h1.erase(dim); + for (const auto dim : k_dims_h1) + k_dims.erase(dim); + for (const auto dim : k_dims_h2) + k_dims.erase(dim); + + // std::set k_dims = *tree_s; + // for(const auto dim: *tree_t) k_dims.erase(dim); + // for(const auto dim: *tree_u) k_dims.insert(dim); + + // Rcout << " " << "k_dims: "; + // for(auto dim: k_dims) Rcout << dim << ", "; + // Rcout << std::endl; + + if (k_dims.size() == 0) + { + + values[tree_index_s][gridPoint_j] += pow(-1, (*tree_s).size() - j_dims.size()) * update; + } + else + { + + // Rcout <<"k_dims :"; + // for(auto dim: k_dims) Rcout << dim << ", "; + // Rcout << std::endl; + + std::vector k_sizes(k_dims.size(), 0); + for (size_t k = 0; k < k_dims.size(); ++k) + { + auto tmp = k_dims.begin(); + std::advance(tmp, k); + int k_index = std::distance(variables[tree_index_t].begin(), variables[tree_index_t].find(*tmp)); + k_sizes[k] = grids[tree_index_t].dimensions[k_index]; + } + // Rcout << " " << "k_sizes: "; + // for(auto dim: k_sizes) Rcout << dim << ", "; + // Rcout << std::endl; + grid::NDGrid grid_k = grid::NDGrid(k_sizes); + while (!grid_k.nextPoint()) + { + auto gridPoint_k = grid_k.getPoint(); + // Rcout << " " << "k: "; + // for(auto p: gridPoint_k) Rcout << p << ", "; + // Rcout << std::endl << " "; + std::vector gridPoint_jk(tree_s->size(), 0); + for (size_t j = 0; j < gridPoint_j.size(); ++j) + { + auto j_dim = j_dims.begin(); + std::advance(j_dim, j); + int j_index = std::distance(variables[tree_index_s].begin(), variables[tree_index_s].find(*j_dim)); + // Rcout << " j_dim=" << *j_dim << ", j_index=" << j_index; + gridPoint_jk[j_index] = gridPoint_j[j]; + } + for (size_t k = 0; k < gridPoint_k.size(); ++k) + { + auto k_dim = k_dims.begin(); + std::advance(k_dim, k); + int k_index = std::distance(variables[tree_index_s].begin(), variables[tree_index_s].find(*k_dim)); + // Rcout << " k_dim=" << *k_dim << ", k_index=" << k_index; + gridPoint_jk[k_index] = gridPoint_k[k]; + } + // Rcout << std::endl << " " << "j, k: "; + // for(auto p: gridPoint_jk) Rcout << p << ", "; + // Rcout << std::endl; + + // Rcout << pow(-1, (*tree_s).size() - j_dims.size()) * update[0]; + values[tree_index_s][gridPoint_jk] += pow(-1, (*tree_s).size() - j_dims.size()) * update; + } + } + } + } + } + } + --tree_index_t; + } + + // ------------- attach to rpf class ------------- + + // fill with new trees + for (size_t tree_index = 0; tree_index < variables.size(); ++tree_index) + { + LeafGrid curr_gridLeaf; + curr_gridLeaf.grid = grids[tree_index]; + curr_gridLeaf.individuals = individuals[tree_index]; + curr_gridLeaf.lim_list = lim_list; + curr_gridLeaf.values = values[tree_index]; + curr_family[variables[tree_index]]->GridLeaves = curr_gridLeaf; + } + } + + purified = true; +} From bdd0a3c4d9d524272af9878f9ee80b72a98e084c Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Wed, 26 Jun 2024 11:02:37 +0200 Subject: [PATCH 15/16] Add helper function for purify ANOVA + marginalize in bounds --- src/Makevars | 4 +++- src/include/rpf.hpp | 1 + src/lib/rpf_purify.cpp | 5 +++++ src/randomPlantedForest.cpp | 3 ++- 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/Makevars b/src/Makevars index 17a0e98..fb89948 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,4 +1,6 @@ -SOURCES=lib/cpf.cpp lib/grid.cpp lib/helper.cpp lib/rpf.cpp lib/trees.cpp lib/rpf_purify_no_extrapolation.cpp lib/rpf_purify_anova.cpp lib/rpf_purify.cpp randomPlantedForest.cpp RcppExports.cpp +SOURCES=lib/cpf.cpp lib/grid.cpp lib/helper.cpp lib/rpf.cpp lib/trees.cpp \ + lib/rpf_purify_no_extrapolation.cpp lib/rpf_purify_no_extrapolation_grid.cpp \ + lib/rpf_purify_anova.cpp lib/rpf_purify.cpp randomPlantedForest.cpp RcppExports.cpp OBJECTS = $(SOURCES:.cpp=.o) diff --git a/src/include/rpf.hpp b/src/include/rpf.hpp index 9647c6d..e608865 100644 --- a/src/include/rpf.hpp +++ b/src/include/rpf.hpp @@ -20,6 +20,7 @@ class RandomPlantedForest void purify_22(); void purify_3(); void purify_3_new(); + void purify_anova_and_marginalize(); void purify_no_extrapolation(); void purify_no_extrapolation_existing_grid(); void print(); diff --git a/src/lib/rpf_purify.cpp b/src/lib/rpf_purify.cpp index 6411021..63efe0a 100644 --- a/src/lib/rpf_purify.cpp +++ b/src/lib/rpf_purify.cpp @@ -1235,3 +1235,8 @@ void RandomPlantedForest::purify_3_new() purified = true; } + +void RandomPlantedForest::purify_anova_and_marginalize() { + purify_anova(); + purify_no_extrapolation_existing_grid(); +} \ No newline at end of file diff --git a/src/randomPlantedForest.cpp b/src/randomPlantedForest.cpp index fcc0289..6904edf 100644 --- a/src/randomPlantedForest.cpp +++ b/src/randomPlantedForest.cpp @@ -18,7 +18,8 @@ RCPP_MODULE(mod_rpf) .method("purify3_new", &RandomPlantedForest::purify_3_new) .method("purify1", &RandomPlantedForest::purify_1) .method("purify_anova", &RandomPlantedForest::purify_anova) - .method("purify_new", &RandomPlantedForest::purify_no_extrapolation) + .method("purify_anova_and_marginalize", &RandomPlantedForest::purify_anova_and_marginalize) + .method("purify_no_extrapolation", &RandomPlantedForest::purify_no_extrapolation) .method("print", &RandomPlantedForest::print) .method("get_parameters", &RandomPlantedForest::get_parameters) .method("set_parameters", &RandomPlantedForest::set_parameters) From 9fa3c33878c6012a9154d1ee222c94bd305ff422 Mon Sep 17 00:00:00 2001 From: Jinyang Liu Date: Wed, 26 Jun 2024 13:02:48 +0200 Subject: [PATCH 16/16] Add method for purify with existing grid --- src/randomPlantedForest.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/randomPlantedForest.cpp b/src/randomPlantedForest.cpp index 6904edf..5bb8388 100644 --- a/src/randomPlantedForest.cpp +++ b/src/randomPlantedForest.cpp @@ -20,6 +20,7 @@ RCPP_MODULE(mod_rpf) .method("purify_anova", &RandomPlantedForest::purify_anova) .method("purify_anova_and_marginalize", &RandomPlantedForest::purify_anova_and_marginalize) .method("purify_no_extrapolation", &RandomPlantedForest::purify_no_extrapolation) + .method("purify_no_extrapolation_existing_grid", &RandomPlantedForest::purify_no_extrapolation_existing_grid) .method("print", &RandomPlantedForest::print) .method("get_parameters", &RandomPlantedForest::get_parameters) .method("set_parameters", &RandomPlantedForest::set_parameters)