Evovest
diff --git a/‎Project.toml‎
Lines changed: 1 addition & 1 deletion b/‎Project.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/YEAR-bench.jl‎ ‎benchmarks/YEAR-regressor.jl‎benchmarks/YEAR-bench.jl renamed to benchmarks/YEAR-regressor.jl
Lines changed: 23 additions & 32 deletions b/‎benchmarks/YEAR-bench.jl‎ ‎benchmarks/YEAR-regressor.jl‎benchmarks/YEAR-bench.jl renamed to benchmarks/YEAR-regressor.jl
Lines changed: 23 additions & 32 deletions
diff --git a/‎benchmarks/boston.jl‎
Lines changed: 43 additions & 0 deletions b/‎benchmarks/boston.jl‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎benchmarks/results/regressor-cpu.csv‎
Lines changed: 12 additions & 12 deletions b/‎benchmarks/results/regressor-cpu.csv‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎benchmarks/results/regressor-gpu.csv‎
Lines changed: 12 additions & 12 deletions b/‎benchmarks/results/regressor-gpu.csv‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎blog/cred/.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎blog/cred/.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎blog/cred/README.jl‎
Lines changed: 182 additions & 0 deletions b/‎blog/cred/README.jl‎
Lines changed: 182 additions & 0 deletions
@@ -1,7 +1,7 @@
 name = "EvoTrees"
 uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
 authors = ["jeremiedb <[email protected]>"]
-version = "0.17.2"
+version = "0.18.0"
 
 [deps]
 BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
 
@@ -1,15 +1,15 @@
-using Revise
 using CSV
 using DataFrames
-using EvoTrees
 using StatsBase: sample, tiedrank
 using Statistics
 using Random: seed!
+using EvoTrees
+using EvoTrees: fit
 
 using AWS: AWSCredentials, AWSConfig, @service
 @service S3
 aws_creds = AWSCredentials(ENV["AWS_ACCESS_KEY_ID_JDB"], ENV["AWS_SECRET_ACCESS_KEY_JDB"])
-aws_config = AWSConfig(; creds = aws_creds, region = "ca-central-1")
+aws_config = AWSConfig(; creds=aws_creds, region="ca-central-1")
 
 path = "share/data/year/year.csv"
 raw = S3.get_object(
@@ -18,7 +18,7 @@ raw = S3.get_object(
     Dict("response-content-type" => "application/octet-stream");
     aws_config,
 )
-df = DataFrame(CSV.File(raw, header = false))
+df = DataFrame(CSV.File(raw, header=false))
 
 path = "share/data/year/year-train-idx.txt"
 raw = S3.get_object(
@@ -27,7 +27,7 @@ raw = S3.get_object(
     Dict("response-content-type" => "application/octet-stream");
     aws_config,
 )
-train_idx = DataFrame(CSV.File(raw, header = false))[:, 1] .+ 1
+train_idx = DataFrame(CSV.File(raw, header=false))[:, 1] .+ 1
 
 path = "share/data/year/year-eval-idx.txt"
 raw = S3.get_object(
@@ -36,50 +36,41 @@ raw = S3.get_object(
     Dict("response-content-type" => "application/octet-stream");
     aws_config,
 )
-eval_idx = DataFrame(CSV.File(raw, header = false))[:, 1] .+ 1
+eval_idx = DataFrame(CSV.File(raw, header=false))[:, 1] .+ 1
 
 X = df[:, 2:end]
 Y_raw = Float64.(df[:, 1])
 Y = (Y_raw .- mean(Y_raw)) ./ std(Y_raw)
 
-function percent_rank(x::AbstractVector{T}) where {T}
-    return tiedrank(x) / (length(x) + 1)
-end
-
-transform!(X, names(X) .=> percent_rank .=> names(X))
-X = collect(Matrix{Float32}(X))
-Y = Float32.(Y)
-
 x_tot, y_tot = X[1:(end-51630), :], Y[1:(end-51630)]
-x_test, y_test = X[(end-51630+1):end, :], Y[(end-51630+1):end]
-x_train, x_eval = x_tot[train_idx, :], x_tot[eval_idx, :]
+x_test, y_test = Matrix(X[(end-51630+1):end, :]), Y[(end-51630+1):end]
+x_train, x_eval = Matrix(x_tot[train_idx, :]), Matrix(x_tot[eval_idx, :])
 y_train, y_eval = y_tot[train_idx], y_tot[eval_idx]
 
 config = EvoTreeRegressor(
-    T = Float32,
-    nrounds = 1200,
-    loss = :linear,
-    eta = 0.1,
-    nbins = 128,
-    min_weight = 4,
-    max_depth = 7,
-    lambda = 0,
-    gamma = 0,
-    rowsample = 0.8,
-    colsample=0.8,
+    nrounds=3000,
+    loss=:cred_std,
+    metric=:mse,
+    eta=0.1,
+    nbins=32,
+    min_weight=1,
+    max_depth=7,
+    lambda=0,
+    L2=0,
+    gamma=0,
+    rowsample=0.5,
+    colsample=0.9,
+    early_stopping_rounds=50,
 )
 
 # @time m = fit_evotree(config; x_train, y_train, print_every_n=25);
-@time m, logger = fit_evotree(
+@time m = fit(
     config;
     x_train,
     y_train,
     x_eval,
     y_eval,
-    early_stopping_rounds = 100,
-    print_every_n = 10,
-    metric = :mse,
-    return_logger = true,
+    print_every_n=100,
 );
 p_evo = m(x_test);
 mean((p_evo .- y_test) .^ 2) * std(Y_raw)^2
@@ -0,0 +1,43 @@
+
+using EvoTrees
+using MLDatasets
+using DataFrames
+using Statistics: mean
+using CategoricalArrays
+using Random
+
+df = MLDatasets.BostonHousing().dataframe
+Random.seed!(123)
+
+train_ratio = 0.8
+train_indices = randperm(nrow(df))[1:Int(round(train_ratio * nrow(df)))]
+
+train_data = df[train_indices, :]
+eval_data = df[setdiff(1:nrow(df), train_indices), :]
+
+x_train, y_train = Matrix(train_data[:, Not(:MEDV)]), train_data[:, :MEDV]
+x_eval, y_eval = Matrix(eval_data[:, Not(:MEDV)]), eval_data[:, :MEDV]
+
+config = EvoTreeRegressor(
+    loss=:mse,
+    metric=:mse,
+    nrounds=1,
+    early_stopping_rounds=10,
+    eta=0.1,
+    max_depth=2,
+    lambda=0.0,
+    L2=0.0,
+    rowsample=0.9,
+    colsample=0.9)
+
+model_mse = EvoTrees.fit(config;
+    x_train, y_train,
+    x_eval, y_eval,
+    print_every_n=1)
+
+pred_train = model(x_train)
+pred_eval = model(x_eval)
+
+mean(abs.(pred_train .- y_train))
+mean(abs.(pred_eval .- y_eval))
+
@@ -1,13 +1,13 @@
 device,nobs,nfeats,max_depth,train_evo,train_xgb,infer_evo,infer_xgb
-cpu,100000,10,6,0.339105894,0.642794662,0.045756982,0.02628743
-cpu,100000,10,11,1.279537507,1.073844805,0.085824892,0.06088243
-cpu,100000,100,6,0.825591006,1.521080299,0.068610545,0.14405631
-cpu,100000,100,11,4.875921204,3.93826447,0.11966227,0.168092254
-cpu,1000000,10,6,2.310057563,6.782713955,0.295144245,0.283113086
-cpu,1000000,10,11,5.079728577,8.015394605,0.802753618,0.60897193
-cpu,1000000,100,6,5.724386557,13.513077202,0.688739903,1.272025185
-cpu,1000000,100,11,18.003480355,21.454809233,1.247011838,1.657717943
-cpu,10000000,10,6,27.055199606,85.252937661,2.921450187,2.888122252
-cpu,10000000,10,11,52.143569851,111.505335039,6.18255143,6.202632593
-cpu,10000000,100,6,83.326695985,144.605970885,6.047807335,14.620566726
-cpu,10000000,100,11,194.955017106,182.237153757,11.50293827,17.660819455
+cpu,100000,10,6,0.330517856,0.62067627,0.045004986,0.044798794
+cpu,100000,10,11,1.337956436,1.105285991,0.086839406,0.061570974
+cpu,100000,100,6,0.828781594,1.363081129,0.106595691,0.119703591
+cpu,100000,100,11,4.941785747,3.435747012,0.122107048,0.166959767
+cpu,1000000,10,6,2.314222299,6.57856163,0.3913027,0.364978734
+cpu,1000000,10,11,5.170780341,8.45243535,0.611472906,0.612859723
+cpu,1000000,100,6,5.6716359,14.418231971,0.721386688,1.295978265
+cpu,1000000,100,11,18.040254949,18.531543442,1.360270762,1.75281179
+cpu,10000000,10,6,25.582933653,78.774728198,2.748972478,2.744420644
+cpu,10000000,10,11,51.265034576,112.372616748,6.100088688,6.337031412
+cpu,10000000,100,6,81.37971803,146.266650929,5.952960637,14.103855381
+cpu,10000000,100,11,190.794016029,189.25733363,11.847299848,18.792787023
@@ -1,13 +1,13 @@
 device,nobs,nfeats,max_depth,train_evo,train_xgb,infer_evo,infer_xgb
-gpu,100000,10,6,1.270319267,0.285433309,0.010683488,0.012045765
-gpu,100000,10,11,15.401192763,1.308309359,0.011901549,0.016735511
-gpu,100000,100,6,1.656181006,0.617481512,0.034038548,0.106893283
-gpu,100000,100,11,19.647128327,3.209314346,0.038993915,0.162476102
-gpu,1000000,10,6,2.033918292,0.955982504,0.051210817,0.131093957
-gpu,1000000,10,11,23.490242119,2.714125028,0.059398531,0.144796451
-gpu,1000000,100,6,3.424353046,2.866238074,0.307580376,1.342028138
-gpu,1000000,100,11,30.398456011,7.88853843,0.352188155,1.651248449
-gpu,10000000,10,6,7.552837802,7.424535739,0.457648127,1.604205225
-gpu,10000000,10,11,39.834112089,13.51496456,0.577825194,1.763046
-gpu,10000000,100,6,21.76585393,28.380947932,3.282138258,14.587210604
-gpu,10000000,100,11,66.83786749,53.762559553,3.620799047,17.43789161
+gpu,100000,10,6,1.262735022,0.319433525,0.010457942,0.012211455
+gpu,100000,10,11,15.66936406,1.494096649,0.01337129,0.017512698
+gpu,100000,100,6,1.756429163,0.675154915,0.034648695,0.148641248
+gpu,100000,100,11,20.447355358,3.821046349,0.038901134,0.162287052
+gpu,1000000,10,6,2.215962749,1.049401344,0.05478112,0.134286529
+gpu,1000000,10,11,24.254557497,3.112351903,0.061374392,0.157115342
+gpu,1000000,100,6,3.635739525,3.228633649,0.307356514,1.361765356
+gpu,1000000,100,11,31.102936915,8.530664774,0.312302753,1.61460587
+gpu,10000000,10,6,8.384827155,7.755134961,0.457555156,1.626715379
+gpu,10000000,10,11,42.062736926,13.615783394,0.58097717,1.723417395
+gpu,10000000,100,6,21.687369289,28.868658021,3.237098864,14.680655122
+gpu,10000000,100,11,68.618695095,57.845418449,3.440632989,16.870168538
@@ -0,0 +1 @@
+# assets/
@@ -0,0 +1,182 @@
+# # Exploring a credibility-based approach for tree-gain estimation
+
+include(joinpath(@__DIR__, "utils.jl")); #hide
+
+#=
+> The motivation for this experiment was to explore an alternative to gradient-based gain measure by integrating the volatility of split candidates to identity the best node split. 
+=#
+
+#=
+
+## Review of key gradient-based MSE characteristics
+
+The figures below illustrate the behavior of vanilla gradient-based approach using a mean-squarred error (MSE) loss.
+The 2 colors represent the observations belonging to the left and right children.
+
+Key observations:
+- **the gain is invariant to the volatility**: the top vs bottom figures differs only by the std dev of the observations. 
+    The associated gain is identical, which is aligned with the gradient-based approach to gain: the gain matches the reduction in the MSE, which is identical regardless of the dispersion. It's strictly driven by their mean.
+- **the gain scales linearly with the number of observations**: the right vs left figures contrasts different number of observations (100 vs 10k), and show that gain is directly proportional.
+- **the gain scales quadratically with the spread**: moving from a spread of 1.0 to 0.1 between the 2nd and 3rd row results in a drop by 100x of the gain: from 50.0 to 0.5.
+=#
+
+loss = :mse#hide
+f = get_dist_figure(; loss, nobs=100, spread=1.0, sd=1.0)#hide
+save(joinpath(@__DIR__, "assets", "dist-mse-1A.png"), f)#hide
+f = get_dist_figure(; loss, nobs=1_000, spread=1.0, sd=1.0)#hide
+save(joinpath(@__DIR__, "assets", "dist-mse-1B.png"), f)#hide
+f = get_dist_figure(; loss, nobs=100, spread=1.0, sd=0.1)#hide
+save(joinpath(@__DIR__, "assets", "dist-mse-2A.png"), f)#hide
+f = get_dist_figure(; loss, nobs=1_000, spread=1.0, sd=0.1)#hide
+save(joinpath(@__DIR__, "assets", "dist-mse-2B.png"), f);#hide
+f = get_dist_figure(; loss, nobs=100, spread=0.1, sd=0.1)#hide
+save(joinpath(@__DIR__, "assets", "dist-mse-3A.png"), f)#hide
+f = get_dist_figure(; loss, nobs=1_000, spread=0.1, sd=0.1)#hide
+save(joinpath(@__DIR__, "assets", "dist-mse-3B.png"), f);#hide
+
+#=
+| ![](assets/dist-mse-1A.png) | ![](assets/dist-mse-1B.png) |
+|:----------------------:|:----------------------:|
+| ![](assets/dist-mse-2A.png) | ![](assets/dist-mse-3A.png) |
+=#
+
+#=
+## Credibility-based gains
+=#
+
+#=
+The idea is for *gain* to reflect varying uncertainty levels for observations associated to each of the tree-split candidates. 
+For tree-split candidates with an identical spread, the intuition is that candidates with a lower volatility, all other things being equal, should be preferred.
+The original inspiration comes from credibility theory, a foundational notion in actuarial science with direct connexion mixed effect models and bayesian theory. 
+Key concept is that the credibility associated with a set of observations is driven by the relative effect of 2 components:
+ - **Variance of the Hypothetical Means (VHM)**: if large differences between candidates means are expected, a greater credibility is assigned.
+ - **Expected Value of the Process Variance (EVPV)**: if the data generation process of a given candidate has a large volatility, a smaller credibility is assigned.
+The Buhlmann credibility states that the optimal linear posterior estimator of a group mean is: 
+ - `Z * X̄ + (1 - Z) * μ`, where `X̄` is the group mean and `μ` the population mean.
+=#
+
+#=
+This approach results in a shift of perspective in how the gain is derived. 
+Classical gradient based is about deriving a second-order approximation of the loss curve for a tre-split candidate.
+The gain corresponds to the reduction in this approximated loss by taking the prediciton that minimises the quadratic loss curve. 
+The credibility-based takes a loss function agnostic approach, and view the gain as the total absolute change in the credibility-adjusted predicted value.
+Example, if a child has a mean residual of *2.0*, credibility of 0.5 and 100 observations, the resulting gain is: `2.0 * 0.5 * 100 = 100.0`, where `2.0 * 0.5` corresponds to the credibility adjusted prediction.
+
+VHM is estimated as the square of the mean of the spread between observed values and predictions: 
+- `VHM = E[X] = mean(y - p)`
+
+EVPV is estimated as the variance of the observations. This value can be derived from the aggregation of the first and second moment of the individual observations: 
+- `EVPV = E[(x - μ)²] = E[X²] - E²[X]`
+=#
+
+#=
+## Credibility-based losses in EvoTrees
+Two credibility-based losses are supported with `EvoTreeRegressor`:
+ - **cred_var**: `VHM / (VHM + EVPV)`
+ - **cred_std**: `sqrt(VHM) / (sqrt(VHM) + sqrt(EVPV))`
+=#
+
+
+#=
+Just like the gradient-based MSE error, the gain grows linearly with the number of observations, all other things being equal.
+However, a smaller volatility results in an increased gain, as shown in 2nd vs 1st row. 
+=#
+
+loss = :cred_std#hide
+f = get_dist_figure(; loss, nobs=100, spread=1.0, sd=1.0)#hide
+save(joinpath(@__DIR__, "assets", "dist-cred_std-1A.png"), f);#hide
+f = get_dist_figure(; loss, nobs=1_000, spread=1.0, sd=1.0)#hide
+save(joinpath(@__DIR__, "assets", "dist-cred_std-1B.png"), f);#hide
+f = get_dist_figure(; loss, nobs=100, spread=1.0, sd=0.1)#hide
+save(joinpath(@__DIR__, "assets", "dist-cred_std-2A.png"), f);#hide
+f = get_dist_figure(; loss, nobs=1_000, spread=1.0, sd=0.1)#hide
+save(joinpath(@__DIR__, "assets", "dist-cred_std-2B.png"), f);#hide
+f = get_dist_figure(; loss, nobs=100, spread=0.1, sd=0.1)#hide
+save(joinpath(@__DIR__, "assets", "dist-cred_std-3A.png"), f);#hide
+f = get_dist_figure(; loss, nobs=1_000, spread=0.1, sd=0.1)#hide
+save(joinpath(@__DIR__, "assets", "dist-cred_std-3B.png"), f);#hide
+
+#=
+| ![](assets/dist-cred_std-1A.png) | ![](assets/dist-cred_std-1B.png) |
+|:----------------------:|:----------------------:|
+| ![](assets/dist-cred_std-2A.png) | ![](assets/dist-cred_std-3A.png) |
+=#
+
+# ### Simulation grid
+
+#=
+The chart below show the associated credibility and gain for a given node split candidate for various spreads and standards deviations.
+=#
+
+nobs = 1000
+sd_list = [0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5]
+spread_list = [0.01, 0.05, 0.1, 0.2, 0.5, 1]
+metric_name = "cred"#hide
+f = get_cred_figureB(; metric_name, loss=:cred_std, nobs, sd_list, spread_list)#hide
+save(joinpath(@__DIR__, "assets", "heatmap-$metric_name-cred_std.png"), f);#hide
+metric_name = "gain"#hide
+f = get_cred_figureB(; metric_name, loss=:cred_std, nobs, sd_list, spread_list)#hide
+save(joinpath(@__DIR__, "assets", "heatmap-$metric_name-cred_std.png"), f);#hide
+#=
+| ![](assets/heatmap-cred-cred_std.png) | ![](assets/heatmap-gain-cred_std.png) |
+|:----------------------:|:----------------------:|
+=#
+
+# ### Illustration of different cred-based decision between `cred_std` to `MSE`
+
+#=
+Despite both `mse` and `cred_std` resulting in the same prediction, which matches the mean of the observations, the associated gain differs due to the volatility penalty.
+
+The following illustrates a minimal scenario of 2 features, each with only 2 levels. 
+=#
+
+#=
+| ![](assets/dist-mse-cred-x1.png) | ![](assets/dist-mse-cred-x2.png) |
+|:----------------------:|:----------------------:|
+=#
+
+#=
+```julia
+config = EvoTreeRegressor(loss=:mse, nrounds=1, max_depth=2)
+model_mse = EvoTrees.fit(config, dtrain; target_name="y")
+
+EvoTrees.Tree{EvoTrees.MSE, 1}
+ - feat: [2, 0, 0]
+ - cond_bin: UInt8[0x01, 0x00, 0x00]
+ - gain: Float32[12113.845, 0.0, 0.0]
+ - pred: Float32[0.0 -0.017858343 0.3391479]
+ - split: Bool[1, 0, 0]
+```
+=#
+
+#=
+```julia
+config = EvoTreeRegressor(loss=:cred_std, nrounds=1, max_depth=2)
+model_std = EvoTrees.fit(config, dtrain; target_name="y")
+
+EvoTrees.Tree{EvoTrees.CredStd, 1}
+ - feat: [1, 0, 0]
+ - cond_bin: UInt8[0x02, 0x00, 0x00]
+ - gain: Float32[8859.706, 0.0, 0.0]
+ - pred: Float32[0.0 0.07375729 -0.07375729]
+ - split: Bool[1, 0, 0]
+```
+=#
+
+#=
+## Benchmarks
+
+From [MLBenchmarks.jl](https://github.com/Evovest/MLBenchmarks.jl).
+
+| **model** | **metric** | **mse** | **cred_var** | **cred_std** |
+|:---------:|:----------:|:-------:|:------------:|:------------:|
+| boston    | mse        | 6.3     | 5.95         | 5.43         |
+| boston    | gini       | 0.945   | 0.947        | 0.952        |
+| year      | mse        | 74.9    | 74.6         | 74.2         |
+| year      | gini       | 0.662   | 0.664        | 0.661        |
+| msrank    | mse        | 0.55    | 0.551        | 0.549        |
+| msrank    | ndcg       | 0.511   | 0.509        | 0.51         |
+| yahoo     | mse        | 0.565   | 0.589        | 0.568        |
+| yahoo     | ndcg       | 0.795   | 0.787        | 0.794        |
+
+=#