-
Notifications
You must be signed in to change notification settings - Fork 120
Closed
Description
The problem
I'm having trouble with predicting on a trained workflow, as recipe cannot see outcome variable in the new data.
Reproducible example
library(tidymodels)
library(tidyverse)
train_datas1 <- readRDS(file = "~/traindata.rds")
test_datas1 <- readRDS(file = "~/testdata.rds")
Best_test_resultss1prauc <- readRDS(file = "~/lastfit.rds")
str(train_datas1)
#> tibble [6,752 × 12] (S3: tbl_df/tbl/data.frame)
#> $ id : int [1:6752] 1 2 3 4 5 7 8 9 12 13 ...
#> $ type : chr [1:6752] "private" "private" "private" "private" ...
#> $ age : num [1:6752] 12 11 4 9 15 18 12 14 20 23 ...
#> $ postal_code : Factor w/ 9506 levels "10001","10003",..: 7236 6383 5752 663 6203 860 3561 7594 1348 1441 ...
#> $ suspicious_label: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
#> $ tx_cnt : int [1:6752] NA 2 1 2 3 1 2 1 1 1 ...
#> $ tx_lavg : num [1:6752] NA 2.41 2.08 2.35 2.56 ...
#> $ tx_lmed : num [1:6752] NA 2.41 2.08 2.35 2.26 ...
#> $ tx_liqr : num [1:6752] NA 2.25 -4 1.88 2.55 ...
#> $ tx_lmin : num [1:6752] NA 1.9 2.08 2.17 1.96 ...
#> $ tx_lmax : num [1:6752] NA 2.64 2.08 2.48 2.91 ...
#> $ case_wts : imp_wts [1:6752] 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
str(test_datas1)
#> tibble [1,689 × 11] (S3: tbl_df/tbl/data.frame)
#> $ id : int [1:1689] 14 25 29 39 42 54 69 75 79 82 ...
#> $ type : chr [1:1689] "private" "private" "private" "private" ...
#> $ age : num [1:1689] 15 22 31 41 42 59 75 76 68 64 ...
#> $ postal_code : Factor w/ 9506 levels "10001","10003",..: 4225 3127 9118 3312 1089 2153 2611 7621 1544 1661 ...
#> $ suspicious_label: Factor w/ 2 levels "0","1": 2 2 1 1 1 1 1 1 1 1 ...
#> $ tx_cnt : int [1:1689] 1 1 NA NA 1 NA 2 1 1 1 ...
#> $ tx_lavg : num [1:1689] 3.02 3.13 NA NA 2.23 ...
#> $ tx_lmed : num [1:1689] 3.02 3.13 NA NA 2.23 ...
#> $ tx_liqr : num [1:1689] -4 -4 NA NA -4 ...
#> $ tx_lmin : num [1:1689] 3.02 3.13 NA NA 2.23 ...
#> $ tx_lmax : num [1:1689] 3.02 3.13 NA NA 2.23 ...
last_wf <- extract_workflow(Best_test_resultss1prauc)
last_wf
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Recipe
#> Model: rand_forest()
#>
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> 1 Recipe Step
#>
#> • step_relevel()
#>
#> ── Model ───────────────────────────────────────────────────────────────────────
#> $predictions
#> 1 0
#> [1,] 1.813933e-01 0.818606749
#> [2,] 1.152768e-02 0.988472318
#> [3,] 8.259202e-03 0.991740798
#> [4,] 1.741788e-02 0.982582117
#> [5,] 3.853423e-02 0.961465768
#> [6,] 2.495565e-03 0.997504435
#> [7,] 1.629475e-03 0.998370525
#> [8,] 5.008395e-03 0.994991605
#> [9,] 5.815574e-03 0.994184426
#> [10,] 4.645134e-03 0.995354866
#> [11,] 1.803659e-01 0.819634126
#> [12,] 4.513234e-01 0.548676645
#> [13,] 5.721453e-03 0.994278547
#> [14,] 1.169564e-03 0.998830436
#> [15,] 2.941641e-02 0.970583594
#> [16,] 3.207482e-01 0.679251754
#> [17,] 1.554592e-02 0.984454079
#> [18,] 7.659054e-02 0.923409464
#> [19,] 1.413417e-03 0.998586583
#> [20,] 2.512597e-01 0.748740254
#> [21,] 1.828849e-02 0.981711514
#> [22,] 1.268135e-01 0.873186537
#> [23,] 4.473315e-03 0.995526685
#> [24,] 3.522791e-01 0.647720877
#> [25,] 2.339271e-02 0.976607291
#> [26,] 1.547133e-01 0.845286722
#> [27,] 0.000000e+00 1.000000000
#> [28,] 8.447518e-04 0.999155248
#> [29,] 9.812423e-04 0.999018758
#> [30,] 1.182108e-02 0.988178915
#> [31,] 1.263573e-02 0.987364271
#> [32,] 7.612188e-03 0.992387812
#> [33,] 5.325998e-02 0.946740019
#> [34,] 6.468801e-04 0.999353120
#> [35,] 1.268726e-02 0.987312740
#> [36,] 2.347993e-02 0.976520072
#> [37,] 0.000000e+00 1.000000000
#> [38,] 2.525561e-03 0.997474439
#> [39,] 4.114922e-02 0.958850779
#> [40,] 5.698457e-04 0.999430154
#> [41,] 1.034058e-03 0.998965942
#> [42,] 1.409088e-02 0.985909125
#> [43,] 2.765180e-04 0.999723482
#> [44,] 2.462403e-02 0.975375973
#> [45,] 8.341663e-03 0.991658337
#> [46,] 6.063251e-02 0.939367491
#> [47,] 5.689365e-03 0.994310635
#> [48,] 1.514053e-02 0.984859471
#>
#> ...
#> and 958857 more lines.
preproc <- last_wf %>% extract_preprocessor()
preproc
#>
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#>
#> ── Inputs
#> Number of variables by role
#> outcome: 1
#> predictor: 9
#> case_weights: 1
#> ID: 1
#>
#> ── Operations
#> • Re-order factor level to ref_level for: suspicious_label
baked_train <- preproc %>% prep() %>% bake(new_data = train_datas1)
baked_test <- preproc %>% prep() %>% bake(new_data = test_datas1)
str(baked_train)
#> tibble [6,752 × 12] (S3: tbl_df/tbl/data.frame)
#> $ id : int [1:6752] 1 2 3 4 5 7 8 9 12 13 ...
#> $ type : Factor w/ 3 levels "corporate","private",..: 2 2 2 2 2 2 2 2 2 2 ...
#> $ age : num [1:6752] 12 11 4 9 15 18 12 14 20 23 ...
#> $ postal_code : Factor w/ 9506 levels "10001","10003",..: 7236 6383 5752 663 6203 860 3561 7594 1348 1441 ...
#> $ tx_cnt : int [1:6752] NA 2 1 2 3 1 2 1 1 1 ...
#> $ tx_lavg : num [1:6752] NA 2.41 2.08 2.35 2.56 ...
#> $ tx_lmed : num [1:6752] NA 2.41 2.08 2.35 2.26 ...
#> $ tx_liqr : num [1:6752] NA 2.25 -4 1.88 2.55 ...
#> $ tx_lmin : num [1:6752] NA 1.9 2.08 2.17 1.96 ...
#> $ tx_lmax : num [1:6752] NA 2.64 2.08 2.48 2.91 ...
#> $ case_wts : imp_wts [1:6752] 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
#> $ suspicious_label: Factor w/ 2 levels "1","0": 2 2 2 2 2 2 2 2 2 2 ...
str(baked_test)
#> tibble [1,689 × 11] (S3: tbl_df/tbl/data.frame)
#> $ id : int [1:1689] 14 25 29 39 42 54 69 75 79 82 ...
#> $ type : Factor w/ 3 levels "corporate","private",..: 2 2 2 2 2 2 2 2 2 2 ...
#> $ age : num [1:1689] 15 22 31 41 42 59 75 76 68 64 ...
#> $ postal_code : Factor w/ 9506 levels "10001","10003",..: 4225 3127 9118 3312 1089 2153 2611 7621 1544 1661 ...
#> $ tx_cnt : int [1:1689] 1 1 NA NA 1 NA 2 1 1 1 ...
#> $ tx_lavg : num [1:1689] 3.02 3.13 NA NA 2.23 ...
#> $ tx_lmed : num [1:1689] 3.02 3.13 NA NA 2.23 ...
#> $ tx_liqr : num [1:1689] -4 -4 NA NA -4 ...
#> $ tx_lmin : num [1:1689] 3.02 3.13 NA NA 2.23 ...
#> $ tx_lmax : num [1:1689] 3.02 3.13 NA NA 2.23 ...
#> $ suspicious_label: Factor w/ 2 levels "1","0": 1 1 2 2 2 2 2 2 2 2 ...
augment(last_wf, new_data = train_datas1)
#> Error in `step_relevel()`:
#> ! The following required column is missing from `new_data`:
#> suspicious_label.
<sup>Created on 2025-09-28 with [reprex v2.1.1](https://reprex.tidyverse.org)</sup>
Backtrace:
<error/rlang_error>
Error in `step_relevel()`:
! The following required column is missing from `new_data`: suspicious_label.
---
Backtrace:
▆
1. ├─generics::augment(last_wf, new_data = train_datas1)
2. └─workflows:::augment.workflow(last_wf, new_data = train_datas1)
3. ├─hardhat::forge(new_data, blueprint = mold$blueprint, outcomes = outcomes)
4. └─hardhat:::forge.data.frame(...)
5. ├─hardhat::run_forge(blueprint, new_data = new_data, outcomes = outcomes)
6. └─hardhat:::run_forge.default_recipe_blueprint(...)
7. └─hardhat:::forge_recipe_default_process(...)
8. ├─recipes::bake(object = rec, new_data = new_data)
9. └─recipes:::bake.recipe(object = rec, new_data = new_data)
10. ├─recipes::bake(step, new_data = new_data)
11. └─recipes:::bake.step_relevel(step, new_data = new_data)
12. └─recipes::check_new_data(col_names, object, new_data)
Run rlang::last_trace(drop = FALSE) to see 2 hidden frames.
Session Info:
R version 4.5.1 (2025-06-13)
Platform: x86_64-apple-darwin20
Running under: macOS Sequoia 15.7
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: Europe/Stockholm
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] lubridate_1.9.4 forcats_1.0.0 stringr_1.5.2 readr_2.1.5 tibble_3.3.0 tidyverse_2.0.0 yardstick_1.3.2 workflowsets_1.1.1 workflows_1.3.0 tune_2.0.0 tidyr_1.3.1
[12] tailor_0.1.0 rsample_1.3.1 recipes_1.3.1 purrr_1.1.0 parsnip_1.3.3 modeldata_1.5.1 infer_1.0.9 ggplot2_4.0.0 dplyr_1.1.4 dials_1.4.2 scales_1.4.0
[23] broom_1.0.10 tidymodels_1.4.1
loaded via a namespace (and not attached):
[1] tidyselect_1.2.1 timeDate_4041.110 farver_2.1.2 S7_0.2.0 fastmap_1.2.0 reprex_2.1.1 digest_0.6.37 rpart_4.1.24 timechange_0.3.0 lifecycle_1.0.4
[11] survival_3.8-3 processx_3.8.6 magrittr_2.0.4 compiler_4.5.1 rlang_1.1.6 tools_4.5.1 utf8_1.2.6 yaml_2.3.10 data.table_1.17.8 knitr_1.50
[21] DiceDesign_1.10 RColorBrewer_1.1-3 withr_3.0.2 nnet_7.3-20 grid_4.5.1 future_1.67.0 globals_0.18.0 MASS_7.3-65 cli_3.6.5 crayon_1.5.3
[31] rmarkdown_2.29 generics_0.1.4 rstudioapi_0.17.1 future.apply_1.20.0 tzdb_0.5.0 splines_4.5.1 parallel_4.5.1 vctrs_0.6.5 hardhat_1.4.2 Matrix_1.7-4
[41] callr_3.7.6 hms_1.1.3 listenv_0.9.1 clipr_0.8.0 gower_1.0.2 glue_1.8.0 parallelly_1.45.1 codetools_0.2-20 ps_1.9.1 stringi_1.8.7
[51] gtable_0.3.6 GPfit_1.0-9 pillar_1.11.1 furrr_0.3.1 htmltools_0.5.8.1 ipred_0.9-15 lava_1.8.1 R6_2.6.1 lhs_1.2.0 evaluate_1.0.5
[61] lattice_0.22-7 backports_1.5.0 class_7.3-23 Rcpp_1.1.0 prodlim_2025.04.28 xfun_0.53 fs_1.6.6 pkgconfig_2.0.3
BRs,
Mariusz
Metadata
Metadata
Assignees
Labels
No labels