Skip to content

Error "required column is missing from new_data" despite column being present #1527

@marioem

Description

@marioem

The problem

I'm having trouble with predicting on a trained workflow, as recipe cannot see outcome variable in the new data.

Reproducible example

Attachments.zip

library(tidymodels)
library(tidyverse)

train_datas1 <- readRDS(file = "~/traindata.rds")
test_datas1 <- readRDS(file = "~/testdata.rds")

Best_test_resultss1prauc <- readRDS(file = "~/lastfit.rds")

str(train_datas1)
#> tibble [6,752 × 12] (S3: tbl_df/tbl/data.frame)
#>  $ id              : int [1:6752] 1 2 3 4 5 7 8 9 12 13 ...
#>  $ type            : chr [1:6752] "private" "private" "private" "private" ...
#>  $ age             : num [1:6752] 12 11 4 9 15 18 12 14 20 23 ...
#>  $ postal_code     : Factor w/ 9506 levels "10001","10003",..: 7236 6383 5752 663 6203 860 3561 7594 1348 1441 ...
#>  $ suspicious_label: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
#>  $ tx_cnt          : int [1:6752] NA 2 1 2 3 1 2 1 1 1 ...
#>  $ tx_lavg         : num [1:6752] NA 2.41 2.08 2.35 2.56 ...
#>  $ tx_lmed         : num [1:6752] NA 2.41 2.08 2.35 2.26 ...
#>  $ tx_liqr         : num [1:6752] NA 2.25 -4 1.88 2.55 ...
#>  $ tx_lmin         : num [1:6752] NA 1.9 2.08 2.17 1.96 ...
#>  $ tx_lmax         : num [1:6752] NA 2.64 2.08 2.48 2.91 ...
#>  $ case_wts        : imp_wts [1:6752] 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
str(test_datas1)
#> tibble [1,689 × 11] (S3: tbl_df/tbl/data.frame)
#>  $ id              : int [1:1689] 14 25 29 39 42 54 69 75 79 82 ...
#>  $ type            : chr [1:1689] "private" "private" "private" "private" ...
#>  $ age             : num [1:1689] 15 22 31 41 42 59 75 76 68 64 ...
#>  $ postal_code     : Factor w/ 9506 levels "10001","10003",..: 4225 3127 9118 3312 1089 2153 2611 7621 1544 1661 ...
#>  $ suspicious_label: Factor w/ 2 levels "0","1": 2 2 1 1 1 1 1 1 1 1 ...
#>  $ tx_cnt          : int [1:1689] 1 1 NA NA 1 NA 2 1 1 1 ...
#>  $ tx_lavg         : num [1:1689] 3.02 3.13 NA NA 2.23 ...
#>  $ tx_lmed         : num [1:1689] 3.02 3.13 NA NA 2.23 ...
#>  $ tx_liqr         : num [1:1689] -4 -4 NA NA -4 ...
#>  $ tx_lmin         : num [1:1689] 3.02 3.13 NA NA 2.23 ...
#>  $ tx_lmax         : num [1:1689] 3.02 3.13 NA NA 2.23 ...
last_wf <- extract_workflow(Best_test_resultss1prauc)
last_wf
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Recipe
#> Model: rand_forest()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> 1 Recipe Step
#> 
#> • step_relevel()
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> $predictions
#>                    1           0
#>    [1,] 1.813933e-01 0.818606749
#>    [2,] 1.152768e-02 0.988472318
#>    [3,] 8.259202e-03 0.991740798
#>    [4,] 1.741788e-02 0.982582117
#>    [5,] 3.853423e-02 0.961465768
#>    [6,] 2.495565e-03 0.997504435
#>    [7,] 1.629475e-03 0.998370525
#>    [8,] 5.008395e-03 0.994991605
#>    [9,] 5.815574e-03 0.994184426
#>   [10,] 4.645134e-03 0.995354866
#>   [11,] 1.803659e-01 0.819634126
#>   [12,] 4.513234e-01 0.548676645
#>   [13,] 5.721453e-03 0.994278547
#>   [14,] 1.169564e-03 0.998830436
#>   [15,] 2.941641e-02 0.970583594
#>   [16,] 3.207482e-01 0.679251754
#>   [17,] 1.554592e-02 0.984454079
#>   [18,] 7.659054e-02 0.923409464
#>   [19,] 1.413417e-03 0.998586583
#>   [20,] 2.512597e-01 0.748740254
#>   [21,] 1.828849e-02 0.981711514
#>   [22,] 1.268135e-01 0.873186537
#>   [23,] 4.473315e-03 0.995526685
#>   [24,] 3.522791e-01 0.647720877
#>   [25,] 2.339271e-02 0.976607291
#>   [26,] 1.547133e-01 0.845286722
#>   [27,] 0.000000e+00 1.000000000
#>   [28,] 8.447518e-04 0.999155248
#>   [29,] 9.812423e-04 0.999018758
#>   [30,] 1.182108e-02 0.988178915
#>   [31,] 1.263573e-02 0.987364271
#>   [32,] 7.612188e-03 0.992387812
#>   [33,] 5.325998e-02 0.946740019
#>   [34,] 6.468801e-04 0.999353120
#>   [35,] 1.268726e-02 0.987312740
#>   [36,] 2.347993e-02 0.976520072
#>   [37,] 0.000000e+00 1.000000000
#>   [38,] 2.525561e-03 0.997474439
#>   [39,] 4.114922e-02 0.958850779
#>   [40,] 5.698457e-04 0.999430154
#>   [41,] 1.034058e-03 0.998965942
#>   [42,] 1.409088e-02 0.985909125
#>   [43,] 2.765180e-04 0.999723482
#>   [44,] 2.462403e-02 0.975375973
#>   [45,] 8.341663e-03 0.991658337
#>   [46,] 6.063251e-02 0.939367491
#>   [47,] 5.689365e-03 0.994310635
#>   [48,] 1.514053e-02 0.984859471
#> 
#> ...
#> and 958857 more lines.
preproc <- last_wf %>% extract_preprocessor()
preproc
#> 
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#> 
#> ── Inputs
#> Number of variables by role
#> outcome:      1
#> predictor:    9
#> case_weights: 1
#> ID:           1
#> 
#> ── Operations
#> • Re-order factor level to ref_level for: suspicious_label

baked_train <- preproc %>% prep() %>% bake(new_data = train_datas1)
baked_test <- preproc %>% prep() %>% bake(new_data = test_datas1)

str(baked_train)
#> tibble [6,752 × 12] (S3: tbl_df/tbl/data.frame)
#>  $ id              : int [1:6752] 1 2 3 4 5 7 8 9 12 13 ...
#>  $ type            : Factor w/ 3 levels "corporate","private",..: 2 2 2 2 2 2 2 2 2 2 ...
#>  $ age             : num [1:6752] 12 11 4 9 15 18 12 14 20 23 ...
#>  $ postal_code     : Factor w/ 9506 levels "10001","10003",..: 7236 6383 5752 663 6203 860 3561 7594 1348 1441 ...
#>  $ tx_cnt          : int [1:6752] NA 2 1 2 3 1 2 1 1 1 ...
#>  $ tx_lavg         : num [1:6752] NA 2.41 2.08 2.35 2.56 ...
#>  $ tx_lmed         : num [1:6752] NA 2.41 2.08 2.35 2.26 ...
#>  $ tx_liqr         : num [1:6752] NA 2.25 -4 1.88 2.55 ...
#>  $ tx_lmin         : num [1:6752] NA 1.9 2.08 2.17 1.96 ...
#>  $ tx_lmax         : num [1:6752] NA 2.64 2.08 2.48 2.91 ...
#>  $ case_wts        : imp_wts [1:6752] 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
#>  $ suspicious_label: Factor w/ 2 levels "1","0": 2 2 2 2 2 2 2 2 2 2 ...
str(baked_test)
#> tibble [1,689 × 11] (S3: tbl_df/tbl/data.frame)
#>  $ id              : int [1:1689] 14 25 29 39 42 54 69 75 79 82 ...
#>  $ type            : Factor w/ 3 levels "corporate","private",..: 2 2 2 2 2 2 2 2 2 2 ...
#>  $ age             : num [1:1689] 15 22 31 41 42 59 75 76 68 64 ...
#>  $ postal_code     : Factor w/ 9506 levels "10001","10003",..: 4225 3127 9118 3312 1089 2153 2611 7621 1544 1661 ...
#>  $ tx_cnt          : int [1:1689] 1 1 NA NA 1 NA 2 1 1 1 ...
#>  $ tx_lavg         : num [1:1689] 3.02 3.13 NA NA 2.23 ...
#>  $ tx_lmed         : num [1:1689] 3.02 3.13 NA NA 2.23 ...
#>  $ tx_liqr         : num [1:1689] -4 -4 NA NA -4 ...
#>  $ tx_lmin         : num [1:1689] 3.02 3.13 NA NA 2.23 ...
#>  $ tx_lmax         : num [1:1689] 3.02 3.13 NA NA 2.23 ...
#>  $ suspicious_label: Factor w/ 2 levels "1","0": 1 1 2 2 2 2 2 2 2 2 ...

augment(last_wf, new_data = train_datas1) 
#> Error in `step_relevel()`:
#> ! The following required column is missing from `new_data`:
#>   suspicious_label.

<sup>Created on 2025-09-28 with [reprex v2.1.1](https://reprex.tidyverse.org)</sup>

Backtrace:

<error/rlang_error>
Error in `step_relevel()`:
! The following required column is missing from `new_data`: suspicious_label.
---
Backtrace:
     ▆
  1. ├─generics::augment(last_wf, new_data = train_datas1)
  2. └─workflows:::augment.workflow(last_wf, new_data = train_datas1)
  3.   ├─hardhat::forge(new_data, blueprint = mold$blueprint, outcomes = outcomes)
  4.   └─hardhat:::forge.data.frame(...)
  5.     ├─hardhat::run_forge(blueprint, new_data = new_data, outcomes = outcomes)
  6.     └─hardhat:::run_forge.default_recipe_blueprint(...)
  7.       └─hardhat:::forge_recipe_default_process(...)
  8.         ├─recipes::bake(object = rec, new_data = new_data)
  9.         └─recipes:::bake.recipe(object = rec, new_data = new_data)
 10.           ├─recipes::bake(step, new_data = new_data)
 11.           └─recipes:::bake.step_relevel(step, new_data = new_data)
 12.             └─recipes::check_new_data(col_names, object, new_data)
Run rlang::last_trace(drop = FALSE) to see 2 hidden frames.

Session Info:

R version 4.5.1 (2025-06-13)
Platform: x86_64-apple-darwin20
Running under: macOS Sequoia 15.7

Matrix products: default
BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Europe/Stockholm
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] lubridate_1.9.4    forcats_1.0.0      stringr_1.5.2      readr_2.1.5        tibble_3.3.0       tidyverse_2.0.0    yardstick_1.3.2    workflowsets_1.1.1 workflows_1.3.0    tune_2.0.0         tidyr_1.3.1       
[12] tailor_0.1.0       rsample_1.3.1      recipes_1.3.1      purrr_1.1.0        parsnip_1.3.3      modeldata_1.5.1    infer_1.0.9        ggplot2_4.0.0      dplyr_1.1.4        dials_1.4.2        scales_1.4.0      
[23] broom_1.0.10       tidymodels_1.4.1  

loaded via a namespace (and not attached):
 [1] tidyselect_1.2.1    timeDate_4041.110   farver_2.1.2        S7_0.2.0            fastmap_1.2.0       reprex_2.1.1        digest_0.6.37       rpart_4.1.24        timechange_0.3.0    lifecycle_1.0.4    
[11] survival_3.8-3      processx_3.8.6      magrittr_2.0.4      compiler_4.5.1      rlang_1.1.6         tools_4.5.1         utf8_1.2.6          yaml_2.3.10         data.table_1.17.8   knitr_1.50         
[21] DiceDesign_1.10     RColorBrewer_1.1-3  withr_3.0.2         nnet_7.3-20         grid_4.5.1          future_1.67.0       globals_0.18.0      MASS_7.3-65         cli_3.6.5           crayon_1.5.3       
[31] rmarkdown_2.29      generics_0.1.4      rstudioapi_0.17.1   future.apply_1.20.0 tzdb_0.5.0          splines_4.5.1       parallel_4.5.1      vctrs_0.6.5         hardhat_1.4.2       Matrix_1.7-4       
[41] callr_3.7.6         hms_1.1.3           listenv_0.9.1       clipr_0.8.0         gower_1.0.2         glue_1.8.0          parallelly_1.45.1   codetools_0.2-20    ps_1.9.1            stringi_1.8.7      
[51] gtable_0.3.6        GPfit_1.0-9         pillar_1.11.1       furrr_0.3.1         htmltools_0.5.8.1   ipred_0.9-15        lava_1.8.1          R6_2.6.1            lhs_1.2.0           evaluate_1.0.5     
[61] lattice_0.22-7      backports_1.5.0     class_7.3-23        Rcpp_1.1.0          prodlim_2025.04.28  xfun_0.53           fs_1.6.6            pkgconfig_2.0.3   

BRs,

Mariusz

Attachments.zip

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions