Skip to content

Commit

Permalink
add rest of checks
Browse files Browse the repository at this point in the history
  • Loading branch information
EmilHvitfeldt committed Nov 10, 2024
1 parent 8161b20 commit 11d4ece
Show file tree
Hide file tree
Showing 24 changed files with 343 additions and 8 deletions.
18 changes: 11 additions & 7 deletions R/tokenfilter.R
Original file line number Diff line number Diff line change
Expand Up @@ -100,13 +100,6 @@ step_tokenfilter <-
res = NULL,
skip = FALSE,
id = rand_id("tokenfilter")) {
if (percentage && (max_times > 1 | max_times < 0 |
min_times > 1 | min_times < 0)) {
cli::cli_abort(
"{.arg max_times} and {.arg min_times} should be in the interval [0, 1]."
)
}

add_step(
recipe,
step_tokenfilter_new(
Expand Down Expand Up @@ -150,6 +143,17 @@ step_tokenfilter_new <-
prep.step_tokenfilter <- function(x, training, info = NULL, ...) {
col_names <- recipes_eval_select(x$terms, training, info)

check_bool(x$percentage, arg = "percentage")
if (x$percentage) {
check_number_decimal(x$max_times, min = 0, max = 1, arg = "max_times")
check_number_decimal(x$min_times, min = 0, max = 1, arg = "min_times")
} else {
check_number_whole(x$max_times, min = 0, allow_infinite = TRUE, arg = "max_times")
check_number_whole(x$min_times, min = 0, arg = "min_times")
}
check_number_whole(x$max_tokens, min = 0, arg = "max_tokens")
check_function(x$filter_fun, allow_null = TRUE, arg = "filter_fun")

check_type(training[, col_names], types = "tokenlist")

retain_words <- list()
Expand Down
4 changes: 4 additions & 0 deletions R/tokenize.R
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,10 @@ step_tokenize_new <-
prep.step_tokenize <- function(x, training, info = NULL, ...) {
col_names <- recipes_eval_select(x$terms, training, info)

check_string(x$token, arg = "token")
check_string(x$engine, arg = "engine")
check_function(x$custom_token, allow_null = TRUE, arg = "custom_token")

training <- factor_to_text(training, col_names)

check_type(training[, col_names], types = c("string", "factor", "ordered"))
Expand Down
2 changes: 2 additions & 0 deletions R/tokenize_bpe.R
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ step_tokenize_bpe_new <-
prep.step_tokenize_bpe <- function(x, training, info = NULL, ...) {
col_names <- recipes_eval_select(x$terms, training, info)

check_number_whole(x$vocabulary_size, min = 0, arg = "vocabulary_size")

training <- factor_to_text(training, col_names)

check_type(training[, col_names], types = c("string", "factor", "ordered"))
Expand Down
2 changes: 2 additions & 0 deletions R/tokenize_sentencepiece.R
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ step_tokenize_sentencepiece_new <-
prep.step_tokenize_sentencepiece <- function(x, training, info = NULL, ...) {
col_names <- recipes_eval_select(x$terms, training, info)

check_number_whole(x$vocabulary_size, min = 0, arg = "vocabulary_size")

training <- factor_to_text(training, col_names)

check_type(training[, col_names], types = c("string", "factor", "ordered"))
Expand Down
3 changes: 3 additions & 0 deletions R/tokenize_wordpiece.R
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ step_tokenize_wordpiece_new <-
prep.step_tokenize_wordpiece <- function(x, training, info = NULL, ...) {
col_names <- recipes_eval_select(x$terms, training, info)

check_string(x$unk_token, arg = "unk_token")
check_number_whole(x$max_chars, min = 0, arg = "max_chars")

training <- factor_to_text(training, col_names)

check_type(training[, col_names], types = c("string", "factor", "ordered"))
Expand Down
2 changes: 2 additions & 0 deletions R/tokenmerge.R
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ step_tokenmerge_new <-
prep.step_tokenmerge <- function(x, training, info = NULL, ...) {
col_names <- recipes_eval_select(x$terms, training, info)

check_string(x$prefix, arg = "prefix")

check_type(training[, col_names], types = "tokenlist")

step_tokenmerge_new(
Expand Down
2 changes: 2 additions & 0 deletions R/untokenize.R
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ step_untokenize_new <-
prep.step_untokenize <- function(x, training, info = NULL, ...) {
col_names <- recipes_eval_select(x$terms, training, info)

check_string(x$sep, arg = "sep")

check_type(training[, col_names], types = "tokenlist")

step_untokenize_new(
Expand Down
5 changes: 4 additions & 1 deletion R/word_embeddings.R
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ step_word_embeddings <- function(recipe,
)
}

aggregation <- match.arg(aggregation)
aggregation <- rlang::arg_match(aggregation)

add_step(
recipe,
Expand Down Expand Up @@ -160,6 +160,9 @@ step_word_embeddings_new <- function(terms, role, trained, columns, embeddings,
prep.step_word_embeddings <- function(x, training, info = NULL, ...) {
col_names <- recipes_eval_select(x$terms, training, info)

check_number_decimal(x$aggregation_default, arg = "aggregation_default")
check_string(x$prefix, arg = "prefix")

check_type(training[, col_names], types = "tokenlist")

step_word_embeddings_new(
Expand Down
67 changes: 67 additions & 0 deletions tests/testthat/_snaps/tokenfilter.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,73 @@
* Tokenization for: text | Trained
* Text filtering for: text | Trained

# bad args

Code
recipe(~., data = mtcars) %>% step_tokenfilter(percentage = "yes") %>% prep()
Condition
Error in `step_tokenfilter()`:
Caused by error in `prep()`:
! `percentage` must be `TRUE` or `FALSE`, not the string "yes".

---

Code
recipe(~., data = mtcars) %>% step_tokenfilter(max_tokens = -4) %>% prep()
Condition
Error in `step_tokenfilter()`:
Caused by error in `prep()`:
! `max_tokens` must be a whole number larger than or equal to 0, not the number -4.

---

Code
recipe(~., data = mtcars) %>% step_tokenfilter(filter_fun = -4) %>% prep()
Condition
Error in `step_tokenfilter()`:
Caused by error in `prep()`:
! `filter_fun` must be a function or `NULL`, not the number -4.

---

Code
recipe(~., data = mtcars) %>% step_tokenfilter(percentage = TRUE, max_times = 2) %>%
prep()
Condition
Error in `step_tokenfilter()`:
Caused by error in `prep()`:
! `max_times` must be a number between 0 and 1, not the number 2.

---

Code
recipe(~., data = mtcars) %>% step_tokenfilter(percentage = TRUE, min_times = 2) %>%
prep()
Condition
Error in `step_tokenfilter()`:
Caused by error in `prep()`:
! `min_times` must be a number between 0 and 1, not the number 2.

---

Code
recipe(~., data = mtcars) %>% step_tokenfilter(percentage = FALSE, max_times = -
1) %>% prep()
Condition
Error in `step_tokenfilter()`:
Caused by error in `prep()`:
! `max_times` must be a whole number larger than or equal to 0, not the number -1.

---

Code
recipe(~., data = mtcars) %>% step_tokenfilter(percentage = FALSE, min_times = -
1) %>% prep()
Condition
Error in `step_tokenfilter()`:
Caused by error in `prep()`:
! `min_times` must be a whole number larger than or equal to 0, not the number -1.

# bake method errors when needed non-standard role columns are missing

Code
Expand Down
27 changes: 27 additions & 0 deletions tests/testthat/_snaps/tokenize.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,33 @@
Caused by error in `prep()`:
! The `engine` argument is not valid.

# bad args

Code
recipe(~., data = mtcars) %>% step_tokenize(token = letters) %>% prep()
Condition
Error in `step_tokenize()`:
Caused by error in `prep()`:
! `token` must be a single string, not a character vector.

---

Code
recipe(~., data = mtcars) %>% step_tokenize(engine = letters) %>% prep()
Condition
Error in `step_tokenize()`:
Caused by error in `prep()`:
! `engine` must be a single string, not a character vector.

---

Code
recipe(~., data = mtcars) %>% step_tokenize(custom_token = "yes") %>% prep()
Condition
Error in `step_tokenize()`:
Caused by error in `prep()`:
! `custom_token` must be a function or `NULL`, not the string "yes".

# bake method errors when needed non-standard role columns are missing

Code
Expand Down
9 changes: 9 additions & 0 deletions tests/testthat/_snaps/tokenize_bpe.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
# bad args

Code
recipe(~., data = mtcars) %>% step_tokenize_bpe(vocabulary_size = -4) %>% prep()
Condition
Error in `step_tokenize_bpe()`:
Caused by error in `prep()`:
! `vocabulary_size` must be a whole number larger than or equal to 0, not the number -4.

# bake method errors when needed non-standard role columns are missing

Code
Expand Down
10 changes: 10 additions & 0 deletions tests/testthat/_snaps/tokenize_sentencepiece.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,16 @@
Caused by error in `prep()`:
! The `vocabulary_size` of 10 is too small for column `text1` which has a unique character count of 23.

# bad args

Code
recipe(~., data = mtcars) %>% step_tokenize_sentencepiece(vocabulary_size = -4) %>%
prep()
Condition
Error in `step_tokenize_sentencepiece()`:
Caused by error in `prep()`:
! `vocabulary_size` must be a whole number larger than or equal to 0, not the number -4.

# bake method errors when needed non-standard role columns are missing

Code
Expand Down
18 changes: 18 additions & 0 deletions tests/testthat/_snaps/tokenize_wordpiece.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,21 @@
# bad args

Code
recipe(~., data = mtcars) %>% step_tokenize_wordpiece(unk_token = 0) %>% prep()
Condition
Error in `step_tokenize_wordpiece()`:
Caused by error in `prep()`:
! `unk_token` must be a single string, not the number 0.

---

Code
recipe(~., data = mtcars) %>% step_tokenize_wordpiece(max_chars = -4) %>% prep()
Condition
Error in `step_tokenize_wordpiece()`:
Caused by error in `prep()`:
! `max_chars` must be a whole number larger than or equal to 0, not the number -4.

# bake method errors when needed non-standard role columns are missing

Code
Expand Down
9 changes: 9 additions & 0 deletions tests/testthat/_snaps/tokenmerge.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,15 @@
! Name collision occurred. The following variable names already exist:
* `tokenmerge`

# bad args

Code
recipe(~., data = mtcars) %>% step_tokenmerge(prefix = NULL) %>% prep()
Condition
Error in `step_tokenmerge()`:
Caused by error in `prep()`:
! `prefix` must be a single string, not `NULL`.

# bake method errors when needed non-standard role columns are missing

Code
Expand Down
9 changes: 9 additions & 0 deletions tests/testthat/_snaps/untokenize.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
# bad args

Code
recipe(~., data = mtcars) %>% step_untokenize(sep = 0) %>% prep()
Condition
Error in `step_untokenize()`:
Caused by error in `prep()`:
! `sep` must be a single string, not the number 0.

# bake method errors when needed non-standard role columns are missing

Code
Expand Down
26 changes: 26 additions & 0 deletions tests/testthat/_snaps/word_embeddings.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,32 @@
! Name collision occurred. The following variable names already exist:
* `wordembed_text_d1`

# bad args

Code
recipe(~., data = mtcars) %>% step_word_embeddings(aggregation = "wrong") %>%
prep()
Condition
Error in `step_word_embeddings()`:
! argument "embeddings" is missing, with no default

---

Code
recipe(~., data = mtcars) %>% step_word_embeddings(aggregation_default = "yes") %>%
prep()
Condition
Error in `step_word_embeddings()`:
! argument "embeddings" is missing, with no default

---

Code
recipe(~., data = mtcars) %>% step_word_embeddings(prefix = NULL) %>% prep()
Condition
Error in `step_word_embeddings()`:
! argument "embeddings" is missing, with no default

# bake method errors when needed non-standard role columns are missing

Code
Expand Down
46 changes: 46 additions & 0 deletions tests/testthat/test-tokenfilter.R
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,52 @@ test_that("tunable", {
)
})

test_that("bad args", {
expect_snapshot(
error = TRUE,
recipe(~., data = mtcars) %>%
step_tokenfilter(percentage = "yes") %>%
prep()
)
expect_snapshot(
error = TRUE,
recipe(~., data = mtcars) %>%
step_tokenfilter(max_tokens = -4) %>%
prep()
)
expect_snapshot(
error = TRUE,
recipe(~., data = mtcars) %>%
step_tokenfilter(filter_fun = -4) %>%
prep()
)
expect_snapshot(
error = TRUE,
recipe(~., data = mtcars) %>%
step_tokenfilter(percentage = TRUE, max_times = 2) %>%
prep()
)
expect_snapshot(
error = TRUE,
recipe(~., data = mtcars) %>%
step_tokenfilter(percentage = TRUE, min_times = 2) %>%
prep()
)
expect_snapshot(
error = TRUE,
recipe(~., data = mtcars) %>%
step_tokenfilter(percentage = FALSE, max_times = -1) %>%
prep()
)
expect_snapshot(
error = TRUE,
recipe(~., data = mtcars) %>%
step_tokenfilter(percentage = FALSE, min_times = -1) %>%
prep()
)
})


# Infrastructure ---------------------------------------------------------------

test_that("bake method errors when needed non-standard role columns are missing", {
Expand Down
Loading

0 comments on commit 11d4ece

Please sign in to comment.