From 5be75f1483055ba2bdbf204a96d4bc09240e65a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Sun, 3 Nov 2024 07:38:12 +0100 Subject: [PATCH 1/2] feat: Client-side warning on join explosion --- R/dplyr.R | 2 + R/full_join.R | 2 +- R/inner_join.R | 2 +- R/join.R | 64 ++++++++++++++++++++++++ R/left_join.R | 2 +- R/right_join.R | 2 +- tests/testthat/_snaps/dplyr-join-rows.md | 33 ++++++++++++ tests/testthat/_snaps/dplyr-join.md | 9 ++++ tests/testthat/test-dplyr-join-rows.R | 1 - tests/testthat/test-dplyr-join.R | 1 - tools/00-funs.R | 6 +-- 11 files changed, 113 insertions(+), 11 deletions(-) diff --git a/R/dplyr.R b/R/dplyr.R index dd1129355..7d4fd7c7e 100644 --- a/R/dplyr.R +++ b/R/dplyr.R @@ -80,9 +80,11 @@ rows_check_y_unmatched <- dplyr$rows_check_y_unmatched rows_df_in_place <- dplyr$rows_df_in_place rowwise_df <- dplyr$rowwise_df slice_rows <- dplyr$slice_rows +stop_join <- dplyr$stop_join summarise_build <- dplyr$summarise_build summarise_cols <- dplyr$summarise_cols summarise_deprecate_variable_size <- dplyr$summarise_deprecate_variable_size the <- dplyr$the tick_if_needed <- dplyr$tick_if_needed +warn_join <- dplyr$warn_join warn_join_cross_by <- dplyr$warn_join_cross_by diff --git a/R/full_join.R b/R/full_join.R index 31bb03154..efd769ccb 100644 --- a/R/full_join.R +++ b/R/full_join.R @@ -17,7 +17,7 @@ full_join.duckplyr_df <- function(x, y, by = NULL, copy = FALSE, suffix = c(".x" "No implicit cross joins for {.code full_join()}" = is_cross_by(by), "{.arg multiple} not supported" = !identical(multiple, "all"), { - out <- rel_join_impl(x, y, by, "full", na_matches, suffix, keep, error_call) + out <- rel_join_impl(x, y, by, "full", na_matches, suffix, keep, relationship, error_call) return(out) } ) diff --git a/R/inner_join.R b/R/inner_join.R index 24fbbb71c..d09883942 100644 --- a/R/inner_join.R +++ b/R/inner_join.R @@ -19,7 +19,7 @@ inner_join.duckplyr_df <- function(x, y, by = NULL, copy = FALSE, suffix = c(".x "{.arg multiple} not supported" = !identical(multiple, "all"), "{.arg unmatched} not supported" = !identical(unmatched, "drop"), { - out <- rel_join_impl(x, y, by, "inner", na_matches, suffix, keep, error_call) + out <- rel_join_impl(x, y, by, "inner", na_matches, suffix, keep, relationship, error_call) return(out) } ) diff --git a/R/join.R b/R/join.R index 42dde6927..ceaf65581 100644 --- a/R/join.R +++ b/R/join.R @@ -6,6 +6,7 @@ rel_join_impl <- function( na_matches, suffix = c(".x", ".y"), keep = NULL, + relationship = NULL, error_call = caller_env() ) { mutating <- !(join %in% c("semi", "anti")) @@ -25,6 +26,10 @@ rel_join_impl <- function( by <- as_join_by(by, error_call = error_call) } + if (mutating) { + check_relationship(relationship, x, y, by, error_call = error_call) + } + x_by <- by$x y_by <- by$y x_rel <- duckdb_rel_from_df(x) @@ -136,3 +141,62 @@ rel_join_impl <- function( return(out) } + +check_relationship <- function(relationship, x, y, by, error_call) { + if (is_null(relationship)) { + # FIXME: Determine behavior based on option + if (!is_key(x, by$x) && !is_key(y, by$y)) { + warn_join( + message = c( + "Detected an unexpected many-to-many relationship between `x` and `y`.", + i = paste0( + "If a many-to-many relationship is expected, ", + "set `relationship = \"many-to-many\"` to silence this warning." + ) + ), + class = "dplyr_warning_join_relationship_many_to_many", + call = error_call + ) + } + return() + } + + if (relationship %in% c("one-to-many", "one-to-one")) { + if (!is_key(x, by$x)) { + stop_join( + message = c( + glue("Each row in `{x_name}` must match at most 1 row in `{y_name}`."), + ), + class = paste0("dplyr_error_join_relationship_", gsub("-", "_", relationship)), + call = error_call + ) + } + } + + if (relationship %in% c("many-to-one", "one-to-one")) { + if (!is_key(y, by$y)) { + stop_join( + message = c( + glue("Each row in `{y_name}` must match at most 1 row in `{x_name}`."), + ), + class = paste0("dplyr_error_join_relationship_", gsub("-", "_", relationship)), + call = error_call + ) + } + } +} + +is_key <- function(x, cols) { + local_options(duckdb.materialize_message = FALSE) + + rows <- + x %>% + # FIXME: Why does this materialize + # as_duckplyr_tibble() %>% + summarize(.by = c(!!!syms(cols)), `___n` = n()) %>% + filter(`___n` > 1L) %>% + head(1L) %>% + nrow() + + rows == 0 +} diff --git a/R/left_join.R b/R/left_join.R index 2d0b0d31d..eb52b8327 100644 --- a/R/left_join.R +++ b/R/left_join.R @@ -20,7 +20,7 @@ left_join.duckplyr_df <- function(x, y, by = NULL, copy = FALSE, suffix = c(".x" "{.arg multiple} not supported" = !identical(multiple, "all"), "{.arg unmatched} not supported" = !identical(unmatched, "drop"), { - out <- rel_join_impl(x, y, by, "left", na_matches, suffix, keep, error_call) + out <- rel_join_impl(x, y, by, "left", na_matches, suffix, keep, relationship, error_call) return(out) } ) diff --git a/R/right_join.R b/R/right_join.R index ae8073839..c1572febf 100644 --- a/R/right_join.R +++ b/R/right_join.R @@ -19,7 +19,7 @@ right_join.duckplyr_df <- function(x, y, by = NULL, copy = FALSE, suffix = c(".x "{.arg multiple} not supported" = !identical(multiple, "all"), "{.arg unmatched} not supported" = !identical(unmatched, "drop"), { - out <- rel_join_impl(x, y, by, "right", na_matches, suffix, keep, error_call) + out <- rel_join_impl(x, y, by, "right", na_matches, suffix, keep, relationship, error_call) return(out) } ) diff --git a/tests/testthat/_snaps/dplyr-join-rows.md b/tests/testthat/_snaps/dplyr-join-rows.md index 07d31a943..f478fd51f 100644 --- a/tests/testthat/_snaps/dplyr-join-rows.md +++ b/tests/testthat/_snaps/dplyr-join-rows.md @@ -65,6 +65,39 @@ ! Each row in `x` must match at most 1 row in `y`. i Row 1 of `x` matches multiple rows in `y`. +# join_rows() gives meaningful many-to-many warnings + + Code + join_rows(c(1, 1), c(1, 1)) + Condition + Warning: + Detected an unexpected many-to-many relationship between `x` and `y`. + i Row 1 of `x` matches multiple rows in `y`. + i Row 1 of `y` matches multiple rows in `x`. + i If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning. + Output + $x + [1] 1 1 2 2 + + $y + [1] 1 2 1 2 + + +--- + + Code + duckplyr_left_join(df, df, by = join_by(x)) + Condition + Warning in `duckplyr_left_join()`: + Detected an unexpected many-to-many relationship between `x` and `y`. + i If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning. + Output + x + 1 1 + 2 1 + 3 1 + 4 1 + # join_rows() gives meaningful error message on unmatched rows Code diff --git a/tests/testthat/_snaps/dplyr-join.md b/tests/testthat/_snaps/dplyr-join.md index 73111ec5a..28ebdbb85 100644 --- a/tests/testthat/_snaps/dplyr-join.md +++ b/tests/testthat/_snaps/dplyr-join.md @@ -50,6 +50,15 @@ Error: ! `na_matches` must be one of "na" or "never", not "foo". +# mutating joins trigger many-to-many warning + + Code + out <- duckplyr_left_join(df, df, join_by(x)) + Condition + Warning in `duckplyr_left_join()`: + Detected an unexpected many-to-many relationship between `x` and `y`. + i If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning. + # mutating joins compute common columns Code diff --git a/tests/testthat/test-dplyr-join-rows.R b/tests/testthat/test-dplyr-join-rows.R index 016df9b5d..d4ae8b4d4 100644 --- a/tests/testthat/test-dplyr-join-rows.R +++ b/tests/testthat/test-dplyr-join-rows.R @@ -190,7 +190,6 @@ test_that("join_rows() gives meaningful many-to-one errors", { }) test_that("join_rows() gives meaningful many-to-many warnings", { - skip("TODO duckdb") expect_snapshot({ join_rows(c(1, 1), c(1, 1)) }) diff --git a/tests/testthat/test-dplyr-join.R b/tests/testthat/test-dplyr-join.R index 8e047639b..429e7f906 100644 --- a/tests/testthat/test-dplyr-join.R +++ b/tests/testthat/test-dplyr-join.R @@ -353,7 +353,6 @@ test_that("join_filter() validates arguments", { }) test_that("mutating joins trigger many-to-many warning", { - skip("TODO duckdb") df <- tibble(x = c(1, 1)) expect_snapshot(out <- duckplyr_left_join(df, df, join_by(x))) }) diff --git a/tools/00-funs.R b/tools/00-funs.R index fd43dab5d..4f4bfc94d 100644 --- a/tools/00-funs.R +++ b/tools/00-funs.R @@ -237,14 +237,10 @@ duckplyr_tests <- head(n = -1, list( NULL ), "test-join-rows.R" = c( - "join_rows() gives meaningful many-to-many warnings", NULL ), "test-join.R" = c( - "mutating joins trigger multiple match warning", - "mutating joins don't trigger multiple match warning when called indirectly", - - "mutating joins trigger many-to-many warning", + # FIXME: How to detect an indirect call? "mutating joins don't trigger many-to-many warning when called indirectly", NULL ), From 3296a221e5d2bd82dac9ab317aaacbe6dd93cdca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Sun, 26 Jan 2025 07:30:15 +0100 Subject: [PATCH 2/2] Squashed commit of the following: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit db5d405f508af70a3abfda7d8fed0e56510156d2 Author: Kirill Müller Date: Sun Jan 26 06:54:20 2025 +0100 Sync commit 7bd1872e95563cc7df2dca20e14e56f741b82ffa Author: Kirill Müller Date: Fri Jan 24 21:23:24 2025 +0100 Remove dplyr compat code commit 8d6a573227b2462e4a9b15b45ea64e9aad56532c Author: Kirill Müller Date: Wed Jan 22 16:55:00 2025 +0100 fix: Avoid forwarding `is.na()` to `is.nan()` to support non-numeric data --- R/relational-duckdb.R | 10 +--------- tests/testthat/test-rel_api.R | 12 ++++++------ tools/tpch-raw-oo/13.R | 2 +- tools/tpch-raw/13.R | 2 +- 4 files changed, 9 insertions(+), 17 deletions(-) diff --git a/R/relational-duckdb.R b/R/relational-duckdb.R index ed1db951c..431e80c06 100644 --- a/R/relational-duckdb.R +++ b/R/relational-duckdb.R @@ -33,7 +33,7 @@ duckplyr_macros <- c( # "___divide" = "(x, y) AS CASE WHEN y = 0 THEN CASE WHEN x = 0 THEN CAST('NaN' AS double) WHEN x > 0 THEN CAST('+Infinity' AS double) ELSE CAST('-Infinity' AS double) END ELSE CAST(x AS double) / y END", # - "is.na" = "(x) AS (x IS NULL OR isnan(x))", + "is.na" = "(x) AS (x IS NULL)", "n" = "() AS CAST(COUNT(*) AS int32)", # "___log10" = "(x) AS CASE WHEN x < 0 THEN CAST('NaN' AS double) WHEN x = 0 THEN CAST('-Inf' AS double) ELSE log10(x) END", @@ -180,14 +180,6 @@ check_df_for_rel <- function(df) { if (!identical(df_attrib, roundtrip_attrib)) { cli::cli_abort("Attributes are lost during conversion. Affected column: {.var {names(df)[[i]]}}.") } - # Always check roundtrip for timestamp columns - # duckdb uses microsecond precision only, this is in some cases - # less than R does - if (inherits(df[[i]], "POSIXct")) { - if (!identical(df[[i]], roundtrip[[i]])) { - cli::cli_abort("Imperfect roundtrip. Affected column: {.var {names(df)[[i]]}}.") - } - } } } diff --git a/tests/testthat/test-rel_api.R b/tests/testthat/test-rel_api.R index c57f6363a..7be8dacce 100644 --- a/tests/testthat/test-rel_api.R +++ b/tests/testthat/test-rel_api.R @@ -9490,7 +9490,7 @@ test_that("relational mutate(d = a %in% NA_real_) order-preserving", { drv <- duckdb::duckdb() con <- DBI::dbConnect(drv) experimental <- FALSE - invisible(DBI::dbExecute(con, 'CREATE MACRO "is.na"(x) AS (x IS NULL OR isnan(x))')) + invisible(DBI::dbExecute(con, 'CREATE MACRO "is.na"(x) AS (x IS NULL)')) df1 <- data.frame(a = seq(1, 6, by = 1), b = 2, g = c(1L, 2L, 2L, 3L, 3L, 3L)) "mutate" @@ -9634,7 +9634,7 @@ test_that("relational mutate(d = NA_real_, e = is.na(d)) order-preserving", { drv <- duckdb::duckdb() con <- DBI::dbConnect(drv) experimental <- FALSE - invisible(DBI::dbExecute(con, 'CREATE MACRO "is.na"(x) AS (x IS NULL OR isnan(x))')) + invisible(DBI::dbExecute(con, 'CREATE MACRO "is.na"(x) AS (x IS NULL)')) df1 <- data.frame(a = seq(1, 6, by = 1), b = 2, g = c(1L, 2L, 2L, 3L, 3L, 3L)) "mutate" @@ -9721,7 +9721,7 @@ test_that("relational mutate(d = NaN, e = is.na(d)) order-preserving", { drv <- duckdb::duckdb() con <- DBI::dbConnect(drv) experimental <- FALSE - invisible(DBI::dbExecute(con, 'CREATE MACRO "is.na"(x) AS (x IS NULL OR isnan(x))')) + invisible(DBI::dbExecute(con, 'CREATE MACRO "is.na"(x) AS (x IS NULL)')) df1 <- data.frame(a = seq(1, 6, by = 1), b = 2, g = c(1L, 2L, 2L, 3L, 3L, 3L)) "mutate" @@ -13951,7 +13951,7 @@ test_that("relational mutate(d = a %in% NA_real_) order-enforcing", { drv <- duckdb::duckdb() con <- DBI::dbConnect(drv) experimental <- FALSE - invisible(DBI::dbExecute(con, 'CREATE MACRO "is.na"(x) AS (x IS NULL OR isnan(x))')) + invisible(DBI::dbExecute(con, 'CREATE MACRO "is.na"(x) AS (x IS NULL)')) df1 <- data.frame(a = seq(1, 6, by = 1), b = 2, g = c(1L, 2L, 2L, 3L, 3L, 3L)) "mutate" @@ -14110,7 +14110,7 @@ test_that("relational mutate(d = NA_real_, e = is.na(d)) order-enforcing", { drv <- duckdb::duckdb() con <- DBI::dbConnect(drv) experimental <- FALSE - invisible(DBI::dbExecute(con, 'CREATE MACRO "is.na"(x) AS (x IS NULL OR isnan(x))')) + invisible(DBI::dbExecute(con, 'CREATE MACRO "is.na"(x) AS (x IS NULL)')) df1 <- data.frame(a = seq(1, 6, by = 1), b = 2, g = c(1L, 2L, 2L, 3L, 3L, 3L)) "mutate" @@ -14202,7 +14202,7 @@ test_that("relational mutate(d = NaN, e = is.na(d)) order-enforcing", { drv <- duckdb::duckdb() con <- DBI::dbConnect(drv) experimental <- FALSE - invisible(DBI::dbExecute(con, 'CREATE MACRO "is.na"(x) AS (x IS NULL OR isnan(x))')) + invisible(DBI::dbExecute(con, 'CREATE MACRO "is.na"(x) AS (x IS NULL)')) df1 <- data.frame(a = seq(1, 6, by = 1), b = 2, g = c(1L, 2L, 2L, 3L, 3L, 3L)) "mutate" diff --git a/tools/tpch-raw-oo/13.R b/tools/tpch-raw-oo/13.R index be0651a03..ad9de90ec 100644 --- a/tools/tpch-raw-oo/13.R +++ b/tools/tpch-raw-oo/13.R @@ -18,7 +18,7 @@ invisible( 'CREATE MACRO "if_else"(test, yes, no) AS (CASE WHEN test IS NULL THEN NULL ELSE CASE WHEN test THEN yes ELSE no END END)' ) ) -invisible(DBI::dbExecute(con, 'CREATE MACRO "is.na"(x) AS (x IS NULL OR isnan(x))')) +invisible(DBI::dbExecute(con, 'CREATE MACRO "is.na"(x) AS (x IS NULL)')) invisible(DBI::dbExecute(con, 'CREATE MACRO "n"() AS CAST(COUNT(*) AS int32)')) df1 <- orders "filter" diff --git a/tools/tpch-raw/13.R b/tools/tpch-raw/13.R index 699889767..8d4064f6a 100644 --- a/tools/tpch-raw/13.R +++ b/tools/tpch-raw/13.R @@ -18,7 +18,7 @@ invisible( 'CREATE MACRO "if_else"(test, yes, no) AS (CASE WHEN test IS NULL THEN NULL ELSE CASE WHEN test THEN yes ELSE no END END)' ) ) -invisible(DBI::dbExecute(con, 'CREATE MACRO "is.na"(x) AS (x IS NULL OR isnan(x))')) +invisible(DBI::dbExecute(con, 'CREATE MACRO "is.na"(x) AS (x IS NULL)')) invisible(DBI::dbExecute(con, 'CREATE MACRO "n"() AS CAST(COUNT(*) AS int32)')) df1 <- orders "filter"