Skip to content

Catch anyDuplicated() equivalents from dplyr, data.table #2795

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Jun 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
* `boolean_arithmetic_linter()` finds many more cases like `sum(x | y) == 0` where the total of a known-logical vector is compared to 0 (#1580, @MichaelChirico).
* `expect_lint()` has a new argument `ignore_order` (default `FALSE`), which, if `TRUE`, allows the `checks=` to be provided in arbitary order vs. how `lint()` produces them (@MichaelChirico).
* `undesirable_function_linter()` accepts unnamed entries, treating them as undesirable functions, e.g. `undesirable_function_linter("sum")` (#2536, @MichaelChirico).
* `any_duplicated_linter()` is extended to recognize some usages from {dplyr} and {data.table} that could be replaced by `anyDuplicated()`, e.g. `n_distinct(col) == n()` or `uniqueN(col) == .N` (#2482, @MichaelChirico).

### New linters

Expand Down
109 changes: 89 additions & 20 deletions R/any_duplicated_linter.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@
#' linters = any_duplicated_linter()
#' )
#'
#' lint(
#' text = "DT[, uniqueN(col) == .N]",
#' linters = any_duplicated_linter()
#' )
#'
#' # okay
#' lint(
#' text = "anyDuplicated(x)",
Expand All @@ -30,12 +35,17 @@
#' linters = any_duplicated_linter()
#' )
#'
#' lint(
#' text = "anyDuplicated(DT, by = 'col') == 0L",
#' linters = any_duplicated_linter()
#' )
#'
#' @evalRd rd_tags("any_duplicated_linter")
#' @seealso [linters] for a complete list of linters available in lintr.
#' @export
any_duplicated_linter <- function() {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should the docs and examples be expanded as well?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, good idea. I actually am not sure what a "canonical" (non-contrived) example using n_distinct() would look like TBH, so omitted for now.

any_duplicated_xpath <- "
following-sibling::expr[1][expr[1][SYMBOL_FUNCTION_CALL[text() = 'duplicated']]]
following-sibling::expr[1][expr[1]/SYMBOL_FUNCTION_CALL[text() = 'duplicated']]
/parent::expr[
count(expr) = 2
or (count(expr) = 3 and SYMBOL_SUB[text() = 'na.rm'])
Expand All @@ -56,32 +66,70 @@ any_duplicated_linter <- function() {
//{ c('EQ', 'NE', 'GT', 'LT') }
/parent::expr
/expr[
expr[1][SYMBOL_FUNCTION_CALL[text() = 'length']]
and expr[expr[1][
expr[1]/SYMBOL_FUNCTION_CALL[text() = 'length']
and expr/expr[1][
SYMBOL_FUNCTION_CALL[text() = 'unique']
and (
following-sibling::expr =
parent::expr
/parent::expr
/parent::expr
/expr
/expr[1][SYMBOL_FUNCTION_CALL[text()= 'length']]
/following-sibling::expr
or
following-sibling::expr[OP-DOLLAR or LBB]/expr[1] =
/parent::expr
/parent::expr
/expr
/expr[1][SYMBOL_FUNCTION_CALL[text() = 'length']]
/following-sibling::expr
or following-sibling::expr[OP-DOLLAR or LBB]/expr[1] =
parent::expr
/parent::expr
/parent::expr
/expr
/expr[1][SYMBOL_FUNCTION_CALL[text() = 'nrow']]
/following-sibling::expr
or parent::expr
/parent::expr
/parent::expr
/expr
/expr[1][SYMBOL_FUNCTION_CALL[text()= 'nrow']]
/following-sibling::expr
/expr[
SYMBOL[text() = '.N']
or (expr/SYMBOL_FUNCTION_CALL[text() = 'n'] and count(expr) = 1)
]
)
]]
]
]
")
length_unique_xpath <- paste(length_unique_xpath_parts, collapse = " | ")

distinct_xpath <- glue("
//{ c('EQ', 'NE', 'GT', 'LT') }
/parent::expr
/expr[
expr[1][
SYMBOL_FUNCTION_CALL[text() = 'uniqueN' or text() = 'n_distinct']
and (
following-sibling::expr =
parent::expr
/parent::expr
/expr
/expr[1][SYMBOL_FUNCTION_CALL[text() = 'length' or text() = 'nrow']]
/following-sibling::expr
or following-sibling::expr[OP-DOLLAR or LBB]/expr[1] =
parent::expr
/parent::expr
/expr
/expr[1][SYMBOL_FUNCTION_CALL[text() = 'nrow']]
/following-sibling::expr
or parent::expr
/parent::expr
/expr[
SYMBOL[text() = '.N']
or (expr/SYMBOL_FUNCTION_CALL[text() = 'n'] and count(expr) = 1)
]
)
]
]
")

uses_nrow_xpath <- "./parent::expr/expr/expr[1]/SYMBOL_FUNCTION_CALL[text() = 'nrow']"
uses_dtn_xpath <- "./parent::expr/expr/SYMBOL[text() = '.N']"
uses_dplyr_xpath <- "./parent::expr/expr/expr[1]/SYMBOL_FUNCTION_CALL[text() = 'n']"

Linter(linter_level = "expression", function(source_expression) {
xml <- source_expression$xml_parsed_content
Expand All @@ -96,18 +144,39 @@ any_duplicated_linter <- function() {
)

length_unique_expr <- xml_find_all(xml, length_unique_xpath)
lint_message <- ifelse(
is.na(xml_find_first(length_unique_expr, uses_nrow_xpath)),
"anyDuplicated(x) == 0L is better than length(unique(x)) == length(x).",
length_unique_lint_message <- character(length(length_unique_expr))
length_unique_lint_message[] <- "anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)."
length_unique_lint_message[!is.na(xml_find_first(length_unique_expr, uses_nrow_xpath))] <-
"anyDuplicated(DF$col) == 0L is better than length(unique(DF$col)) == nrow(DF)"
)
length_unique_lint_message[!is.na(xml_find_first(length_unique_expr, uses_dtn_xpath))] <-
"anyDuplicated(x) == 0L is better than length(unique(x)) == .N"
length_unique_lint_message[!is.na(xml_find_first(length_unique_expr, uses_dplyr_xpath))] <-
"anyDuplicated(x) == 0L is better than length(unique(x)) == n()."
length_unique_lints <- xml_nodes_to_lints(
length_unique_expr,
source_expression = source_expression,
lint_message = lint_message,
lint_message = length_unique_lint_message,
type = "warning"
)

distinct_expr <- xml_find_all(xml, distinct_xpath)
distinct_lint_message_fmt <- character(length(distinct_expr))
distinct_lint_message_fmt[] <- "anyDuplicated(x) == 0L is better than %s(x) == length(x)."
distinct_lint_message_fmt[!is.na(xml_find_first(distinct_expr, uses_nrow_xpath))] <-
"anyDuplicated(DF$col) == 0L is better than %s(DF$col) == nrow(DF)"
distinct_lint_message_fmt[!is.na(xml_find_first(distinct_expr, uses_dtn_xpath))] <-
"anyDuplicated(x) == 0L is better than %s(x) == .N"
distinct_lint_message_fmt[!is.na(xml_find_first(distinct_expr, uses_dplyr_xpath))] <-
"anyDuplicated(x) == 0L is better than %s(x) == n()."

distinct_lint_message <- sprintf(distinct_lint_message_fmt, xp_call_name(distinct_expr))
distinct_lints <- xml_nodes_to_lints(
distinct_expr,
source_expression = source_expression,
lint_message = distinct_lint_message,
type = "warning"
)

c(any_duplicated_lints, length_unique_lints)
c(any_duplicated_lints, length_unique_lints, distinct_lints)
})
}
10 changes: 10 additions & 0 deletions man/any_duplicated_linter.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

35 changes: 28 additions & 7 deletions tests/testthat/test-any_duplicated_linter.R
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
test_that("any_duplicated_linter skips allowed usages", {
linter <- any_duplicated_linter()

expect_lint("x <- any(y)", NULL, linter)
expect_lint("y <- duplicated(z)", NULL, linter)
expect_no_lint("x <- any(y)", linter)
expect_no_lint("y <- duplicated(z)", linter)

# extended usage of any is not covered
expect_lint("any(duplicated(y), b)", NULL, linter)
expect_lint("any(b, duplicated(y))", NULL, linter)
expect_no_lint("any(duplicated(y), b)", linter)
expect_no_lint("any(b, duplicated(y))", linter)
})

test_that("any_duplicated_linter blocks simple disallowed usages", {
Expand All @@ -28,10 +28,10 @@ test_that("any_duplicated_linter catches length(unique()) equivalencies too", {

# non-matches
## different variable
expect_lint("length(unique(x)) == length(y)", NULL, linter)
expect_no_lint("length(unique(x)) == length(y)", linter)
## different table
expect_lint("length(unique(DF$x)) == nrow(DT)", NULL, linter)
expect_lint("length(unique(l1$DF$x)) == nrow(l2$DF)", NULL, linter)
expect_no_lint("length(unique(DF$x)) == nrow(DT)", linter)
expect_no_lint("length(unique(l1$DF$x)) == nrow(l2$DF)", linter)

# lintable usage
expect_lint("length(unique(x)) == length(x)", lint_msg_x, linter)
Expand All @@ -51,6 +51,27 @@ test_that("any_duplicated_linter catches length(unique()) equivalencies too", {
expect_lint("length(x) > length(unique(x))", lint_msg_x, linter)
})

test_that("dplyr & data.table equivalents are also linted", {
linter <- any_duplicated_linter()

expect_no_lint("uniqueN(x) == nrow(y)", linter)
expect_no_lint("n_distinct(x) == nrow(y)", linter)
expect_no_lint("x[, length(unique(col)) == .N()]", linter)
# some other n function, not dplyr::n
expect_no_lint("x %>% summarize(length(unique(col)) == n(2))", linter)
expect_no_lint("x %>% summarize(length(unique(col)) == n)", linter)

expect_lint("uniqueN(x) == nrow(x)", rex::rex("uniqueN(DF$col) == nrow(DF)"), linter)
expect_lint("data.table::uniqueN(x) == nrow(x)", rex::rex("uniqueN(DF$col) == nrow(DF)"), linter)
expect_lint("x[, length(unique(col)) == .N]", rex::rex("length(unique(x)) == .N"), linter)
expect_lint("x[, uniqueN(col) == .N]", rex::rex("uniqueN(x) == .N"), linter)

expect_lint("n_distinct(x) == nrow(x)", rex::rex("n_distinct(DF$col) == nrow(DF)"), linter)
expect_lint("dplyr::n_distinct(x) == nrow(x)", rex::rex("n_distinct(DF$col) == nrow(DF)"), linter)
expect_lint("x %>% summarize(length(unique(col)) == n())", rex::rex("length(unique(x)) == n()"), linter)
expect_lint("x %>% summarize(n_distinct(col) == n())", rex::rex("n_distinct(x) == n()"), linter)
})

test_that("any_duplicated_linter catches expression with two types of lint", {
linter <- any_duplicated_linter()
lint_msg <- rex::rex("anyDuplicated(DF$col) == 0L is better than length(unique(DF$col)) == nrow(DF)")
Expand Down
Loading