Skip to content

Catch anyDuplicated() equivalents from dplyr, data.table #2795

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
* Setting `exclusions` supports globs like `knitr*` to exclude files/directories with a pattern (#1554, @MichaelChirico).
* `object_name_linter()` and `object_length_linter()` apply to objects assigned with `assign()` or generics created with `setGeneric()` (#1665, @MichaelChirico).
* `object_usage_linter()` gains argument `interpret_extensions` to govern which false positive-prone common syntaxes should be checked for used objects (#1472, @MichaelChirico). Currently `"glue"` (renamed from earlier argument `interpret_glue`) and `"rlang"` are supported. The latter newly covers usage of the `.env` pronoun like `.env$key`, where `key` was previously missed as being a used variable.
* `any_duplicated_linter()` is extended to recognize some usages from {dplyr} and {data.table} that could be replaced by `anyDuplicated()`, e.g. `n_distinct(col) == n()` or `uniqueN(col) == .N` (#2482, @MichaelChirico).

### New linters

Expand Down
99 changes: 79 additions & 20 deletions R/any_duplicated_linter.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
#' @export
any_duplicated_linter <- function() {
any_duplicated_xpath <- "
following-sibling::expr[1][expr[1][SYMBOL_FUNCTION_CALL[text() = 'duplicated']]]
following-sibling::expr[1][expr[1]/SYMBOL_FUNCTION_CALL[text() = 'duplicated']]
/parent::expr[
count(expr) = 2
or (count(expr) = 3 and SYMBOL_SUB[text() = 'na.rm'])
Expand All @@ -56,32 +56,70 @@ any_duplicated_linter <- function() {
//{ c('EQ', 'NE', 'GT', 'LT') }
/parent::expr
/expr[
expr[1][SYMBOL_FUNCTION_CALL[text() = 'length']]
and expr[expr[1][
expr[1]/SYMBOL_FUNCTION_CALL[text() = 'length']
and expr/expr[1][
SYMBOL_FUNCTION_CALL[text() = 'unique']
and (
following-sibling::expr =
parent::expr
/parent::expr
/parent::expr
/expr
/expr[1][SYMBOL_FUNCTION_CALL[text()= 'length']]
/following-sibling::expr
or
following-sibling::expr[OP-DOLLAR or LBB]/expr[1] =
/parent::expr
/parent::expr
/expr
/expr[1][SYMBOL_FUNCTION_CALL[text() = 'length']]
/following-sibling::expr
or following-sibling::expr[OP-DOLLAR or LBB]/expr[1] =
parent::expr
/parent::expr
/parent::expr
/expr
/expr[1][SYMBOL_FUNCTION_CALL[text() = 'nrow']]
/following-sibling::expr
or parent::expr
/parent::expr
/parent::expr
/expr
/expr[1][SYMBOL_FUNCTION_CALL[text()= 'nrow']]
/following-sibling::expr
/expr[
SYMBOL[text() = '.N']
or (expr/SYMBOL_FUNCTION_CALL[text() = 'n'] and count(expr) = 1)
]
)
]]
]
]
")
length_unique_xpath <- paste(length_unique_xpath_parts, collapse = " | ")

distinct_xpath <- glue("
//{ c('EQ', 'NE', 'GT', 'LT') }
/parent::expr
/expr[
expr[1][
SYMBOL_FUNCTION_CALL[text() = 'uniqueN' or text() = 'n_distinct']
and (
following-sibling::expr =
parent::expr
/parent::expr
/expr
/expr[1][SYMBOL_FUNCTION_CALL[text() = 'length' or text() = 'nrow']]
/following-sibling::expr
or following-sibling::expr[OP-DOLLAR or LBB]/expr[1] =
parent::expr
/parent::expr
/expr
/expr[1][SYMBOL_FUNCTION_CALL[text() = 'nrow']]
/following-sibling::expr
or parent::expr
/parent::expr
/expr[
SYMBOL[text() = '.N']
or (expr/SYMBOL_FUNCTION_CALL[text() = 'n'] and count(expr) = 1)
]
)
]
]
")

uses_nrow_xpath <- "./parent::expr/expr/expr[1]/SYMBOL_FUNCTION_CALL[text() = 'nrow']"
uses_dtn_xpath <- "./parent::expr/expr/SYMBOL[text() = '.N']"
uses_dplyr_xpath <- "./parent::expr/expr/expr[1]/SYMBOL_FUNCTION_CALL[text() = 'n']"

Linter(linter_level = "expression", function(source_expression) {
xml <- source_expression$xml_parsed_content
Expand All @@ -96,18 +134,39 @@ any_duplicated_linter <- function() {
)

length_unique_expr <- xml_find_all(xml, length_unique_xpath)
lint_message <- ifelse(
is.na(xml_find_first(length_unique_expr, uses_nrow_xpath)),
"anyDuplicated(x) == 0L is better than length(unique(x)) == length(x).",
length_unique_lint_message <- character(length(length_unique_expr))
length_unique_lint_message[] <- "anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)."
length_unique_lint_message[!is.na(xml_find_first(length_unique_expr, uses_nrow_xpath))] <-
"anyDuplicated(DF$col) == 0L is better than length(unique(DF$col)) == nrow(DF)"
)
length_unique_lint_message[!is.na(xml_find_first(length_unique_expr, uses_dtn_xpath))] <-
"anyDuplicated(x) == 0L is better than length(unique(x)) == .N"
length_unique_lint_message[!is.na(xml_find_first(length_unique_expr, uses_dplyr_xpath))] <-
"anyDuplicated(x) == 0L is better than length(unique(x)) == n()."
length_unique_lints <- xml_nodes_to_lints(
length_unique_expr,
source_expression = source_expression,
lint_message = lint_message,
lint_message = length_unique_lint_message,
type = "warning"
)

distinct_expr <- xml_find_all(xml, distinct_xpath)
distinct_lint_message_fmt <- character(length(distinct_expr))
distinct_lint_message_fmt[] <- "anyDuplicated(x) == 0L is better than %s(x) == length(x)."
distinct_lint_message_fmt[!is.na(xml_find_first(distinct_expr, uses_nrow_xpath))] <-
"anyDuplicated(DF$col) == 0L is better than %s(DF$col) == nrow(DF)"
distinct_lint_message_fmt[!is.na(xml_find_first(distinct_expr, uses_dtn_xpath))] <-
"anyDuplicated(x) == 0L is better than %s(x) == .N"
distinct_lint_message_fmt[!is.na(xml_find_first(distinct_expr, uses_dplyr_xpath))] <-
"anyDuplicated(x) == 0L is better than %s(x) == n()."

distinct_lint_message <- sprintf(distinct_lint_message_fmt, xp_call_name(distinct_expr))
distinct_lints <- xml_nodes_to_lints(
distinct_expr,
source_expression = source_expression,
lint_message = distinct_lint_message,
type = "warning"
)

c(any_duplicated_lints, length_unique_lints)
c(any_duplicated_lints, length_unique_lints, distinct_lints)
})
}
35 changes: 28 additions & 7 deletions tests/testthat/test-any_duplicated_linter.R
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
test_that("any_duplicated_linter skips allowed usages", {
linter <- any_duplicated_linter()

expect_lint("x <- any(y)", NULL, linter)
expect_lint("y <- duplicated(z)", NULL, linter)
expect_no_lint("x <- any(y)", linter)
expect_no_lint("y <- duplicated(z)", linter)

# extended usage of any is not covered
expect_lint("any(duplicated(y), b)", NULL, linter)
expect_lint("any(b, duplicated(y))", NULL, linter)
expect_no_lint("any(duplicated(y), b)", linter)
expect_no_lint("any(b, duplicated(y))", linter)
})

test_that("any_duplicated_linter blocks simple disallowed usages", {
Expand All @@ -28,10 +28,10 @@ test_that("any_duplicated_linter catches length(unique()) equivalencies too", {

# non-matches
## different variable
expect_lint("length(unique(x)) == length(y)", NULL, linter)
expect_no_lint("length(unique(x)) == length(y)", linter)
## different table
expect_lint("length(unique(DF$x)) == nrow(DT)", NULL, linter)
expect_lint("length(unique(l1$DF$x)) == nrow(l2$DF)", NULL, linter)
expect_no_lint("length(unique(DF$x)) == nrow(DT)", linter)
expect_no_lint("length(unique(l1$DF$x)) == nrow(l2$DF)", linter)

# lintable usage
expect_lint("length(unique(x)) == length(x)", lint_msg_x, linter)
Expand All @@ -51,6 +51,27 @@ test_that("any_duplicated_linter catches length(unique()) equivalencies too", {
expect_lint("length(x) > length(unique(x))", lint_msg_x, linter)
})

test_that("dplyr & data.table equivalents are also linted", {
linter <- any_duplicated_linter()

expect_no_lint("uniqueN(x) == nrow(y)", linter)
expect_no_lint("n_distinct(x) == nrow(y)", linter)
expect_no_lint("x[, length(unique(col)) == .N()]", linter)
# some other n function, not dplyr::n
expect_no_lint("x %>% summarize(length(unique(col)) == n(2))", linter)
expect_no_lint("x %>% summarize(length(unique(col)) == n)", linter)

expect_lint("uniqueN(x) == nrow(x)", rex::rex("uniqueN(DF$col) == nrow(DF)"), linter)
expect_lint("data.table::uniqueN(x) == nrow(x)", rex::rex("uniqueN(DF$col) == nrow(DF)"), linter)
expect_lint("x[, length(unique(col)) == .N]", rex::rex("length(unique(x)) == .N"), linter)
expect_lint("x[, uniqueN(col) == .N]", rex::rex("uniqueN(x) == .N"), linter)

expect_lint("n_distinct(x) == nrow(x)", rex::rex("n_distinct(DF$col) == nrow(DF)"), linter)
expect_lint("dplyr::n_distinct(x) == nrow(x)", rex::rex("n_distinct(DF$col) == nrow(DF)"), linter)
expect_lint("x %>% summarize(length(unique(col)) == n())", rex::rex("length(unique(x)) == n()"), linter)
expect_lint("x %>% summarize(n_distinct(col) == n())", rex::rex("n_distinct(x) == n()"), linter)
})

test_that("any_duplicated_linter catches expression with two types of lint", {
linter <- any_duplicated_linter()
lint_msg <- rex::rex("anyDuplicated(DF$col) == 0L is better than length(unique(DF$col)) == nrow(DF)")
Expand Down
Loading