Skip to content

Add step to do deduplication #1444

@EmilHvitfeldt

Description

@EmilHvitfeldt

possible new name step_select_unique

library(tidymodels)

ames
#ames$Lot_Frontage_dup <- ames$Lot_Frontage
#ames$MS_SubClass_dup <- as.integer(ames$MS_SubClass) * 4
#ames$MS_Zoning_dup <- sample(length(levels(ames$MS_Zoning)))[as.integer(ames$MS_Zoning)]
col_comb <- combn(names(ames), 2)

dups <- c()
dups_ind <- c()

identical_cross <- function(x, y) {
  res <- table(x, y) != 0
  all(colSums(res) < 2) && all(rowSums(res) < 2)
}

tictoc::tic()
for (i in seq_len(ncol(col_comb))) {
  cols <- col_comb[, i]
  
  if (identical_cross(ames[[cols[[1]]]], ames[[cols[[2]]]])) {
    dups <- c(dups, cols[[1]])
    dups_ind <- c(dups_ind, i)
  }
}
tictoc::toc()

length(dups)

dups

dups_ind[1]
col_comb[, dups_ind]

res <- table(ames$MS_SubClass, ames$MS_Zoning) != 0

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions