Skip to content

Request non-equal join: closest() supports dual-sided inequality <> #6992

@zhjx19

Description

@zhjx19

For example, I now have the following Table 1 and Table 2. I want to find a value2 from both sides that is closest to the value based on matching the IDs. However, closest() only supports an inequality involving one of: >, >=, <, or <=.

library(purrr)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
df1 = tibble(
  ID = c("NO1", "NO1", "NO2", "NO3", "NO3", "NO3", "NO4", "NO5", 
         "NO5", "NO5", "NO5", "NO6", "NO7", "NO7", "NO7", "NO7", 
         "NO7", "NO8", "NO9", "NO9"), 
  value = c(11, 25, 22, 24, 29, 1.75, 26, 27, 55, 44, 2.1, 31, 
         35, 33, 410, 21, 1.44, 80, 75, 52))
df1
#> # A tibble: 20 × 2
#>    ID    value2
#>    <chr>  <dbl>
#>  1 NO1    11   
#>  2 NO1    25   
#>  3 NO2    22   
#>  4 NO3    24   
#>  5 NO3    29   
#>  6 NO3     1.75
#>  7 NO4    26   
#>  8 NO5    27   
#>  9 NO5    55   
#> 10 NO5    44   
#> 11 NO5     2.1 
#> 12 NO6    31   
#> 13 NO7    35   
#> 14 NO7    33   
#> 15 NO7   410   
#> 16 NO7    21   
#> 17 NO7     1.44
#> 18 NO8    80   
#> 19 NO9    75   
#> 20 NO9    52
df2 = tibble(
  编号 = c("NO1", "NO1", "NO2", "NO3", "NO3", "NO4", "NO5", "NO5", 
         "NO5", "NO5", "NO6", "NO7", "NO7", "NO7", "NO8", "NO9", "NO9"), 
  数值2 = c(11.2, 24.6, 21.6, 22, 29.5, 26.1, 26.9, 26.4, 54.3,
            42.1, 30.2, 34.7, 31.2, 20.59, 79.8, 74.2, 50.9))
df2
#> # A tibble: 17 × 2
#>    编号  数值2
#>    <chr> <dbl>
#>  1 NO1    11.2
#>  2 NO1    24.6
#>  3 NO2    21.6
#>  4 NO3    22  
#>  5 NO3    29.5
#>  6 NO4    26.1
#>  7 NO5    26.9
#>  8 NO5    26.4
#>  9 NO5    54.3
#> 10 NO5    42.1
#> 11 NO6    30.2
#> 12 NO7    34.7
#> 13 NO7    31.2
#> 14 NO7    20.6
#> 15 NO8    79.8
#> 16 NO9    74.2
#> 17 NO9    50.9

The manual method I can think of right now is:

df1 %>%
  left_join(df2, join_by(编号), relationship = "many-to-many") %>%
  group_nest(编号, 数值) %>%
  mutate(数值2 = map2_dbl(数值, data, \(x, y) y$数值2[which.min(abs(y$数值2 - x))]),
         误差 = abs(数值 - 数值2)) %>%
  select(-data)
#> # A tibble: 20 × 4
#>    编号    数值 数值2    误差
#>    <chr>  <dbl> <dbl>   <dbl>
#>  1 NO1    11     11.2   0.200
#>  2 NO1    25     24.6   0.400
#>  3 NO2    22     21.6   0.400
#>  4 NO3     1.75  22    20.2  
#>  5 NO3    24     22     2    
#>  6 NO3    29     29.5   0.5  
#>  7 NO4    26     26.1   0.100
#>  8 NO5     2.1   26.4  24.3  
#>  9 NO5    27     26.9   0.100
#> 10 NO5    44     42.1   1.90 
#> 11 NO5    55     54.3   0.700
#> 12 NO6    31     30.2   0.800
#> 13 NO7     1.44  20.6  19.2  
#> 14 NO7    21     20.6   0.41 
#> 15 NO7    33     34.7   1.70 
#> 16 NO7    35     34.7   0.300
#> 17 NO7   410     34.7 375.   
#> 18 NO8    80     79.8   0.200
#> 19 NO9    52     50.9   1.10 
#> 20 NO9    75     74.2   0.800

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions