Match the closest observations from one dataset to a key dataset.

Similar to survival::neardate but chooses closest in both directions restricted to an asymmetrical range.

e_match_closest_in_range(
  dat_to_match,
  id_vars_to_match,
  val_var_to_match,
  dat_key,
  id_vars_key,
  val_var_key,
  diff_lower = -Inf,
  diff_upper = +Inf,
  sw_criteria = c("closest", "minimum", "maximum")[1],
  sw_return_key_vars = FALSE
)

Arguments

dat_to_match: data to match to the key dataset
id_vars_to_match: associated ID variables in data to match
val_var_to_match: associated value variable in data to match
dat_key: key dataset
id_vars_key: ID variables in key dataset
val_var_key: value variable to determine closeness in key dataset
diff_lower: match from data to match can be no lower than the key data by this amount
diff_upper: match from data to match can be no higher than the key data by this amount
sw_criteria: criteria for match proximity (useful when range values diff_lower and diff_upper are used): closest, minimum, or maximum.
sw_return_key_vars: T/F return the key value for use in matching if multiple records per ID

Value

dat_to_match restricted to only those unique observations that are closest to the key data

Details

Can also be used to match closest within a range of dates in the future by setting diff_lower and diff_upper to be positive numbers, e.g., 5 and 7.

Examples


set.seed(1)

dat_key <-
  tidyr::expand_grid(
    key1 = c("a", "b", "c")
  , key2 = c("x", "y")
  ) %>%
  dplyr::mutate(
    value = 1:dplyr::n()
  )

dat_to_match <-
  tidyr::expand_grid(
    key1_m = c("a", "b")      # no "c"
  , key2_m = c("x", "y", "z") # added "z"
  ) %>%
  dplyr::slice(
    sample.int(n = 2*3, size = 4 * 2*3, replace = TRUE) # produce multiple per obs
  ) %>%
  dplyr::mutate(
    value_m = runif(n = dplyr::n(), min = -5, max = 10)
  , other1  = rnorm(dplyr::n())
  , other2  = rnorm(dplyr::n())
  ) %>%
  dplyr::arrange(
    key1_m, key2_m
  )

dat_to_match_sub <-
  e_match_closest_in_range(
    dat_to_match      = dat_to_match
  , id_vars_to_match  = c("key1_m", "key2_m")
  , val_var_to_match  = "value_m"
  , dat_key           = dat_key
  , id_vars_key       = c("key1"  , "key2"  )
  , val_var_key       = "value"
  , diff_lower        = -Inf
  , diff_upper        = +Inf
  )

dat_key          %>% print()
#> # A tibble: 6 × 3
#>   key1  key2  value
#>   <chr> <chr> <int>
#> 1 a     x         1
#> 2 a     y         2
#> 3 b     x         3
#> 4 b     y         4
#> 5 c     x         5
#> 6 c     y         6
dat_to_match     %>% print(n = Inf)
#> # A tibble: 24 × 5
#>    key1_m key2_m value_m  other1  other2
#>    <chr>  <chr>    <dbl>   <dbl>   <dbl>
#>  1 a      x        8.05  -0.156   0.398 
#>  2 a      x        2.23  -0.478   0.341 
#>  3 a      x        5.86  -0.394   2.40  
#>  4 a      x        6.84   0.557   1.47  
#>  5 a      x        5.98   0.365   0.476 
#>  6 a      x        5.39   0.769  -0.710 
#>  7 a      y        3.99   0.418  -1.13  
#>  8 a      y        5.03  -0.0538 -1.04  
#>  9 a      y        4.71   0.763   0.0280
#> 10 a      y        2.95   0.697  -1.80  
#> 11 a      z       -2.21  -0.103   1.98  
#> 12 a      z        6.91  -1.38    0.570 
#> 13 a      z       -3.38  -0.415  -0.135 
#> 14 b      x        0.105 -1.47   -0.612 
#> 15 b      y        2.40   1.36    1.43  
#> 16 b      y        1.17  -0.0593 -0.0392
#> 17 b      y        7.31   1.10    0.690 
#> 18 b      y       -4.65  -0.689   0.153 
#> 19 b      y        2.16  -0.707   2.17  
#> 20 b      y        7.92   0.881  -0.934 
#> 21 b      z        7.41   0.388  -0.367 
#> 22 b      z        6.74  -0.165  -0.743 
#> 23 b      z        3.30  -0.253   0.189 
#> 24 b      z        2.16  -0.112   0.611 
dat_to_match_sub %>% print()
#> # A tibble: 4 × 5
#>   key1_m key2_m value_m other1 other2
#>   <chr>  <chr>    <dbl>  <dbl>  <dbl>
#> 1 a      x        2.23  -0.478  0.341
#> 2 a      y        2.95   0.697 -1.80 
#> 3 b      x        0.105 -1.47  -0.612
#> 4 b      y        2.40   1.36   1.43 


# within specified range
e_match_closest_in_range(
  dat_to_match      = dat_to_match
, id_vars_to_match  = c("key1_m", "key2_m")
, val_var_to_match  = "value_m"
, dat_key           = dat_key
, id_vars_key       = c("key1"  , "key2"  )
, val_var_key       = "value"
, diff_lower        = -2
, diff_upper        = +4
, sw_return_key_vars = TRUE
)
#> # A tibble: 3 × 7
#>   key1_m key2_m value_m other1 other2 val_var_key__ val_diff__
#>   <chr>  <chr>    <dbl>  <dbl>  <dbl>         <int>      <dbl>
#> 1 a      x         2.23 -0.478  0.341             1      1.23 
#> 2 a      y         2.95  0.697 -1.80              2      0.946
#> 3 b      y         2.40  1.36   1.43              4     -1.60 

# within specified range, maximum value
e_match_closest_in_range(
  dat_to_match      = dat_to_match
, id_vars_to_match  = c("key1_m", "key2_m")
, val_var_to_match  = "value_m"
, dat_key           = dat_key
, id_vars_key       = c("key1"  , "key2"  )
, val_var_key       = "value"
, diff_lower        = -2
, diff_upper        = +4
, sw_criteria       = "maximum"
, sw_return_key_vars = TRUE
)
#> # A tibble: 3 × 7
#>   key1_m key2_m value_m  other1 other2 val_var_key__ val_diff__
#>   <chr>  <chr>    <dbl>   <dbl>  <dbl>         <int>      <dbl>
#> 1 a      x         2.23 -0.478   0.341             1       1.23
#> 2 a      y         5.03 -0.0538 -1.04              2       3.03
#> 3 b      y         7.92  0.881  -0.934             4       3.92

# within specified range, minimum value
e_match_closest_in_range(
  dat_to_match      = dat_to_match
, id_vars_to_match  = c("key1_m", "key2_m")
, val_var_to_match  = "value_m"
, dat_key           = dat_key
, id_vars_key       = c("key1"  , "key2"  )
, val_var_key       = "value"
, diff_lower        = -2
, diff_upper        = +4
, sw_criteria       = "minimum"
, sw_return_key_vars = TRUE
)
#> # A tibble: 3 × 7
#>   key1_m key2_m value_m other1 other2 val_var_key__ val_diff__
#>   <chr>  <chr>    <dbl>  <dbl>  <dbl>         <int>      <dbl>
#> 1 a      x         2.23 -0.478  0.341             1      1.23 
#> 2 a      y         2.95  0.697 -1.80              2      0.946
#> 3 b      y         2.16 -0.707  2.17              4     -1.84