R/e_match_closest_in_range.R
e_match_closest_in_range.Rd
Similar to survival::neardate
but chooses closest in both directions restricted to an asymmetrical range.
e_match_closest_in_range(
dat_to_match,
id_vars_to_match,
val_var_to_match,
dat_key,
id_vars_key,
val_var_key,
diff_lower = -Inf,
diff_upper = +Inf,
sw_criteria = c("closest", "minimum", "maximum")[1],
sw_return_key_vars = FALSE
)
data to match to the key dataset
associated ID variables in data to match
associated value variable in data to match
key dataset
ID variables in key dataset
value variable to determine closeness in key dataset
match from data to match can be no lower than the key data by this amount
match from data to match can be no higher than the key data by this amount
criteria for match proximity (useful when range values diff_lower
and diff_upper
are used): closest, minimum, or maximum.
T/F return the key value for use in matching if multiple records per ID
dat_to_match restricted to only those unique observations that are closest to the key data
Can also be used to match closest within a range of dates in the future by setting diff_lower
and diff_upper
to be positive numbers, e.g., 5 and 7.
set.seed(1)
dat_key <-
tidyr::expand_grid(
key1 = c("a", "b", "c")
, key2 = c("x", "y")
) %>%
dplyr::mutate(
value = 1:dplyr::n()
)
dat_to_match <-
tidyr::expand_grid(
key1_m = c("a", "b") # no "c"
, key2_m = c("x", "y", "z") # added "z"
) %>%
dplyr::slice(
sample.int(n = 2*3, size = 4 * 2*3, replace = TRUE) # produce multiple per obs
) %>%
dplyr::mutate(
value_m = runif(n = dplyr::n(), min = -5, max = 10)
, other1 = rnorm(dplyr::n())
, other2 = rnorm(dplyr::n())
) %>%
dplyr::arrange(
key1_m, key2_m
)
dat_to_match_sub <-
e_match_closest_in_range(
dat_to_match = dat_to_match
, id_vars_to_match = c("key1_m", "key2_m")
, val_var_to_match = "value_m"
, dat_key = dat_key
, id_vars_key = c("key1" , "key2" )
, val_var_key = "value"
, diff_lower = -Inf
, diff_upper = +Inf
)
dat_key %>% print()
#> # A tibble: 6 × 3
#> key1 key2 value
#> <chr> <chr> <int>
#> 1 a x 1
#> 2 a y 2
#> 3 b x 3
#> 4 b y 4
#> 5 c x 5
#> 6 c y 6
dat_to_match %>% print(n = Inf)
#> # A tibble: 24 × 5
#> key1_m key2_m value_m other1 other2
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 a x 8.05 -0.156 0.398
#> 2 a x 2.23 -0.478 0.341
#> 3 a x 5.86 -0.394 2.40
#> 4 a x 6.84 0.557 1.47
#> 5 a x 5.98 0.365 0.476
#> 6 a x 5.39 0.769 -0.710
#> 7 a y 3.99 0.418 -1.13
#> 8 a y 5.03 -0.0538 -1.04
#> 9 a y 4.71 0.763 0.0280
#> 10 a y 2.95 0.697 -1.80
#> 11 a z -2.21 -0.103 1.98
#> 12 a z 6.91 -1.38 0.570
#> 13 a z -3.38 -0.415 -0.135
#> 14 b x 0.105 -1.47 -0.612
#> 15 b y 2.40 1.36 1.43
#> 16 b y 1.17 -0.0593 -0.0392
#> 17 b y 7.31 1.10 0.690
#> 18 b y -4.65 -0.689 0.153
#> 19 b y 2.16 -0.707 2.17
#> 20 b y 7.92 0.881 -0.934
#> 21 b z 7.41 0.388 -0.367
#> 22 b z 6.74 -0.165 -0.743
#> 23 b z 3.30 -0.253 0.189
#> 24 b z 2.16 -0.112 0.611
dat_to_match_sub %>% print()
#> # A tibble: 4 × 5
#> key1_m key2_m value_m other1 other2
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 a x 2.23 -0.478 0.341
#> 2 a y 2.95 0.697 -1.80
#> 3 b x 0.105 -1.47 -0.612
#> 4 b y 2.40 1.36 1.43
# within specified range
e_match_closest_in_range(
dat_to_match = dat_to_match
, id_vars_to_match = c("key1_m", "key2_m")
, val_var_to_match = "value_m"
, dat_key = dat_key
, id_vars_key = c("key1" , "key2" )
, val_var_key = "value"
, diff_lower = -2
, diff_upper = +4
, sw_return_key_vars = TRUE
)
#> # A tibble: 3 × 7
#> key1_m key2_m value_m other1 other2 val_var_key__ val_diff__
#> <chr> <chr> <dbl> <dbl> <dbl> <int> <dbl>
#> 1 a x 2.23 -0.478 0.341 1 1.23
#> 2 a y 2.95 0.697 -1.80 2 0.946
#> 3 b y 2.40 1.36 1.43 4 -1.60
# within specified range, maximum value
e_match_closest_in_range(
dat_to_match = dat_to_match
, id_vars_to_match = c("key1_m", "key2_m")
, val_var_to_match = "value_m"
, dat_key = dat_key
, id_vars_key = c("key1" , "key2" )
, val_var_key = "value"
, diff_lower = -2
, diff_upper = +4
, sw_criteria = "maximum"
, sw_return_key_vars = TRUE
)
#> # A tibble: 3 × 7
#> key1_m key2_m value_m other1 other2 val_var_key__ val_diff__
#> <chr> <chr> <dbl> <dbl> <dbl> <int> <dbl>
#> 1 a x 2.23 -0.478 0.341 1 1.23
#> 2 a y 5.03 -0.0538 -1.04 2 3.03
#> 3 b y 7.92 0.881 -0.934 4 3.92
# within specified range, minimum value
e_match_closest_in_range(
dat_to_match = dat_to_match
, id_vars_to_match = c("key1_m", "key2_m")
, val_var_to_match = "value_m"
, dat_key = dat_key
, id_vars_key = c("key1" , "key2" )
, val_var_key = "value"
, diff_lower = -2
, diff_upper = +4
, sw_criteria = "minimum"
, sw_return_key_vars = TRUE
)
#> # A tibble: 3 × 7
#> key1_m key2_m value_m other1 other2 val_var_key__ val_diff__
#> <chr> <chr> <dbl> <dbl> <dbl> <int> <dbl>
#> 1 a x 2.23 -0.478 0.341 1 1.23
#> 2 a y 2.95 0.697 -1.80 2 0.946
#> 3 b y 2.16 -0.707 2.17 4 -1.84