Split a (set of) item-listing columns into indicator columns — e_split_list_columns_into_indicator

Commonly used for comorbidities or prescription lists within a single (or multiple) column(s). Takes a column where items are separated by puncutation (,.;/|) and creates separate columns with indicators. Can treat counts of items >1 as 1 to simplify summary tables (for example, multiple items coded as "other").

e_split_list_columns_into_indicator_columns(
  dat_this,
  var_names_items = NULL,
  item_delimiters = ",.;/|",
  code_other_below_freq = 5,
  label_other = "other",
  indicator_col_prefix = "item_",
  sw_data_or_summary = c("data", "summary")[1],
  sw_replace_GT1_with_1 = FALSE,
  sw_print_unique = TRUE
)

Arguments

dat_this: entire data.frame or tibble, will return with additional indicator columns
var_names_items: column names with lists of items
item_delimiters: delimiter(s) that separate items within a single column
code_other_below_freq: replace item name with label_other if total frequency for an item is less than this value
label_other: label for the "other" category
indicator_col_prefix: prefix for the new indicator columns
sw_data_or_summary: return data with indicator columns or return summary tables of frequencies of items
sw_replace_GT1_with_1: T/F, to replace "greater than 1" counts with an indicator of 1 (to interpret as "at least 1")
sw_print_unique: T/F, print list of items before and after replacing with "other"

Value

dat_this from sw_data_or_summary, either the data with additional indicator columns; or a list of summary tables of frequencies of items

Examples

dat_ex <-
  dplyr::tibble(
    col1 =
      c(
        NA, "", "a", "A, B  ,C", "b", "D. c ;    d"
      , "x  ;Y", "ab/0|J;1;1", 2, "other", 1
      , "a a a", "1,a a a", "2,a a a"
      )
  , col2 = LETTERS[1:length(col1)]
  ) %>%
  dplyr::mutate(
    ID = 1:dplyr::n()
  ) %>%
  dplyr::select(
    ID
  , tidyselect::everything()
  )
dat_ex %>% print(n = Inf)
#> # A tibble: 14 × 3
#>       ID col1          col2 
#>    <int> <chr>         <chr>
#>  1     1  NA           A    
#>  2     2 ""            B    
#>  3     3 "a"           C    
#>  4     4 "A, B  ,C"    D    
#>  5     5 "b"           E    
#>  6     6 "D. c ;    d" F    
#>  7     7 "x  ;Y"       G    
#>  8     8 "ab/0|J;1;1"  H    
#>  9     9 "2"           I    
#> 10    10 "other"       J    
#> 11    11 "1"           K    
#> 12    12 "a a a"       L    
#> 13    13 "1,a a a"     M    
#> 14    14 "2,a a a"     N    

# return data
dat_ex_out <-
  e_split_list_columns_into_indicator_columns(
    dat_this              = dat_ex
  , var_names_items       = c("col1", "col2")
  , item_delimiters       = ",.;/|"
  , code_other_below_freq = 2
  , label_other           = "other"
  , indicator_col_prefix  = "item_"
  , sw_data_or_summary    = "data"
  , sw_replace_GT1_with_1 = FALSE
  , sw_print_unique       = TRUE
  )
#> [1] "Unique items: 22"
#> # A tibble: 22 × 2
#>    items  freq
#>    <chr> <int>
#>  1 1         4
#>  2 a         3
#>  3 a a a     3
#>  4 b         3
#>  5 c         3
#>  6 d         3
#>  7 2         2
#>  8 j         2
#>  9 0         1
#> 10 ab        1
#> 11 e         1
#> 12 f         1
#> 13 g         1
#> 14 h         1
#> 15 i         1
#> 16 k         1
#> 17 l         1
#> 18 m         1
#> 19 n         1
#> 20 other     1
#> 21 x         1
#> 22 y         1
#> [1] "Coding items with frequencies less than 2 to 'other'"
#> Note: erikmisc::e_split_list_columns_into_indicator_columns: Other category `other` already appears in data.
#> [1] "Unique items: 9 with other category `other`"
#> # A tibble: 9 × 2
#>   items  freq
#>   <chr> <int>
#> 1 other    14
#> 2 1         4
#> 3 a         3
#> 4 a a a     3
#> 5 b         3
#> 6 c         3
#> 7 d         3
#> 8 2         2
#> 9 j         2
dat_ex_out %>% print(n = Inf)
#> # A tibble: 14 × 12
#>       ID col1    col2  item_…¹ item_1 item_a item_…² item_b item_c item_d item_2
#>    <int> <chr>   <chr>   <dbl>  <dbl>  <dbl>   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
#>  1     1  NA     A           0      0      1       0      0      0      0      0
#>  2     2 ""      B           0      0      0       0      1      0      0      0
#>  3     3 "a"     C           0      0      1       0      0      1      0      0
#>  4     4 "A, B … D           0      0      1       0      1      1      1      0
#>  5     5 "b"     E           1      0      0       0      1      0      0      0
#>  6     6 "D. c … F           1      0      0       0      0      1      2      0
#>  7     7 "x  ;Y" G           3      0      0       0      0      0      0      0
#>  8     8 "ab/0|… H           3      2      0       0      0      0      0      0
#>  9     9 "2"     I           1      0      0       0      0      0      0      1
#> 10    10 "other" J           1      0      0       0      0      0      0      0
#> 11    11 "1"     K           1      1      0       0      0      0      0      0
#> 12    12 "a a a" L           1      0      0       1      0      0      0      0
#> 13    13 "1,a a… M           1      1      0       1      0      0      0      0
#> 14    14 "2,a a… N           1      0      0       1      0      0      0      1
#> # … with 1 more variable: item_j <dbl>, and abbreviated variable names
#> #   ¹item_other, ²`item_a a a`

# return summary
dat_ex_sum <-
  e_split_list_columns_into_indicator_columns(
    dat_this              = dat_ex
  , var_names_items       = c("col1", "col2")
  , item_delimiters       = ",.;/|"
  , code_other_below_freq = 2
  , label_other           = "other"
  , indicator_col_prefix  = "item_"
  , sw_data_or_summary    = "summary"
  , sw_replace_GT1_with_1 = FALSE
  , sw_print_unique       = FALSE
  )
#> Note: erikmisc::e_split_list_columns_into_indicator_columns: Other category `other` already appears in data.
dat_ex_sum
#> $tab_items_unique
#> # A tibble: 22 × 2
#>    items  freq
#>    <chr> <int>
#>  1 1         4
#>  2 a         3
#>  3 a a a     3
#>  4 b         3
#>  5 c         3
#>  6 d         3
#>  7 2         2
#>  8 j         2
#>  9 0         1
#> 10 ab        1
#> # … with 12 more rows
#> 
#> $tab_items_unique_other
#> # A tibble: 9 × 2
#>   items  freq
#>   <chr> <int>
#> 1 other    14
#> 2 1         4
#> 3 a         3
#> 4 a a a     3
#> 5 b         3
#> 6 c         3
#> 7 d         3
#> 8 2         2
#> 9 j         2
#>