Create readr column specification using regular expression matching

Allows to define a regular expression per desired column specification object matching the respective column names.

Usage

cols_regex(..., .col_names, .default = readr::col_character())

Arguments

...: Named arguments where the names are (Perl-compatible) regular expressions and the values are column objects created by col_*(), or their abbreviated character names (as described in the col_types parameter of readr::read_delim()). Dynamic dots are supported.
.col_names: Column names which should be matched by ....
.default: Any named columns not matched by any of the regular expressions in ... will be read with this column type.

Value

A column specification.

Details

The main limitation of cols_regex() is that it needs to know the input dataset's full set of .col_names in advance, for which dsv_colnames() can help. See the examples for further details.

Examples

library(magrittr)

# for some hypothetical CSV data column names like these...
col_names <- c("VAR1_Text",
               "VAR2_Text",
               "VAR3_Text_Other",
               "VAR1_Code_R1",
               "VAR2_Code_R2",
               "HAS_R1_Lag",
               "HAS_R2_Lag",
               "GARBAGEX67",
               "GARBAGEY09")

# ...a column spec could be created concisely as follows:
col_regex <- list("_Text(_|$)" = "c",
                  "_Code(_|$)" = "i",
                  "^GARBAGE"   = readr::col_skip())

pal::cols_regex(.col_names = col_names,
                !!!col_regex,
                .default = "l")
#> cols(
#>   .default = col_logical(),
#>   VAR1_Text = col_character(),
#>   VAR2_Text = col_character(),
#>   VAR3_Text_Other = col_character(),
#>   VAR1_Code_R1 = col_integer(),
#>   VAR2_Code_R2 = col_integer(),
#>   GARBAGEX67 = col_skip(),
#>   GARBAGEY09 = col_skip()
#> )

# we can parse some real data:
url <- "https://salim_b.gitlab.io/misc/Kantonsratswahl_Zuerich_2019_Ergebnisse_Gemeinden.csv"

raw_data <-
  httr2::request(url) |>
  httr2::req_perform() |>
  httr2::resp_body_string()

col_spec <- pal::cols_regex("^(Gemeindenamen|Liste|Wahlkreis)$" = "c",
                            "(?i)anteil" = "d",
                            .default = "i",
                            .col_names = pal::dsv_colnames(raw_data))
print(col_spec)
#> cols(
#>   .default = col_integer(),
#>   Gemeindenamen = col_character(),
#>   Liste = col_character(),
#>   Wahlkreis = col_character(),
#>   Stimmenanteil = col_double(),
#>   `+/- (Stimmenanteil)` = col_double(),
#>   Wähleranteil = col_double(),
#>   `+/- (Wähleranteil)` = col_double(),
#>   `Stimmenanteil 2015` = col_double(),
#>   `Wähleranteil 2015` = col_double()
#> )

readr::read_csv(file = raw_data,
                col_types = col_spec)
#> # A tibble: 1,770 × 15
#>    Gemeindenamen `BFS-Nr.` `Listen-Nr.` Liste `Wahlkreis-Nr.` Wahlkreis            Stimmen Stimmenanteil `+/- (Stimmenanteil)` Stimmenzusatz Wähler Wähleranteil
#>    <chr>             <int>        <int> <chr>           <int> <chr>                  <int>         <dbl>                 <dbl>         <int>  <int>        <dbl>
#>  1 Adlikon              21            1 SVP                16 Wahlkreis XVI, Ande…     468         56.0                  -6.21             8    117        56.0 
#>  2 Adlikon              21            2 SP                 16 Wahlkreis XVI, Ande…      39          4.67                 -0.19             0     10         4.67
#>  3 Adlikon              21            3 FDP                16 Wahlkreis XVI, Ande…     106         12.7                   1.73             0     27        12.7 
#>  4 Adlikon              21            4 GLP                16 Wahlkreis XVI, Ande…      36          4.31                  2.94             0      9         4.31
#>  5 Adlikon              21            5 Grüne              16 Wahlkreis XVI, Ande…      54          6.46                  0.86             0     14         6.46
#>  6 Adlikon              21            6 CVP                16 Wahlkreis XVI, Ande…       6          0.72                 -0.03             0      2         0.72
#>  7 Adlikon              21            7 EVP                16 Wahlkreis XVI, Ande…      52          6.22                  3.61             0     13         6.22
#>  8 Adlikon              21            8 AL                 16 Wahlkreis XVI, Ande…       7          0.84                 -0.28             0      2         0.84
#>  9 Adlikon              21            9 BDP                16 Wahlkreis XVI, Ande…      45          5.38                 -1.46             4     11         5.38
#> 10 Adlikon              21           10 EDU                16 Wahlkreis XVI, Ande…      23          2.75                 -0.98             0      6         2.75
#> # ℹ 1,760 more rows
#> # ℹ 3 more variables: `+/- (Wähleranteil)` <dbl>, `Stimmenanteil 2015` <dbl>, `Wähleranteil 2015` <dbl>

# we can also do basically the same in a more concise way without having to rely on
# `pal::dsv_colnames()`:
readr::read_csv(file = url,
                col_types = list(.default = "c")) %>%
  readr::type_convert(col_types = pal::cols_regex("^(Gemeindenamen|Liste|Wahlkreis)$" = "c",
                                                  "(?i)anteil" = "d",
                                                  .default = "i",
                                                  .col_names = colnames(.)))
#> # A tibble: 1,770 × 15
#>    Gemeindenamen `BFS-Nr.` `Listen-Nr.` Liste `Wahlkreis-Nr.` Wahlkreis            Stimmen Stimmenanteil `+/- (Stimmenanteil)` Stimmenzusatz Wähler Wähleranteil
#>    <chr>             <int>        <int> <chr>           <int> <chr>                  <int>         <dbl>                 <dbl>         <int>  <int>        <dbl>
#>  1 Adlikon              21            1 SVP                16 Wahlkreis XVI, Ande…     468         56.0                  -6.21             8    117        56.0 
#>  2 Adlikon              21            2 SP                 16 Wahlkreis XVI, Ande…      39          4.67                 -0.19             0     10         4.67
#>  3 Adlikon              21            3 FDP                16 Wahlkreis XVI, Ande…     106         12.7                   1.73             0     27        12.7 
#>  4 Adlikon              21            4 GLP                16 Wahlkreis XVI, Ande…      36          4.31                  2.94             0      9         4.31
#>  5 Adlikon              21            5 Grüne              16 Wahlkreis XVI, Ande…      54          6.46                  0.86             0     14         6.46
#>  6 Adlikon              21            6 CVP                16 Wahlkreis XVI, Ande…       6          0.72                 -0.03             0      2         0.72
#>  7 Adlikon              21            7 EVP                16 Wahlkreis XVI, Ande…      52          6.22                  3.61             0     13         6.22
#>  8 Adlikon              21            8 AL                 16 Wahlkreis XVI, Ande…       7          0.84                 -0.28             0      2         0.84
#>  9 Adlikon              21            9 BDP                16 Wahlkreis XVI, Ande…      45          5.38                 -1.46             4     11         5.38
#> 10 Adlikon              21           10 EDU                16 Wahlkreis XVI, Ande…      23          2.75                 -0.98             0      6         2.75
#> # ℹ 1,760 more rows
#> # ℹ 3 more variables: `+/- (Wähleranteil)` <dbl>, `Stimmenanteil 2015` <dbl>, `Wähleranteil 2015` <dbl>