Skip to contents

Allows to define a regular expression per desired column specification object matching the respective column names.

Usage

cols_regex(..., .col_names, .default = readr::col_character())

Arguments

...

Named arguments where the names are (Perl-compatible) regular expressions and the values are column objects created by col_*(), or their abbreviated character names (as described in the col_types parameter of readr::read_delim()). Dynamic dots are supported.

.col_names

Column names which should be matched by ....

.default

Any named columns not matched by any of the regular expressions in ... will be read with this column type.

Examples

library(magrittr)

# for some hypothetical CSV data column names like these...
col_names <- c("VAR1_Text",
               "VAR2_Text",
               "VAR3_Text_Other",
               "VAR1_Code_R1",
               "VAR2_Code_R2",
               "HAS_R1_Lag",
               "HAS_R2_Lag",
               "GARBAGEX67",
               "GARBAGEY09")

# ...a column spec could be created concisely as follows:
col_regex <- list("_Text(_|$)" = "c",
                  "_Code(_|$)" = "i",
                  "^GARBAGE"  = readr::col_skip())

pal::cols_regex(.col_names = col_names,
                !!!col_regex,
                .default     = "l")
#> cols(
#>   .default = col_logical(),
#>   VAR1_Text = col_character(),
#>   VAR2_Text = col_character(),
#>   VAR3_Text_Other = col_character(),
#>   VAR1_Code_R1 = col_integer(),
#>   VAR2_Code_R2 = col_integer(),
#>   GARBAGEX67 = col_skip(),
#>   GARBAGEY09 = col_skip()
#> )

# we can parse some real data:
raw_data <-
  httr::GET(paste0("http://www.web.statistik.zh.ch/ogd/data/",
                   "KANTON_ZUERICH_nrw_2019_listen_ergebnisse_gemeinde.csv")) |>
  httr::content(as = "text",
                encoding = "UTF-8")

readr::read_csv(file = raw_data,
                col_types = pal::cols_regex("^(Gemeindenamen|Partei)$" = "c",
                                            "(?i)anteil" = "d",
                                            .default = "i",
                                            .col_names = pal::dsv_colnames(raw_data)))
#> Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
#>   dat <- vroom(...)
#>   problems(dat)
#> # A tibble: 6 × 1
#>   `<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">`
#>                                                  <int>
#> 1                                                   NA
#> 2                                                   NA
#> 3                                                   NA
#> 4                                                   NA
#> 5                                                   NA
#> 6                                                   NA

# an alternative way to process the same data using `readr::type_convert()`:
readr::read_csv(file = raw_data,
                col_types = list(.default = "c")) %>%
  readr::type_convert(col_types = pal::cols_regex("^(Gemeindenamen|Partei)$" = "c",
                                                  "(?i)anteil" = "d",
                                                  .default = "i",
                                                  .col_names = colnames(.)))
#> Warning: [0, 1]: expected no trailing characters, but got '<html><head>'
#> Warning: [1, 1]: expected no trailing characters, but got '<title>404 Not Found</title>'
#> Warning: [2, 1]: expected no trailing characters, but got '</head><body>'
#> Warning: [3, 1]: expected no trailing characters, but got '<h1>Not Found</h1>'
#> Warning: [4, 1]: expected no trailing characters, but got '<p>The requested URL was not found on this server.</p>'
#> Warning: [5, 1]: expected no trailing characters, but got '</body></html>'
#> # A tibble: 6 × 1
#>   `<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">`
#>                                                  <int>
#> 1                                                   NA
#> 2                                                   NA
#> 3                                                   NA
#> 4                                                   NA
#> 5                                                   NA
#> 6                                                   NA