Create readr column specification using regular expression matching
Source:R/pal.gen.R
cols_regex.Rd
Allows to define a regular expression per desired column specification object matching the respective column names.
Usage
cols_regex(..., .col_names, .default = readr::col_character())
Arguments
- ...
Named arguments where the names are (Perl-compatible) regular expressions and the values are column objects created by
col_*()
, or their abbreviated character names (as described in thecol_types
parameter ofreadr::read_delim()
). Dynamic dots are supported.- .col_names
Column names which should be matched by
...
.- .default
Any named columns not matched by any of the regular expressions in
...
will be read with this column type.
Examples
library(magrittr)
# for some hypothetical CSV data column names like these...
col_names <- c("VAR1_Text",
"VAR2_Text",
"VAR3_Text_Other",
"VAR1_Code_R1",
"VAR2_Code_R2",
"HAS_R1_Lag",
"HAS_R2_Lag",
"GARBAGEX67",
"GARBAGEY09")
# ...a column spec could be created concisely as follows:
col_regex <- list("_Text(_|$)" = "c",
"_Code(_|$)" = "i",
"^GARBAGE" = readr::col_skip())
pal::cols_regex(.col_names = col_names,
!!!col_regex,
.default = "l")
#> cols(
#> .default = col_logical(),
#> VAR1_Text = col_character(),
#> VAR2_Text = col_character(),
#> VAR3_Text_Other = col_character(),
#> VAR1_Code_R1 = col_integer(),
#> VAR2_Code_R2 = col_integer(),
#> GARBAGEX67 = col_skip(),
#> GARBAGEY09 = col_skip()
#> )
# we can parse some real data:
raw_data <-
httr::GET(paste0("http://www.web.statistik.zh.ch/ogd/data/",
"KANTON_ZUERICH_nrw_2019_listen_ergebnisse_gemeinde.csv")) |>
httr::content(as = "text",
encoding = "UTF-8")
readr::read_csv(file = raw_data,
col_types = pal::cols_regex("^(Gemeindenamen|Partei)$" = "c",
"(?i)anteil" = "d",
.default = "i",
.col_names = pal::dsv_colnames(raw_data)))
#> Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
#> dat <- vroom(...)
#> problems(dat)
#> # A tibble: 6 × 1
#> `<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">`
#> <int>
#> 1 NA
#> 2 NA
#> 3 NA
#> 4 NA
#> 5 NA
#> 6 NA
# an alternative way to process the same data using `readr::type_convert()`:
readr::read_csv(file = raw_data,
col_types = list(.default = "c")) %>%
readr::type_convert(col_types = pal::cols_regex("^(Gemeindenamen|Partei)$" = "c",
"(?i)anteil" = "d",
.default = "i",
.col_names = colnames(.)))
#> Warning: [0, 1]: expected no trailing characters, but got '<html><head>'
#> Warning: [1, 1]: expected no trailing characters, but got '<title>404 Not Found</title>'
#> Warning: [2, 1]: expected no trailing characters, but got '</head><body>'
#> Warning: [3, 1]: expected no trailing characters, but got '<h1>Not Found</h1>'
#> Warning: [4, 1]: expected no trailing characters, but got '<p>The requested URL was not found on this server.</p>'
#> Warning: [5, 1]: expected no trailing characters, but got '</body></html>'
#> # A tibble: 6 × 1
#> `<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">`
#> <int>
#> 1 NA
#> 2 NA
#> 3 NA
#> 4 NA
#> 5 NA
#> 6 NA