No lab this Friday (Fall break!)
Internal course evaluation open
library(nycflights13)
test.case.1 <- flights %>%
transmute(across(1:4, list(log = log, log2 = log2)))
test.case.2 <- flights %>%
transmute(across(1:4, list(log = log, log.2.base = ~ log(.x, base = 2))))
test.case.3 <- flights %>%
transmute(across(1:4, list(log = log, log.2.base = function(.x) log(.x, base = 2))))
all(test.case.1 == test.case.2, na.rm = TRUE)
## [1] TRUE
all(test.case.1 == test.case.3, na.rm = TRUE)
## [1] TRUEstringr pacakge, by Hadley Wickham, provides utilities for handling strings.
Included in tidyverse.
library("tidyverse")
# load htmlwidgets
library(htmlwidgets)Main functions:
str_detect(string, pattern): Detect the presence or
absence of a pattern in a string.str_locate(string, pattern): Locate the first position
of a pattern and return a matrix with start and end.str_extract(string, pattern): Extracts text
corresponding to the first match.str_match(string, pattern): Extracts capture groups
formed by () from the first match.str_split(string, pattern): Splits string into pieces
and returns a list of character vectors.str_replace(string, pattern, replacement): Replaces the
first matched pattern and returns a character vector.Variants with an _all suffix will match more than 1
occurrence of the pattern in a given string.
Most functions are vectorized.
Strings are enclosed by double quotes or single quotes:
string1 <- "This is a string"
string2 <- 'If I want to include a "quote" inside a string, I use single quotes'Literal single or double quote:
double_quote <- "\"" # or '"'
single_quote <- '\'' # or "'"Printed representation:
x <- c("\"", "\\")
x
## [1] "\"" "\\"
vs cat:
cat(x)
## " \
vs writeLines():
writeLines(x)
## "
## \Other special characters: "\n" (new line),
"\t" (tab), … Check
?"'"
for a complete list.
cat("a\"b")
## a"b
cat("a\tb")
## a b
cat("a\nb")
## a
## bUnicode
x <- "\u00b5"
x
## [1] "µ"Character vector (vector of strings):
c("one", "two", "three")
## [1] "one" "two" "three"Length of a single string:
str_length("R for data science")
## [1] 18Lengths of a character vector:
str_length(c("a", "R for data science", NA))
## [1] 1 18 NARead str_c’s documentation
Combine two or more strings
str_c("x", "y")
## [1] "xy"
str_c("x", "y", "z")
## [1] "xyz"Separator:
str_c("x", "y", sep = ", ")
## [1] "x, y"str_c() is vectorised:
str_c("prefix-", c("a", "b", "c"), "-suffix")
## [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"Objects of length 0 are silently dropped:
name <- "Hadley"
time_of_day <- "morning"
birthday <- FALSE
str_c(
"Good ", time_of_day, " ", name,
if (birthday) " and HAPPY BIRTHDAY",
"."
)
## [1] "Good morning Hadley."Combine a vector of strings:
str_c(c("x", "y", "z"))
## [1] "x" "y" "z"
str_c(c("x", "y", "z"), collapse = ", ")
## [1] "x, y, z"str_sub’s documentation
By position:
str_sub("Apple", 1, 3)
## [1] "App"
x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)
## [1] "App" "Ban" "Pea"Negative numbers count backwards from end:
str_sub(x, -3, -1)
## [1] "ple" "ana" "ear"Out of range:
str_sub("a", 1, 5)
## [1] "a"
str_sub("a", 2, 5)
## [1] ""Assignment to a substring:
str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1))
x
## [1] "apple" "banana" "pear"str_view() shows the first match;
str_view_all() shows all matches.
Match exact strings:
x <- c("apple", "banana", "pear")
str_view(x, "an")
## [2] │ b<an><an>a
str_view_all(x, "an")
## Warning: `str_view()` was deprecated in stringr 1.5.0.
## ℹ Please use `str_view_all()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## [1] │ apple
## [2] │ b<an><an>a
## [3] │ pear. matches any character apart from a newline:
str_view(x, ".a.")
## [2] │ <ban>ana
## [3] │ p<ear>Regex escapes live on top of regular string escapes, so
there needs to be two levels of escapes..
To match a literal .:
# doesn't work because "a\.c" is treated as a regular expression
str_view(c("abc", "a.c", "bef"), "a\.c")
## Error: '\.' is an unrecognized escape in character string (<text>:2:37)
# regular expression needs double escape
str_view(c("abc", "a.c", "bef"), "a\\.c")
## [2] │ <a.c>
To match a literal \:
str_view("a\\b", "\\\\")
## [1] │ a<\>b
List of typographical symbols and punctuation marks wikipedia
^ matches the start of the string:
x <- c("apple", "banana", "pear")
str_view(x, "^a")
## [1] │ <a>pple$ matches the end of the string:
str_view(x, "a$")
## [2] │ banan<a>To force a regular expression to only match a complete string:
x <- c("apple pie", "apple", "apple cake")
str_view(x, "^apple$")
## [2] │ <apple>[abc]: matches a, b, or c. Same as
(a|b|c).[^abc]: matches anything except a, b, or c.str_view(c("grey", "gray"), "gr(e|a)y")
## [1] │ <grey>
## [2] │ <gray>
str_view(c("grey", "gray"), "gr[ea]y")
## [1] │ <grey>
## [2] │ <gray>?: 0 or 1
+: 1 or more
*: 0 or more
x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
# match either C or CC, being greedy here
str_view(x, "CC?")
## [1] │ 1888 is the longest year in Roman numerals: MD<CC><C>LXXXVIII
# greedy matches
str_view(x, "CC+")
## [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
# greedy matches
str_view(x, 'C[LX]+')
## [1] │ 1888 is the longest year in Roman numerals: MDCC<CLXXX>VIIISpecify number of matches:
{n}: exactly n
{n,}: n or more
{,m}: at most m
{n,m}: between n and m
str_view(x, "C{2}")
## [1] │ 1888 is the longest year in Roman numerals: MD<CC>CLXXXVIII
# greedy matches
str_view(x, "C{2,}")
## [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
# greedy matches
str_view(x, "C{2,3}")
## [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIIIGreedy (default) vs lazy (put ? after
repetition):
# lazy matches
str_view(x, 'C{2,3}?')
## [1] │ 1888 is the longest year in Roman numerals: MD<CC>CLXXXVIII
# lazy matches
str_view(x, 'C[LX]+?')
## [1] │ 1888 is the longest year in Roman numerals: MDCC<CL>XXXVIIIWhat went wrong here?
text = "<div class='main'> <div> <a href='here.pdf'>Here!</a> </div> </div>"
str_extract(text, "<div>.*</div>")
## [1] "<div> <a href='here.pdf'>Here!</a> </div> </div>"
If we add ? after a quantifier, the matching will be lazy (find the shortest possible match, not the longest).
str_extract(text, "<div>.*?</div>")
## [1] "<div> <a href='here.pdf'>Here!</a> </div>"fruit is a character vector pre-defined in
stringr package:
fruit
## [1] "apple" "apricot" "avocado"
## [4] "banana" "bell pepper" "bilberry"
## [7] "blackberry" "blackcurrant" "blood orange"
## [10] "blueberry" "boysenberry" "breadfruit"
## [13] "canary melon" "cantaloupe" "cherimoya"
## [16] "cherry" "chili pepper" "clementine"
## [19] "cloudberry" "coconut" "cranberry"
## [22] "cucumber" "currant" "damson"
## [25] "date" "dragonfruit" "durian"
## [28] "eggplant" "elderberry" "feijoa"
## [31] "fig" "goji berry" "gooseberry"
## [34] "grape" "grapefruit" "guava"
## [37] "honeydew" "huckleberry" "jackfruit"
## [40] "jambul" "jujube" "kiwi fruit"
## [43] "kumquat" "lemon" "lime"
## [46] "loquat" "lychee" "mandarine"
## [49] "mango" "mulberry" "nectarine"
## [52] "nut" "olive" "orange"
## [55] "pamelo" "papaya" "passionfruit"
## [58] "peach" "pear" "persimmon"
## [61] "physalis" "pineapple" "plum"
## [64] "pomegranate" "pomelo" "purple mangosteen"
## [67] "quince" "raisin" "rambutan"
## [70] "raspberry" "redcurrant" "rock melon"
## [73] "salal berry" "satsuma" "star fruit"
## [76] "strawberry" "tamarillo" "tangerine"
## [79] "ugli fruit" "watermelon"Parentheses define groups, which can be back-referenced as
\1, \2, …
# only show matched strings
str_view(fruit, "(..)\\1", match = TRUE)
## [4] │ b<anan>a
## [20] │ <coco>nut
## [22] │ <cucu>mber
## [41] │ <juju>be
## [56] │ <papa>ya
## [73] │ s<alal> berryx <- c("apple", "banana", "pear")
str_detect(x, "e")
## [1] TRUE FALSE TRUEVector words contains about 1000 commonly used
words:
length(words)
## [1] 980
head(words)
## [1] "a" "able" "about" "absolute" "accept" "account"# How many common words start with t?
sum(str_detect(words, "^t"))
## [1] 65
# What proportion of common words end with a vowel?
mean(str_detect(words, "[aeiou]$"))
## [1] 0.2765306Find words that end with x:
words[str_detect(words, "x$")]
## [1] "box" "sex" "six" "tax"
same as
str_subset(words, "x$")
## [1] "box" "sex" "six" "tax"Filter a data frame:
df <- tibble(
word = words,
i = seq_along(word)
)
df %>%
filter(str_detect(words, "x$"))
## # A tibble: 4 × 2
## word i
## <chr> <int>
## 1 box 108
## 2 sex 747
## 3 six 772
## 4 tax 841str_count() tells how many matches are found:
x <- c("apple", "banana", "pear")
str_count(x, "a")
## [1] 1 3 1
# On average, how many vowels per word?
mean(str_count(words, "[aeiou]"))
## [1] 1.991837Matches never overlap:
str_count("abababa", "aba")
## [1] 2
str_view_all("abababa", "aba")
## [1] │ <aba>b<aba>Mutate a data frame:
df %>%
mutate(
vowels = str_count(word, "[aeiou]"),
consonants = str_count(word, "[^aeiou]")
)
## # A tibble: 980 × 4
## word i vowels consonants
## <chr> <int> <int> <int>
## 1 a 1 1 0
## 2 able 2 2 2
## 3 about 3 3 2
## 4 absolute 4 4 4
## 5 accept 5 2 4
## 6 account 6 3 4
## 7 achieve 7 4 3
## 8 across 8 2 4
## 9 act 9 1 2
## 10 active 10 3 3
## # ℹ 970 more rowssentences is a collection of 720 phrases:
length(sentences)
## [1] 720
head(sentences)
## [1] "The birch canoe slid on the smooth planks."
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."
## [4] "These days a chicken leg is a rare dish."
## [5] "Rice is often served in round bowls."
## [6] "The juice of lemons makes fine punch."Suppose we want to find all sentences that contain a colour.
Create a collection of colours:
(colours <- c("red", "orange", "yellow", "green", "blue", "purple"))
## [1] "red" "orange" "yellow" "green" "blue" "purple"
(colour_match <- str_c(colours, collapse = "|"))
## [1] "red|orange|yellow|green|blue|purple"Select the sentences that contain a colour, and then extract the colour to figure out which one it is:
(has_colour <- str_subset(sentences, colour_match))
## [1] "Glue the sheet to the dark blue background."
## [2] "Two blue fish swam in the tank."
## [3] "The colt reared and threw the tall rider."
## [4] "The wide road shimmered in the hot sun."
## [5] "See the cat glaring at the scared mouse."
## [6] "A wisp of cloud hung in the blue air."
## [7] "Leaves turn brown and yellow in the fall."
## [8] "He ordered peach pie with ice cream."
## [9] "Pure bred poodles have curls."
## [10] "The spot on the blotter was made by green ink."
## [11] "Mud was spattered on the front of his white shirt."
## [12] "The sofa cushion is red and of light weight."
## [13] "The sky that morning was clear and bright blue."
## [14] "Torn scraps littered the stone floor."
## [15] "The doctor cured him with these pills."
## [16] "The new girl was fired today at noon."
## [17] "The third act was dull and tired the players."
## [18] "A blue crane is a tall wading bird."
## [19] "Live wires should be kept covered."
## [20] "It is hard to erase blue or red ink."
## [21] "The wreck occurred by the bank on Main Street."
## [22] "The lamp shone with a steady green flame."
## [23] "The box is held by a bright red snapper."
## [24] "The prince ordered his head chopped off."
## [25] "The houses are built of red clay bricks."
## [26] "The red tape bound the smuggled food."
## [27] "Nine men were hired to dig the ruins."
## [28] "The flint sputtered and lit a pine torch."
## [29] "Hedge apples may stain your hands green."
## [30] "The old pan was covered with hard fudge."
## [31] "The plant grew large and green in the window."
## [32] "The store walls were lined with colored frocks."
## [33] "The purple tie was ten years old."
## [34] "Bathe and relax in the cool green grass."
## [35] "The clan gathered on each dull night."
## [36] "The lake sparkled in the red hot sun."
## [37] "Mark the spot with a sign painted red."
## [38] "Smoke poured out of every crack."
## [39] "Serve the hot rum to the tired heroes."
## [40] "The couch cover and hall drapes were blue."
## [41] "He offered proof in the form of a large chart."
## [42] "A man in a blue sweater sat at the desk."
## [43] "A sip of tea revives his tired friend."
## [44] "The door was barred, locked, and bolted as well."
## [45] "A thick coat of black paint covered all."
## [46] "The small red neon lamp went out."
## [47] "Paint the sockets in the wall dull green."
## [48] "Wake and rise, and step into the green outdoors."
## [49] "The green light in the brown box flickered."
## [50] "He put his last cartridge into the gun and fired."
## [51] "The ram scared the school children off."
## [52] "Tear a thin sheet from the yellow pad."
## [53] "Dimes showered down from all sides."
## [54] "The sky in the west is tinged with orange red."
## [55] "The red paper brightened the dim stage."
## [56] "The hail pattered on the burnt brown grass."
## [57] "The big red apple fell to the ground."
(matches <- str_extract(has_colour, colour_match))
## [1] "blue" "blue" "red" "red" "red" "blue" "yellow" "red"
## [9] "red" "green" "red" "red" "blue" "red" "red" "red"
## [17] "red" "blue" "red" "blue" "red" "green" "red" "red"
## [25] "red" "red" "red" "red" "green" "red" "green" "red"
## [33] "purple" "green" "red" "red" "red" "red" "red" "blue"
## [41] "red" "blue" "red" "red" "red" "red" "green" "green"
## [49] "green" "red" "red" "yellow" "red" "orange" "red" "red"
## [57] "red"str_extract() only extracts the first
match.
more <- sentences[str_count(sentences, colour_match) > 1]
str_view_all(more, colour_match)
## [1] │ It is hard to erase <blue> or <red> ink.
## [2] │ The <green> light in the brown box flicke<red>.
## [3] │ The sky in the west is tinged with <orange> <red>.str_extract_all() extracts all matches:
str_extract_all(more, colour_match)
## [[1]]
## [1] "blue" "red"
##
## [[2]]
## [1] "green" "red"
##
## [[3]]
## [1] "orange" "red"Setting simplify = TRUE in
str_extract_all() will return a matrix with short matches
expanded to the same length as the longest:
str_extract_all(more, colour_match, simplify = TRUE)
## [,1] [,2]
## [1,] "blue" "red"
## [2,] "green" "red"
## [3,] "orange" "red"
x <- c("a", "a b", "a b c")
str_extract_all(x, "[a-z]", simplify = TRUE)
## [,1] [,2] [,3]
## [1,] "a" "" ""
## [2,] "a" "b" ""
## [3,] "a" "b" "c"str_extract() gives us the complete match:
# why "([^ ]+)" match a word?
noun <- "(a|the) ([^ ]+)"
has_noun <- sentences %>%
str_subset(noun) %>%
head(10)
has_noun %>%
str_extract(noun)
## [1] "the smooth" "the sheet" "the depth" "a chicken" "the parked"
## [6] "the sun" "the huge" "the ball" "the woman" "a helps"str_match() gives each individual component:
has_noun %>%
str_match(noun)
## [,1] [,2] [,3]
## [1,] "the smooth" "the" "smooth"
## [2,] "the sheet" "the" "sheet"
## [3,] "the depth" "the" "depth"
## [4,] "a chicken" "a" "chicken"
## [5,] "the parked" "the" "parked"
## [6,] "the sun" "the" "sun"
## [7,] "the huge" "the" "huge"
## [8,] "the ball" "the" "ball"
## [9,] "the woman" "the" "woman"
## [10,] "a helps" "a" "helps"tidyr::extract() works with tibble:
tibble(sentence = sentences) %>%
tidyr::extract(
sentence, c("article", "noun"), "(a|the) ([^ ]+)",
remove = FALSE
)
## # A tibble: 720 × 3
## sentence article noun
## <chr> <chr> <chr>
## 1 The birch canoe slid on the smooth planks. the smooth
## 2 Glue the sheet to the dark blue background. the sheet
## 3 It's easy to tell the depth of a well. the depth
## 4 These days a chicken leg is a rare dish. a chicken
## 5 Rice is often served in round bowls. <NA> <NA>
## 6 The juice of lemons makes fine punch. <NA> <NA>
## 7 The box was thrown beside the parked truck. the parked
## 8 The hogs were fed chopped corn and garbage. <NA> <NA>
## 9 Four hours of steady work faced us. <NA> <NA>
## 10 A large size in stockings is hard to sell. <NA> <NA>
## # ℹ 710 more rowsReplace the first match:
x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")
## [1] "-pple" "p-ar" "b-nana"Replace all matches:
str_replace_all(x, "[aeiou]", "-")
## [1] "-ppl-" "p--r" "b-n-n-"Multiple replacement:
x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
## [1] "one house" "two cars" "three people"Back-reference:
# flip the order of the second and third words
sentences %>%
str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>%
head(5)
## [1] "The canoe birch slid on the smooth planks."
## [2] "Glue sheet the to the dark blue background."
## [3] "It's to easy tell the depth of a well."
## [4] "These a days chicken leg is a rare dish."
## [5] "Rice often is served in round bowls."Split a string up into pieces:
sentences %>%
head(5) %>%
str_split(" ")
## [[1]]
## [1] "The" "birch" "canoe" "slid" "on" "the" "smooth"
## [8] "planks."
##
## [[2]]
## [1] "Glue" "the" "sheet" "to" "the"
## [6] "dark" "blue" "background."
##
## [[3]]
## [1] "It's" "easy" "to" "tell" "the" "depth" "of" "a" "well."
##
## [[4]]
## [1] "These" "days" "a" "chicken" "leg" "is" "a"
## [8] "rare" "dish."
##
## [[5]]
## [1] "Rice" "is" "often" "served" "in" "round" "bowls."Use simplify = TRUE to return a matrix:
sentences %>%
head(5) %>%
str_split(" ", simplify = TRUE)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] "The" "birch" "canoe" "slid" "on" "the" "smooth" "planks."
## [2,] "Glue" "the" "sheet" "to" "the" "dark" "blue" "background."
## [3,] "It's" "easy" "to" "tell" "the" "depth" "of" "a"
## [4,] "These" "days" "a" "chicken" "leg" "is" "a" "rare"
## [5,] "Rice" "is" "often" "served" "in" "round" "bowls." ""
## [,9]
## [1,] ""
## [2,] ""
## [3,] "well."
## [4,] "dish."
## [5,] ""
A number of built-in convenience classes of characters:
.: Any character except new line \n.\s: White space.\S: Not white space.\d: Digit (0-9).\D: Not digit.\w: Word (A-Z, a-z, 0-9, or _).\W: Not word.Example: How to match a telephone number with the form (###) ###-####?
text = c("apple", "(219) 733-8965", "(329) 293-8753")
str_detect(text, "(\d\d\d) \d\d\d-\d\d\d\d")
## Error: '\d' is an unrecognized escape in character string (<text>:2:21)
text = c("apple", "(219) 733-8965", "(329) 293-8753")
str_detect(text, "(\\d\\d\\d) \\d\\d\\d-\\d\\d\\d\\d")
## [1] FALSE FALSE FALSE
str_detect(text, "\\(\\d\\d\\d\\) \\d\\d\\d-\\d\\d\\d\\d")
## [1] FALSE TRUE TRUE[abc]: List (a or b or c)[^abc]: Excluded list (not a or b or c)[a-q]: Range lower case letter from a to q[A-Q]: Range upper case letter from A to Q[0-7]: Digit from 0 to 7text = c("apple", "(219) 733-8965", "(329) 293-8753")
str_replace_all(text, "[aeiou]", "") # strip all vowels
## [1] "ppl" "(219) 733-8965" "(329) 293-8753"
str_replace_all(text, "[13579]", "*")
## [1] "apple" "(2**) ***-8*6*" "(*2*) 2**-8***"
str_replace_all(text, "[1-5a-ep]", "*")
## [1] "***l*" "(**9) 7**-896*" "(**9) *9*-87**"