Song of the Day

Main Ideas

Coming Up

Lecture Notes and Exercises

In addition to the tidyverse, we will use the stringr package.

library(tidyverse)
library(stringr)

stringr provides tools to work with character strings. Functions in stringr have consistent, memorable names.

Preliminaries

Character strings in R are defined by double quotation marks. These can include numbers, letters, punctation, whitespace, etc.

string1 <- "STA 199 is my favorite class"
string1
## [1] "STA 199 is my favorite class"

You can combine character strings in a vector.

string2 <- c("STA 199", "Data Science", "Duke")
string2
## [1] "STA 199"      "Data Science" "Duke"

Question: What if we want to include a quotation in a string? Why doesn’t the code below work?

string3 <- "I said "Hello" to my class"

To include a double quote in a string escape it using a backslash. Try it now in the code chunk below and name your string string4.

string4 <- "I said \"Hello\" to my class"

If you want to include an actual backslash, escape it as shown below. This may seem tedious but it will be important later.

string5 <- "\\"

The function writeLines() shows the content of the strings not including escapes. Try it for string1, string2, string3, string4, and string5 in the code chunk below.

writeLines(string1)
## STA 199 is my favorite class
writeLines(string2)
## STA 199
## Data Science
## Duke
writeLines(string4)
## I said "Hello" to my class
writeLines(string5)
## \

U.S. States

To demonstrate the basic functions from stringr we will use a vector of all 50 U.S. states.

states <- c("alabama", "alaska", "arizona", "arkansas", "california", 
            "colorado", "connecticut", "delaware", "florida", "georgia", 
            "hawaii", "idaho", "illinois", "indiana", "iowa", "kansas", 
            "kentucky", "louisiana", "maine", "maryland", "massachusetts", 
            "michigan", "minnesota", "mississippi", "missouri", "montana", 
            "nebraska", "nevada", "new hampshire", "new jersey", 
            "new mexico", "new york", "north carolina", "north dakota", "ohio", 
            "oklahoma", "oregon", "pennsylvania", "rhode island",
            "south carolina", "south dakota", "tennessee", "texas", "utah", 
            "vermont", "virginia", "washington", "west virginia", "wisconsin",
            "wyoming")

str_length()

Given a string, return the number of characters.

string1
## [1] "STA 199 is my favorite class"
str_length(string1)
## [1] 28

Given a vector of strings, return the number of characters in each string.

str_length(states)
##  [1]  7  6  7  8 10  8 11  8  7  7  6  5  8  7  4  6  8  9  5  8 13  8  9 11  8
## [26]  7  8  6 13 10 10  8 14 12  4  8  6 12 12 14 12  9  5  4  7  8 10 13  9  7

str_c()

Combine two (or more) strings.

str_c("STA 199", "is", "my", "favorite", "class")
## [1] "STA 199ismyfavoriteclass"

Use sep to specify how the strings are separated.

str_c("STA 199", "is", "my", "favorite", "class", sep = " ")
## [1] "STA 199 is my favorite class"

str_to_lower() and str_to_upper()

Convert the case of a string from lower to upper or vice versa.

str_to_upper(states)
##  [1] "ALABAMA"        "ALASKA"         "ARIZONA"        "ARKANSAS"      
##  [5] "CALIFORNIA"     "COLORADO"       "CONNECTICUT"    "DELAWARE"      
##  [9] "FLORIDA"        "GEORGIA"        "HAWAII"         "IDAHO"         
## [13] "ILLINOIS"       "INDIANA"        "IOWA"           "KANSAS"        
## [17] "KENTUCKY"       "LOUISIANA"      "MAINE"          "MARYLAND"      
## [21] "MASSACHUSETTS"  "MICHIGAN"       "MINNESOTA"      "MISSISSIPPI"   
## [25] "MISSOURI"       "MONTANA"        "NEBRASKA"       "NEVADA"        
## [29] "NEW HAMPSHIRE"  "NEW JERSEY"     "NEW MEXICO"     "NEW YORK"      
## [33] "NORTH CAROLINA" "NORTH DAKOTA"   "OHIO"           "OKLAHOMA"      
## [37] "OREGON"         "PENNSYLVANIA"   "RHODE ISLAND"   "SOUTH CAROLINA"
## [41] "SOUTH DAKOTA"   "TENNESSEE"      "TEXAS"          "UTAH"          
## [45] "VERMONT"        "VIRGINIA"       "WASHINGTON"     "WEST VIRGINIA" 
## [49] "WISCONSIN"      "WYOMING"

str_sub()

Extract parts of a string from start to end, inclusive.

str_sub(states, 1, 4)
##  [1] "alab" "alas" "ariz" "arka" "cali" "colo" "conn" "dela" "flor" "geor"
## [11] "hawa" "idah" "illi" "indi" "iowa" "kans" "kent" "loui" "main" "mary"
## [21] "mass" "mich" "minn" "miss" "miss" "mont" "nebr" "neva" "new " "new "
## [31] "new " "new " "nort" "nort" "ohio" "okla" "oreg" "penn" "rhod" "sout"
## [41] "sout" "tenn" "texa" "utah" "verm" "virg" "wash" "west" "wisc" "wyom"
str_sub(states, -4, -1)
##  [1] "bama" "aska" "zona" "nsas" "rnia" "rado" "icut" "ware" "rida" "rgia"
## [11] "waii" "daho" "nois" "iana" "iowa" "nsas" "ucky" "iana" "aine" "land"
## [21] "etts" "igan" "sota" "ippi" "ouri" "tana" "aska" "vada" "hire" "rsey"
## [31] "xico" "york" "lina" "kota" "ohio" "homa" "egon" "ania" "land" "lina"
## [41] "kota" "ssee" "exas" "utah" "mont" "inia" "gton" "inia" "nsin" "ming"

Practice: Combine str_sub() and str_to_upper() to capitalize each state (you can ignore two word states).

str_sub(states, 1, 1) <- str_to_upper(str_sub(states, 1, 1))
states
##  [1] "Alabama"        "Alaska"         "Arizona"        "Arkansas"      
##  [5] "California"     "Colorado"       "Connecticut"    "Delaware"      
##  [9] "Florida"        "Georgia"        "Hawaii"         "Idaho"         
## [13] "Illinois"       "Indiana"        "Iowa"           "Kansas"        
## [17] "Kentucky"       "Louisiana"      "Maine"          "Maryland"      
## [21] "Massachusetts"  "Michigan"       "Minnesota"      "Mississippi"   
## [25] "Missouri"       "Montana"        "Nebraska"       "Nevada"        
## [29] "New hampshire"  "New jersey"     "New mexico"     "New york"      
## [33] "North carolina" "North dakota"   "Ohio"           "Oklahoma"      
## [37] "Oregon"         "Pennsylvania"   "Rhode island"   "South carolina"
## [41] "South dakota"   "Tennessee"      "Texas"          "Utah"          
## [45] "Vermont"        "Virginia"       "Washington"     "West virginia" 
## [49] "Wisconsin"      "Wyoming"

str_sort()

Sort a string. Below we sort in decreasing alphabetical order.

str_sort(states, decreasing = TRUE)
##  [1] "Wyoming"        "Wisconsin"      "West virginia"  "Washington"    
##  [5] "Virginia"       "Vermont"        "Utah"           "Texas"         
##  [9] "Tennessee"      "South dakota"   "South carolina" "Rhode island"  
## [13] "Pennsylvania"   "Oregon"         "Oklahoma"       "Ohio"          
## [17] "North dakota"   "North carolina" "New york"       "New mexico"    
## [21] "New jersey"     "New hampshire"  "Nevada"         "Nebraska"      
## [25] "Montana"        "Missouri"       "Mississippi"    "Minnesota"     
## [29] "Michigan"       "Massachusetts"  "Maryland"       "Maine"         
## [33] "Louisiana"      "Kentucky"       "Kansas"         "Iowa"          
## [37] "Indiana"        "Illinois"       "Idaho"          "Hawaii"        
## [41] "Georgia"        "Florida"        "Delaware"       "Connecticut"   
## [45] "Colorado"       "California"     "Arkansas"       "Arizona"       
## [49] "Alaska"         "Alabama"

Regular Expressions

A regular expression is a sequence of characters that allows you to describe string patterns. We use them to search for patterns.

Examples of usage include the following data science tasks:

To demonstrate regular expressions, we will use a vector of the states bordering North Carolina.

nc_states <- c("North Carolina", "South Carolina", "Virginia", "Tennessee", 
               "Georgia")

Basic Match

We can match exactly using a basic match.

str_view_all(nc_states, "in")

We can match any character using .

str_view_all(nc_states, ".a")

Question: What if we want to match a period .?

Escape it using . This is the regular expression.

But we represent regular expressions using strings and  is also an escape symbol in strings.

Escape again!

To create the regular expression ., use the string “\.”

str_view_all(c("a.c", "abc", "def"), "a\\.c")

Anchors

Match the start of a string using ^.

str_view_all(nc_states, "^G")

Match the end of a string using $.

str_view_all(nc_states, "a$")

str_detect()

Determine if a character vector matches a pattern.

nc_states
## [1] "North Carolina" "South Carolina" "Virginia"       "Tennessee"     
## [5] "Georgia"
str_detect(nc_states, "a")
## [1]  TRUE  TRUE  TRUE FALSE  TRUE

str_subset()

nc_states
## [1] "North Carolina" "South Carolina" "Virginia"       "Tennessee"     
## [5] "Georgia"
str_subset(nc_states, "e$")
## [1] "Tennessee"

str_count()

Determine how many matches there are in a string.

nc_states
## [1] "North Carolina" "South Carolina" "Virginia"       "Tennessee"     
## [5] "Georgia"
str_count(nc_states, "a")
## [1] 2 2 1 0 1

str_replace() and str_replace_all()

Replace matches with new strings.

str_replace(nc_states, "a", "-")
## [1] "North C-rolina" "South C-rolina" "Virgini-"       "Tennessee"     
## [5] "Georgi-"

Use str_replace_all() to replace all matches with new strings.

str_replace_all(nc_states, "a", "-")
## [1] "North C-rolin-" "South C-rolin-" "Virgini-"       "Tennessee"     
## [5] "Georgi-"

Many Matches

The regular expressions below match more than one character.

Remember these are regular expressions! To match digits you’ll need to escape the , so use “\d”, not "

Practice

To practice manipulating strings we will use question and answer data from two recent seasons (2008 - 2009) of the television game show Jeopardy!.

jeopardy <- read_csv("data/questions.csv")
glimpse(jeopardy)
## Rows: 40,865
## Columns: 5
## $ category <chr> "OLD FOLKS IN THEIR 30s", "MOVIES & TV", "A STATE OF COLLEGE…
## $ value    <dbl> 200, 200, 200, 200, 200, 200, 400, 400, 400, 400, 400, 400, …
## $ question <chr> "goop.com is a lifestyles website from this Oscar-winning ac…
## $ answer   <chr> "Gwyneth Paltrow", "Jay Leno", "Texas", "a pride", "a bunny …
## $ year     <dbl> 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, …
  1. Use a single code pipeline and a function from stringr to return all rows where the answer contains the word “Durham”
jeopardy %>%
  filter(str_detect(answer, "Durham"))
## # A tibble: 3 x 5
##   category     value question                                   answer      year
##   <chr>        <dbl> <chr>                                      <chr>      <dbl>
## 1 BULL          2000 "\"Bull City\", this place's nickname, is… Durham      2009
## 2 BASEBRAWL     1000 "In 1995 10 players were ejected for a br… the Durha…  2009
## 3 MOVIES BY Q…   800 "Crash: \"Man, that ball got out of here … Bull Durh…  2009
  1. Use a single code pipeline and stringr to find the length of all of the answers, sort by decreasing length, and return the five longest answers.
jeopardy %>%
  mutate(answer_length = str_length(answer)) %>%
  arrange(desc(answer_length)) %>%
  select(answer, answer_length) %>% 
  slice(1:5)
## # A tibble: 5 x 2
##   answer                                                           answer_length
##   <chr>                                                                    <int>
## 1 a microphone & the masks of comedy & tragedy (a TV set, a movie…            86
## 2 hiding your light under a bushel (keeping your light underneath…            82
## 3 International Talk Like a Pirate Day (National Talk Like a Pira…            79
## 4 (any of) the (St. Louis) Rams, the Oakland Raiders, or the San …            77
## 5 to take the number that's between 3 and 5 (averaging the 2 midd…            74
  1. What answer has the most digits?
jeopardy %>% 
  mutate(answer_digits = str_count(answer, "\\d")) %>%
  arrange(desc(answer_digits)) %>%
  select(answer, answer_digits) %>%
  slice(1:3)
## # A tibble: 3 x 2
##   answer         answer_digits
##   <chr>                  <int>
## 1 1939 (or 1942)             8
## 2 1952 & 1956                8
## 3 867-5309                   7
  1. Return all rows where the category has a period.
jeopardy %>%
  filter(str_detect(category, "\\."))
## # A tibble: 1,249 x 5
##    category      value question                                answer       year
##    <chr>         <dbl> <chr>                                   <chr>       <dbl>
##  1 I LOVE L.A. …   400 "Kobe called it \"idiotic criticism\" … Shaquille …  2009
##  2 I LOVE L.A. …   800 "A wizard at passing the ball, this La… Magic John…  2009
##  3 I LOVE L.A. …  1200 "This Laker giant was nicknamed \"The … Wilt Chamb…  2009
##  4 I LOVE L.A. …  1600 "This Hall-of-Fame guard & former Lake… Jerry West   2009
##  5 I LOVE L.A. …  2000 "This flashy Lakers forward was nickna… James Wort…  2009
##  6 IT'S AN L.A.…   200 "Wanna live in this city, 90210? in Ju… Beverly Hi…  2009
##  7 IT'S AN L.A.…   400 "Originally the letters in this landma… the Hollyw…  2009
##  8 IT'S AN L.A.…   600 "Good times are Bruin in this district… Westwood     2009
##  9 IT'S AN L.A.…   800 "You can hit the Comedy Store, House o… Sunset Str…  2009
## 10 IT'S AN L.A.…  1000 "Originally called \"Nuestro Pueblo\" … the Watts …  2009
## # … with 1,239 more rows
  1. Using a single code pipeline, return all rows where the question contains a (numeric) year between 1800 and 1999
jeopardy %>%
  filter(str_detect(question, "1[89]\\d\\d")) %>%
  select(question)
## # A tibble: 6,749 x 1
##    question                                                                     
##    <chr>                                                                        
##  1 "During the War Of 1812, this \"Rip Van Winkle\" author wrote biographies of…
##  2 "(<a href=\"http://www.j-archive.com/media/2009-05-08_DJ_28.jpg\" target=\"_…
##  3 "He reviewed films & TV for the New Republic before his first book, \"Goodby…
##  4 "While he was in Spain in 1959, he wrote \"The Dangerous Summer\", a story a…
##  5 "In 1884 she moved to Red Cloud, Nebraska & later fictionalized it as the to…
##  6 "1980: \"Regular Folks\""                                                    
##  7 "In 1986 Mexico scored as the first country to host this international sport…
##  8 "1932: \"Magnificent Inn\""                                                  
##  9 "1976: \"A Single Colorado Mountain\""                                       
## 10 "1954: \"Dockside\""                                                         
## # … with 6,739 more rows
  1. Using a single code pipeline, return all rows with answers that begin with three vowels.
jeopardy %>%
  filter(str_detect(answer, "^[AEIOUaeiou][AEIOUaeiou][AEIOUaeiou]")) %>%
  select(answer)
## # A tibble: 7 x 1
##   answer   
##   <chr>    
## 1 Ouija    
## 2 AAA      
## 3 Aeolus   
## 4 Aeon Flux
## 5 Aeolus   
## 6 aioli    
## 7 Ouija
  1. Using a single code pipeline, return all answers that end with ugh but not ough.
jeopardy %>%
  filter(str_detect(answer, "[^o]ugh$")) %>%
  select(answer)
## # A tibble: 5 x 1
##   answer         
##   <chr>          
## 1 (Rush) Limbaugh
## 2 laugh          
## 3 Evelyn Waugh   
## 4 Rush Limbaugh  
## 5 Rush Limbaugh
  1. Use a single code pipeline to create a new variable prop_vowel that is the proportion of all letters in each answer that are vowels. What is the highest? Lowest?
jeopardy %>%
  mutate(vowels = str_count(answer, "[AEIOUaeiou]"),
         letters = str_count(answer, "[[:alpha:]]"),
         prop_vowel = vowels / letters) %>%
  select(answer, vowels, letters, prop_vowel) %>%
  arrange(desc(prop_vowel)) %>%
  filter(letters > 5, 
         !is.na(prop_vowel)) %>%
  slice(1:3, (n() - 2):n())
## # A tibble: 6 x 4
##   answer         vowels letters prop_vowel
##   <chr>           <int>   <int>      <dbl>
## 1 a queue             5       6      0.833
## 2 a lei & a lee       6       8      0.75 
## 3 queue / cue         6       8      0.75 
## 4 Lynyrd Skynyrd      0      13      0    
## 5 Lynyrd Skynyrd      0      13      0    
## 6 rhythms             0       7      0

Additional Resources