Chapter 15 String Pattern Matching

What You’ll Learn:

  • Regular expressions basics
  • grep family functions
  • Pattern matching errors
  • Replacement operations
  • Common regex pitfalls

Key Errors Covered: 12+ pattern matching errors

Difficulty: ⭐⭐⭐ Advanced

15.1 Introduction

Pattern matching in strings is powerful but error-prone:

# Try to match a pattern
text <- c("file1.txt", "file2.csv", "file3.txt")
grep(".", text)  # Expect to find the dots
#> [1] 1 2 3
# But . in regex means "any character"!
grep(".", text)  # Matches everything!
#> [1] 1 2 3

Let’s master pattern matching and avoid regex pitfalls.

15.2 Pattern Matching Basics

💡 Key Insight: grep Family Functions

texts <- c("apple", "banana", "apricot", "cherry")

# grep: return indices of matches
grep("ap", texts)
#> [1] 1 3
texts[grep("ap", texts)]
#> [1] "apple"   "apricot"

# grepl: return logical vector
grepl("ap", texts)
#> [1]  TRUE FALSE  TRUE FALSE

# sub: replace first match
sub("a", "X", texts)
#> [1] "Xpple"   "bXnana"  "Xpricot" "cherry"

# gsub: replace all matches
gsub("a", "X", texts)
#> [1] "Xpple"   "bXnXnX"  "Xpricot" "cherry"

# regexpr: position of first match
regexpr("a", texts)
#> [1]  1  2  1 -1
#> attr(,"match.length")
#> [1]  1  1  1 -1
#> attr(,"index.type")
#> [1] "chars"
#> attr(,"useBytes")
#> [1] TRUE

# gregexpr: positions of all matches
gregexpr("a", texts)
#> [[1]]
#> [1] 1
#> attr(,"match.length")
#> [1] 1
#> attr(,"index.type")
#> [1] "chars"
#> attr(,"useBytes")
#> [1] TRUE
#> 
#> [[2]]
#> [1] 2 4 6
#> attr(,"match.length")
#> [1] 1 1 1
#> attr(,"index.type")
#> [1] "chars"
#> attr(,"useBytes")
#> [1] TRUE
#> 
#> [[3]]
#> [1] 1
#> attr(,"match.length")
#> [1] 1
#> attr(,"index.type")
#> [1] "chars"
#> attr(,"useBytes")
#> [1] TRUE
#> 
#> [[4]]
#> [1] -1
#> attr(,"match.length")
#> [1] -1
#> attr(,"index.type")
#> [1] "chars"
#> attr(,"useBytes")
#> [1] TRUE

Key differences: - grep() → indices - grepl() → TRUE/FALSE - sub() → replace first - gsub() → replace all - *expr() → positions

15.3 Error #1: invalid regular expression

⭐⭐⭐ ADVANCED 🔤 SYNTAX

15.3.1 The Error

# Unmatched bracket
grep("[abc", c("a", "b", "c"))
#> Warning in grep("[abc", c("a", "b", "c")): TRE pattern compilation error
#> 'Missing ']''
#> Error in grep("[abc", c("a", "b", "c")): invalid regular expression '[abc', reason 'Missing ']''

🔴 ERROR

Error in grep("[abc", c("a", "b", "c")) : 
  invalid regular expression '[abc', reason 'Missing ']''

15.3.2 What It Means

Your regular expression has invalid syntax.

15.3.3 Common Invalid Patterns

# Unmatched brackets
grep("[abc", "test")
#> Warning in grep("[abc", "test"): TRE pattern compilation error 'Missing ']''
#> Error in grep("[abc", "test"): invalid regular expression '[abc', reason 'Missing ']''
grep("abc]", "test")
#> integer(0)

# Unmatched parentheses
grep("(abc", "test")
#> Warning in grep("(abc", "test"): TRE pattern compilation error 'Missing ')''
#> Error in grep("(abc", "test"): invalid regular expression '(abc', reason 'Missing ')''
grep("abc)", "test")
#> integer(0)

# Invalid repetition
grep("a{2,1}", "test")  # max < min
#> Warning in grep("a{2,1}", "test"): TRE pattern compilation error 'Invalid
#> contents of {}'
#> Error in grep("a{2,1}", "test"): invalid regular expression 'a{2,1}', reason 'Invalid contents of {}'

# Trailing backslash
grep("test\\", "test")
#> Warning in grep("test\\", "test"): TRE pattern compilation error 'Trailing
#> backslash'
#> Error in grep("test\\", "test"): invalid regular expression 'test\', reason 'Trailing backslash'

# Invalid escape
grep("\\k", "test")  # \k not valid
#> integer(0)

15.3.4 Solutions

SOLUTION 1: Escape Special Characters

# To match literal special characters, escape them
special_chars <- c(".", "*", "+", "?", "[", "]", "(", ")", 
                   "{", "}", "^", "$", "|", "\\")

# Match literal dot
grep("\\.", c("file.txt", "file_txt"))
#> [1] 1

# Match literal bracket
grep("\\[", c("[test]", "test"))
#> [1] 1

# Match literal backslash
grep("\\\\", c("C:\\path", "C:/path"))
#> [1] 1

SOLUTION 2: Use fixed = TRUE for Literals

# When you want literal matching, not regex
grep(".", c("file.txt", "file_txt"), fixed = TRUE)
#> [1] 1

# Works with all special characters
grep("[abc]", c("[abc]", "abc"), fixed = TRUE)
#> [1] 1

# Much simpler for file extensions
grep(".txt", c("file.txt", "file.csv"), fixed = TRUE)
#> [1] 1

SOLUTION 3: Validate Pattern First

is_valid_regex <- function(pattern) {
  tryCatch({
    grep(pattern, "test")
    TRUE
  }, error = function(e) {
    message("Invalid regex: ", e$message)
    FALSE
  })
}

# Test
is_valid_regex("[abc")    # FALSE
#> Warning in grep(pattern, "test"): TRE pattern compilation error 'Missing ']''
#> Invalid regex: invalid regular expression '[abc', reason 'Missing ']''
#> [1] FALSE
is_valid_regex("[abc]")   # TRUE
#> [1] TRUE

15.4 Regular Expression Special Characters

💡 Key Insight: Regex Special Characters

texts <- c("abc", "a.c", "a*c", "aXc", "ac", "abbc")

# . = any single character
grep("a.c", texts, value = TRUE)
#> [1] "abc" "a.c" "a*c" "aXc"

# * = zero or more of previous
grep("ab*c", texts, value = TRUE)
#> [1] "abc"  "ac"   "abbc"

# + = one or more of previous
grep("ab+c", texts, value = TRUE)
#> [1] "abc"  "abbc"

# ? = zero or one of previous
grep("ab?c", texts, value = TRUE)
#> [1] "abc" "ac"

# ^ = start of string
grep("^a", texts, value = TRUE)
#> [1] "abc"  "a.c"  "a*c"  "aXc"  "ac"   "abbc"

# $ = end of string
grep("c$", texts, value = TRUE)
#> [1] "abc"  "a.c"  "a*c"  "aXc"  "ac"   "abbc"

# [abc] = any of a, b, or c
grep("a[bX]c", texts, value = TRUE)
#> [1] "abc" "aXc"

# [^abc] = anything except a, b, or c
grep("a[^b]c", texts, value = TRUE)
#> [1] "a.c" "a*c" "aXc"

# | = or
grep("a|c", texts, value = TRUE)
#> [1] "abc"  "a.c"  "a*c"  "aXc"  "ac"   "abbc"

# {n} = exactly n
grep("b{2}", texts, value = TRUE)
#> [1] "abbc"

# {n,} = n or more
grep("b{1,}", texts, value = TRUE)
#> [1] "abc"  "abbc"

# {n,m} = between n and m
grep("b{1,2}", texts, value = TRUE)
#> [1] "abc"  "abbc"

To match literal special characters, escape with \\:

# Match literal dot
grep("\\.", c("a.b", "aXb"), value = TRUE)
#> [1] "a.b"

# Match literal asterisk
grep("\\*", c("a*b", "aXb"), value = TRUE)
#> [1] "a*b"

15.5 Error #2: Pattern Matches Everything/Nothing

⭐⭐ INTERMEDIATE 🧠 LOGIC

15.5.1 The Problem

# Want to find files with dots
files <- c("file1.txt", "file2.csv", "README")

# But . matches any character!
grep(".", files)  # Matches all 3!
#> [1] 1 2 3

# Want to find emails
emails <- c("test@email.com", "notanemail", "another@test.org")

# But simple pattern matches too much
grep("@", emails, value = TRUE)  # OK so far...
#> [1] "test@email.com"   "another@test.org"
grep(".*@.*", emails, value = TRUE)  # Also matches all!
#> [1] "test@email.com"   "another@test.org"

15.5.2 Common Pattern Mistakes

texts <- c("abc", "def", "xyz")

# .* matches everything (zero or more any character)
grep(".*", texts)  # All match!
#> [1] 1 2 3

# Wrong escaping
grep(".", texts)   # All match (. is any character)
#> [1] 1 2 3
grep("\\.", texts) # None match (no literal dots)
#> integer(0)

# Too greedy
text <- "value=123&other=456"
sub("=.*", "", text)  # Removes too much! "value"
#> [1] "value"

15.5.3 Solutions

SOLUTION 1: Be Specific

files <- c("file1.txt", "file2.csv", "README")

# Match literal dot
grep("\\.", files, value = TRUE)
#> [1] "file1.txt" "file2.csv"

# Match specific extension
grep("\\.txt$", files, value = TRUE)
#> [1] "file1.txt"

# Match email pattern
emails <- c("test@email.com", "notanemail", "another@test.org")
grep("[A-Za-z0-9.]+@[A-Za-z0-9.]+\\.[A-Za-z]{2,}", emails, value = TRUE)
#> [1] "test@email.com"   "another@test.org"

SOLUTION 2: Use Anchors

texts <- c("apple", "pineapple", "application")

# Without anchor: matches all
grep("app", texts, value = TRUE)
#> [1] "apple"       "pineapple"   "application"

# With ^: only at start
grep("^app", texts, value = TRUE)
#> [1] "apple"       "application"

# With $: only at end
grep("app$", texts, value = TRUE)
#> character(0)

# Exact match
grep("^apple$", texts, value = TRUE)
#> [1] "apple"

SOLUTION 3: Use Non-greedy Matching

text <- "value=123&other=456"

# Greedy: takes everything
sub("=.*&", "=X&", text)  # "value=X&other=456"
#> [1] "value=X&other=456"

# Non-greedy (in Perl regex): *? or +?
sub("=.*?&", "=X&", text, perl = TRUE)  # "value=X&other=456"
#> [1] "value=X&other=456"

# Alternative: use negated character class
sub("=[^&]*&", "=X&", text)  # "value=X&other=456"
#> [1] "value=X&other=456"

15.6 stringr: Modern String Operations

🎯 Best Practice: Use stringr

library(stringr)

texts <- c("apple", "banana", "apricot")

# Detect pattern (like grepl)
str_detect(texts, "ap")
#> [1]  TRUE FALSE  TRUE

# Which match (like grep)
str_which(texts, "ap")
#> [1] 1 3

# Extract matches
str_subset(texts, "ap")
#> [1] "apple"   "apricot"

# Count matches
str_count(texts, "a")
#> [1] 1 3 1

# Extract pattern
str_extract(texts, "ap")
#> [1] "ap" NA   "ap"
str_extract_all(texts, "a")
#> [[1]]
#> [1] "a"
#> 
#> [[2]]
#> [1] "a" "a" "a"
#> 
#> [[3]]
#> [1] "a"

# Replace
str_replace(texts, "a", "X")      # First match
#> [1] "Xpple"   "bXnana"  "Xpricot"
str_replace_all(texts, "a", "X")  # All matches
#> [1] "Xpple"   "bXnXnX"  "Xpricot"

# Remove pattern
str_remove(texts, "ap")           # First match
#> [1] "ple"    "banana" "ricot"
str_remove_all(texts, "a")        # All matches
#> [1] "pple"   "bnn"    "pricot"

# Split
str_split("a-b-c", "-")
#> [[1]]
#> [1] "a" "b" "c"
str_split("a-b-c", "-", simplify = TRUE)
#>      [,1] [,2] [,3]
#> [1,] "a"  "b"  "c"

# Better error messages
str_detect(texts, "[invalid")  # Clearer error
#> Error in stri_detect_regex(string, pattern, negate = negate, opts_regex = opts(pattern)): Missing closing bracket on a bracket expression. (U_REGEX_MISSING_CLOSE_BRACKET, context=`[invalid`)

15.7 Common Regex Patterns

🎯 Best Practice: Useful Patterns

library(stringr)

# Digits
texts <- c("abc123", "def456", "xyz")
str_extract_all(texts, "\\d+")  # One or more digits
#> [[1]]
#> [1] "123"
#> 
#> [[2]]
#> [1] "456"
#> 
#> [[3]]
#> character(0)

# Non-digits
str_extract_all(texts, "\\D+")  # One or more non-digits
#> [[1]]
#> [1] "abc"
#> 
#> [[2]]
#> [1] "def"
#> 
#> [[3]]
#> [1] "xyz"

# Word characters (letters, digits, underscore)
str_extract_all("hello_world123", "\\w+")
#> [[1]]
#> [1] "hello_world123"

# Whitespace
str_detect("hello world", "\\s")
#> [1] TRUE

# Email (simple)
email_pattern <- "[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}"
str_detect("test@email.com", email_pattern)
#> [1] TRUE

# Phone (US)
phone_pattern <- "\\d{3}-\\d{3}-\\d{4}"
str_detect("123-456-7890", phone_pattern)
#> [1] TRUE

# URL (simple)
url_pattern <- "https?://[A-Za-z0-9.-]+"
str_extract("Visit http://example.com", url_pattern)
#> [1] "http://example.com"

# Extract numbers
text <- "Price: $19.99"
str_extract(text, "\\d+\\.?\\d*")
#> [1] "19.99"

# Extract words
text <- "hello world, how are you?"
str_extract_all(text, "\\w+")
#> [[1]]
#> [1] "hello" "world" "how"   "are"   "you"

15.8 Replacement Operations

💡 Key Insight: Replacement Strategies

library(stringr)

texts <- c("apple", "banana", "apricot")

# Simple replacement
str_replace(texts, "a", "X")      # First 'a'
#> [1] "Xpple"   "bXnana"  "Xpricot"
str_replace_all(texts, "a", "X")  # All 'a's
#> [1] "Xpple"   "bXnXnX"  "Xpricot"

# Using captured groups
str_replace("John Smith", "(\\w+) (\\w+)", "\\2, \\1")
#> [1] "Smith, John"

# Multiple replacements
text <- "I have 3 cats and 2 dogs"
str_replace_all(text, c("cats" = "birds", "dogs" = "fish"))
#> [1] "I have 3 birds and 2 fish"

# Conditional replacement
str_replace_all("hello", "l+", "L")  # Multiple l's to one L
#> [1] "heLo"

# Remove pattern
str_remove("Price: $19.99", "\\$")
#> [1] "Price: 19.99"
str_remove_all("a-b-c-d", "-")
#> [1] "abcd"

# Case-insensitive
str_replace("Hello", regex("hello", ignore_case = TRUE), "Hi")
#> [1] "Hi"

15.9 Splitting Strings

⚠️ Common Pitfall: strsplit() Returns List

text <- "a,b,c"

# Returns a LIST
result <- strsplit(text, ",")
class(result)  # "list"
#> [1] "list"
result         # List of 1 element
#> [[1]]
#> [1] "a" "b" "c"

# To get vector, extract first element
result[[1]]
#> [1] "a" "b" "c"

# With multiple strings
texts <- c("a,b,c", "d,e,f")
result <- strsplit(texts, ",")
result         # List of 2 elements
#> [[1]]
#> [1] "a" "b" "c"
#> 
#> [[2]]
#> [1] "d" "e" "f"

# To get all values as vector
unlist(result)
#> [1] "a" "b" "c" "d" "e" "f"

# stringr alternative (also returns list)
str_split(text, ",")
#> [[1]]
#> [1] "a" "b" "c"

# But can simplify
str_split(text, ",", simplify = TRUE)  # Matrix
#>      [,1] [,2] [,3]
#> [1,] "a"  "b"  "c"

# Or use specific extraction
str_split_fixed(text, ",", n = 3)  # Fixed number of pieces
#>      [,1] [,2] [,3]
#> [1,] "a"  "b"  "c"

15.10 Error #3: 'replacement' is not a character vector

⭐ BEGINNER 🔢 TYPE

15.10.1 The Error

texts <- c("price: 10", "price: 20")
sub("price: ", 100, texts)
#> [1] "10010" "10020"

🔴 ERROR

Error in sub("price: ", 100, texts) : 
  invalid 'replacement' argument

15.10.2 What It Means

The replacement value must be a character string, not numeric.

15.10.3 Solutions

SOLUTION: Convert Replacement to Character

texts <- c("price: 10", "price: 20")

# Convert to character
sub("price: ", as.character(100), texts)
#> [1] "10010" "10020"

# Or use paste
sub("price: ", paste0("$", 100), texts)
#> [1] "$10010" "$10020"

# With stringr (auto-converts)
library(stringr)
str_replace(texts, "price: ", 100)  # Auto-converts
#> Error in `str_replace()`:
#> ! `replacement` must be a character vector, not the number 100.

15.11 Case-Insensitive Matching

🎯 Best Practice: Ignore Case

texts <- c("Apple", "banana", "CHERRY")

# Base R: use ignore.case
grep("apple", texts, ignore.case = TRUE, value = TRUE)
#> [1] "Apple"

# Or convert to same case first
grep("apple", tolower(texts), value = TRUE)
#> [1] "apple"

# stringr: use regex() with ignore_case
library(stringr)
str_subset(texts, regex("apple", ignore_case = TRUE))
#> [1] "Apple"

# In replacement
str_replace(texts, regex("apple", ignore_case = TRUE), "Orange")
#> [1] "Orange" "banana" "CHERRY"

15.12 Unicode and Locales

⚠️ Platform Issue: Locale-Dependent Matching

# Character classes depend on locale
texts <- c("café", "naïve", "résumé")

# May behave differently on different systems
grep("[[:alpha:]]+", texts, value = TRUE)
#> [1] "café"   "naïve"  "résumé"

# Safer: specify UTF-8
Sys.setlocale("LC_CTYPE", "en_US.UTF-8")
#> [1] "en_US.UTF-8"

# Or use Unicode escapes
grep("caf\\u00e9", texts, value = TRUE)
#> character(0)

# stringr handles Unicode better
library(stringr)
str_detect(texts, "é")  # More consistent across platforms
#> [1]  TRUE FALSE  TRUE

15.13 Extracting Patterns

🎯 Best Practice: Pattern Extraction

library(stringr)

# Extract all numbers
text <- "I have 3 cats, 2 dogs, and 15 fish"
str_extract_all(text, "\\d+")
#> [[1]]
#> [1] "3"  "2"  "15"

# Extract email addresses
text <- "Contact: john@example.com or jane@test.org"
str_extract_all(text, "[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}")
#> [[1]]
#> [1] "john@example.com" "jane@test.org"

# Extract with groups
text <- "John Smith, Age: 30"
str_match(text, "(\\w+) (\\w+), Age: (\\d+)")
#>      [,1]                  [,2]   [,3]    [,4]
#> [1,] "John Smith, Age: 30" "John" "Smith" "30"

# Named groups (requires stringr)
str_match(text, "(?<first>\\w+) (?<last>\\w+), Age: (?<age>\\d+)")
#>                            first  last    age 
#> [1,] "John Smith, Age: 30" "John" "Smith" "30"

# Extract between delimiters
text <- "The value is [123] and the code is [ABC]"
str_extract_all(text, "\\[([^\\]]+)\\]")
#> [[1]]
#> [1] "[123]" "[ABC]"
str_match_all(text, "\\[([^\\]]+)\\]")
#> [[1]]
#>      [,1]    [,2] 
#> [1,] "[123]" "123"
#> [2,] "[ABC]" "ABC"

15.14 Debugging Regex

🎯 Best Practice: Test and Debug Patterns

# Test patterns incrementally
text <- "test@email.com"

# Build up pattern piece by piece
str_detect(text, "\\w+")           # Any word chars
#> [1] TRUE
str_detect(text, "\\w+@")          # Word chars + @
#> [1] TRUE
str_detect(text, "\\w+@\\w+")      # Add domain start
#> [1] TRUE
str_detect(text, "\\w+@\\w+\\.")   # Add dot
#> [1] TRUE
str_detect(text, "\\w+@\\w+\\.\\w+")  # Add extension
#> [1] TRUE

# Use str_view() to visualize (if available)
# str_view(text, "\\w+@\\w+\\.\\w+")

# Test on multiple examples
test_cases <- c(
  "valid@email.com",
  "invalid",
  "no@domain",
  "missing.at.sign.com"
)

pattern <- "\\w+@\\w+\\.\\w+"
data.frame(
  text = test_cases,
  matches = str_detect(test_cases, pattern)
)
#>                  text matches
#> 1     valid@email.com    TRUE
#> 2             invalid   FALSE
#> 3           no@domain   FALSE
#> 4 missing.at.sign.com   FALSE

15.15 Summary

Key Takeaways:

  1. Escape special characters - Use \\ or fixed = TRUE
  2. . matches any character - Use \\. for literal dot
  3. Use anchors - ^ for start, $ for end
  4. stringr is easier - Better errors and consistency
  5. Test patterns incrementally - Build complex patterns step by step
  6. strsplit() returns list - Extract with [[1]] or use simplify = TRUE
  7. Replacement must be character - Convert numbers with as.character()

Quick Reference:

Error Cause Fix
invalid regular expression Syntax error in pattern Check brackets, escape specials
Matches everything/nothing Wrong pattern Test incrementally, use anchors
replacement not character Numeric replacement as.character() first
Different results by platform Locale/encoding Use stringr, specify UTF-8

Essential Patterns:

# Special characters
.     # Any character
*     # Zero or more
+     # One or more
?     # Zero or one
^     # Start of string
$     # End of string
|     # Or
[]    # Character class
()    # Group

# Character classes
\\d   # Digit
\\D   # Non-digit
\\w   # Word character
\\W   # Non-word
\\s   # Whitespace
\\S   # Non-whitespace

# Quantifiers
{n}   # Exactly n
{n,}  # n or more
{n,m} # Between n and m

grep Family:

grep(pattern, x)        # Indices
grepl(pattern, x)       # Logical
sub(pattern, repl, x)   # Replace first
gsub(pattern, repl, x)  # Replace all

# stringr alternatives (recommended)
str_detect(x, pattern)
str_which(x, pattern)
str_subset(x, pattern)
str_replace(x, pattern, replacement)
str_replace_all(x, pattern, replacement)
str_extract(x, pattern)
str_extract_all(x, pattern)

Best Practices:

# ✅ Good
grep("\\.", files)                    # Escape special chars
str_detect(text, "^pattern$")         # Use anchors
str_replace_all(text, "a", "X")      # Use stringr
fixed = TRUE                          # For literal matching

# ❌ Avoid
grep(".", files)                      # Matches everything
grep(unvalidated_pattern, text)       # No error checking
sub() with unescaped specials         # Unexpected matches
Platform-dependent locale assumptions # Inconsistent results

15.16 Exercises

📝 Exercise 1: Pattern Building

Build patterns to match: 1. Valid email addresses 2. Phone numbers (format: XXX-XXX-XXXX) 3. Dates (format: YYYY-MM-DD) 4. URLs starting with http:// or https://

📝 Exercise 2: Text Extraction

From: "Price: $19.99, Quantity: 5 units"

Extract: 1. The price (numeric only) 2. The quantity (number only) 3. Both in a named vector

📝 Exercise 3: Safe Pattern Matching

Write safe_grep(pattern, x) that: 1. Validates pattern first 2. Provides helpful errors 3. Returns indices with option for values 4. Handles empty inputs

📝 Exercise 4: Text Cleaning

Write clean_identifiers(x) that: 1. Removes special characters 2. Converts spaces to underscores 3. Converts to lowercase 4. Ensures valid R variable names

15.17 Exercise Answers

Click to see answers

Exercise 1:

library(stringr)

# 1. Email pattern
email_pattern <- "[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}"

test_emails <- c(
  "valid@email.com",
  "also.valid@test.co.uk",
  "invalid",
  "missing@domain"
)
str_subset(test_emails, email_pattern)
#> [1] "valid@email.com"       "also.valid@test.co.uk"

# 2. Phone pattern (XXX-XXX-XXXX)
phone_pattern <- "\\d{3}-\\d{3}-\\d{4}"

test_phones <- c(
  "123-456-7890",
  "555-1234",
  "not a phone",
  "123-456-789"
)
str_subset(test_phones, phone_pattern)
#> [1] "123-456-7890"

# 3. Date pattern (YYYY-MM-DD)
date_pattern <- "\\d{4}-\\d{2}-\\d{2}"

test_dates <- c(
  "2024-01-15",
  "2024-1-5",
  "not a date",
  "2024/01/15"
)
str_subset(test_dates, date_pattern)
#> [1] "2024-01-15"

# 4. URL pattern
url_pattern <- "https?://[A-Za-z0-9.-]+(/[A-Za-z0-9._~:/?#\\[\\]@!$&'()*+,;=-]*)?"

test_urls <- c(
  "http://example.com",
  "https://test.org/path",
  "not a url",
  "ftp://wrong.com"
)
str_subset(test_urls, url_pattern)
#> [1] "http://example.com"    "https://test.org/path"

Exercise 2:

text <- "Price: $19.99, Quantity: 5 units"

# 1. Extract price
price <- str_extract(text, "\\d+\\.\\d+")
as.numeric(price)
#> [1] 19.99

# 2. Extract quantity
quantity <- str_extract(text, "Quantity: (\\d+)")
quantity <- str_extract(quantity, "\\d+")
as.numeric(quantity)
#> [1] 5

# 3. Both in named vector
extract_both <- function(text) {
  price <- as.numeric(str_extract(text, "(?<=\\$)\\d+\\.\\d+"))
  quantity <- as.numeric(str_extract(text, "(?<=Quantity: )\\d+"))
  
  c(price = price, quantity = quantity)
}

extract_both(text)
#>    price quantity 
#>    19.99     5.00

# Alternative: using str_match with groups
pattern <- "Price: \\$(\\d+\\.\\d+), Quantity: (\\d+)"
matches <- str_match(text, pattern)
c(
  price = as.numeric(matches[, 2]),
  quantity = as.numeric(matches[, 3])
)
#>    price quantity 
#>    19.99     5.00

Exercise 3:

safe_grep <- function(pattern, x, value = FALSE, ignore.case = FALSE) {
  # Validate inputs
  if (length(x) == 0) {
    message("Input vector is empty")
    return(if (value) character(0) else integer(0))
  }
  
  if (!is.character(x)) {
    message("Converting input to character")
    x <- as.character(x)
  }
  
  # Validate pattern
  pattern_valid <- tryCatch({
    grep(pattern, "test")
    TRUE
  }, error = function(e) {
    FALSE
  })
  
  if (!pattern_valid) {
    stop("Invalid regular expression pattern: '", pattern, "'\n",
         "Check for:\n",
         "  - Unmatched brackets: [ ] ( )\n",
         "  - Invalid escapes\n",
         "  - Unescaped special characters: . * + ? ^ $")
  }
  
  # Perform grep
  result <- grep(pattern, x, value = value, ignore.case = ignore.case)
  
  # Report
  n_matches <- if (value) length(result) else length(result)
  message("Found ", n_matches, " match(es) out of ", length(x), " elements")
  
  return(result)
}

# Test
safe_grep("ap", c("apple", "banana", "apricot"))
#> Found 2 match(es) out of 3 elements
#> [1] 1 3
safe_grep("ap", c("apple", "banana", "apricot"), value = TRUE)
#> Found 2 match(es) out of 3 elements
#> [1] "apple"   "apricot"
safe_grep("[invalid", c("test"))  # Clear error
#> Warning in grep(pattern, "test"): TRE pattern compilation error 'Missing ']''
#> Error in safe_grep("[invalid", c("test")): Invalid regular expression pattern: '[invalid'
#> Check for:
#>   - Unmatched brackets: [ ] ( )
#>   - Invalid escapes
#>   - Unescaped special characters: . * + ? ^ $

Exercise 4:

clean_identifiers <- function(x) {
  library(stringr)
  
  # Convert to character if needed
  if (!is.character(x)) {
    x <- as.character(x)
  }
  
  # Remove leading/trailing whitespace
  x <- str_trim(x)
  
  # Convert to lowercase
  x <- str_to_lower(x)
  
  # Replace spaces with underscores
  x <- str_replace_all(x, "\\s+", "_")
  
  # Remove non-alphanumeric except underscore
  x <- str_replace_all(x, "[^a-z0-9_]", "")
  
  # Ensure doesn't start with number
  x <- str_replace(x, "^(\\d)", "x\\1")
  
  # Ensure not empty
  x[x == ""] <- "var"
  
  # Ensure unique
  x <- make.names(x, unique = TRUE)
  
  return(x)
}

# Test
messy <- c("My Variable!", "123 Start", "test@#$", "  spaces  ", "")
clean_identifiers(messy)
#> [1] "my_variable" "x123_start"  "test"        "spaces"      "var"