Chapter 24 Advanced dplyr Patterns
What You’ll Learn:
- Programming with dplyr
- Tidy evaluation
- Dynamic column selection
- Row-wise operations
- Complex aggregations
Key Errors Covered: 12+ advanced dplyr errors
Difficulty: ⭐⭐⭐ Advanced
24.1 Introduction
Advanced dplyr enables programmatic data manipulation:
library(dplyr)
library(rlang)
# Function with column name as string
summarize_col <- function(data, col_name) {
data %>%
summarize(
mean = mean(.data[[col_name]], na.rm = TRUE),
sd = sd(.data[[col_name]], na.rm = TRUE)
)
}
summarize_col(mtcars, "mpg")
#> mean sd
#> 1 20.09062 6.026948Let’s master advanced patterns!
24.2 Tidy Evaluation Basics
💡 Key Insight: .data and .env Pronouns
# .data pronoun for data frame columns
mtcars %>%
summarize(avg = mean(.data$mpg))
#> avg
#> 1 20.09062
# .env pronoun for environment variables
threshold <- 20
mtcars %>%
filter(.data$mpg > .env$threshold)
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 6
#> Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 6
#> Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 4
#> Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 6
#> Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 4
#> Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 4
#> Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 4
#> Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 4
#> [ reached 'max' / getOption("max.print") -- omitted 6 rows ]
# In functions
filter_by_value <- function(data, col, value) {
data %>%
filter(.data[[col]] > value)
}
filter_by_value(mtcars, "mpg", 20)
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 6
#> Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 6
#> Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 4
#> Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 6
#> Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 4
#> Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 4
#> Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 4
#> Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 4
#> [ reached 'max' / getOption("max.print") -- omitted 6 rows ]
# Dynamic column creation
create_column <- function(data, new_col, value) {
data %>%
mutate("{new_col}" := value)
}
create_column(mtcars, "test", 42) %>%
select(mpg, test)
#> Error in select(., mpg, test): unused arguments (mpg, test)24.3 Error #1: Cannot use object of type ‘symbol’
⭐⭐⭐ ADVANCED 🔮 TIDY-EVAL
24.3.1 The Error
my_function <- function(data, col) {
data %>%
summarize(mean = mean(col))
}
my_function(mtcars, mpg)
#> Warning: There was 1 warning in `summarize()`.
#> ℹ In argument: `mean = mean(col)`.
#> Caused by warning in `mean.default()`:
#> ! argument is not numeric or logical: returning NA
#> mean
#> 1 NA🔴 ERROR
Error in mean(col) : object 'mpg' not found
24.3.3 Solutions
✅ SOLUTION 1: Use {{ }} (Embrace)
my_function <- function(data, col) {
data %>%
summarize(mean = mean({{ col }}))
}
my_function(mtcars, mpg)
#> mean
#> 1 20.09062
# Works with multiple operations
my_filter_summarize <- function(data, filter_col, summary_col) {
data %>%
filter({{ filter_col }} > 4) %>%
summarize(
mean = mean({{ summary_col }}),
n = n()
)
}
my_filter_summarize(mtcars, cyl, mpg)
#> mean n
#> 1 16.64762 21✅ SOLUTION 2: Use .data[[]] for Strings
my_function2 <- function(data, col_name) {
data %>%
summarize(mean = mean(.data[[col_name]]))
}
my_function2(mtcars, "mpg")
#> mean
#> 1 20.09062
# Dynamic column names
summarize_multiple <- function(data, cols) {
data %>%
summarize(
across(all_of(cols), mean, .names = "mean_{.col}")
)
}
summarize_multiple(mtcars, c("mpg", "hp", "wt"))
#> mean_mpg mean_hp mean_wt
#> 1 20.09062 146.6875 3.2172524.4 rowwise() Operations
💡 Key Insight: Row-wise Operations
# Calculate row means
data <- tibble(
id = 1:3,
x = c(1, 2, 3),
y = c(4, 5, 6),
z = c(7, 8, 9)
)
# Wrong: column mean
data %>%
mutate(mean = mean(c(x, y, z)))
#> # A tibble: 3 × 5
#> id x y z mean
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 4 7 5
#> 2 2 2 5 8 5
#> 3 3 3 6 9 5
# Right: use rowwise()
data %>%
rowwise() %>%
mutate(mean = mean(c(x, y, z))) %>%
ungroup()
#> # A tibble: 3 × 5
#> id x y z mean
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 4 7 4
#> 2 2 2 5 8 5
#> 3 3 3 6 9 6
# Or use c_across()
data %>%
rowwise() %>%
mutate(mean = mean(c_across(x:z))) %>%
ungroup()
#> # A tibble: 3 × 5
#> id x y z mean
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 4 7 4
#> 2 2 2 5 8 5
#> 3 3 3 6 9 6
# Complex row operations
data %>%
rowwise() %>%
mutate(
total = sum(c_across(x:z)),
min_val = min(c_across(x:z)),
max_val = max(c_across(x:z))
) %>%
ungroup()
#> # A tibble: 3 × 7
#> id x y z total min_val max_val
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 4 7 12 1 7
#> 2 2 2 5 8 15 2 8
#> 3 3 3 6 9 18 3 924.5 across() Advanced Usage
🎯 Best Practice: across() Patterns
# Multiple functions
mtcars %>%
summarize(
across(c(mpg, hp, wt),
list(mean = mean, sd = sd, min = min, max = max),
.names = "{.col}_{.fn}")
)
#> mpg_mean mpg_sd mpg_min mpg_max hp_mean hp_sd hp_min hp_max wt_mean
#> 1 20.09062 6.026948 10.4 33.9 146.6875 68.56287 52 335 3.21725
#> wt_sd wt_min wt_max
#> 1 0.9784574 1.513 5.424
# With where()
mtcars %>%
summarize(
across(where(is.numeric) & !c(vs, am),
mean,
.names = "avg_{.col}")
)
#> avg_mpg avg_cyl avg_disp avg_hp avg_drat avg_wt avg_qsec avg_gear
#> 1 20.09062 6.1875 230.7219 146.6875 3.596563 3.21725 17.84875 3.6875
#> avg_carb
#> 1 2.8125
# In mutate with formula
mtcars %>%
mutate(
across(c(mpg, hp),
~ . / mean(.),
.names = "{.col}_scaled")
) %>%
select(mpg, mpg_scaled, hp, hp_scaled)
#> Error in select(., mpg, mpg_scaled, hp, hp_scaled): unused arguments (mpg, mpg_scaled, hp, hp_scaled)
# Conditional transformation
mtcars %>%
mutate(
across(where(is.numeric),
~ if_else(. < median(.), "Low", "High"),
.names = "{.col}_cat")
) %>%
select(mpg, mpg_cat, hp, hp_cat)
#> Error in select(., mpg, mpg_cat, hp, hp_cat): unused arguments (mpg, mpg_cat, hp, hp_cat)24.6 if_any() and if_all()
💡 Key Insight: Filter with Multiple Conditions
# if_any: TRUE if ANY condition met
mtcars %>%
filter(if_any(c(mpg, hp), ~ . > 100)) %>%
select(mpg, hp) %>%
head()
#> Error in select(., mpg, hp): unused arguments (mpg, hp)
# if_all: TRUE if ALL conditions met
mtcars %>%
filter(if_all(c(mpg, hp), ~ . > 100)) %>%
select(mpg, hp)
#> Error in select(., mpg, hp): unused arguments (mpg, hp)
# With where()
mtcars %>%
filter(if_any(where(is.numeric), ~ . > 200)) %>%
select(where(function(x) any(x > 200)))
#> Error in select(., where(function(x) any(x > 200))): unused argument (where(function(x) any(x > 200)))24.7 Summary
Key Takeaways:
- Use {{ }} - For passing column names to functions
- Use .data[[]] - For string column names
- rowwise() - For row-by-row operations
- across() - Apply functions to multiple columns
- if_any() / if_all() - Filter with multiple conditions
- Always ungroup() - After rowwise()
Quick Reference:
# Tidy evaluation
function(data, col) {
data %>% mutate(new = {{ col }} * 2)
}
# String column names
function(data, col_name) {
data %>% mutate(new = .data[[col_name]] * 2)
}
# Rowwise
data %>%
rowwise() %>%
mutate(mean = mean(c_across(x:z))) %>%
ungroup()
# across
data %>%
summarize(across(where(is.numeric), mean))
# if_any/if_all
data %>%
filter(if_any(c(x, y), ~ . > 10))24.8 Exercises
📝 Exercise: Create Flexible Function
Write summarize_by_group(data, group_col, summary_cols) using tidy evaluation that groups by one column and summarizes multiple columns.
24.9 Exercise Answer
Click to see answer
summarize_by_group <- function(data, group_col, summary_cols) {
data %>%
group_by({{ group_col }}) %>%
summarize(
across({{ summary_cols }},
list(mean = mean, sd = sd, n = ~n()),
.names = "{.col}_{.fn}"),
.groups = "drop"
)
}
summarize_by_group(mtcars, cyl, c(mpg, hp, wt))
#> # A tibble: 3 × 10
#> cyl mpg_mean mpg_sd mpg_n hp_mean hp_sd hp_n wt_mean wt_sd wt_n
#> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <int> <dbl> <dbl> <int>
#> 1 4 26.7 4.51 11 82.6 20.9 11 2.29 0.570 11
#> 2 6 19.7 1.45 7 122. 24.3 7 3.12 0.356 7
#> 3 8 15.1 2.56 14 209. 51.0 14 4.00 0.759 14