Chapter 24 Advanced dplyr Patterns

What You’ll Learn:

  • Programming with dplyr
  • Tidy evaluation
  • Dynamic column selection
  • Row-wise operations
  • Complex aggregations

Key Errors Covered: 12+ advanced dplyr errors

Difficulty: ⭐⭐⭐ Advanced

24.1 Introduction

Advanced dplyr enables programmatic data manipulation:

library(dplyr)
library(rlang)

# Function with column name as string
summarize_col <- function(data, col_name) {
  data %>%
    summarize(
      mean = mean(.data[[col_name]], na.rm = TRUE),
      sd = sd(.data[[col_name]], na.rm = TRUE)
    )
}

summarize_col(mtcars, "mpg")
#>       mean       sd
#> 1 20.09062 6.026948

Let’s master advanced patterns!

24.2 Tidy Evaluation Basics

💡 Key Insight: .data and .env Pronouns

# .data pronoun for data frame columns
mtcars %>%
  summarize(avg = mean(.data$mpg))
#>        avg
#> 1 20.09062

# .env pronoun for environment variables
threshold <- 20
mtcars %>%
  filter(.data$mpg > .env$threshold)
#>                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb cyl_factor
#> Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4          6
#> Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4          6
#> Datsun 710     22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1          4
#> Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1          6
#> Merc 240D      24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2          4
#> Merc 230       22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2          4
#> Fiat 128       32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1          4
#> Honda Civic    30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2          4
#>  [ reached 'max' / getOption("max.print") -- omitted 6 rows ]

# In functions
filter_by_value <- function(data, col, value) {
  data %>%
    filter(.data[[col]] > value)
}

filter_by_value(mtcars, "mpg", 20)
#>                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb cyl_factor
#> Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4          6
#> Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4          6
#> Datsun 710     22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1          4
#> Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1          6
#> Merc 240D      24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2          4
#> Merc 230       22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2          4
#> Fiat 128       32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1          4
#> Honda Civic    30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2          4
#>  [ reached 'max' / getOption("max.print") -- omitted 6 rows ]

# Dynamic column creation
create_column <- function(data, new_col, value) {
  data %>%
    mutate("{new_col}" := value)
}

create_column(mtcars, "test", 42) %>%
  select(mpg, test)
#> Error in select(., mpg, test): unused arguments (mpg, test)

24.3 Error #1: Cannot use object of type ‘symbol’

⭐⭐⭐ ADVANCED 🔮 TIDY-EVAL

24.3.1 The Error

my_function <- function(data, col) {
  data %>%
    summarize(mean = mean(col))
}

my_function(mtcars, mpg)
#> Warning: There was 1 warning in `summarize()`.
#> ℹ In argument: `mean = mean(col)`.
#> Caused by warning in `mean.default()`:
#> ! argument is not numeric or logical: returning NA
#>   mean
#> 1   NA

🔴 ERROR

Error in mean(col) : object 'mpg' not found

24.3.2 What It Means

Need to use tidy evaluation to pass column names.

24.3.3 Solutions

SOLUTION 1: Use {{ }} (Embrace)

my_function <- function(data, col) {
  data %>%
    summarize(mean = mean({{ col }}))
}

my_function(mtcars, mpg)
#>       mean
#> 1 20.09062

# Works with multiple operations
my_filter_summarize <- function(data, filter_col, summary_col) {
  data %>%
    filter({{ filter_col }} > 4) %>%
    summarize(
      mean = mean({{ summary_col }}),
      n = n()
    )
}

my_filter_summarize(mtcars, cyl, mpg)
#>       mean  n
#> 1 16.64762 21

SOLUTION 2: Use .data[[]] for Strings

my_function2 <- function(data, col_name) {
  data %>%
    summarize(mean = mean(.data[[col_name]]))
}

my_function2(mtcars, "mpg")
#>       mean
#> 1 20.09062

# Dynamic column names
summarize_multiple <- function(data, cols) {
  data %>%
    summarize(
      across(all_of(cols), mean, .names = "mean_{.col}")
    )
}

summarize_multiple(mtcars, c("mpg", "hp", "wt"))
#>   mean_mpg  mean_hp mean_wt
#> 1 20.09062 146.6875 3.21725

24.4 rowwise() Operations

💡 Key Insight: Row-wise Operations

# Calculate row means
data <- tibble(
  id = 1:3,
  x = c(1, 2, 3),
  y = c(4, 5, 6),
  z = c(7, 8, 9)
)

# Wrong: column mean
data %>%
  mutate(mean = mean(c(x, y, z)))
#> # A tibble: 3 × 5
#>      id     x     y     z  mean
#>   <int> <dbl> <dbl> <dbl> <dbl>
#> 1     1     1     4     7     5
#> 2     2     2     5     8     5
#> 3     3     3     6     9     5

# Right: use rowwise()
data %>%
  rowwise() %>%
  mutate(mean = mean(c(x, y, z))) %>%
  ungroup()
#> # A tibble: 3 × 5
#>      id     x     y     z  mean
#>   <int> <dbl> <dbl> <dbl> <dbl>
#> 1     1     1     4     7     4
#> 2     2     2     5     8     5
#> 3     3     3     6     9     6

# Or use c_across()
data %>%
  rowwise() %>%
  mutate(mean = mean(c_across(x:z))) %>%
  ungroup()
#> # A tibble: 3 × 5
#>      id     x     y     z  mean
#>   <int> <dbl> <dbl> <dbl> <dbl>
#> 1     1     1     4     7     4
#> 2     2     2     5     8     5
#> 3     3     3     6     9     6

# Complex row operations
data %>%
  rowwise() %>%
  mutate(
    total = sum(c_across(x:z)),
    min_val = min(c_across(x:z)),
    max_val = max(c_across(x:z))
  ) %>%
  ungroup()
#> # A tibble: 3 × 7
#>      id     x     y     z total min_val max_val
#>   <int> <dbl> <dbl> <dbl> <dbl>   <dbl>   <dbl>
#> 1     1     1     4     7    12       1       7
#> 2     2     2     5     8    15       2       8
#> 3     3     3     6     9    18       3       9

24.5 across() Advanced Usage

🎯 Best Practice: across() Patterns

# Multiple functions
mtcars %>%
  summarize(
    across(c(mpg, hp, wt),
           list(mean = mean, sd = sd, min = min, max = max),
           .names = "{.col}_{.fn}")
  )
#>   mpg_mean   mpg_sd mpg_min mpg_max  hp_mean    hp_sd hp_min hp_max wt_mean
#> 1 20.09062 6.026948    10.4    33.9 146.6875 68.56287     52    335 3.21725
#>       wt_sd wt_min wt_max
#> 1 0.9784574  1.513  5.424

# With where()
mtcars %>%
  summarize(
    across(where(is.numeric) & !c(vs, am), 
           mean,
           .names = "avg_{.col}")
  )
#>    avg_mpg avg_cyl avg_disp   avg_hp avg_drat  avg_wt avg_qsec avg_gear
#> 1 20.09062  6.1875 230.7219 146.6875 3.596563 3.21725 17.84875   3.6875
#>   avg_carb
#> 1   2.8125

# In mutate with formula
mtcars %>%
  mutate(
    across(c(mpg, hp), 
           ~ . / mean(.),
           .names = "{.col}_scaled")
  ) %>%
  select(mpg, mpg_scaled, hp, hp_scaled)
#> Error in select(., mpg, mpg_scaled, hp, hp_scaled): unused arguments (mpg, mpg_scaled, hp, hp_scaled)

# Conditional transformation
mtcars %>%
  mutate(
    across(where(is.numeric), 
           ~ if_else(. < median(.), "Low", "High"),
           .names = "{.col}_cat")
  ) %>%
  select(mpg, mpg_cat, hp, hp_cat)
#> Error in select(., mpg, mpg_cat, hp, hp_cat): unused arguments (mpg, mpg_cat, hp, hp_cat)

24.6 if_any() and if_all()

💡 Key Insight: Filter with Multiple Conditions

# if_any: TRUE if ANY condition met
mtcars %>%
  filter(if_any(c(mpg, hp), ~ . > 100)) %>%
  select(mpg, hp) %>%
  head()
#> Error in select(., mpg, hp): unused arguments (mpg, hp)

# if_all: TRUE if ALL conditions met
mtcars %>%
  filter(if_all(c(mpg, hp), ~ . > 100)) %>%
  select(mpg, hp)
#> Error in select(., mpg, hp): unused arguments (mpg, hp)

# With where()
mtcars %>%
  filter(if_any(where(is.numeric), ~ . > 200)) %>%
  select(where(function(x) any(x > 200)))
#> Error in select(., where(function(x) any(x > 200))): unused argument (where(function(x) any(x > 200)))

24.7 Summary

Key Takeaways:

  1. Use {{ }} - For passing column names to functions
  2. Use .data[[]] - For string column names
  3. rowwise() - For row-by-row operations
  4. across() - Apply functions to multiple columns
  5. if_any() / if_all() - Filter with multiple conditions
  6. Always ungroup() - After rowwise()

Quick Reference:

# Tidy evaluation
function(data, col) {
  data %>% mutate(new = {{ col }} * 2)
}

# String column names
function(data, col_name) {
  data %>% mutate(new = .data[[col_name]] * 2)
}

# Rowwise
data %>%
  rowwise() %>%
  mutate(mean = mean(c_across(x:z))) %>%
  ungroup()

# across
data %>%
  summarize(across(where(is.numeric), mean))

# if_any/if_all
data %>%
  filter(if_any(c(x, y), ~ . > 10))

24.8 Exercises

📝 Exercise: Create Flexible Function

Write summarize_by_group(data, group_col, summary_cols) using tidy evaluation that groups by one column and summarizes multiple columns.

24.9 Exercise Answer

Click to see answer
summarize_by_group <- function(data, group_col, summary_cols) {
  data %>%
    group_by({{ group_col }}) %>%
    summarize(
      across({{ summary_cols }},
             list(mean = mean, sd = sd, n = ~n()),
             .names = "{.col}_{.fn}"),
      .groups = "drop"
    )
}

summarize_by_group(mtcars, cyl, c(mpg, hp, wt))
#> # A tibble: 3 × 10
#>     cyl mpg_mean mpg_sd mpg_n hp_mean hp_sd  hp_n wt_mean wt_sd  wt_n
#>   <dbl>    <dbl>  <dbl> <int>   <dbl> <dbl> <int>   <dbl> <dbl> <int>
#> 1     4     26.7   4.51    11    82.6  20.9    11    2.29 0.570    11
#> 2     6     19.7   1.45     7   122.   24.3     7    3.12 0.356     7
#> 3     8     15.1   2.56    14   209.   51.0    14    4.00 0.759    14