Chapter 21 Grouping & Summarizing
What You’ll Learn:
- group_by() mechanics
- Grouped operations
- summarize() with groups
- Common grouping errors
- Multiple grouping variables
- Ungrouping data
Key Errors Covered: 18+ grouping errors
Difficulty: ⭐⭐ Intermediate to ⭐⭐⭐ Advanced
21.1 Introduction
Grouping is one of dplyr’s most powerful features:
library(dplyr)
# Without grouping
mtcars %>%
summarize(avg_mpg = mean(mpg))
#> avg_mpg
#> 1 20.09062
# With grouping
mtcars %>%
group_by(cyl) %>%
summarize(avg_mpg = mean(mpg))
#> # A tibble: 3 × 2
#> cyl avg_mpg
#> <dbl> <dbl>
#> 1 4 26.7
#> 2 6 19.7
#> 3 8 15.1But grouping has surprising behaviors. Let’s master it!
21.2 group_by() Basics
💡 Key Insight: group_by() Creates Invisible Groups
# Create grouped data
grouped_data <- mtcars %>%
group_by(cyl)
# Looks the same when printed
head(grouped_data, 3)
#> # A tibble: 3 × 12
#> # Groups: cyl [2]
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4 6
#> 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4 6
#> 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 4
# But it's grouped!
class(grouped_data)
#> [1] "grouped_df" "tbl_df" "tbl" "data.frame"
groups(grouped_data)
#> [[1]]
#> cyl
# Check if grouped
is_grouped_df(grouped_data)
#> [1] TRUE
# See grouping variables
group_vars(grouped_data)
#> [1] "cyl"
# Count groups
n_groups(grouped_data)
#> [1] 3
# Subsequent operations work within groups
grouped_data %>%
summarize(avg_mpg = mean(mpg))
#> # A tibble: 3 × 2
#> cyl avg_mpg
#> <dbl> <dbl>
#> 1 4 26.7
#> 2 6 19.7
#> 3 8 15.1Key insight: Grouping doesn’t change data, just how operations apply!
21.3 Error #1: Forgetting Data is Grouped
⭐⭐ INTERMEDIATE 🧠 LOGIC
21.3.1 The Problem
# Group the data
cars_grouped <- mtcars %>%
group_by(cyl)
# Later in code... expecting ungrouped behavior
cars_grouped %>%
mutate(rank = row_number()) # Ranks within groups!
#> # A tibble: 32 × 13
#> # Groups: cyl [3]
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4 6
#> 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4 6
#> 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 4
#> 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1 6
#> 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2 8
#> 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1 6
#> 7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4 8
#> 8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2 4
#> 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2 4
#> 10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4 6
#> # ℹ 22 more rows
#> # ℹ 1 more variable: rank <int>21.3.3 Common Causes
# Expecting overall statistics
mtcars %>%
group_by(cyl) %>%
mutate(
overall_mean = mean(mpg) # Actually group means!
) %>%
select(cyl, mpg, overall_mean)
#> Error in select(., cyl, mpg, overall_mean): unused arguments (cyl, mpg, overall_mean)
# Expecting all rows
mtcars %>%
group_by(cyl) %>%
slice(1) # Gets first row of EACH group, not overall first
#> # A tibble: 3 × 12
#> # Groups: cyl [3]
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
#> 1 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 4
#> 2 21 6 160 110 3.9 2.62 16.5 0 1 4 4 6
#> 3 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2 8
# Filtering
mtcars %>%
group_by(cyl) %>%
filter(mpg == max(mpg)) # Max within each group
#> # A tibble: 3 × 12
#> # Groups: cyl [3]
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
#> 1 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1 6
#> 2 33.9 4 71.1 65 4.22 1.84 19.9 1 1 4 1 4
#> 3 19.2 8 400 175 3.08 3.84 17.0 0 0 3 2 821.3.4 Solutions
✅ SOLUTION 1: Ungroup When Done
# Ungroup explicitly
result <- mtcars %>%
group_by(cyl) %>%
summarize(avg_mpg = mean(mpg)) %>%
ungroup() # Remove grouping
# Now operations are ungrouped
result %>%
mutate(rank = row_number()) # Ranks across all rows
#> # A tibble: 3 × 3
#> cyl avg_mpg rank
#> <dbl> <dbl> <int>
#> 1 4 26.7 1
#> 2 6 19.7 2
#> 3 8 15.1 3✅ SOLUTION 2: Check Grouping Status
# Always check if data is grouped
check_grouping <- function(data) {
if (is_grouped_df(data)) {
message("Data is grouped by: ", paste(group_vars(data), collapse = ", "))
} else {
message("Data is not grouped")
}
invisible(data)
}
mtcars %>%
group_by(cyl) %>%
check_grouping() %>%
summarize(avg_mpg = mean(mpg)) %>%
check_grouping()
#> Data is grouped by: cyl
#> Data is not grouped✅ SOLUTION 3: Use .groups Argument
# Control grouping in summarize
mtcars %>%
group_by(cyl, gear) %>%
summarize(
avg_mpg = mean(mpg),
.groups = "drop" # Ungroup completely
)
#> # A tibble: 8 × 3
#> cyl gear avg_mpg
#> <dbl> <dbl> <dbl>
#> 1 4 3 21.5
#> 2 4 4 26.9
#> 3 4 5 28.2
#> 4 6 3 19.8
#> 5 6 4 19.8
#> 6 6 5 19.7
#> 7 8 3 15.0
#> 8 8 5 15.4
# Keep some grouping
mtcars %>%
group_by(cyl, gear) %>%
summarize(
avg_mpg = mean(mpg),
.groups = "drop_last" # Keep cyl grouping
) %>%
group_vars()
#> [1] "cyl"
# Keep all groups
mtcars %>%
group_by(cyl, gear) %>%
summarize(
avg_mpg = mean(mpg),
.groups = "keep"
) %>%
group_vars()
#> [1] "cyl" "gear"21.4 summarize() with Groups
💡 Key Insight: summarize() Reduces Groups
# Single grouping variable
mtcars %>%
group_by(cyl) %>%
summarize(
n = n(),
avg_mpg = mean(mpg),
sd_mpg = sd(mpg)
)
#> # A tibble: 3 × 4
#> cyl n avg_mpg sd_mpg
#> <dbl> <int> <dbl> <dbl>
#> 1 4 11 26.7 4.51
#> 2 6 7 19.7 1.45
#> 3 8 14 15.1 2.56
# Multiple grouping variables
mtcars %>%
group_by(cyl, gear) %>%
summarize(
n = n(),
avg_mpg = mean(mpg),
.groups = "drop"
)
#> # A tibble: 8 × 4
#> cyl gear n avg_mpg
#> <dbl> <dbl> <int> <dbl>
#> 1 4 3 1 21.5
#> 2 4 4 8 26.9
#> 3 4 5 2 28.2
#> 4 6 3 2 19.8
#> 5 6 4 4 19.8
#> 6 6 5 1 19.7
#> 7 8 3 12 15.0
#> 8 8 5 2 15.4
# Useful summary functions
mtcars %>%
group_by(cyl) %>%
summarize(
count = n(), # Number of observations
n_distinct = n_distinct(gear), # Unique values
mean = mean(mpg),
median = median(mpg),
sd = sd(mpg),
min = min(mpg),
max = max(mpg),
first = first(mpg), # First value
last = last(mpg), # Last value
nth = nth(mpg, 2) # Nth value
)
#> # A tibble: 3 × 11
#> cyl count n_distinct mean median sd min max first last nth
#> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 4 11 3 26.7 26 4.51 21.4 33.9 22.8 21.4 24.4
#> 2 6 7 3 19.7 19.7 1.45 17.8 21.4 21 19.7 21
#> 3 8 14 2 15.1 15.2 2.56 10.4 19.2 18.7 15 14.3
# Multiple values from same column
mtcars %>%
group_by(cyl) %>%
summarize(
mpg_stats = list(summary(mpg)) # List column
)
#> # A tibble: 3 × 2
#> cyl mpg_stats
#> <dbl> <list>
#> 1 4 <smmryDfl [6]>
#> 2 6 <smmryDfl [6]>
#> 3 8 <smmryDfl [6]>21.5 Error #2: must be length 1, not ...
⭐⭐ INTERMEDIATE 🔢 TYPE
21.5.1 The Error
mtcars %>%
group_by(cyl) %>%
summarize(
top_mpg = head(mpg, 3) # Returns multiple values!
)
#> Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
#> dplyr 1.1.0.
#> ℹ Please use `reframe()` instead.
#> ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
#> always returns an ungrouped data frame and adjust accordingly.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
#> `summarise()` has grouped output by 'cyl'. You can override using the `.groups`
#> argument.
#> # A tibble: 9 × 2
#> # Groups: cyl [3]
#> cyl top_mpg
#> <dbl> <dbl>
#> 1 4 22.8
#> 2 4 24.4
#> 3 4 22.8
#> 4 6 21
#> 5 6 21
#> 6 6 21.4
#> 7 8 18.7
#> 8 8 14.3
#> 9 8 16.4🔴 ERROR
Error in summarize():
! `top_mpg` must be size 1, not 3.
21.5.2 What It Means
summarize() expects functions that return a single value per group, not vectors.
21.5.3 Common Causes
# Functions returning multiple values
mtcars %>%
group_by(cyl) %>%
summarize(all_mpg = mpg) # All values, not one
#> Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
#> dplyr 1.1.0.
#> ℹ Please use `reframe()` instead.
#> ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
#> always returns an ungrouped data frame and adjust accordingly.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
#> `summarise()` has grouped output by 'cyl'. You can override using the `.groups`
#> argument.
#> # A tibble: 32 × 2
#> # Groups: cyl [3]
#> cyl all_mpg
#> <dbl> <dbl>
#> 1 4 22.8
#> 2 4 24.4
#> 3 4 22.8
#> 4 4 32.4
#> 5 4 30.4
#> 6 4 33.9
#> 7 4 21.5
#> 8 4 27.3
#> 9 4 26
#> 10 4 30.4
#> # ℹ 22 more rows
mtcars %>%
group_by(cyl) %>%
summarize(quantiles = quantile(mpg)) # Returns 5 values
#> Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
#> dplyr 1.1.0.
#> ℹ Please use `reframe()` instead.
#> ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
#> always returns an ungrouped data frame and adjust accordingly.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
#> `summarise()` has grouped output by 'cyl'. You can override using the `.groups`
#> argument.
#> # A tibble: 15 × 2
#> # Groups: cyl [3]
#> cyl quantiles
#> <dbl> <dbl>
#> 1 4 21.4
#> 2 4 22.8
#> 3 4 26
#> 4 4 30.4
#> 5 4 33.9
#> 6 6 17.8
#> 7 6 18.6
#> 8 6 19.7
#> 9 6 21
#> 10 6 21.4
#> 11 8 10.4
#> 12 8 14.4
#> 13 8 15.2
#> 14 8 16.2
#> 15 8 19.2
mtcars %>%
group_by(cyl) %>%
summarize(range = range(mpg)) # Returns 2 values
#> Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
#> dplyr 1.1.0.
#> ℹ Please use `reframe()` instead.
#> ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
#> always returns an ungrouped data frame and adjust accordingly.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
#> `summarise()` has grouped output by 'cyl'. You can override using the `.groups`
#> argument.
#> # A tibble: 6 × 2
#> # Groups: cyl [3]
#> cyl range
#> <dbl> <dbl>
#> 1 4 21.4
#> 2 4 33.9
#> 3 6 17.8
#> 4 6 21.4
#> 5 8 10.4
#> 6 8 19.221.5.4 Solutions
✅ SOLUTION 1: Use Single-Value Functions
# Functions that return one value
mtcars %>%
group_by(cyl) %>%
summarize(
mean_mpg = mean(mpg),
max_mpg = max(mpg),
min_mpg = min(mpg),
n_obs = n()
)
#> # A tibble: 3 × 5
#> cyl mean_mpg max_mpg min_mpg n_obs
#> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 4 26.7 33.9 21.4 11
#> 2 6 19.7 21.4 17.8 7
#> 3 8 15.1 19.2 10.4 14
# For ranges, get separately
mtcars %>%
group_by(cyl) %>%
summarize(
min_mpg = min(mpg),
max_mpg = max(mpg)
)
#> # A tibble: 3 × 3
#> cyl min_mpg max_mpg
#> <dbl> <dbl> <dbl>
#> 1 4 21.4 33.9
#> 2 6 17.8 21.4
#> 3 8 10.4 19.2✅ SOLUTION 2: Use reframe() for Multiple Rows
# reframe() allows multiple rows per group (dplyr >= 1.1.0)
if (packageVersion("dplyr") >= "1.1.0") {
mtcars %>%
group_by(cyl) %>%
reframe(
top_3_mpg = head(sort(mpg, decreasing = TRUE), 3)
)
}
#> # A tibble: 9 × 2
#> cyl top_3_mpg
#> <dbl> <dbl>
#> 1 4 33.9
#> 2 4 32.4
#> 3 4 30.4
#> 4 6 21.4
#> 5 6 21
#> 6 6 21
#> 7 8 19.2
#> 8 8 18.7
#> 9 8 17.3
# For older dplyr, use summarize with list
mtcars %>%
group_by(cyl) %>%
summarize(
top_3_mpg = list(head(sort(mpg, decreasing = TRUE), 3))
) %>%
tidyr::unnest(top_3_mpg)
#> # A tibble: 9 × 2
#> cyl top_3_mpg
#> <dbl> <dbl>
#> 1 4 33.9
#> 2 4 32.4
#> 3 4 30.4
#> 4 6 21.4
#> 5 6 21
#> 6 6 21
#> 7 8 19.2
#> 8 8 18.7
#> 9 8 17.3✅ SOLUTION 3: Use slice() Instead
# Get top N rows per group with slice
mtcars %>%
group_by(cyl) %>%
slice_max(mpg, n = 3) %>%
select(cyl, mpg)
#> Error in select(., cyl, mpg): unused arguments (cyl, mpg)
# Or with arrange and slice
mtcars %>%
group_by(cyl) %>%
arrange(desc(mpg)) %>%
slice(1:3) %>%
select(cyl, mpg)
#> Error in select(., cyl, mpg): unused arguments (cyl, mpg)21.6 mutate() with Groups
💡 Key Insight: mutate() Keeps All Rows
# mutate() operates within groups but keeps all rows
mtcars %>%
group_by(cyl) %>%
mutate(
group_mean = mean(mpg),
deviation = mpg - group_mean
) %>%
select(cyl, mpg, group_mean, deviation) %>%
arrange(cyl, mpg)
#> Error in select(., cyl, mpg, group_mean, deviation): unused arguments (cyl, mpg, group_mean, deviation)
# Ranking within groups
mtcars %>%
group_by(cyl) %>%
mutate(
mpg_rank = rank(desc(mpg)),
mpg_dense_rank = dense_rank(desc(mpg)),
mpg_row_number = row_number(desc(mpg))
) %>%
select(cyl, mpg, mpg_rank, mpg_dense_rank, mpg_row_number) %>%
arrange(cyl, desc(mpg))
#> Error in select(., cyl, mpg, mpg_rank, mpg_dense_rank, mpg_row_number): unused arguments (cyl, mpg, mpg_rank, mpg_dense_rank, mpg_row_number)
# Cumulative operations within groups
mtcars %>%
group_by(cyl) %>%
arrange(mpg) %>%
mutate(
cumsum_mpg = cumsum(mpg),
cummax_mpg = cummax(mpg)
) %>%
select(cyl, mpg, cumsum_mpg, cummax_mpg)
#> Error in select(., cyl, mpg, cumsum_mpg, cummax_mpg): unused arguments (cyl, mpg, cumsum_mpg, cummax_mpg)
# Lead and lag within groups
mtcars %>%
group_by(cyl) %>%
arrange(mpg) %>%
mutate(
prev_mpg = lag(mpg),
next_mpg = lead(mpg),
diff_from_prev = mpg - lag(mpg)
) %>%
select(cyl, mpg, prev_mpg, next_mpg, diff_from_prev)
#> Error in select(., cyl, mpg, prev_mpg, next_mpg, diff_from_prev): unused arguments (cyl, mpg, prev_mpg, next_mpg, diff_from_prev)21.7 filter() with Groups
💡 Key Insight: filter() with Groups
# Filter based on group statistics
mtcars %>%
group_by(cyl) %>%
filter(mpg > mean(mpg)) %>%
select(cyl, mpg) %>%
arrange(cyl, mpg)
#> Error in select(., cyl, mpg): unused arguments (cyl, mpg)
# Keep only groups meeting criteria
mtcars %>%
group_by(cyl) %>%
filter(n() >= 10) %>% # Only groups with 10+ observations
select(cyl) %>%
distinct()
#> Error in select(., cyl): unused argument (cyl)
# Top N per group
mtcars %>%
group_by(cyl) %>%
filter(rank(desc(mpg)) <= 2) %>%
select(cyl, mpg) %>%
arrange(cyl, desc(mpg))
#> Error in select(., cyl, mpg): unused arguments (cyl, mpg)
# Filter by group condition
mtcars %>%
group_by(cyl) %>%
filter(mean(mpg) > 20) %>% # Only groups with avg mpg > 20
select(cyl, mpg)
#> Error in select(., cyl, mpg): unused arguments (cyl, mpg)21.8 Multiple Grouping Variables
💡 Key Insight: Multiple Groups
# Group by multiple variables
mtcars %>%
group_by(cyl, gear) %>%
summarize(
n = n(),
avg_mpg = mean(mpg),
.groups = "drop"
)
#> # A tibble: 8 × 4
#> cyl gear n avg_mpg
#> <dbl> <dbl> <int> <dbl>
#> 1 4 3 1 21.5
#> 2 4 4 8 26.9
#> 3 4 5 2 28.2
#> 4 6 3 2 19.8
#> 5 6 4 4 19.8
#> 6 6 5 1 19.7
#> 7 8 3 12 15.0
#> 8 8 5 2 15.4
# Order matters for operations
mtcars %>%
group_by(cyl, gear) %>%
mutate(
rank_in_cyl_gear = row_number(desc(mpg))
) %>%
select(cyl, gear, mpg, rank_in_cyl_gear) %>%
arrange(cyl, gear, desc(mpg))
#> Error in select(., cyl, gear, mpg, rank_in_cyl_gear): unused arguments (cyl, gear, mpg, rank_in_cyl_gear)
# Count combinations
mtcars %>%
count(cyl, gear)
#> cyl gear n
#> 1 4 3 1
#> 2 4 4 8
#> 3 4 5 2
#> 4 6 3 2
#> 5 6 4 4
#> 6 6 5 1
#> 7 8 3 12
#> 8 8 5 2
# Or with group_by + summarize
mtcars %>%
group_by(cyl, gear) %>%
summarize(n = n(), .groups = "drop")
#> # A tibble: 8 × 3
#> cyl gear n
#> <dbl> <dbl> <int>
#> 1 4 3 1
#> 2 4 4 8
#> 3 4 5 2
#> 4 6 3 2
#> 5 6 4 4
#> 6 6 5 1
#> 7 8 3 12
#> 8 8 5 221.9 Error #3: Unexpected Grouping After summarize()
⭐⭐ INTERMEDIATE 🧠 LOGIC
21.9.1 The Problem
# Summarize with multiple groups
result <- mtcars %>%
group_by(cyl, gear) %>%
summarize(avg_mpg = mean(mpg)) # Message about grouping
#> `summarise()` has grouped output by 'cyl'. You can override using the `.groups`
#> argument.
# Still grouped!
is_grouped_df(result)
#> [1] TRUE
group_vars(result)
#> [1] "cyl"
# Next operation uses this grouping
result %>%
mutate(rank = row_number()) # Within cyl groups!
#> # A tibble: 8 × 4
#> # Groups: cyl [3]
#> cyl gear avg_mpg rank
#> <dbl> <dbl> <dbl> <int>
#> 1 4 3 21.5 1
#> 2 4 4 26.9 2
#> 3 4 5 28.2 3
#> 4 6 3 19.8 1
#> 5 6 4 19.8 2
#> 6 6 5 19.7 3
#> 7 8 3 15.0 1
#> 8 8 5 15.4 221.9.3 Solutions
✅ SOLUTION 1: Use .groups = “drop”
✅ SOLUTION 2: Explicit ungroup()
✅ SOLUTION 3: Understand .groups Options
# .groups = "drop_last" (default)
mtcars %>%
group_by(cyl, gear) %>%
summarize(avg_mpg = mean(mpg), .groups = "drop_last") %>%
group_vars() # Still has cyl
#> [1] "cyl"
# .groups = "keep"
mtcars %>%
group_by(cyl, gear) %>%
summarize(avg_mpg = mean(mpg), .groups = "keep") %>%
group_vars() # Has both cyl and gear
#> [1] "cyl" "gear"
# .groups = "drop"
mtcars %>%
group_by(cyl, gear) %>%
summarize(avg_mpg = mean(mpg), .groups = "drop") %>%
group_vars() # No groups
#> character(0)
# .groups = "rowwise"
mtcars %>%
group_by(cyl, gear) %>%
summarize(avg_mpg = mean(mpg), .groups = "rowwise") %>%
class()
#> [1] "rowwise_df" "tbl_df" "tbl" "data.frame"21.10 Window Functions
🎯 Best Practice: Window Functions with Groups
# Ranking functions
mtcars %>%
group_by(cyl) %>%
mutate(
rank = rank(desc(mpg)), # Ties get average rank
dense_rank = dense_rank(desc(mpg)), # Ties get same rank, no gaps
row_number = row_number(desc(mpg)), # Ties broken arbitrarily
percent_rank = percent_rank(desc(mpg)), # Percentile
cume_dist = cume_dist(desc(mpg)) # Cumulative distribution
) %>%
select(cyl, mpg, rank, dense_rank, row_number, percent_rank, cume_dist) %>%
arrange(cyl, desc(mpg))
#> Error in select(., cyl, mpg, rank, dense_rank, row_number, percent_rank, : unused arguments (cyl, mpg, rank, dense_rank, row_number, percent_rank, cume_dist)
# Lead and lag
mtcars %>%
group_by(cyl) %>%
arrange(mpg) %>%
mutate(
prev_value = lag(mpg, 1), # Previous value
next_value = lead(mpg, 1), # Next value
prev_2 = lag(mpg, 2), # 2 rows back
change = mpg - lag(mpg, 1) # Change from previous
) %>%
select(cyl, mpg, prev_value, next_value, change)
#> Error in select(., cyl, mpg, prev_value, next_value, change): unused arguments (cyl, mpg, prev_value, next_value, change)
# Cumulative functions
mtcars %>%
group_by(cyl) %>%
arrange(mpg) %>%
mutate(
cumsum = cumsum(mpg),
cummean = cummean(mpg),
cummin = cummin(mpg),
cummax = cummax(mpg)
) %>%
select(cyl, mpg, cumsum, cummean, cummin, cummax)
#> Error in select(., cyl, mpg, cumsum, cummean, cummin, cummax): unused arguments (cyl, mpg, cumsum, cummean, cummin, cummax)
# Offset functions
mtcars %>%
group_by(cyl) %>%
arrange(mpg) %>%
mutate(
first_val = first(mpg),
last_val = last(mpg),
nth_val = nth(mpg, 3)
) %>%
select(cyl, mpg, first_val, last_val, nth_val)
#> Error in select(., cyl, mpg, first_val, last_val, nth_val): unused arguments (cyl, mpg, first_val, last_val, nth_val)21.11 count() and add_count()
💡 Key Insight: Counting Shortcuts
# count() is shortcut for group_by + summarize + ungroup
mtcars %>%
count(cyl)
#> cyl n
#> 1 4 11
#> 2 6 7
#> 3 8 14
# Equivalent to:
mtcars %>%
group_by(cyl) %>%
summarize(n = n()) %>%
ungroup()
#> # A tibble: 3 × 2
#> cyl n
#> <dbl> <int>
#> 1 4 11
#> 2 6 7
#> 3 8 14
# Multiple variables
mtcars %>%
count(cyl, gear)
#> cyl gear n
#> 1 4 3 1
#> 2 4 4 8
#> 3 4 5 2
#> 4 6 3 2
#> 5 6 4 4
#> 6 6 5 1
#> 7 8 3 12
#> 8 8 5 2
# With sorting
mtcars %>%
count(cyl, sort = TRUE)
#> cyl n
#> 1 8 14
#> 2 4 11
#> 3 6 7
# With weights
mtcars %>%
count(cyl, wt = mpg) # Sum of mpg instead of count
#> cyl n
#> 1 4 293.3
#> 2 6 138.2
#> 3 8 211.4
# add_count() keeps all rows
mtcars %>%
add_count(cyl) %>%
select(cyl, n, everything())
#> Error in select(., cyl, n, everything()): unused arguments (cyl, n, everything())
# Name the count column
mtcars %>%
add_count(cyl, name = "n_per_cyl") %>%
select(cyl, n_per_cyl, mpg)
#> Error in select(., cyl, n_per_cyl, mpg): unused arguments (cyl, n_per_cyl, mpg)21.12 across() for Multiple Columns
🎯 Best Practice: across() for Multiple Columns
# Summarize multiple columns
mtcars %>%
group_by(cyl) %>%
summarize(
across(c(mpg, hp, wt), mean),
.groups = "drop"
)
#> # A tibble: 3 × 4
#> cyl mpg hp wt
#> <dbl> <dbl> <dbl> <dbl>
#> 1 4 26.7 82.6 2.29
#> 2 6 19.7 122. 3.12
#> 3 8 15.1 209. 4.00
# With named functions
mtcars %>%
group_by(cyl) %>%
summarize(
across(c(mpg, hp, wt),
list(mean = mean, sd = sd),
.names = "{.col}_{.fn}"),
.groups = "drop"
)
#> # A tibble: 3 × 7
#> cyl mpg_mean mpg_sd hp_mean hp_sd wt_mean wt_sd
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 4 26.7 4.51 82.6 20.9 2.29 0.570
#> 2 6 19.7 1.45 122. 24.3 3.12 0.356
#> 3 8 15.1 2.56 209. 51.0 4.00 0.759
# Using selection helpers
mtcars %>%
group_by(cyl) %>%
summarize(
across(where(is.numeric), mean),
.groups = "drop"
)
#> # A tibble: 3 × 11
#> cyl mpg disp hp drat wt qsec vs am gear carb
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 4 26.7 105. 82.6 4.07 2.29 19.1 0.909 0.727 4.09 1.55
#> 2 6 19.7 183. 122. 3.59 3.12 18.0 0.571 0.429 3.86 3.43
#> 3 8 15.1 353. 209. 3.23 4.00 16.8 0 0.143 3.29 3.5
# In mutate
mtcars %>%
group_by(cyl) %>%
mutate(
across(c(mpg, hp),
~ . - mean(.),
.names = "{.col}_centered")
) %>%
select(cyl, mpg, mpg_centered, hp, hp_centered)
#> Error in select(., cyl, mpg, mpg_centered, hp, hp_centered): unused arguments (cyl, mpg, mpg_centered, hp, hp_centered)
# Multiple functions
mtcars %>%
group_by(cyl) %>%
summarize(
across(c(mpg, hp),
list(min = min, max = max, mean = mean)),
.groups = "drop"
)
#> # A tibble: 3 × 7
#> cyl mpg_min mpg_max mpg_mean hp_min hp_max hp_mean
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 4 21.4 33.9 26.7 52 113 82.6
#> 2 6 17.8 21.4 19.7 105 175 122.
#> 3 8 10.4 19.2 15.1 150 335 209.21.13 Common Pitfalls
⚠️ Common Grouping Pitfalls
# Pitfall 1: Forgetting to ungroup
bad <- mtcars %>%
group_by(cyl) %>%
mutate(mpg_scaled = mpg / mean(mpg))
# Still grouped!
good <- mtcars %>%
group_by(cyl) %>%
mutate(mpg_scaled = mpg / mean(mpg)) %>%
ungroup()
# Pitfall 2: Summarizing already summarized data
mtcars %>%
group_by(cyl) %>%
summarize(avg_mpg = mean(mpg)) %>%
summarize(overall_avg = mean(avg_mpg)) # Wrong! Treats groups equally
#> # A tibble: 1 × 1
#> overall_avg
#> <dbl>
#> 1 20.5
# Correct: weight by group size
mtcars %>%
group_by(cyl) %>%
summarize(
avg_mpg = mean(mpg),
n = n(),
.groups = "drop"
) %>%
summarize(overall_avg = sum(avg_mpg * n) / sum(n))
#> # A tibble: 1 × 1
#> overall_avg
#> <dbl>
#> 1 20.1
# Pitfall 3: Group order affecting row_number
set.seed(123)
df <- tibble(
group = rep(c("A", "B"), each = 3),
value = sample(1:6)
)
# Different results!
df %>% group_by(group) %>% mutate(rn = row_number())
#> # A tibble: 6 × 3
#> # Groups: group [2]
#> group value rn
#> <chr> <int> <int>
#> 1 A 3 1
#> 2 A 6 2
#> 3 A 2 3
#> 4 B 4 1
#> 5 B 5 2
#> 6 B 1 3
df %>% arrange(value) %>% group_by(group) %>% mutate(rn = row_number())
#> # A tibble: 6 × 3
#> # Groups: group [2]
#> group value rn
#> <chr> <int> <int>
#> 1 B 1 1
#> 2 A 2 1
#> 3 A 3 2
#> 4 B 4 2
#> 5 B 5 3
#> 6 A 6 3
# Pitfall 4: Comparing grouped vs ungrouped
mtcars %>%
group_by(cyl) %>%
filter(mpg == max(mpg)) %>% # Max within each cyl
nrow()
#> [1] 3
mtcars %>%
filter(mpg == max(mpg)) %>% # Overall max
nrow()
#> [1] 121.14 Summary
Key Takeaways:
- group_by() is invisible - Data looks same but behaves differently
- Always ungroup - Use
ungroup()or.groups = "drop" - summarize() reduces groups - Drops last group by default
- mutate() keeps all rows - Operations within groups
- Window functions - Work great with groups
- Check grouping status - Use
group_vars()oris_grouped_df() - across() for multiple columns - Apply same operation to many columns
Quick Reference:
| Error | Cause | Fix |
|---|---|---|
| Unexpected grouped behavior | Forgot data is grouped | Check with group_vars(), use ungroup() |
| must be size 1 | summarize() with vector | Use single-value function or reframe() |
| Wrong statistics | Operating on groups | Ungroup or use different operation |
Grouping Operations:
# Basic grouping
data %>% group_by(col)
# Multiple groups
data %>% group_by(col1, col2)
# Ungroup
data %>% ungroup()
# Check grouping
group_vars(data)
is_grouped_df(data)
n_groups(data)
# Summarize with groups
data %>%
group_by(col) %>%
summarize(stat = mean(value), .groups = "drop")
# Mutate with groups (keeps all rows)
data %>%
group_by(col) %>%
mutate(group_mean = mean(value))
# Filter with groups
data %>%
group_by(col) %>%
filter(value > mean(value))
# Count
data %>% count(col)
data %>% add_count(col)
# across for multiple columns
data %>%
group_by(col) %>%
summarize(across(where(is.numeric), mean))Best Practices:
# ✅ Good
data %>% group_by(col) %>%
summarize(stat = mean(x), .groups = "drop") # Explicit ungrouping
data %>% group_by(col) %>% ... %>% ungroup() # Ungroup at end
Always check: group_vars(data) # Verify grouping
# ❌ Avoid
data %>% group_by(col) %>% summarize(...) # No .groups argument
Leave data grouped # Causes confusion later
Assume data is/isn't grouped # Always check21.15 Exercises
📝 Exercise 1: Group Statistics
Using mtcars:
1. Group by cyl
2. Calculate mean, median, sd of mpg per group
3. Add count and min/max mpg
4. Sort by mean mpg descending
5. Ensure result is ungrouped
📝 Exercise 2: Window Functions
Using mtcars:
1. Group by cyl
2. Add rank of mpg within each cyl
3. Add difference from group mean
4. Add percentile within group
5. Keep only top 2 per group
📝 Exercise 3: Group Detector
Write check_grouped_operations(data) that:
1. Checks if data is grouped
2. Shows what grouping variables exist
3. Warns about potential issues
4. Suggests when to ungroup
📝 Exercise 4: Grouped Summary Report
Write grouped_summary(data, group_vars, summary_vars) that:
1. Groups by specified columns
2. Summarizes multiple statistics for each summary variable
3. Includes count and percentage of total
4. Returns ungrouped, sorted result
21.16 Exercise Answers
Click to see answers
Exercise 1:
result <- mtcars %>%
group_by(cyl) %>%
summarize(
count = n(),
mean_mpg = mean(mpg),
median_mpg = median(mpg),
sd_mpg = sd(mpg),
min_mpg = min(mpg),
max_mpg = max(mpg),
.groups = "drop"
) %>%
arrange(desc(mean_mpg))
result
#> # A tibble: 3 × 7
#> cyl count mean_mpg median_mpg sd_mpg min_mpg max_mpg
#> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 4 11 26.7 26 4.51 21.4 33.9
#> 2 6 7 19.7 19.7 1.45 17.8 21.4
#> 3 8 14 15.1 15.2 2.56 10.4 19.2
# Verify it's ungrouped
is_grouped_df(result)
#> [1] FALSEExercise 2:
result <- mtcars %>%
group_by(cyl) %>%
mutate(
mpg_rank = rank(desc(mpg)),
deviation_from_mean = mpg - mean(mpg),
percentile = percent_rank(mpg)
) %>%
filter(mpg_rank <= 2) %>%
select(cyl, mpg, mpg_rank, deviation_from_mean, percentile) %>%
arrange(cyl, mpg_rank) %>%
ungroup()
#> Error in select(., cyl, mpg, mpg_rank, deviation_from_mean, percentile): unused arguments (cyl, mpg, mpg_rank, deviation_from_mean, percentile)
result
#> # A tibble: 3 × 7
#> cyl count mean_mpg median_mpg sd_mpg min_mpg max_mpg
#> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 4 11 26.7 26 4.51 21.4 33.9
#> 2 6 7 19.7 19.7 1.45 17.8 21.4
#> 3 8 14 15.1 15.2 2.56 10.4 19.2Exercise 3:
check_grouped_operations <- function(data, warn_threshold = 3) {
cat("=== Grouping Analysis ===\n\n")
# Check if grouped
if (!is_grouped_df(data)) {
cat("✓ Data is NOT grouped\n")
return(invisible(data))
}
# Get grouping info
grp_vars <- group_vars(data)
n_grps <- n_groups(data)
cat("⚠ Data IS grouped\n")
cat("Grouping variables:", paste(grp_vars, collapse = ", "), "\n")
cat("Number of groups:", n_grps, "\n\n")
# Group sizes
grp_sizes <- data %>%
summarize(n = n(), .groups = "drop") %>%
pull(n)
cat("Group size summary:\n")
cat(" Min:", min(grp_sizes), "\n")
cat(" Max:", max(grp_sizes), "\n")
cat(" Mean:", round(mean(grp_sizes), 1), "\n\n")
# Warnings
if (length(grp_vars) > warn_threshold) {
warning("Many grouping variables (", length(grp_vars),
") may lead to many small groups")
}
if (any(grp_sizes == 1)) {
warning("Some groups have only 1 observation")
}
# Suggestions
cat("Suggestions:\n")
cat(" - Use ungroup() when done with grouped operations\n")
cat(" - Or use .groups = 'drop' in summarize()\n")
cat(" - Check grouping with group_vars() before operations\n")
invisible(data)
}
# Test
mtcars %>%
group_by(cyl) %>%
check_grouped_operations()
#> === Grouping Analysis ===
#>
#> ⚠ Data IS grouped
#> Grouping variables: cyl
#> Number of groups: 3
#>
#> Group size summary:
#> Min: 7
#> Max: 14
#> Mean: 10.7
#>
#> Suggestions:
#> - Use ungroup() when done with grouped operations
#> - Or use .groups = 'drop' in summarize()
#> - Check grouping with group_vars() before operations
mtcars %>%
group_by(cyl, gear, carb) %>%
check_grouped_operations()
#> === Grouping Analysis ===
#>
#> ⚠ Data IS grouped
#> Grouping variables: cyl, gear, carb
#> Number of groups: 12
#>
#> Group size summary:
#> Min: 1
#> Max: 5
#> Mean: 2.7
#> Warning in check_grouped_operations(.): Some groups have only 1 observation
#> Suggestions:
#> - Use ungroup() when done with grouped operations
#> - Or use .groups = 'drop' in summarize()
#> - Check grouping with group_vars() before operationsExercise 4:
grouped_summary <- function(data, group_vars, summary_vars) {
# Get total count for percentages
total_n <- nrow(data)
# Create summary
result <- data %>%
group_by(across(all_of(group_vars))) %>%
summarize(
count = n(),
pct_of_total = n() / total_n * 100,
across(
all_of(summary_vars),
list(
mean = ~mean(., na.rm = TRUE),
median = ~median(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
min = ~min(., na.rm = TRUE),
max = ~max(., na.rm = TRUE)
),
.names = "{.col}_{.fn}"
),
.groups = "drop"
) %>%
arrange(desc(count))
result
}
# Test
grouped_summary(
mtcars,
group_vars = c("cyl"),
summary_vars = c("mpg", "hp")
)
#> # A tibble: 3 × 13
#> cyl count pct_of_total mpg_mean mpg_median mpg_sd mpg_min mpg_max hp_mean
#> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 8 14 43.8 15.1 15.2 2.56 10.4 19.2 209.
#> 2 4 11 34.4 26.7 26 4.51 21.4 33.9 82.6
#> 3 6 7 21.9 19.7 19.7 1.45 17.8 21.4 122.
#> # ℹ 4 more variables: hp_median <dbl>, hp_sd <dbl>, hp_min <dbl>, hp_max <dbl>
grouped_summary(
mtcars,
group_vars = c("cyl", "gear"),
summary_vars = c("mpg", "hp", "wt")
)
#> # A tibble: 8 × 19
#> cyl gear count pct_of_total mpg_mean mpg_median mpg_sd mpg_min mpg_max
#> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 8 3 12 37.5 15.0 15.2 2.77 10.4 19.2
#> 2 4 4 8 25 26.9 25.8 4.81 21.4 33.9
#> 3 6 4 4 12.5 19.8 20.1 1.55 17.8 21
#> 4 4 5 2 6.25 28.2 28.2 3.11 26 30.4
#> 5 6 3 2 6.25 19.8 19.8 2.33 18.1 21.4
#> 6 8 5 2 6.25 15.4 15.4 0.566 15 15.8
#> 7 4 3 1 3.12 21.5 21.5 NA 21.5 21.5
#> 8 6 5 1 3.12 19.7 19.7 NA 19.7 19.7
#> # ℹ 10 more variables: hp_mean <dbl>, hp_median <dbl>, hp_sd <dbl>,
#> # hp_min <dbl>, hp_max <dbl>, wt_mean <dbl>, wt_median <dbl>, wt_sd <dbl>,
#> # wt_min <dbl>, wt_max <dbl>
# With iris
grouped_summary(
iris,
group_vars = "Species",
summary_vars = c("Sepal.Length", "Sepal.Width", "Petal.Length")
)
#> # A tibble: 3 × 18
#> Species count pct_of_total Sepal.Length_mean Sepal.Length_median
#> <fct> <int> <dbl> <dbl> <dbl>
#> 1 setosa 50 33.3 5.01 5
#> 2 versicolor 50 33.3 5.94 5.9
#> 3 virginica 50 33.3 6.59 6.5
#> # ℹ 13 more variables: Sepal.Length_sd <dbl>, Sepal.Length_min <dbl>,
#> # Sepal.Length_max <dbl>, Sepal.Width_mean <dbl>, Sepal.Width_median <dbl>,
#> # Sepal.Width_sd <dbl>, Sepal.Width_min <dbl>, Sepal.Width_max <dbl>,
#> # Petal.Length_mean <dbl>, Petal.Length_median <dbl>, Petal.Length_sd <dbl>,
#> # Petal.Length_min <dbl>, Petal.Length_max <dbl>