Chapter 13 Factor Operations
What You’ll Learn:
- Recoding and relabeling factors
- Grouping and collapsing levels
- Reordering factors
- Factor arithmetic issues
- Common factor operation errors
Key Errors Covered: 10+ factor operation errors
Difficulty: ⭐⭐ Intermediate
13.1 Introduction
Working with existing factors requires care:
grades <- factor(c("A", "B", "C", "B", "A"))
grades > "B" # Try to compare
#> Warning in Ops.factor(grades, "B"): '>' not meaningful for factors
#> [1] NA NA NA NA NA🔴 ERROR
Error in Ops.factor(grades, "B") :
level sets of factors are different
Let’s master factor operations to avoid these errors.
13.2 Recoding Factors
💡 Key Insight: Recoding Strategies
# Original factor
responses <- factor(c("Y", "N", "Y", "N", "M"))
responses
#> [1] Y N Y N M
#> Levels: M N Y
# Method 1: Relabel levels directly
responses_v1 <- responses
levels(responses_v1) <- c("Maybe", "No", "Yes") # Alphabetical order!
responses_v1
#> [1] Yes No Yes No Maybe
#> Levels: Maybe No Yes
# Method 2: Create mapping
library(forcats)
responses_v2 <- fct_recode(responses,
"Yes" = "Y",
"No" = "N",
"Maybe" = "M"
)
responses_v2
#> [1] Yes No Yes No Maybe
#> Levels: Maybe No Yes
# Method 3: Convert to character, recode, factor
responses_v3 <- as.character(responses)
responses_v3[responses_v3 == "Y"] <- "Yes"
responses_v3[responses_v3 == "N"] <- "No"
responses_v3[responses_v3 == "M"] <- "Maybe"
responses_v3 <- factor(responses_v3)
responses_v3
#> [1] Yes No Yes No Maybe
#> Levels: Maybe No Yes
# Method 4: Use named vector mapping
recode_map <- c(Y = "Yes", N = "No", M = "Maybe")
responses_v4 <- factor(recode_map[as.character(responses)])
responses_v4
#> Y N Y N M
#> Yes No Yes No Maybe
#> Levels: Maybe No Yes13.3 Error #1: level sets of factors are different
⭐⭐ INTERMEDIATE 🔢 TYPE
13.3.1 The Error
sizes1 <- factor(c("S", "M", "L"))
sizes2 <- factor(c("M", "L", "XL"))
sizes1 == sizes2
#> Error in Ops.factor(sizes1, sizes2): level sets of factors are different🔴 ERROR
Error in Ops.factor(sizes1, sizes2) :
level sets of factors are different
13.3.2 What It Means
You’re comparing factors with different level sets. R doesn’t know how to match them.
13.3.3 Common Causes
13.3.3.2 Cause 2: After Subsetting
all_sizes <- factor(c("S", "M", "L", "XL"))
# Different subsets
small_sizes <- all_sizes[1:2]
large_sizes <- all_sizes[3:4]
# Both still have all levels
levels(small_sizes)
#> [1] "L" "M" "S" "XL"
levels(large_sizes)
#> [1] "L" "M" "S" "XL"
# But after droplevels...
small_sizes <- droplevels(small_sizes)
large_sizes <- droplevels(large_sizes)
levels(small_sizes) # S, M
#> [1] "M" "S"
levels(large_sizes) # L, XL
#> [1] "L" "XL"
# Now can't compare
small_sizes[1] == large_sizes[1]
#> [1] FALSE13.3.4 Solutions
✅ SOLUTION 1: Convert to Character
✅ SOLUTION 2: Unify Levels
sizes1 <- factor(c("S", "M", "L"))
sizes2 <- factor(c("M", "L", "XL"))
# Find all unique levels
all_levels <- union(levels(sizes1), levels(sizes2))
all_levels
#> [1] "L" "M" "S" "XL"
# Recreate with same levels
sizes1 <- factor(sizes1, levels = all_levels)
sizes2 <- factor(sizes2, levels = all_levels)
# Now can compare
sizes1 == sizes2
#> [1] FALSE FALSE FALSE13.4 Error #2: NAs produced in factor operations
⭐ BEGINNER 🔢 TYPE
13.4.1 The Warning
sizes <- factor(c("S", "M", "L"))
levels(sizes) <- c("Small", "Large") # Only 2 new names for 3 levels!
#> Error in `levels<-.factor`(`*tmp*`, value = c("Small", "Large")): number of levels differs🟡 WARNING
Warning message:
In `levels<-.factor`(`*tmp*`, value = c("Small", "Large")) :
longer object length is not a multiple of replacement length
13.4.2 What It Means
When recoding, if you don’t provide the right number of new level names, unexpected things happen.
13.4.3 The Problem
# Original
sizes <- factor(c("S", "M", "L", "XL"))
levels(sizes)
#> [1] "L" "M" "S" "XL"
# Try to simplify to 2 categories
levels(sizes) <- c("Small", "Large")
#> Error in `levels<-.factor`(`*tmp*`, value = c("Small", "Large")): number of levels differs
sizes # Recycled! Not what we wanted
#> [1] S M L XL
#> Levels: L M S XL13.5 Collapsing Factor Levels
🎯 Best Practice: Grouping Levels
library(forcats)
# Original detailed categories
age_groups <- factor(c("0-10", "11-20", "21-30", "31-40",
"41-50", "51-60", "61-70", "71+"))
# Method 1: fct_collapse (explicit groups)
age_collapsed <- fct_collapse(age_groups,
Youth = c("0-10", "11-20"),
Adult = c("21-30", "31-40", "41-50"),
Senior = c("51-60", "61-70", "71+")
)
age_collapsed
#> [1] Youth Youth Adult Adult Adult Senior Senior Senior
#> Levels: Youth Adult Senior
# Method 2: fct_other (keep some, lump rest)
top_ages <- fct_other(age_groups,
keep = c("21-30", "31-40", "41-50"),
other_level = "Other"
)
top_ages
#> [1] Other Other 21-30 31-40 41-50 Other Other Other
#> Levels: 21-30 31-40 41-50 Other
# Method 3: fct_lump (keep n most frequent)
responses <- factor(c("A", "A", "A", "B", "B", "C", "D", "E"))
lumped <- fct_lump(responses, n = 2) # Keep top 2
lumped
#> [1] A A A B B Other Other Other
#> Levels: A B Other
# Method 4: fct_lump_min (minimum count)
lumped_min <- fct_lump_min(responses, min = 2) # Keep if appears 2+ times
lumped_min
#> [1] A A A B B Other Other Other
#> Levels: A B Other13.6 Reordering Factors
💡 Key Insight: Factor Ordering
library(forcats)
# Original (alphabetical by default)
grades <- factor(c("B", "A", "C", "A", "B"))
grades
#> [1] B A C A B
#> Levels: A B C
levels(grades)
#> [1] "A" "B" "C"
# Method 1: Specify order explicitly
grades <- factor(grades, levels = c("C", "B", "A"))
levels(grades)
#> [1] "C" "B" "A"
# Method 2: By frequency
grades <- fct_infreq(grades)
levels(grades) # Most common first
#> [1] "B" "A" "C"
# Method 3: By another variable
df <- data.frame(
name = c("Alice", "Bob", "Charlie"),
score = c(95, 85, 90)
)
df$name <- factor(df$name)
# Order by score
df$name <- fct_reorder(df$name, df$score)
levels(df$name) # Ordered by score
#> [1] "Bob" "Charlie" "Alice"
# Method 4: Reverse order
grades <- fct_rev(grades)
levels(grades)
#> [1] "C" "A" "B"
# Method 5: Manual reordering
grades <- fct_relevel(grades, "A", "B", "C")
levels(grades)
#> [1] "A" "B" "C"13.7 Error #3: 'ordered' must be a factor
⭐ BEGINNER 🔢 TYPE
13.7.1 The Error
sizes <- c("S", "M", "L") # Character vector
ordered(sizes) # Try to make ordered
#> [1] S M L
#> Levels: L < M < S🔴 ERROR
Error in ordered(sizes) : argument is not a factor
13.7.3 Solutions
✅ SOLUTION 1: Convert to Factor First
sizes <- c("S", "M", "L")
# Option A: Two steps
sizes_fac <- factor(sizes)
sizes_ord <- ordered(sizes_fac, levels = c("S", "M", "L"))
sizes_ord
#> [1] S M L
#> Levels: S < M < L
# Option B: Direct with levels
sizes_ord <- ordered(sizes, levels = c("S", "M", "L"))
sizes_ord
#> [1] S M L
#> Levels: S < M < L13.8 Factor Arithmetic Errors
⚠️ Pitfall: You Can’t Do Math on Factors
# Numeric-looking factor
numbers <- factor(c("1", "2", "3", "4", "5"))
# Try arithmetic
numbers + 10
#> Warning in Ops.factor(numbers, 10): '+' not meaningful for factors
#> [1] NA NA NA NA NA
numbers * 2
#> Warning in Ops.factor(numbers, 2): '*' not meaningful for factors
#> [1] NA NA NA NA NA
mean(numbers)
#> Warning in mean.default(numbers): argument is not numeric or logical: returning
#> NA
#> [1] NAWhy: Factors are categorical, not numeric, even if they look like numbers.
Solution:
13.9 Useful forcats Functions
🎯 Best Practice: forcats Toolkit
library(forcats)
# Sample data
responses <- factor(c("Good", "Bad", "Good", "Excellent", "Bad",
"Good", "Fair", "Excellent"))
# 1. Count levels
fct_count(responses)
#> # A tibble: 4 × 2
#> f n
#> <fct> <int>
#> 1 Bad 2
#> 2 Excellent 2
#> 3 Fair 1
#> 4 Good 3
# 2. Reorder by frequency
responses_freq <- fct_infreq(responses)
levels(responses_freq)
#> [1] "Good" "Bad" "Excellent" "Fair"
# 3. Reverse order
responses_rev <- fct_rev(responses_freq)
levels(responses_rev)
#> [1] "Fair" "Excellent" "Bad" "Good"
# 4. Recode specific levels
responses_clean <- fct_recode(responses,
"Very Good" = "Excellent",
"Not Good" = "Bad"
)
levels(responses_clean)
#> [1] "Not Good" "Very Good" "Fair" "Good"
# 5. Lump rare levels
responses_lumped <- fct_lump(responses, n = 2, other_level = "Other")
table(responses_lumped)
#> responses_lumped
#> Bad Excellent Good Other
#> 2 2 3 1
# 6. Add new levels
responses_expanded <- fct_expand(responses, "Outstanding")
levels(responses_expanded)
#> [1] "Bad" "Excellent" "Fair" "Good" "Outstanding"
# 7. Drop unused levels
sub <- responses[responses %in% c("Good", "Bad")]
levels(sub) # Still has all levels
#> [1] "Bad" "Excellent" "Fair" "Good"
sub <- fct_drop(sub)
levels(sub) # Only Good and Bad
#> [1] "Bad" "Good"
# 8. Explicit ordering
responses_ordered <- fct_relevel(responses,
"Bad", "Fair", "Good", "Excellent")
levels(responses_ordered)
#> [1] "Bad" "Fair" "Good" "Excellent"13.10 Factor in Data Frames
💡 Key Insight: Factors in Data Analysis
library(dplyr)
# Sample data
df <- data.frame(
category = factor(c("A", "B", "A", "C", "B", "A")),
value = c(10, 20, 15, 25, 22, 18)
)
# Factors work well with grouping
df %>%
group_by(category) %>%
summarise(mean_value = mean(value))
#> # A tibble: 3 × 2
#> category mean_value
#> <fct> <dbl>
#> 1 A 14.3
#> 2 B 21
#> 3 C 25
# But watch out for unused levels after filtering
df_filtered <- df %>%
filter(category != "C")
levels(df_filtered$category) # C still there!
#> [1] "A" "B" "C"
# Drop unused
df_filtered <- df_filtered %>%
mutate(category = droplevels(category))
levels(df_filtered$category) # Now just A and B
#> [1] "A" "B"
# Or use forcats
df_filtered <- df %>%
filter(category != "C") %>%
mutate(category = fct_drop(category))13.11 Converting Between Types
🎯 Best Practice: Type Conversions
# Factor to character
f <- factor(c("a", "b", "c"))
as.character(f)
#> [1] "a" "b" "c"
# Factor to numeric (for numeric-looking factors)
f_num <- factor(c("10", "20", "30"))
as.numeric(as.character(f_num)) # Correct
#> [1] 10 20 30
# NOT: as.numeric(f_num) # Wrong! Gives 1, 2, 3
# Character to factor
ch <- c("x", "y", "z")
factor(ch)
#> [1] x y z
#> Levels: x y z
# Numeric to factor (with labels)
nums <- c(1, 2, 3, 2, 1)
factor(nums, levels = 1:3, labels = c("Low", "Medium", "High"))
#> [1] Low Medium High Medium Low
#> Levels: Low Medium High
# Ordered to unordered
ord <- ordered(c("S", "M", "L"), levels = c("S", "M", "L"))
factor(ord, ordered = FALSE)
#> [1] S M L
#> Levels: S M L
# Unordered to ordered
unord <- factor(c("S", "M", "L"))
ordered(unord, levels = c("S", "M", "L"))
#> [1] S M L
#> Levels: S < M < L13.12 Summary
Key Takeaways:
- Can’t compare factors with different levels - Unify first or convert to character
- Recoding requires all levels - Use fct_recode() or fct_collapse()
- Can’t do arithmetic on factors - Convert to numeric first
- Unordered factors can’t use < > - Use ordered() for ordinal data
- droplevels() after subsetting - Remove unused levels
- forcats makes factor work easier - Use it!
- Convert through character when going to numeric
Quick Reference:
| Error | Cause | Fix |
|---|---|---|
| level sets are different | Comparing different factors | Unify levels or convert |
| NAs produced | Wrong number of level names | Use fct_recode() or provide all names |
| ‘ordered’ must be a factor | Using ordered() on character | Convert to factor first |
| Can’t do arithmetic | Math on factor | Convert to numeric |
| Can’t use < > | Unordered factor comparison | Make ordered or use character |
forcats Essential Functions:
# Reordering
fct_infreq() # By frequency
fct_reorder() # By another variable
fct_relevel() # Manually
fct_rev() # Reverse
# Recoding
fct_recode() # Explicit mapping
fct_collapse() # Group levels
fct_lump() # Combine rare levels
# Utilities
fct_count() # Count levels
fct_drop() # Drop unused
fct_expand() # Add levelsBest Practices:
# ✅ Good
as.numeric(as.character(f)) # Factor to numeric
fct_recode(f, "new" = "old") # Explicit recoding
fct_drop(f) # After subsetting
ordered(f, levels = c(...)) # For ordinal data
# ❌ Avoid
as.numeric(f) # Wrong conversion
levels(f) <- too_few_names # Will recycle
f1 == f2 # Without checking levels
factor(x) without levels # Implicit ordering13.13 Exercises
📝 Exercise 1: Factor Recoding
You have:
- Collapse to simple letter grades (A, B, C)
- Convert to ordered factor
- Create numeric scale (A=4, B=3, C=2)
📝 Exercise 2: Survey Data Cleaning
You have messy survey responses:
Standardize all to lowercase and create ordered factor: Disagree < Maybe < Agree
📝 Exercise 3: Factor Comparison
Write safe_compare(f1, f2, op) that:
1. Checks if factors can be compared
2. Unifies levels if needed
3. Performs comparison
4. Returns result with warnings
📝 Exercise 4: Factor Summary
Write factor_summary(f) that reports:
1. Number of levels
2. Most/least common levels
3. Any unused levels
4. Whether it’s ordered
5. Suggested recoding (if many levels)
13.14 Exercise Answers
Click to see answers
Exercise 1:
library(forcats)
grades <- factor(c("A+", "A", "A-", "B+", "B", "B-", "C+", "C"))
# 1. Collapse to letter grades
grades_simple <- fct_collapse(grades,
A = c("A+", "A", "A-"),
B = c("B+", "B", "B-"),
C = c("C+", "C")
)
grades_simple
#> [1] A A A B B B C C
#> Levels: A B C
# 2. Convert to ordered
grades_ordered <- ordered(grades_simple, levels = c("C", "B", "A"))
grades_ordered
#> [1] A A A B B B C C
#> Levels: C < B < A
# 3. Numeric scale
grade_to_numeric <- c(A = 4, B = 3, C = 2)
grades_numeric <- grade_to_numeric[as.character(grades_simple)]
grades_numeric
#> A A A B B B C C
#> 4 4 4 3 3 3 2 2Exercise 2:
responses <- factor(c("yes", "Yes", "YES", "no", "No", "NO",
"maybe", "Maybe"))
# Standardize to lowercase
responses_clean <- tolower(as.character(responses))
# Map to agreement scale
responses_clean[responses_clean == "yes"] <- "Agree"
responses_clean[responses_clean == "no"] <- "Disagree"
responses_clean[responses_clean == "maybe"] <- "Maybe"
# Create ordered factor
responses_ordered <- ordered(
responses_clean,
levels = c("Disagree", "Maybe", "Agree")
)
responses_ordered
#> [1] Agree Agree Agree Disagree Disagree Disagree Maybe Maybe
#> Levels: Disagree < Maybe < AgreeExercise 3:
safe_compare <- function(f1, f2, op = c("==", "!=", "<", ">", "<=", ">=")) {
op <- match.arg(op)
# Check if both are factors
if (!is.factor(f1) || !is.factor(f2)) {
stop("Both inputs must be factors")
}
# Check length
if (length(f1) != length(f2)) {
warning("Factors have different lengths: ",
length(f1), " vs ", length(f2))
}
# Check if levels match
if (!identical(levels(f1), levels(f2))) {
message("Factors have different levels. Unifying...")
# Unify levels
all_levels <- union(levels(f1), levels(f2))
f1 <- factor(f1, levels = all_levels)
f2 <- factor(f2, levels = all_levels)
}
# Check if ordered (for < > <= >=)
if (op %in% c("<", ">", "<=", ">=")) {
if (!is.ordered(f1) || !is.ordered(f2)) {
warning("Using ordering operators on unordered factors. ",
"Converting to ordered.")
f1 <- ordered(f1, levels = levels(f1))
f2 <- ordered(f2, levels = levels(f2))
}
}
# Perform comparison
result <- switch(op,
"==" = f1 == f2,
"!=" = f1 != f2,
"<" = f1 < f2,
">" = f1 > f2,
"<=" = f1 <= f2,
">=" = f1 >= f2
)
return(result)
}
# Test
f1 <- factor(c("A", "B", "C"))
f2 <- factor(c("B", "C", "D"))
safe_compare(f1, f2, "==")
#> Factors have different levels. Unifying...
#> [1] FALSE FALSE FALSEExercise 4:
factor_summary <- function(f) {
if (!is.factor(f)) {
stop("Input must be a factor")
}
# Basic info
cat("Factor Summary\n")
cat("==============\n\n")
cat("Type:", if(is.ordered(f)) "Ordered" else "Unordered", "\n")
cat("Number of levels:", nlevels(f), "\n")
cat("Number of observations:", length(f), "\n\n")
# Level counts
level_counts <- table(f)
cat("Level frequencies:\n")
print(sort(level_counts, decreasing = TRUE))
cat("\n")
# Most/least common
cat("Most common:", names(which.max(level_counts)),
"(", max(level_counts), "times)\n")
cat("Least common:", names(which.min(level_counts)),
"(", min(level_counts), "times)\n\n")
# Unused levels
used_levels <- levels(f)[levels(f) %in% as.character(f)]
unused_levels <- setdiff(levels(f), used_levels)
if (length(unused_levels) > 0) {
cat("⚠ Unused levels:", paste(unused_levels, collapse = ", "), "\n")
cat(" Consider using droplevels()\n\n")
}
# Suggestions
if (nlevels(f) > 10) {
cat("💡 Suggestion: Factor has", nlevels(f), "levels.\n")
cat(" Consider grouping rare levels with fct_lump()\n")
# Find rare levels (< 5% of data)
rare <- level_counts < (0.05 * length(f))
if (any(rare)) {
cat(" Rare levels:", paste(names(level_counts)[rare], collapse = ", "), "\n")
}
}
invisible(list(
n_levels = nlevels(f),
counts = level_counts,
unused = unused_levels
))
}
# Test
grades <- factor(c("A", "B", "A", "C", "B", "A"),
levels = c("A", "B", "C", "D", "F"))
factor_summary(grades)
#> Factor Summary
#> ==============
#>
#> Type: Unordered
#> Number of levels: 5
#> Number of observations: 6
#>
#> Level frequencies:
#> f
#> A B C D F
#> 3 2 1 0 0
#>
#> Most common: A ( 3 times)
#> Least common: D ( 0 times)
#>
#> ⚠ Unused levels: D, F
#> Consider using droplevels()