Chapter 31 Regression & ANOVA
What You’ll Learn:
- Linear regression
- ANOVA (one-way, two-way)
- Model diagnostics
- Common regression errors
- Interpreting results
Key Errors Covered: 20+ regression errors
Difficulty: ⭐⭐⭐ Advanced
31.1 Introduction
Regression and ANOVA are fundamental statistical tools:
# Simple linear regression
model <- lm(mpg ~ hp, data = mtcars)
summary(model)
#>
#> Call:
#> lm(formula = mpg ~ hp, data = mtcars)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -5.7121 -2.1122 -0.8854 1.5819 8.2360
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 30.09886 1.63392 18.421 < 2e-16 ***
#> hp -0.06823 0.01012 -6.742 1.79e-07 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 3.863 on 30 degrees of freedom
#> Multiple R-squared: 0.6024, Adjusted R-squared: 0.5892
#> F-statistic: 45.46 on 1 and 30 DF, p-value: 1.788e-07Let’s master these analyses!
31.2 Linear Regression
💡 Key Insight: lm() Function
# Simple regression (one predictor)
model1 <- lm(mpg ~ hp, data = mtcars)
summary(model1)
#>
#> Call:
#> lm(formula = mpg ~ hp, data = mtcars)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -5.7121 -2.1122 -0.8854 1.5819 8.2360
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 30.09886 1.63392 18.421 < 2e-16 ***
#> hp -0.06823 0.01012 -6.742 1.79e-07 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 3.863 on 30 degrees of freedom
#> Multiple R-squared: 0.6024, Adjusted R-squared: 0.5892
#> F-statistic: 45.46 on 1 and 30 DF, p-value: 1.788e-07
# Multiple regression
model2 <- lm(mpg ~ hp + wt + cyl, data = mtcars)
summary(model2)
#>
#> Call:
#> lm(formula = mpg ~ hp + wt + cyl, data = mtcars)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -3.9290 -1.5598 -0.5311 1.1850 5.8986
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 38.75179 1.78686 21.687 < 2e-16 ***
#> hp -0.01804 0.01188 -1.519 0.140015
#> wt -3.16697 0.74058 -4.276 0.000199 ***
#> cyl -0.94162 0.55092 -1.709 0.098480 .
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 2.512 on 28 degrees of freedom
#> Multiple R-squared: 0.8431, Adjusted R-squared: 0.8263
#> F-statistic: 50.17 on 3 and 28 DF, p-value: 2.184e-11
# With interaction
model3 <- lm(mpg ~ hp * wt, data = mtcars)
summary(model3)
#>
#> Call:
#> lm(formula = mpg ~ hp * wt, data = mtcars)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -3.0632 -1.6491 -0.7362 1.4211 4.5513
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 49.80842 3.60516 13.816 5.01e-14 ***
#> hp -0.12010 0.02470 -4.863 4.04e-05 ***
#> wt -8.21662 1.26971 -6.471 5.20e-07 ***
#> hp:wt 0.02785 0.00742 3.753 0.000811 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 2.153 on 28 degrees of freedom
#> Multiple R-squared: 0.8848, Adjusted R-squared: 0.8724
#> F-statistic: 71.66 on 3 and 28 DF, p-value: 2.981e-13
# Polynomial
model4 <- lm(mpg ~ poly(hp, 2), data = mtcars)
summary(model4)
#>
#> Call:
#> lm(formula = mpg ~ poly(hp, 2), data = mtcars)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -4.5512 -1.6027 -0.6977 1.5509 8.7213
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 20.091 0.544 36.931 < 2e-16 ***
#> poly(hp, 2)1 -26.046 3.077 -8.464 2.51e-09 ***
#> poly(hp, 2)2 13.155 3.077 4.275 0.000189 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 3.077 on 29 degrees of freedom
#> Multiple R-squared: 0.7561, Adjusted R-squared: 0.7393
#> F-statistic: 44.95 on 2 and 29 DF, p-value: 1.301e-09
# Extract components
coef(model1) # Coefficients
#> (Intercept) hp
#> 30.09886054 -0.06822828
fitted(model1) # Fitted values
#> Mazda RX4 Mazda RX4 Wag Datsun 710 Hornet 4 Drive
#> 22.593750 22.593750 23.753631 22.593750
#> Hornet Sportabout Valiant Duster 360 Merc 240D
#> 18.158912 22.934891 13.382932 25.868707
#> Merc 230 Merc 280 Merc 280C Merc 450SE
#> 23.617174 21.706782 21.706782 17.817770
#> Merc 450SL Merc 450SLC Cadillac Fleetwood Lincoln Continental
#> 17.817770 17.817770 16.112064 15.429781
#> Chrysler Imperial Fiat 128 Honda Civic Toyota Corolla
#> 14.406357 25.595794 26.550990 25.664022
#> Toyota Corona Dodge Challenger AMC Javelin Camaro Z28
#> 23.480718 19.864619 19.864619 13.382932
#> Pontiac Firebird Fiat X1-9 Porsche 914-2 Lotus Europa
#> 18.158912 25.595794 23.890087 22.389065
#> Ford Pantera L Ferrari Dino Maserati Bora Volvo 142E
#> 12.086595 18.158912 7.242387 22.661978
residuals(model1) # Residuals
#> Mazda RX4 Mazda RX4 Wag Datsun 710 Hornet 4 Drive
#> -1.59374995 -1.59374995 -0.95363068 -1.19374995
#> Hornet Sportabout Valiant Duster 360 Merc 240D
#> 0.54108812 -4.83489134 0.91706759 -1.46870730
#> Merc 230 Merc 280 Merc 280C Merc 450SE
#> -0.81717412 -2.50678234 -3.90678234 -1.41777049
#> Merc 450SL Merc 450SLC Cadillac Fleetwood Lincoln Continental
#> -0.51777049 -2.61777049 -5.71206353 -5.02978075
#> Chrysler Imperial Fiat 128 Honda Civic Toyota Corolla
#> 0.29364342 6.80420581 3.84900992 8.23597754
#> Toyota Corona Dodge Challenger AMC Javelin Camaro Z28
#> -1.98071757 -4.36461883 -4.66461883 -0.08293241
#> Pontiac Firebird Fiat X1-9 Porsche 914-2 Lotus Europa
#> 1.04108812 1.70420581 2.10991276 8.01093488
#> Ford Pantera L Ferrari Dino Maserati Bora Volvo 142E
#> 3.71340487 1.54108812 7.75761261 -1.26197823
predict(model1) # Predictions
#> Mazda RX4 Mazda RX4 Wag Datsun 710 Hornet 4 Drive
#> 22.593750 22.593750 23.753631 22.593750
#> Hornet Sportabout Valiant Duster 360 Merc 240D
#> 18.158912 22.934891 13.382932 25.868707
#> Merc 230 Merc 280 Merc 280C Merc 450SE
#> 23.617174 21.706782 21.706782 17.817770
#> Merc 450SL Merc 450SLC Cadillac Fleetwood Lincoln Continental
#> 17.817770 17.817770 16.112064 15.429781
#> Chrysler Imperial Fiat 128 Honda Civic Toyota Corolla
#> 14.406357 25.595794 26.550990 25.664022
#> Toyota Corona Dodge Challenger AMC Javelin Camaro Z28
#> 23.480718 19.864619 19.864619 13.382932
#> Pontiac Firebird Fiat X1-9 Porsche 914-2 Lotus Europa
#> 18.158912 25.595794 23.890087 22.389065
#> Ford Pantera L Ferrari Dino Maserati Bora Volvo 142E
#> 12.086595 18.158912 7.242387 22.661978
confint(model1) # Confidence intervals
#> 2.5 % 97.5 %
#> (Intercept) 26.76194879 33.4357723
#> hp -0.08889465 -0.047561931.3 Error #1: NA/NaN/Inf in 'y'
⭐ BEGINNER 📊 DATA
31.3.1 The Error
# Data with NA
data_na <- mtcars
data_na$mpg[1:3] <- NA
lm(mpg ~ hp, data = data_na)
#>
#> Call:
#> lm(formula = mpg ~ hp, data = data_na)
#>
#> Coefficients:
#> (Intercept) hp
#> 30.44325 -0.06956🔴 ERROR/WARNING
Model runs but removes NAs with warning. Can cause issues downstream.
31.3.2 Solutions
✅ SOLUTION 1: Handle NAs Explicitly
# Remove NAs before modeling
data_clean <- mtcars[complete.cases(mtcars[, c("mpg", "hp")]), ]
model <- lm(mpg ~ hp, data = data_clean)
# Or use na.omit
data_clean2 <- na.omit(mtcars[, c("mpg", "hp", "wt")])
model2 <- lm(mpg ~ hp + wt, data = data_clean2)
# Check for NAs
cat("Complete cases:", sum(complete.cases(mtcars)), "\n")
#> Complete cases: 32
cat("NAs in mpg:", sum(is.na(mtcars$mpg)), "\n")
#> NAs in mpg: 0✅ SOLUTION 2: Use na.action
# Different NA handling
model_fail <- lm(mpg ~ hp, data = data_na, na.action = na.fail) # Fails if NA
#> Error in na.fail.default(structure(list(mpg = c(NA, NA, NA, 21.4, 18.7, : missing values in object
# model_exclude <- lm(mpg ~ hp, data = data_na, na.action = na.exclude) # Excludes
# model_omit <- lm(mpg ~ hp, data = data_na, na.action = na.omit) # Omits (default)
cat("na.omit: removes NAs, shortens residuals\n")
#> na.omit: removes NAs, shortens residuals
cat("na.exclude: removes NAs, preserves length with NAs\n")
#> na.exclude: removes NAs, preserves length with NAs31.4 Model Diagnostics
💡 Key Insight: Checking Assumptions
model <- lm(mpg ~ hp + wt, data = mtcars)
# Diagnostic plots
par(mfrow = c(2, 2))
plot(model)
par(mfrow = c(1, 1))
# 1. Residuals vs Fitted - Check linearity
# 2. Q-Q plot - Check normality
# 3. Scale-Location - Check homoscedasticity
# 4. Residuals vs Leverage - Check influential points
# Individual checks
# Normality of residuals
shapiro.test(residuals(model))
#>
#> Shapiro-Wilk normality test
#>
#> data: residuals(model)
#> W = 0.92792, p-value = 0.03427
# Homoscedasticity
library(lmtest)
bptest(model)
#> Warning in formula$x: partial match of 'x' to 'xlevels'
#>
#> studentized Breusch-Pagan test
#>
#> data: model
#> BP = 0.88072, df = 2, p-value = 0.6438
# Multicollinearity
library(car)
vif(model)
#> hp wt
#> 1.766625 1.766625
31.5 ANOVA
💡 Key Insight: Analysis of Variance
# One-way ANOVA
anova_model <- aov(mpg ~ factor(cyl), data = mtcars)
summary(anova_model)
#> Df Sum Sq Mean Sq F value Pr(>F)
#> factor(cyl) 2 824.8 412.4 39.7 4.98e-09 ***
#> Residuals 29 301.3 10.4
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Two-way ANOVA
anova_model2 <- aov(mpg ~ factor(cyl) + factor(gear), data = mtcars)
summary(anova_model2)
#> Df Sum Sq Mean Sq F value Pr(>F)
#> factor(cyl) 2 824.8 412.4 38.00 1.41e-08 ***
#> factor(gear) 2 8.3 4.1 0.38 0.687
#> Residuals 27 293.0 10.9
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# With interaction
anova_model3 <- aov(mpg ~ factor(cyl) * factor(gear), data = mtcars)
summary(anova_model3)
#> Df Sum Sq Mean Sq F value Pr(>F)
#> factor(cyl) 2 824.8 412.4 36.777 4.92e-08 ***
#> factor(gear) 2 8.3 4.1 0.368 0.696
#> factor(cyl):factor(gear) 3 23.9 8.0 0.710 0.555
#> Residuals 24 269.1 11.2
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Post-hoc tests
TukeyHSD(anova_model)
#> Tukey multiple comparisons of means
#> 95% family-wise confidence level
#>
#> Fit: aov(formula = mpg ~ factor(cyl), data = mtcars)
#>
#> $`factor(cyl)`
#> diff lwr upr p adj
#> 6-4 -6.920779 -10.769350 -3.0722086 0.0003424
#> 8-4 -11.563636 -14.770779 -8.3564942 0.0000000
#> 8-6 -4.642857 -8.327583 -0.9581313 0.011228731.6 Error #2: contrasts can be applied only to factors
⭐⭐ INTERMEDIATE 🔢 TYPE
31.6.1 The Error
# Using numeric variable as factor
model <- lm(mpg ~ cyl, data = mtcars, contrasts = list(cyl = contr.treatment))
#> Error in `contrasts<-`(`*tmp*`, value = ca): contrasts apply only to factors🔴 ERROR
Error in contrasts<-: contrasts can be applied only to factors with 2 or more levels
31.6.2 Solutions
✅ SOLUTION: Convert to Factor
# Convert numeric to factor
mtcars$cyl_factor <- factor(mtcars$cyl)
model <- lm(mpg ~ cyl_factor, data = mtcars)
summary(model)
#>
#> Call:
#> lm(formula = mpg ~ cyl_factor, data = mtcars)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -5.2636 -1.8357 0.0286 1.3893 7.2364
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 26.6636 0.9718 27.437 < 2e-16 ***
#> cyl_factor6 -6.9208 1.5583 -4.441 0.000119 ***
#> cyl_factor8 -11.5636 1.2986 -8.905 8.57e-10 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 3.223 on 29 degrees of freedom
#> Multiple R-squared: 0.7325, Adjusted R-squared: 0.714
#> F-statistic: 39.7 on 2 and 29 DF, p-value: 4.979e-09
# For ANOVA, must be factor
anova_model <- aov(mpg ~ factor(cyl), data = mtcars)
summary(anova_model)
#> Df Sum Sq Mean Sq F value Pr(>F)
#> factor(cyl) 2 824.8 412.4 39.7 4.98e-09 ***
#> Residuals 29 301.3 10.4
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 131.7 Predictions
🎯 Best Practice: Making Predictions
model <- lm(mpg ~ hp + wt, data = mtcars)
# Predict on original data
predictions <- predict(model)
head(predictions)
#> Mazda RX4 Mazda RX4 Wag Datsun 710 Hornet 4 Drive
#> 23.57233 22.58348 25.27582 21.26502
#> Hornet Sportabout Valiant
#> 18.32727 20.47382
# Predict on new data
new_data <- data.frame(
hp = c(100, 150, 200),
wt = c(2.5, 3.0, 3.5)
)
predict(model, newdata = new_data)
#> 1 2 3
#> 24.35540 20.82784 17.30027
# With confidence intervals
predict(model, newdata = new_data, interval = "confidence")
#> fit lwr upr
#> 1 24.35540 23.15968 25.55111
#> 2 20.82784 19.83556 21.82012
#> 3 17.30027 16.07235 18.52820
# With prediction intervals
predict(model, newdata = new_data, interval = "prediction")
#> fit lwr upr
#> 1 24.35540 18.91817 29.79263
#> 2 20.82784 15.43169 26.22398
#> 3 17.30027 11.85587 22.7446831.8 Error #3: variable lengths differ
⭐ BEGINNER 📊 DATA
31.9 Model Comparison
🎯 Best Practice: Compare Models
# Nested models
model1 <- lm(mpg ~ hp, data = mtcars)
model2 <- lm(mpg ~ hp + wt, data = mtcars)
model3 <- lm(mpg ~ hp + wt + cyl, data = mtcars)
# ANOVA comparison
anova(model1, model2, model3)
#> Analysis of Variance Table
#>
#> Model 1: mpg ~ hp
#> Model 2: mpg ~ hp + wt
#> Model 3: mpg ~ hp + wt + cyl
#> Res.Df RSS Df Sum of Sq F Pr(>F)
#> 1 30 447.67
#> 2 29 195.05 1 252.627 40.0494 7.597e-07 ***
#> 3 28 176.62 1 18.427 2.9213 0.09848 .
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# AIC/BIC
AIC(model1, model2, model3)
#> df AIC
#> model1 3 181.2386
#> model2 4 156.6523
#> model3 5 155.4766
BIC(model1, model2, model3)
#> df BIC
#> model1 3 185.6358
#> model2 4 162.5153
#> model3 5 162.8053
# R-squared
summary(model1)$r.squared
#> [1] 0.6024373
summary(model2)$r.squared
#> [1] 0.8267855
summary(model3)$r.squared
#> [1] 0.84315
# Adjusted R-squared (accounts for # predictors)
summary(model1)$adj.r.squared
#> [1] 0.5891853
summary(model2)$adj.r.squared
#> [1] 0.8148396
summary(model3)$adj.r.squared
#> [1] 0.826344631.10 Summary
Key Takeaways:
- Handle NAs explicitly - Check before modeling
- Factors for ANOVA - Convert numeric to factor
- Check diagnostics - Assumptions matter
- Compare models - Use ANOVA, AIC, BIC
- Report properly - Coefficients, p-values, R²
- Predict carefully - Match variable names
Quick Reference:
# Linear regression
model <- lm(y ~ x, data = df)
model <- lm(y ~ x1 + x2, data = df)
model <- lm(y ~ x1 * x2, data = df) # Interaction
# ANOVA
model <- aov(y ~ factor(group), data = df)
TukeyHSD(model)
# Diagnostics
plot(model)
summary(model)
# Predictions
predict(model, newdata = new_df)
predict(model, interval = "confidence")