Chapter 31 Regression & ANOVA

What You’ll Learn:

  • Linear regression
  • ANOVA (one-way, two-way)
  • Model diagnostics
  • Common regression errors
  • Interpreting results

Key Errors Covered: 20+ regression errors

Difficulty: ⭐⭐⭐ Advanced

31.1 Introduction

Regression and ANOVA are fundamental statistical tools:

# Simple linear regression
model <- lm(mpg ~ hp, data = mtcars)
summary(model)
#> 
#> Call:
#> lm(formula = mpg ~ hp, data = mtcars)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -5.7121 -2.1122 -0.8854  1.5819  8.2360 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) 30.09886    1.63392  18.421  < 2e-16 ***
#> hp          -0.06823    0.01012  -6.742 1.79e-07 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 3.863 on 30 degrees of freedom
#> Multiple R-squared:  0.6024, Adjusted R-squared:  0.5892 
#> F-statistic: 45.46 on 1 and 30 DF,  p-value: 1.788e-07

Let’s master these analyses!

31.2 Linear Regression

💡 Key Insight: lm() Function

# Simple regression (one predictor)
model1 <- lm(mpg ~ hp, data = mtcars)
summary(model1)
#> 
#> Call:
#> lm(formula = mpg ~ hp, data = mtcars)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -5.7121 -2.1122 -0.8854  1.5819  8.2360 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) 30.09886    1.63392  18.421  < 2e-16 ***
#> hp          -0.06823    0.01012  -6.742 1.79e-07 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 3.863 on 30 degrees of freedom
#> Multiple R-squared:  0.6024, Adjusted R-squared:  0.5892 
#> F-statistic: 45.46 on 1 and 30 DF,  p-value: 1.788e-07

# Multiple regression
model2 <- lm(mpg ~ hp + wt + cyl, data = mtcars)
summary(model2)
#> 
#> Call:
#> lm(formula = mpg ~ hp + wt + cyl, data = mtcars)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -3.9290 -1.5598 -0.5311  1.1850  5.8986 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) 38.75179    1.78686  21.687  < 2e-16 ***
#> hp          -0.01804    0.01188  -1.519 0.140015    
#> wt          -3.16697    0.74058  -4.276 0.000199 ***
#> cyl         -0.94162    0.55092  -1.709 0.098480 .  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 2.512 on 28 degrees of freedom
#> Multiple R-squared:  0.8431, Adjusted R-squared:  0.8263 
#> F-statistic: 50.17 on 3 and 28 DF,  p-value: 2.184e-11

# With interaction
model3 <- lm(mpg ~ hp * wt, data = mtcars)
summary(model3)
#> 
#> Call:
#> lm(formula = mpg ~ hp * wt, data = mtcars)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -3.0632 -1.6491 -0.7362  1.4211  4.5513 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) 49.80842    3.60516  13.816 5.01e-14 ***
#> hp          -0.12010    0.02470  -4.863 4.04e-05 ***
#> wt          -8.21662    1.26971  -6.471 5.20e-07 ***
#> hp:wt        0.02785    0.00742   3.753 0.000811 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 2.153 on 28 degrees of freedom
#> Multiple R-squared:  0.8848, Adjusted R-squared:  0.8724 
#> F-statistic: 71.66 on 3 and 28 DF,  p-value: 2.981e-13

# Polynomial
model4 <- lm(mpg ~ poly(hp, 2), data = mtcars)
summary(model4)
#> 
#> Call:
#> lm(formula = mpg ~ poly(hp, 2), data = mtcars)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -4.5512 -1.6027 -0.6977  1.5509  8.7213 
#> 
#> Coefficients:
#>              Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)    20.091      0.544  36.931  < 2e-16 ***
#> poly(hp, 2)1  -26.046      3.077  -8.464 2.51e-09 ***
#> poly(hp, 2)2   13.155      3.077   4.275 0.000189 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 3.077 on 29 degrees of freedom
#> Multiple R-squared:  0.7561, Adjusted R-squared:  0.7393 
#> F-statistic: 44.95 on 2 and 29 DF,  p-value: 1.301e-09

# Extract components
coef(model1)          # Coefficients
#> (Intercept)          hp 
#> 30.09886054 -0.06822828
fitted(model1)        # Fitted values
#>           Mazda RX4       Mazda RX4 Wag          Datsun 710      Hornet 4 Drive 
#>           22.593750           22.593750           23.753631           22.593750 
#>   Hornet Sportabout             Valiant          Duster 360           Merc 240D 
#>           18.158912           22.934891           13.382932           25.868707 
#>            Merc 230            Merc 280           Merc 280C          Merc 450SE 
#>           23.617174           21.706782           21.706782           17.817770 
#>          Merc 450SL         Merc 450SLC  Cadillac Fleetwood Lincoln Continental 
#>           17.817770           17.817770           16.112064           15.429781 
#>   Chrysler Imperial            Fiat 128         Honda Civic      Toyota Corolla 
#>           14.406357           25.595794           26.550990           25.664022 
#>       Toyota Corona    Dodge Challenger         AMC Javelin          Camaro Z28 
#>           23.480718           19.864619           19.864619           13.382932 
#>    Pontiac Firebird           Fiat X1-9       Porsche 914-2        Lotus Europa 
#>           18.158912           25.595794           23.890087           22.389065 
#>      Ford Pantera L        Ferrari Dino       Maserati Bora          Volvo 142E 
#>           12.086595           18.158912            7.242387           22.661978
residuals(model1)     # Residuals
#>           Mazda RX4       Mazda RX4 Wag          Datsun 710      Hornet 4 Drive 
#>         -1.59374995         -1.59374995         -0.95363068         -1.19374995 
#>   Hornet Sportabout             Valiant          Duster 360           Merc 240D 
#>          0.54108812         -4.83489134          0.91706759         -1.46870730 
#>            Merc 230            Merc 280           Merc 280C          Merc 450SE 
#>         -0.81717412         -2.50678234         -3.90678234         -1.41777049 
#>          Merc 450SL         Merc 450SLC  Cadillac Fleetwood Lincoln Continental 
#>         -0.51777049         -2.61777049         -5.71206353         -5.02978075 
#>   Chrysler Imperial            Fiat 128         Honda Civic      Toyota Corolla 
#>          0.29364342          6.80420581          3.84900992          8.23597754 
#>       Toyota Corona    Dodge Challenger         AMC Javelin          Camaro Z28 
#>         -1.98071757         -4.36461883         -4.66461883         -0.08293241 
#>    Pontiac Firebird           Fiat X1-9       Porsche 914-2        Lotus Europa 
#>          1.04108812          1.70420581          2.10991276          8.01093488 
#>      Ford Pantera L        Ferrari Dino       Maserati Bora          Volvo 142E 
#>          3.71340487          1.54108812          7.75761261         -1.26197823
predict(model1)       # Predictions
#>           Mazda RX4       Mazda RX4 Wag          Datsun 710      Hornet 4 Drive 
#>           22.593750           22.593750           23.753631           22.593750 
#>   Hornet Sportabout             Valiant          Duster 360           Merc 240D 
#>           18.158912           22.934891           13.382932           25.868707 
#>            Merc 230            Merc 280           Merc 280C          Merc 450SE 
#>           23.617174           21.706782           21.706782           17.817770 
#>          Merc 450SL         Merc 450SLC  Cadillac Fleetwood Lincoln Continental 
#>           17.817770           17.817770           16.112064           15.429781 
#>   Chrysler Imperial            Fiat 128         Honda Civic      Toyota Corolla 
#>           14.406357           25.595794           26.550990           25.664022 
#>       Toyota Corona    Dodge Challenger         AMC Javelin          Camaro Z28 
#>           23.480718           19.864619           19.864619           13.382932 
#>    Pontiac Firebird           Fiat X1-9       Porsche 914-2        Lotus Europa 
#>           18.158912           25.595794           23.890087           22.389065 
#>      Ford Pantera L        Ferrari Dino       Maserati Bora          Volvo 142E 
#>           12.086595           18.158912            7.242387           22.661978
confint(model1)       # Confidence intervals
#>                   2.5 %     97.5 %
#> (Intercept) 26.76194879 33.4357723
#> hp          -0.08889465 -0.0475619

31.3 Error #1: NA/NaN/Inf in 'y'

⭐ BEGINNER 📊 DATA

31.3.1 The Error

# Data with NA
data_na <- mtcars
data_na$mpg[1:3] <- NA

lm(mpg ~ hp, data = data_na)
#> 
#> Call:
#> lm(formula = mpg ~ hp, data = data_na)
#> 
#> Coefficients:
#> (Intercept)           hp  
#>    30.44325     -0.06956

🔴 ERROR/WARNING

Model runs but removes NAs with warning. Can cause issues downstream.

31.3.2 Solutions

SOLUTION 1: Handle NAs Explicitly

# Remove NAs before modeling
data_clean <- mtcars[complete.cases(mtcars[, c("mpg", "hp")]), ]
model <- lm(mpg ~ hp, data = data_clean)

# Or use na.omit
data_clean2 <- na.omit(mtcars[, c("mpg", "hp", "wt")])
model2 <- lm(mpg ~ hp + wt, data = data_clean2)

# Check for NAs
cat("Complete cases:", sum(complete.cases(mtcars)), "\n")
#> Complete cases: 32
cat("NAs in mpg:", sum(is.na(mtcars$mpg)), "\n")
#> NAs in mpg: 0

SOLUTION 2: Use na.action

# Different NA handling
model_fail <- lm(mpg ~ hp, data = data_na, na.action = na.fail)  # Fails if NA
#> Error in na.fail.default(structure(list(mpg = c(NA, NA, NA, 21.4, 18.7, : missing values in object
# model_exclude <- lm(mpg ~ hp, data = data_na, na.action = na.exclude)  # Excludes
# model_omit <- lm(mpg ~ hp, data = data_na, na.action = na.omit)  # Omits (default)

cat("na.omit: removes NAs, shortens residuals\n")
#> na.omit: removes NAs, shortens residuals
cat("na.exclude: removes NAs, preserves length with NAs\n")
#> na.exclude: removes NAs, preserves length with NAs

31.4 Model Diagnostics

💡 Key Insight: Checking Assumptions

model <- lm(mpg ~ hp + wt, data = mtcars)

# Diagnostic plots
par(mfrow = c(2, 2))
plot(model)
par(mfrow = c(1, 1))

# 1. Residuals vs Fitted - Check linearity
# 2. Q-Q plot - Check normality
# 3. Scale-Location - Check homoscedasticity
# 4. Residuals vs Leverage - Check influential points

# Individual checks
# Normality of residuals
shapiro.test(residuals(model))
#> 
#>  Shapiro-Wilk normality test
#> 
#> data:  residuals(model)
#> W = 0.92792, p-value = 0.03427

# Homoscedasticity
library(lmtest)
bptest(model)
#> Warning in formula$x: partial match of 'x' to 'xlevels'
#> 
#>  studentized Breusch-Pagan test
#> 
#> data:  model
#> BP = 0.88072, df = 2, p-value = 0.6438

# Multicollinearity
library(car)
vif(model)
#>       hp       wt 
#> 1.766625 1.766625

31.5 ANOVA

💡 Key Insight: Analysis of Variance

# One-way ANOVA
anova_model <- aov(mpg ~ factor(cyl), data = mtcars)
summary(anova_model)
#>             Df Sum Sq Mean Sq F value   Pr(>F)    
#> factor(cyl)  2  824.8   412.4    39.7 4.98e-09 ***
#> Residuals   29  301.3    10.4                     
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Two-way ANOVA
anova_model2 <- aov(mpg ~ factor(cyl) + factor(gear), data = mtcars)
summary(anova_model2)
#>              Df Sum Sq Mean Sq F value   Pr(>F)    
#> factor(cyl)   2  824.8   412.4   38.00 1.41e-08 ***
#> factor(gear)  2    8.3     4.1    0.38    0.687    
#> Residuals    27  293.0    10.9                     
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# With interaction
anova_model3 <- aov(mpg ~ factor(cyl) * factor(gear), data = mtcars)
summary(anova_model3)
#>                          Df Sum Sq Mean Sq F value   Pr(>F)    
#> factor(cyl)               2  824.8   412.4  36.777 4.92e-08 ***
#> factor(gear)              2    8.3     4.1   0.368    0.696    
#> factor(cyl):factor(gear)  3   23.9     8.0   0.710    0.555    
#> Residuals                24  269.1    11.2                     
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Post-hoc tests
TukeyHSD(anova_model)
#>   Tukey multiple comparisons of means
#>     95% family-wise confidence level
#> 
#> Fit: aov(formula = mpg ~ factor(cyl), data = mtcars)
#> 
#> $`factor(cyl)`
#>           diff        lwr        upr     p adj
#> 6-4  -6.920779 -10.769350 -3.0722086 0.0003424
#> 8-4 -11.563636 -14.770779 -8.3564942 0.0000000
#> 8-6  -4.642857  -8.327583 -0.9581313 0.0112287

31.6 Error #2: contrasts can be applied only to factors

⭐⭐ INTERMEDIATE 🔢 TYPE

31.6.1 The Error

# Using numeric variable as factor
model <- lm(mpg ~ cyl, data = mtcars, contrasts = list(cyl = contr.treatment))
#> Error in `contrasts<-`(`*tmp*`, value = ca): contrasts apply only to factors

🔴 ERROR

Error in contrasts<-: contrasts can be applied only to factors with 2 or more levels

31.6.2 Solutions

SOLUTION: Convert to Factor

# Convert numeric to factor
mtcars$cyl_factor <- factor(mtcars$cyl)
model <- lm(mpg ~ cyl_factor, data = mtcars)
summary(model)
#> 
#> Call:
#> lm(formula = mpg ~ cyl_factor, data = mtcars)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -5.2636 -1.8357  0.0286  1.3893  7.2364 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)  26.6636     0.9718  27.437  < 2e-16 ***
#> cyl_factor6  -6.9208     1.5583  -4.441 0.000119 ***
#> cyl_factor8 -11.5636     1.2986  -8.905 8.57e-10 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 3.223 on 29 degrees of freedom
#> Multiple R-squared:  0.7325, Adjusted R-squared:  0.714 
#> F-statistic:  39.7 on 2 and 29 DF,  p-value: 4.979e-09

# For ANOVA, must be factor
anova_model <- aov(mpg ~ factor(cyl), data = mtcars)
summary(anova_model)
#>             Df Sum Sq Mean Sq F value   Pr(>F)    
#> factor(cyl)  2  824.8   412.4    39.7 4.98e-09 ***
#> Residuals   29  301.3    10.4                     
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

31.7 Predictions

🎯 Best Practice: Making Predictions

model <- lm(mpg ~ hp + wt, data = mtcars)

# Predict on original data
predictions <- predict(model)
head(predictions)
#>         Mazda RX4     Mazda RX4 Wag        Datsun 710    Hornet 4 Drive 
#>          23.57233          22.58348          25.27582          21.26502 
#> Hornet Sportabout           Valiant 
#>          18.32727          20.47382

# Predict on new data
new_data <- data.frame(
  hp = c(100, 150, 200),
  wt = c(2.5, 3.0, 3.5)
)

predict(model, newdata = new_data)
#>        1        2        3 
#> 24.35540 20.82784 17.30027

# With confidence intervals
predict(model, newdata = new_data, interval = "confidence")
#>        fit      lwr      upr
#> 1 24.35540 23.15968 25.55111
#> 2 20.82784 19.83556 21.82012
#> 3 17.30027 16.07235 18.52820

# With prediction intervals
predict(model, newdata = new_data, interval = "prediction")
#>        fit      lwr      upr
#> 1 24.35540 18.91817 29.79263
#> 2 20.82784 15.43169 26.22398
#> 3 17.30027 11.85587 22.74468

31.8 Error #3: variable lengths differ

⭐ BEGINNER 📊 DATA

31.8.1 The Error

# Mismatched lengths
x <- 1:10
y <- 1:5
lm(y ~ x)
#> Error in model.frame.default(formula = y ~ x, drop.unused.levels = TRUE): variable lengths differ (found for 'x')

🔴 ERROR

Error in model.frame.default: variable lengths differ (found for 'x')

31.8.2 Solutions

SOLUTION: Check Data Lengths

# Check before modeling
x <- 1:10
y <- 1:10  # Fixed

if (length(x) != length(y)) {
  stop("Variables must have same length")
}

model <- lm(y ~ x)

31.9 Model Comparison

🎯 Best Practice: Compare Models

# Nested models
model1 <- lm(mpg ~ hp, data = mtcars)
model2 <- lm(mpg ~ hp + wt, data = mtcars)
model3 <- lm(mpg ~ hp + wt + cyl, data = mtcars)

# ANOVA comparison
anova(model1, model2, model3)
#> Analysis of Variance Table
#> 
#> Model 1: mpg ~ hp
#> Model 2: mpg ~ hp + wt
#> Model 3: mpg ~ hp + wt + cyl
#>   Res.Df    RSS Df Sum of Sq       F    Pr(>F)    
#> 1     30 447.67                                   
#> 2     29 195.05  1   252.627 40.0494 7.597e-07 ***
#> 3     28 176.62  1    18.427  2.9213   0.09848 .  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# AIC/BIC
AIC(model1, model2, model3)
#>        df      AIC
#> model1  3 181.2386
#> model2  4 156.6523
#> model3  5 155.4766
BIC(model1, model2, model3)
#>        df      BIC
#> model1  3 185.6358
#> model2  4 162.5153
#> model3  5 162.8053

# R-squared
summary(model1)$r.squared
#> [1] 0.6024373
summary(model2)$r.squared
#> [1] 0.8267855
summary(model3)$r.squared
#> [1] 0.84315

# Adjusted R-squared (accounts for # predictors)
summary(model1)$adj.r.squared
#> [1] 0.5891853
summary(model2)$adj.r.squared
#> [1] 0.8148396
summary(model3)$adj.r.squared
#> [1] 0.8263446

31.10 Summary

Key Takeaways:

  1. Handle NAs explicitly - Check before modeling
  2. Factors for ANOVA - Convert numeric to factor
  3. Check diagnostics - Assumptions matter
  4. Compare models - Use ANOVA, AIC, BIC
  5. Report properly - Coefficients, p-values, R²
  6. Predict carefully - Match variable names

Quick Reference:

# Linear regression
model <- lm(y ~ x, data = df)
model <- lm(y ~ x1 + x2, data = df)
model <- lm(y ~ x1 * x2, data = df)  # Interaction

# ANOVA
model <- aov(y ~ factor(group), data = df)
TukeyHSD(model)

# Diagnostics
plot(model)
summary(model)

# Predictions
predict(model, newdata = new_df)
predict(model, interval = "confidence")