Bias-Variance Trade-off

Load and Inspect Data

library(palmerpenguins)
penguins<-penguins %>% drop_na()

Let's compare flipper length and body mass

ggplot( penguins, aes( x = flipper_length_mm, y= body_mass_g))+geom_point()

Build Models

Let's find the best polynomial to fit this data

## Linear

my_linear_model<- lm(body_mass_g ~ flipper_length_mm, data = penguins)

summary(my_linear_model)
## 
## Call:
## lm(formula = body_mass_g ~ flipper_length_mm, data = penguins)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1057.33  -259.79   -12.24   242.97  1293.89 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -5872.09     310.29  -18.93   <2e-16 ***
## flipper_length_mm    50.15       1.54   32.56   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 393.3 on 331 degrees of freedom
## Multiple R-squared:  0.7621, Adjusted R-squared:  0.7614 
## F-statistic:  1060 on 1 and 331 DF,  p-value: < 2.2e-16
## Quadratic

my_quad_model<- lm(body_mass_g ~ poly(flipper_length_mm, 2), data = penguins)

summary(my_quad_model)
## 
## Call:
## lm(formula = body_mass_g ~ poly(flipper_length_mm, 2), data = penguins)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1018.94  -260.91   -23.13   226.87  1197.35 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  4207.06      20.85 201.775  < 2e-16 ***
## poly(flipper_length_mm, 2)1 12808.11     380.48  33.663  < 2e-16 ***
## poly(flipper_length_mm, 2)2  1854.47     380.48   4.874  1.7e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 380.5 on 330 degrees of freedom
## Multiple R-squared:  0.7781, Adjusted R-squared:  0.7767 
## F-statistic: 578.5 on 2 and 330 DF,  p-value: < 2.2e-16

Build model for a variety of polynomial degree

n<-10 ##max degree

my_models<-list() #empty list

for (i in 1:n){
  my_models[[i]]<-lm( body_mass_g ~ poly(flipper_length_mm, i ), data= penguins)
}

What is adjusted and unadjusted R-squared?

R_sq <- data.frame() #empty data frame

for (i in 1:n){
  R_sq <-
    rbind(R_sq, 
          data.frame(degree = i,
            r.squared = summary( my_models[[i]])$r.squared, 
                     adj.r.squared = summary( my_models[[i]])$adj.r.squared) )
}

Plot R_sq

## R squared is Red
## Adjusted R squared is Black
ggplot(R_sq, aes(x = degree))+geom_line(aes(y = r.squared), color = "red") +
  geom_line(aes(y = adj.r.squared))

Validate Models

## Create glm instead
n<-10 ##max degree

glm_models<-list() #empty list

for (i in 1:n){
  glm_models[[i]]<-glm( body_mass_g ~ poly(flipper_length_mm, i ), data= penguins)
}

Get some errors

library(boot)
my_error<-c()

for (i in 1:n){
  my_error[i]<-cv.glm(penguins, glm_models[[i]])$delta[1]
}

Look at errors

data.frame(my_error, N = 1:n) %>% 
  ggplot( aes( x = N , y = my_error))+geom_line()

Example of Rbind

dd<-data.frame(X = 1:10, Y = 11:20)

dd2<-data.frame(X = 11:20, Y = 21:30)

my_big_data_frame<-rbind(dd, dd2)