library(palmerpenguins)
penguins<-penguins %>% drop_na()
Let's compare flipper length and body mass
ggplot( penguins, aes( x = flipper_length_mm, y= body_mass_g))+geom_point()
Let's find the best polynomial to fit this data
## Linear
my_linear_model<- lm(body_mass_g ~ flipper_length_mm, data = penguins)
summary(my_linear_model)
##
## Call:
## lm(formula = body_mass_g ~ flipper_length_mm, data = penguins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1057.33 -259.79 -12.24 242.97 1293.89
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5872.09 310.29 -18.93 <2e-16 ***
## flipper_length_mm 50.15 1.54 32.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 393.3 on 331 degrees of freedom
## Multiple R-squared: 0.7621, Adjusted R-squared: 0.7614
## F-statistic: 1060 on 1 and 331 DF, p-value: < 2.2e-16
## Quadratic
my_quad_model<- lm(body_mass_g ~ poly(flipper_length_mm, 2), data = penguins)
summary(my_quad_model)
##
## Call:
## lm(formula = body_mass_g ~ poly(flipper_length_mm, 2), data = penguins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1018.94 -260.91 -23.13 226.87 1197.35
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4207.06 20.85 201.775 < 2e-16 ***
## poly(flipper_length_mm, 2)1 12808.11 380.48 33.663 < 2e-16 ***
## poly(flipper_length_mm, 2)2 1854.47 380.48 4.874 1.7e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 380.5 on 330 degrees of freedom
## Multiple R-squared: 0.7781, Adjusted R-squared: 0.7767
## F-statistic: 578.5 on 2 and 330 DF, p-value: < 2.2e-16
Build model for a variety of polynomial degree
n<-10 ##max degree
my_models<-list() #empty list
for (i in 1:n){
my_models[[i]]<-lm( body_mass_g ~ poly(flipper_length_mm, i ), data= penguins)
}
What is adjusted and unadjusted R-squared?
R_sq <- data.frame() #empty data frame
for (i in 1:n){
R_sq <-
rbind(R_sq,
data.frame(degree = i,
r.squared = summary( my_models[[i]])$r.squared,
adj.r.squared = summary( my_models[[i]])$adj.r.squared) )
}
Plot R_sq
## R squared is Red
## Adjusted R squared is Black
ggplot(R_sq, aes(x = degree))+geom_line(aes(y = r.squared), color = "red") +
geom_line(aes(y = adj.r.squared))
## Create glm instead
n<-10 ##max degree
glm_models<-list() #empty list
for (i in 1:n){
glm_models[[i]]<-glm( body_mass_g ~ poly(flipper_length_mm, i ), data= penguins)
}
Get some errors
library(boot)
my_error<-c()
for (i in 1:n){
my_error[i]<-cv.glm(penguins, glm_models[[i]])$delta[1]
}
Look at errors
data.frame(my_error, N = 1:n) %>%
ggplot( aes( x = N , y = my_error))+geom_line()
dd<-data.frame(X = 1:10, Y = 11:20)
dd2<-data.frame(X = 11:20, Y = 21:30)
my_big_data_frame<-rbind(dd, dd2)