STOR 390
library(tidyverse)
hour <- read_csv('https://raw.githubusercontent.com/idc9/stor390/master/data/bikes_2011.csv')
head(hour)
## # A tibble: 6 × 17
## instant dteday season yr mnth hr holiday weekday workingday
## <int> <date> <int> <int> <int> <int> <int> <int> <int>
## 1 1 2011-01-01 1 0 1 0 0 6 0
## 2 2 2011-01-01 1 0 1 1 0 6 0
## 3 3 2011-01-01 1 0 1 2 0 6 0
## 4 4 2011-01-01 1 0 1 3 0 6 0
## 5 5 2011-01-01 1 0 1 4 0 6 0
## 6 6 2011-01-01 1 0 1 5 0 6 0
## # ... with 8 more variables: weathersit <int>, temp <dbl>, atemp <dbl>,
## # hum <dbl>, windspeed <dbl>, casual <int>, registered <int>, cnt <int>
ggplot(hour) +
geom_histogram(aes(x=cnt))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(hour) +
geom_point(aes(x=temp, y=cnt))
ggplot(hour) +
geom_point(aes(x=hr, y=cnt))
hour <- hour %>%
select(cnt, hr)
hour %>%
group_by(hr) %>%
summarise(mean_cnt=mean(cnt)) %>%
ggplot() +
geom_line(aes(x=hr, y=mean_cnt)) +
geom_point(aes(x=hr, y=mean_cnt))
predict the number of riders based only on the time of day
hr
to cnt
You should always start with linear regression!
ggplot(hour) +
geom_point(aes(x=hr, y=cnt)) +
geom_smooth(aes(x=hr, y=cnt), color='red', method=lm, se=FALSE)
linear_model <- lm(cnt ~ hr, hour)
# put the actual and predicted counts in a data frame
results <- tibble(cnt_actual = hour$cnt,
cnt_pred=linear_model$fitted.values)
results %>%
mutate(resid=cnt_actual - cnt_pred) %>%
mutate(resid_sq = resid^2) %>%
summarise(MSE=mean(resid_sq))
## # A tibble: 1 × 1
## MSE
## <dbl>
## 1 14927.62
a statistical model describes random error or noise instead of the underlying relationship.
(from wikipedia)
A model that has been overfit has poor predictive performance, as it overreacts to minor fluctuations in the training data.
Each observation can either be used for exploration or confirmation.
Each observation can either be used for building a model or evaluating a model, not both.
# there are n observations
n <- dim(hour)[1]
# number of observations that go in the training st
n_tr <- floor(n * .8)
# randomly select n_tr numbers, without replacement, from 1...n
tr_indices <- sample(x=1:n, size=n_tr, replace=FALSE)
# break the data into a non-overlapping train and test set
train <- hour[tr_indices, ]
test <- train[-tr_indices, ]
# manually add hr^2 to the data matrix
train_square <- mutate(train, hr_sq = hr^2)
model_square <- lm(cnt ~ hr + hr_sq, train_square)
# there is a better way to do this using R's modeling language
model_square <- lm(cnt ~ hr + I(hr^2), train)
# largest degree polynomial to try
d_max <- 20
# lets save each model we fit in a list
models <- list()
# also store the traing error in data frame
error <- tibble(degree=1:d_max,
MSE_tr = rep(0, d_max))
# fit all the models
for(d in 1:d_max){
# the poly function does exactly what you think it does
models[[d]] <- lm(cnt ~ poly(hr, d), train)
# compute the MSE for the training data
mse_tr <- mean(models[[d]]$residuals^2)
# save the MSE
error[d, 'MSE_tr'] <- mse_tr
}
error
## # A tibble: 20 × 2
## degree MSE_tr
## <int> <dbl>
## 1 1 15025.065
## 2 2 12481.567
## 3 3 11248.611
## 4 4 11240.641
## 5 5 11022.255
## 6 6 10662.835
## 7 7 9980.930
## 8 8 9830.812
## 9 9 9673.942
## 10 10 9673.299
## 11 11 9552.098
## 12 12 9536.471
## 13 13 9060.490
## 14 14 9058.416
## 15 15 8835.333
## 16 16 8831.749
## 17 17 8831.535
## 18 18 8792.392
## 19 19 8745.028
## 20 20 8710.972
# plot the training error
ggplot(error)+
geom_point(aes(x=degree, y=MSE_tr)) +
geom_line(aes(x=degree, y=MSE_tr))
error %>%
rename(tr=MSE_tr, tst=MSE_tst) %>%
gather(key=type, value=error, tr, tst) %>%
ggplot() +
geom_point(aes(x=degree, y=log10(error), color=type)) +
geom_line(aes(x=degree, y=log10(error), color=type))