Cross validation

STOR 390

Tuning parameters

Predictive classification modeling

Overfitting

Independent and identically distributed

Synthetic data

Random seed

no seed

sample(1:100000, 5)
## [1] 48428 99125 83554 43894 63659
sample(1:100000, 5)
## [1] 47913 71315 85607  3417 99655

set seed

set.seed(3443)
sample(1:100000, 5)
## [1]  2218 97756 27628 79129 70252
set.seed(3443)
sample(1:100000, 5)
## [1]  2218 97756 27628 79129 70252

Lecture code

get from github: https://github.com/idc9/stor390/tree/master/notes/cross_validation

Train/test data

# package to sample from  the multivariate gaussian distribution
library(mvtnorm)
library(flexclust)
library(class)
library(tidyverse)
library(stringr)

# some helper functions I wrote for this script
# you can find this file in the same folder as the .Rmd document
source('knn_functions.R')
source('synthetic_distributions.R')

Synthetic data

# the mixture means should be the same for both training and test sets
mean_seed <- 238

# draw train and test data
data <- gmm_distribution2d(n_neg=200, n_pos=201, mean_seed=mean_seed, data_seed=1232)
test_data <- gmm_distribution2d(n_neg=1000, n_pos=1000, mean_seed=mean_seed, data_seed=52345)

Training data

KNN, K = 5

KNN for range of K

K = 1

K = 3

K = 5

K = 9

K = 17

K = 33

K = 65

K = 399

K = 401

Takeaways

Tuning curves

Takeaways

Best value of K for test vs. train

# minimum training error
error_df %>% 
    filter(tr==min(tr))
## # A tibble: 1 × 3
##       k    tr    tst
##   <dbl> <dbl>  <dbl>
## 1     1     0 0.0745
# minimum test error 
error_df %>% 
    filter(tst==min(tst))
## # A tibble: 1 × 3
##       k         tr   tst
##   <dbl>      <dbl> <dbl>
## 1    25 0.04738155 0.054

Well that sucks…

The k that gives the best training error is not the same as the k that gives the best test error.

Validation set

Validation pseudocode

  1. Randomly split the original training data set into a new training set and a validation set (maybe an 80/20 split)

  2. For each value of k we are interested in
    1. fit the model on the smaller training set
    2. compute the test error on the validation set
  3. Select the value of k that performs the best on the validation set (call it k*)

  4. Retrain the model with k=k* using the full training data

Validation set (code)

# split the original data into a train/validation set

# set the seed to sample the validation set
set.seed(345)

# number of observations
n <- dim(data)[1]

# number of observations that go in the training st
n_tr <- floor(n * .6)

# randomly select n_tr numbers, without replacement, from 1...n
tr_indices <- sample(x=1:n, size=n_tr, replace=FALSE)

# break the data into a non-overlapping train and test set
train <- data[tr_indices, ]
validation <- data[-tr_indices, ]

Compute validation error

Compute error (code)

# only try k < n tr points
# k_values_validation <- k_values[k_values < n_tr]

k_values <- k_values[k_values < 200]

# number of k values to check
num_k <- length(k_values)

# initialize data frame to save error rates in
error_df <- error_df %>% 
                add_column(valid=rep(NA, dim(error_df)[1])) %>% 
                filter(k < 200)

# evaluate k for a bunch of values of k
for(i in 1:num_k){
    
    # fix k for this loop iteration
    k <- k_values[i]

    # compute the test error on the validation set
    errs <- get_knn_error_rates(train, validation, k)
    
    # store values in the data frame
    error_df[i, 'valid'] <- errs[['tst']]
}

Validation error

## # A tibble: 53 × 4
##        k         tr    tst      valid
##    <dbl>      <dbl>  <dbl>      <dbl>
## 1      3 0.03740648 0.0700 0.06832298
## 2      7 0.03740648 0.0625 0.04347826
## 3      9 0.03740648 0.0605 0.04347826
## 4      1 0.00000000 0.0745 0.06832298
## 5      5 0.03491272 0.0695 0.04968944
## 6      9 0.03740648 0.0605 0.04347826
## 7     13 0.03990025 0.0580 0.06211180
## 8     17 0.03990025 0.0590 0.06211180
## 9     21 0.04488778 0.0560 0.05590062
## 10    25 0.04738155 0.0540 0.05590062
## # ... with 43 more rows

Compare error curves

Pick k based on the validation error

# k giving the smallest validation error
error_df %>% 
    filter(valid==min(valid))
## # A tibble: 8 × 4
##       k         tr    tst      valid
##   <dbl>      <dbl>  <dbl>      <dbl>
## 1     7 0.03740648 0.0625 0.04347826
## 2     9 0.03740648 0.0605 0.04347826
## 3     9 0.03740648 0.0605 0.04347826
## 4    73 0.06234414 0.0620 0.04347826
## 5    77 0.06234414 0.0615 0.04347826
## 6    81 0.06234414 0.0620 0.04347826
## 7    85 0.06483791 0.0610 0.04347826
## 8    89 0.06483791 0.0605 0.04347826

Compare error curves

Cross-validation

Cross-validation pseudocode

  1. Repeat the following M times
  1. We now have a k x M matrix of cv-errors. For each value of the tuning parameter k compute the average cv-error across the M folds.

  2. Select the value of k with the best cross validation error.

Many variants

Cross-validation code

# for each of the M folds
for(m in 1:M){
    
    # number of points that go in the cv train set
    n_cv_tr <- floor(n * (M-1)/M)
    
    # randomly select n_tr numbers, without replacement, from 1...n
    cv_tr_indices <- sample(x=1:n, size=n_cv_tr, replace=FALSE)

    # break the data into a non-overlapping train and test set
    cv_tr <- data[cv_tr_indices, ]
    cv_tst <- data[-cv_tr_indices, ]
    
    # for each value of k we are interested in
    for(i in 1:num_k){
        
        # fix k for this loop iteration
        k <- k_values[i]

        # compute the test error on the validation set
        errs <- get_knn_error_rates(cv_tr, cv_tst, k)
    
        # store values in the data frame
        cv_error_df[i, paste0('fold',m)] <- errs[['tst']]
    }
}

Cross-validation error

## # A tibble: 53 × 6
##         fold1      fold2      fold3      fold4      fold5     k
##         <dbl>      <dbl>      <dbl>      <dbl>      <dbl> <dbl>
## 1  0.08641975 0.06172840 0.02469136 0.02469136 0.04938272     3
## 2  0.07407407 0.08641975 0.03703704 0.02469136 0.06172840     7
## 3  0.06172840 0.08641975 0.03703704 0.02469136 0.06172840     9
## 4  0.12345679 0.08641975 0.02469136 0.02469136 0.09876543     1
## 5  0.07407407 0.07407407 0.04938272 0.01234568 0.06172840     5
## 6  0.06172840 0.08641975 0.03703704 0.02469136 0.06172840     9
## 7  0.06172840 0.08641975 0.03703704 0.02469136 0.04938272    13
## 8  0.06172840 0.09876543 0.03703704 0.03703704 0.06172840    17
## 9  0.06172840 0.11111111 0.03703704 0.04938272 0.04938272    21
## 10 0.06172840 0.11111111 0.06172840 0.04938272 0.06172840    25
## # ... with 43 more rows

Cross-validation error

Mean cv error

Mean cv error

Mean cv error

what is the best value of K

# minimum training error
error_df %>% 
    filter(tr==min(tr))
## # A tibble: 1 × 5
##       k    tr    tst      valid         cv
##   <dbl> <dbl>  <dbl>      <dbl>      <dbl>
## 1     1     0 0.0745 0.06832298 0.07160494
# minimum validation error 
error_df %>% 
    filter(valid==min(valid))
## # A tibble: 8 × 5
##       k         tr    tst      valid         cv
##   <dbl>      <dbl>  <dbl>      <dbl>      <dbl>
## 1     7 0.03740648 0.0625 0.04347826 0.05679012
## 2     9 0.03740648 0.0605 0.04347826 0.05432099
## 3     9 0.03740648 0.0605 0.04347826 0.05432099
## 4    73 0.06234414 0.0620 0.04347826 0.07654321
## 5    77 0.06234414 0.0615 0.04347826 0.07407407
## 6    81 0.06234414 0.0620 0.04347826 0.07407407
## 7    85 0.06483791 0.0610 0.04347826 0.07901235
## 8    89 0.06483791 0.0605 0.04347826 0.08148148
# minimum cv error 
error_df %>% 
    filter(cv==min(cv))
## # A tibble: 1 × 5
##       k         tr   tst      valid         cv
##   <dbl>      <dbl> <dbl>      <dbl>      <dbl>
## 1     3 0.03740648  0.07 0.06832298 0.04938272
# minimum test error 
error_df %>% 
    filter(tst==min(tst))
## # A tibble: 1 × 5
##       k         tr   tst      valid        cv
##   <dbl>      <dbl> <dbl>      <dbl>     <dbl>
## 1    25 0.04738155 0.054 0.05590062 0.0691358

Human Activity Recognition

Training data

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   activity = col_integer()
## )
## See spec(...) for full column specifications.
train
## # A tibble: 4,252 × 562
##    `tBodyAcc-mean()-X` `tBodyAcc-mean()-Y` `tBodyAcc-mean()-Z`
##                  <dbl>               <dbl>               <dbl>
## 1              2.25890          -0.0223760          -0.6515100
## 2              0.29285          -0.0184040          -0.1784800
## 3              0.17396           0.0084363          -0.0511250
## 4              0.38971          -0.0261920          -0.0120770
## 5              0.35382          -0.0180210          -0.0035395
## 6              0.24777          -0.0035040          -0.1382600
## 7              0.28630          -0.0242190          -0.1572000
## 8              0.21413          -0.0036803          -0.1639500
## 9              0.21328          -0.0064019          -0.1930100
## 10             0.25917          -0.0290970          -0.1730600
## # ... with 4,242 more rows, and 559 more variables:
## #   `tBodyAcc-std()-X` <dbl>, `tBodyAcc-std()-Y` <dbl>,
## #   `tBodyAcc-std()-Z` <dbl>, `tBodyAcc-mad()-X` <dbl>,
## #   `tBodyAcc-mad()-Y` <dbl>, `tBodyAcc-mad()-Z` <dbl>,
## #   `tBodyAcc-max()-X` <dbl>, `tBodyAcc-max()-Y` <dbl>,
## #   `tBodyAcc-max()-Z` <dbl>, `tBodyAcc-min()-X` <dbl>,
## #   `tBodyAcc-min()-Y` <dbl>, `tBodyAcc-min()-Z` <dbl>,
## #   `tBodyAcc-sma()` <dbl>, `tBodyAcc-energy()-X` <dbl>,
## #   `tBodyAcc-energy()-Y` <dbl>, `tBodyAcc-energy()-Z` <dbl>,
## #   `tBodyAcc-iqr()-X` <dbl>, `tBodyAcc-iqr()-Y` <dbl>,
## #   `tBodyAcc-iqr()-Z` <dbl>, `tBodyAcc-entropy()-X` <dbl>,
## #   `tBodyAcc-entropy()-Y` <dbl>, `tBodyAcc-entropy()-Z` <dbl>,
## #   `tBodyAcc-arCoeff()-X,1` <dbl>, `tBodyAcc-arCoeff()-X,2` <dbl>,
## #   `tBodyAcc-arCoeff()-X,3` <dbl>, `tBodyAcc-arCoeff()-X,4` <dbl>,
## #   `tBodyAcc-arCoeff()-Y,1` <dbl>, `tBodyAcc-arCoeff()-Y,2` <dbl>,
## #   `tBodyAcc-arCoeff()-Y,3` <dbl>, `tBodyAcc-arCoeff()-Y,4` <dbl>,
## #   `tBodyAcc-arCoeff()-Z,1` <dbl>, `tBodyAcc-arCoeff()-Z,2` <dbl>,
## #   `tBodyAcc-arCoeff()-Z,3` <dbl>, `tBodyAcc-arCoeff()-Z,4` <dbl>,
## #   `tBodyAcc-correlation()-X,Y` <dbl>,
## #   `tBodyAcc-correlation()-X,Z` <dbl>,
## #   `tBodyAcc-correlation()-Y,Z` <dbl>, `tGravityAcc-mean()-X` <dbl>,
## #   `tGravityAcc-mean()-Y` <dbl>, `tGravityAcc-mean()-Z` <dbl>,
## #   `tGravityAcc-std()-X` <dbl>, `tGravityAcc-std()-Y` <dbl>,
## #   `tGravityAcc-std()-Z` <dbl>, `tGravityAcc-mad()-X` <dbl>,
## #   `tGravityAcc-mad()-Y` <dbl>, `tGravityAcc-mad()-Z` <dbl>,
## #   `tGravityAcc-max()-X` <dbl>, `tGravityAcc-max()-Y` <dbl>,
## #   `tGravityAcc-max()-Z` <dbl>, `tGravityAcc-min()-X` <dbl>,
## #   `tGravityAcc-min()-Y` <dbl>, `tGravityAcc-min()-Z` <dbl>,
## #   `tGravityAcc-sma()` <dbl>, `tGravityAcc-energy()-X` <dbl>,
## #   `tGravityAcc-energy()-Y` <dbl>, `tGravityAcc-energy()-Z` <dbl>,
## #   `tGravityAcc-iqr()-X` <dbl>, `tGravityAcc-iqr()-Y` <dbl>,
## #   `tGravityAcc-iqr()-Z` <dbl>, `tGravityAcc-entropy()-X` <dbl>,
## #   `tGravityAcc-entropy()-Y` <dbl>, `tGravityAcc-entropy()-Z` <dbl>,
## #   `tGravityAcc-arCoeff()-X,1` <dbl>, `tGravityAcc-arCoeff()-X,2` <dbl>,
## #   `tGravityAcc-arCoeff()-X,3` <dbl>, `tGravityAcc-arCoeff()-X,4` <dbl>,
## #   `tGravityAcc-arCoeff()-Y,1` <dbl>, `tGravityAcc-arCoeff()-Y,2` <dbl>,
## #   `tGravityAcc-arCoeff()-Y,3` <dbl>, `tGravityAcc-arCoeff()-Y,4` <dbl>,
## #   `tGravityAcc-arCoeff()-Z,1` <dbl>, `tGravityAcc-arCoeff()-Z,2` <dbl>,
## #   `tGravityAcc-arCoeff()-Z,3` <dbl>, `tGravityAcc-arCoeff()-Z,4` <dbl>,
## #   `tGravityAcc-correlation()-X,Y` <dbl>,
## #   `tGravityAcc-correlation()-X,Z` <dbl>,
## #   `tGravityAcc-correlation()-Y,Z` <dbl>, `tBodyAccJerk-mean()-X` <dbl>,
## #   `tBodyAccJerk-mean()-Y` <dbl>, `tBodyAccJerk-mean()-Z` <dbl>,
## #   `tBodyAccJerk-std()-X` <dbl>, `tBodyAccJerk-std()-Y` <dbl>,
## #   `tBodyAccJerk-std()-Z` <dbl>, `tBodyAccJerk-mad()-X` <dbl>,
## #   `tBodyAccJerk-mad()-Y` <dbl>, `tBodyAccJerk-mad()-Z` <dbl>,
## #   `tBodyAccJerk-max()-X` <dbl>, `tBodyAccJerk-max()-Y` <dbl>,
## #   `tBodyAccJerk-max()-Z` <dbl>, `tBodyAccJerk-min()-X` <dbl>,
## #   `tBodyAccJerk-min()-Y` <dbl>, `tBodyAccJerk-min()-Z` <dbl>,
## #   `tBodyAccJerk-sma()` <dbl>, `tBodyAccJerk-energy()-X` <dbl>,
## #   `tBodyAccJerk-energy()-Y` <dbl>, `tBodyAccJerk-energy()-Z` <dbl>,
## #   `tBodyAccJerk-iqr()-X` <dbl>, `tBodyAccJerk-iqr()-Y` <dbl>,
## #   `tBodyAccJerk-iqr()-Z` <dbl>, `tBodyAccJerk-entropy()-X` <dbl>, ...

KNN training predictions

train_x <- train %>% select(-activity)
train_y <- train$activity # turn into a vector

KNN train predictions

train_predictions <- knn(train=train_x, # training x
                          test=train_x, # test x
                          cl=train_y, # train y
                          k=5)

train_predictions[1:5]
## [1] 2 2 2 2 2
## Levels: 2 3