STOR 390
# package to sample from themultivariate gaussian distribution
library(mvtnorm)
# calculate distances between points in a data frame
library(flexclust)
# for knn
library(class)
library(tidyverse)
# some helper functions I wrote for this script
# you can find this file in the same folder as the .Rmd document
source('helper_functions.R')
given a new test point \(\mathbf{\tilde{x}}\) compute the distance between \(\mathbf{x}\) and each class mean.
# test point
x_test <- c(1, 1)
# compute the observed class means
obs_means <- data_gauss %>%
group_by(y) %>%
summarise_all(mean)
obs_means
## # A tibble: 2 × 3
## y x1 x2
## <fctr> <dbl> <dbl>
## 1 -1 -1.151583 0.12338380
## 2 1 1.147330 -0.09823948
# grab each class mean
mean_pos <- select(filter(obs_means, y==1), -y)
mean_neg <- select(filter(obs_means, y==-1), -y)
# compute the euclidean distance from the class mean to the test point
dist_pos <- sqrt(sum((x_test - mean_pos)^2))
dist_neg <- sqrt(sum((x_test - mean_neg)^2))
dist_pos
## [1] 1.108078
dist_neg
## [1] 2.323309
Make a test grid
# make a grid of test points
test_grid <- expand.grid(x1 = seq(-4, 4, length = 100),
x2 = seq(-4, 4, length = 100)) %>%
as_tibble()
test_grid
## # A tibble: 10,000 × 2
## x1 x2
## <dbl> <dbl>
## 1 -4.000000 -4
## 2 -3.919192 -4
## 3 -3.838384 -4
## 4 -3.757576 -4
## 5 -3.676768 -4
## 6 -3.595960 -4
## 7 -3.515152 -4
## 8 -3.434343 -4
## 9 -3.353535 -4
## 10 -3.272727 -4
## # ... with 9,990 more rows
# compute the distance from each test point to the two class means
# note the use of the apply function (we could have used a for loop)
dist_pos <- apply(test_grid, 1, function(x) sqrt(sum((x - mean_pos)^2)))
dist_neg <- apply(test_grid, 1, function(x) sqrt(sum((x - mean_neg)^2)))
# add distance columns to the test grid data frame
test_grid <- test_grid %>%
add_column(dist_pos = dist_pos,
dist_neg = dist_neg)
# decide which class mean each test point is closest to
test_grid <- test_grid %>%
mutate(y_pred = ifelse(dist_pos < dist_neg, 1, -1)) %>%
mutate(y_pred=factor(y_pred))
test_grid
## # A tibble: 10,000 × 5
## x1 x2 dist_pos dist_neg y_pred
## <dbl> <dbl> <dbl> <dbl> <fctr>
## 1 -4.000000 -4 6.459005 5.011564 -1
## 2 -3.919192 -4 6.394793 4.966080 -1
## 3 -3.838384 -4 6.330962 4.921503 -1
## 4 -3.757576 -4 6.267522 4.877857 -1
## 5 -3.676768 -4 6.204487 4.835168 -1
## 6 -3.595960 -4 6.141867 4.793461 -1
## 7 -3.515152 -4 6.079677 4.752762 -1
## 8 -3.434343 -4 6.017929 4.713098 -1
## 9 -3.353535 -4 5.956637 4.674493 -1
## 10 -3.272727 -4 5.895816 4.636976 -1
## # ... with 9,990 more rows
## Warning: Removed 1 rows containing missing values (geom_point).
A hyperplane is given by - normal vector \(\mathbf{w} \in \mathbb{R}^d\) - an intercept \(b \in \mathbb{R}\)
All points \(\mathbf{x}\) in \(\mathbb{R}^d\) that satisfy \(\mathbf{x}^T\mathbf{w} + b\) i.e. \[H = \{\mathbf{x} \in \mathbb{R}^d | \mathbf{x}^T\mathbf{w} + b = 0\}\]
The normal vector for NC is given by the differnce of the two class means \[\mathbf{w} = \mathbf{m}_{+} - \mathbf{m}_{-}\] NC intercept given by \[b= - \frac{1}{2}\left(||\mathbf{m}_{+}||_2 - ||\mathbf{m}_{-}||_2 \right)\] # Linear classifiers
New test point \(\mathbf{\tilde{x}}\)=
Classify \(\tilde{y}\) to the positive class if \(\mathbf{w}^T \mathbf{\tilde{x}} + b >0\)
Training error rate for point clouds: 0.1125.
Training error rate for separable point clouds: 0
Training error rate for skewed point clouds: 0.065
Training error rate for heteroscedastic point clouds: 0.15.
Training error rate for GMM: 0.41.