STOR 390
books
## # A tibble: 38,954 × 4
## text linenumber
## * <chr> <int>
## 1 Harry Potter and the Chamber of Secrets 1
## 2 by 2
## 3 J. K. Rowling 3
## 4 Illustrations By Mary Grandpré 4
## 5 Arthur A. Levine Books 5
## 6 An Imprint Of Scholastic Press. 6
## 7 For Seán P. F. Harris. 7
## 8 Getaway driver and foul-weather friend 8
## 9 Text copyright © 1999 by J. K. Rowling. 9
## 10 Illustrations by Mary GrandPré copyright © 1999 Warner Bros. 10
## # ... with 38,944 more rows, and 2 more variables: author <chr>,
## # book <chr>
#make rows book-word pairs
book_words <- books %>%
unnest_tokens(word, text)
book_words
## # A tibble: 1,092,986 × 4
## linenumber author book word
## <int> <chr> <chr> <chr>
## 1 1 rowling chamber_secrets harry
## 2 1 rowling chamber_secrets potter
## 3 1 rowling chamber_secrets and
## 4 1 rowling chamber_secrets the
## 5 1 rowling chamber_secrets chamber
## 6 1 rowling chamber_secrets of
## 7 1 rowling chamber_secrets secrets
## 8 2 rowling chamber_secrets by
## 9 3 rowling chamber_secrets j
## 10 3 rowling chamber_secrets k
## # ... with 1,092,976 more rows
# count number of times each word appears in each book
book_words <- book_words %>%
count(book, word, sort = TRUE) %>%
ungroup()
book_words
## # A tibble: 70,486 × 3
## book word n
## <chr> <chr> <int>
## 1 order_phoenix the 11744
## 2 deathly_hallows the 10485
## 3 goblet_fire the 9340
## 4 half_blood_prince the 7527
## 5 order_phoenix to 6346
## 6 order_phoenix and 6280
## 7 deathly_hallows and 5563
## 8 order_phoenix of 5368
## 9 prisoner_azkaban the 5126
## 10 goblet_fire and 4973
## # ... with 70,476 more rows
The frequency that a word appears is inversely proportional to its rank.
\[\text{idf}(\textbf{word}) = \ln \left( \frac{\text{total number of documents}}{\text{number of documents containing } \textbf{word}} \right)\]
idf downweights the term frequency \[\text{tf-idf}(w) = \text{tf}(w) \cdot \text{idf}(w) \]
book_words <- book_words %>%
bind_tf_idf(word, book, n)
## # A tibble: 70,486 × 6
## book word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 half_blood_prince slughorn 335 0.0019664356 1.2527630 0.0024634777
## 2 order_phoenix umbridge 473 0.0018285211 0.8472979 0.0015493020
## 3 goblet_fire bagman 203 0.0010555270 1.2527630 0.0013223251
## 4 chamber_secrets lockhart 193 0.0022438208 0.5596158 0.0012556775
## 5 prisoner_azkaban lupin 372 0.0034408761 0.3364722 0.0011577593
## 6 goblet_fire winky 145 0.0007539478 1.2527630 0.0009445179
## 7 goblet_fire champions 83 0.0004315701 1.9459101 0.0008397967
## 8 deathly_hallows xenophilius 85 0.0004263815 1.9459101 0.0008297000
## 9 deathly_hallows griphook 120 0.0006019503 1.2527630 0.0007541011
## 10 half_blood_prince mclaggen 65 0.0003815472 1.9459101 0.0007424566
## # ... with 70,476 more rows
## [1] "Harry" "Potter" "and" "the" "Sorcerer’s"
## [6] "Stone" "by" "J" "K" "Rowling"
all_words %>% unique %>% length
## [1] 7268
all_words %>% unique %>% length
## [1] 7268
all_words %>% str_to_lower %>% unique %>% length
## [1] 6390
library(SnowballC)
wordStem(c("argue", "argued", "argues", "arguing", "argus", "argument", "arguments"))
## [1] "argu" "argu" "argu" "argu" "argu" "argument"
## [7] "argument"
wordStem(c("dog", "dogs"))
## [1] "dog" "dog"
wordStem(c("crying", "cried", "cries"))
## [1] "cry" "cri" "cri"
wordStem(c('am', 'are', 'is'))
## [1] "am" "ar" "i"
all_words %>% unique %>% length
## [1] 7268
all_words %>% str_to_lower %>% unique %>% length
## [1] 6390
all_words %>% wordStem %>% str_to_lower %>% unique %>% length
## [1] 4839
# which tf-idf ranks to show
word_range <- 30:45