NLP

STOR 390

Corpus

TidyText: book-word rows

Harrpy Potter

books
## # A tibble: 38,954 × 4
##                                                            text linenumber
## *                                                         <chr>      <int>
## 1                       Harry Potter and the Chamber of Secrets          1
## 2                                                            by          2
## 3                                                 J. K. Rowling          3
## 4                                Illustrations By Mary Grandpré          4
## 5                                        Arthur A. Levine Books          5
## 6                               An Imprint Of Scholastic Press.          6
## 7                                        For Seán P. F. Harris.          7
## 8                        Getaway driver and foul-weather friend          8
## 9                       Text copyright © 1999 by J. K. Rowling.          9
## 10 Illustrations by Mary GrandPré copyright © 1999 Warner Bros.         10
## # ... with 38,944 more rows, and 2 more variables: author <chr>,
## #   book <chr>

Book-word rows

#make rows book-word pairs
book_words <- books %>%
    unnest_tokens(word, text)
book_words
## # A tibble: 1,092,986 × 4
##    linenumber  author            book    word
##         <int>   <chr>           <chr>   <chr>
## 1           1 rowling chamber_secrets   harry
## 2           1 rowling chamber_secrets  potter
## 3           1 rowling chamber_secrets     and
## 4           1 rowling chamber_secrets     the
## 5           1 rowling chamber_secrets chamber
## 6           1 rowling chamber_secrets      of
## 7           1 rowling chamber_secrets secrets
## 8           2 rowling chamber_secrets      by
## 9           3 rowling chamber_secrets       j
## 10          3 rowling chamber_secrets       k
## # ... with 1,092,976 more rows

Term-frequency

# count number of times each word appears in each book
book_words <- book_words %>%
    count(book, word, sort = TRUE) %>%
    ungroup()

book_words
## # A tibble: 70,486 × 3
##                 book  word     n
##                <chr> <chr> <int>
## 1      order_phoenix   the 11744
## 2    deathly_hallows   the 10485
## 3        goblet_fire   the  9340
## 4  half_blood_prince   the  7527
## 5      order_phoenix    to  6346
## 6      order_phoenix   and  6280
## 7    deathly_hallows   and  5563
## 8      order_phoenix    of  5368
## 9   prisoner_azkaban   the  5126
## 10       goblet_fire   and  4973
## # ... with 70,476 more rows

Most common words in each book

Term frequency distribution

Heavy tail

Zipf’s law

The frequency that a word appears is inversely proportional to its rank.

Zipf’s law

Term frequency is driven by commonly occuring wrods

Inverse document frequency

\[\text{idf}(\textbf{word}) = \ln \left( \frac{\text{total number of documents}}{\text{number of documents containing } \textbf{word}} \right)\]

Term frequency, inverse document frequency

idf downweights the term frequency \[\text{tf-idf}(w) = \text{tf}(w) \cdot \text{idf}(w) \]

compute tf-idf scores

book_words <- book_words %>%
    bind_tf_idf(word, book, n)
## # A tibble: 70,486 × 6
##                 book        word     n           tf       idf       tf_idf
##                <chr>       <chr> <int>        <dbl>     <dbl>        <dbl>
## 1  half_blood_prince    slughorn   335 0.0019664356 1.2527630 0.0024634777
## 2      order_phoenix    umbridge   473 0.0018285211 0.8472979 0.0015493020
## 3        goblet_fire      bagman   203 0.0010555270 1.2527630 0.0013223251
## 4    chamber_secrets    lockhart   193 0.0022438208 0.5596158 0.0012556775
## 5   prisoner_azkaban       lupin   372 0.0034408761 0.3364722 0.0011577593
## 6        goblet_fire       winky   145 0.0007539478 1.2527630 0.0009445179
## 7        goblet_fire   champions    83 0.0004315701 1.9459101 0.0008397967
## 8    deathly_hallows xenophilius    85 0.0004263815 1.9459101 0.0008297000
## 9    deathly_hallows    griphook   120 0.0006019503 1.2527630 0.0007541011
## 10 half_blood_prince    mclaggen    65 0.0003815472 1.9459101 0.0007424566
## # ... with 70,476 more rows

words with highest tf-idf scores

TF-IDF by book

Text normalization

All words

##  [1] "Harry"      "Potter"     "and"        "the"        "Sorcerer’s"
##  [6] "Stone"      "by"         "J"          "K"          "Rowling"

Unique words

all_words %>% unique %>% length
## [1] 7268

Unique words after lower casing

all_words %>% unique %>% length
## [1] 7268
all_words %>%  str_to_lower %>% unique %>% length
## [1] 6390

Stemming

Porter stemmer

library(SnowballC)

wordStem(c("argue", "argued", "argues", "arguing", "argus", "argument", "arguments"))
## [1] "argu"     "argu"     "argu"     "argu"     "argu"     "argument"
## [7] "argument"
wordStem(c("dog", "dogs"))
## [1] "dog" "dog"
wordStem(c("crying", "cried", "cries"))
## [1] "cry" "cri" "cri"
wordStem(c('am', 'are', 'is'))
## [1] "am" "ar" "i"

Unique words after lower casing and stemming

all_words %>% unique %>% length
## [1] 7268
all_words %>%  str_to_lower %>% unique %>% length
## [1] 6390
all_words %>%  wordStem %>% str_to_lower %>% unique %>% length
## [1] 4839

top tf-idf before stemming

# which tf-idf ranks to show
word_range <- 30:45

top tf-idf after stemming

Document-term matrix (bag of words)

Document-term matrix (bag of words)

tf-idf matrix

George Orwell texts

Clasificaiton task

Sparse matrices

Mean difference classifier

Training error comparision