24.2 Jane Austen
library(janeaustenr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
austen_books()
## # A tibble: 73,422 x 2
## text book
## * <chr> <fct>
## 1 SENSE AND SENSIBILITY Sense & Sensibility
## 2 "" Sense & Sensibility
## 3 by Jane Austen Sense & Sensibility
## 4 "" Sense & Sensibility
## 5 (1811) Sense & Sensibility
## 6 "" Sense & Sensibility
## 7 "" Sense & Sensibility
## 8 "" Sense & Sensibility
## 9 "" Sense & Sensibility
## 10 CHAPTER 1 Sense & Sensibility
## # ... with 73,412 more rows
original_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup()
original_books
## # A tibble: 73,422 x 4
## text book linenumber chapter
## <chr> <fct> <int> <int>
## 1 SENSE AND SENSIBILITY Sense & Sensibility 1 0
## 2 "" Sense & Sensibility 2 0
## 3 by Jane Austen Sense & Sensibility 3 0
## 4 "" Sense & Sensibility 4 0
## 5 (1811) Sense & Sensibility 5 0
## 6 "" Sense & Sensibility 6 0
## 7 "" Sense & Sensibility 7 0
## 8 "" Sense & Sensibility 8 0
## 9 "" Sense & Sensibility 9 0
## 10 CHAPTER 1 Sense & Sensibility 10 1
## # ... with 73,412 more rows
library(tidytext)
tidy_books <- original_books %>%
unnest_tokens(word, text)
tidy_books
## # A tibble: 725,055 x 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Sense & Sensibility 1 0 sense
## 2 Sense & Sensibility 1 0 and
## 3 Sense & Sensibility 1 0 sensibility
## 4 Sense & Sensibility 3 0 by
## 5 Sense & Sensibility 3 0 jane
## 6 Sense & Sensibility 3 0 austen
## 7 Sense & Sensibility 5 0 1811
## 8 Sense & Sensibility 10 1 chapter
## 9 Sense & Sensibility 10 1 1
## 10 Sense & Sensibility 13 1 the
## # ... with 725,045 more rows