21.2 Websites (rvest)
IMDB Top Rated Movies:
http://www.imdb.com/chart/top?ref_=nv_mv_250_6
CSS class and id
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:XML':
##
## xml
# First page of actors
lego_movie %>%
html_nodes(".itemprop .itemprop") %>%
html_text()
## [1] "Will Arnett" "Elizabeth Banks" "Craig Berry"
## [4] "Alison Brie" "David Burrows" "Anthony Daniels"
## [7] "Charlie Day" "Amanda Farinos" "Keith Ferguson"
## [10] "Will Ferrell" "Will Forte" "Dave Franco"
## [13] "Morgan Freeman" "Todd Hansen" "Jonah Hill"
lego_movie %>%
html_nodes("table") %>%
.[[1]] %>%
html_table()
## X1 X2
## 1 Cast overview, first billed only: Cast overview, first billed only:
## 2 Will Arnett
## 3 Elizabeth Banks
## 4 Craig Berry
## 5 Alison Brie
## 6 David Burrows
## 7 Anthony Daniels
## 8 Charlie Day
## 9 Amanda Farinos
## 10 Keith Ferguson
## 11 Will Ferrell
## 12 Will Forte
## 13 Dave Franco
## 14 Morgan Freeman
## 15 Todd Hansen
## 16 Jonah Hill
## X3
## 1 Cast overview, first billed only:
## 2 ...
## 3 ...
## 4 ...
## 5 ...
## 6 ...
## 7 ...
## 8 ...
## 9 ...
## 10 ...
## 11 ...
## 12 ...
## 13 ...
## 14 ...
## 15 ...
## 16 ...
## X4
## 1 Cast overview, first billed only:
## 2 Batman / \n Bruce Wayne \n \n \n (voice)
## 3 Wyldstyle / \n Lucy \n \n \n (voice)
## 4 Blake / \n Additional Voices \n \n \n (voice)
## 5 Unikitty \n \n \n (voice)
## 6 Octan Robot / \n Additional Voices \n \n \n (voice)
## 7 C-3PO \n \n \n (voice)
## 8 Benny \n \n \n (voice)
## 9 Mom \n \n \n (voice)
## 10 Han Solo \n \n \n (voice)
## 11 Lord Business / \n President Business / \n The Man Upstairs \n \n \n (voice)
## 12 Abraham Lincoln \n \n \n (voice) (as Orville Forte)
## 13 Wally \n \n \n (voice)
## 14 Vitruvius \n \n \n (voice)
## 15 Gandalf / \n Additional Voices \n \n \n (voice)
## 16 Green Lantern \n \n \n (voice)
lego_movie %>%
html_nodes(".primary_photo , .ellipsis, .character, #titleCast .itemprop, #titleCast .loadlate")
## {xml_nodeset (87)}
## [1] <td class="primary_photo">\n<a href="/name/nm0004715/?ref_=tt_cl_i1 ...
## [2] <img height="44" width="32" alt="Will Arnett" title="Will Arnett" s ...
## [3] <td class="itemprop" itemprop="actor" itemscope itemtype="http://sc ...
## [4] <span class="itemprop" itemprop="name">Will Arnett</span>
## [5] <td class="ellipsis">\n ...\n </td>
## [6] <td class="character">\n <a href="/title/tt1490017/chara ...
## [7] <td class="primary_photo">\n<a href="/name/nm0006969/?ref_=tt_cl_i2 ...
## [8] <img height="44" width="32" alt="Elizabeth Banks" title="Elizabeth ...
## [9] <td class="itemprop" itemprop="actor" itemscope itemtype="http://sc ...
## [10] <span class="itemprop" itemprop="name">Elizabeth Banks</span>
## [11] <td class="ellipsis">\n ...\n </td>
## [12] <td class="character">\n <a href="/title/tt1490017/chara ...
## [13] <td class="primary_photo">\n<a href="/name/nm1911947/?ref_=tt_cl_i3 ...
## [14] <td class="itemprop" itemprop="actor" itemscope itemtype="http://sc ...
## [15] <span class="itemprop" itemprop="name">Craig Berry</span>
## [16] <td class="ellipsis">\n ...\n </td>
## [17] <td class="character">\n Blake / \n Addition ...
## [18] <td class="primary_photo">\n<a href="/name/nm1555340/?ref_=tt_cl_i4 ...
## [19] <img height="44" width="32" alt="Alison Brie" title="Alison Brie" s ...
## [20] <td class="itemprop" itemprop="actor" itemscope itemtype="http://sc ...
## ...
# more manual way
lego_movie %>%
html_nodes("table") %>%
.[[1]] %>%
html_nodes("tr") %>%
html_nodes("span") %>%
html_text()
## [1] "Will Arnett" "Elizabeth Banks" "Craig Berry"
## [4] "Alison Brie" "David Burrows" "Anthony Daniels"
## [7] "Charlie Day" "Amanda Farinos" "Keith Ferguson"
## [10] "Will Ferrell" "Will Forte" "Dave Franco"
## [13] "Morgan Freeman" "Todd Hansen" "Jonah Hill"