rvest/0000755000175000017500000000000014132660177011556 5ustar nileshnileshrvest/demo/0000755000175000017500000000000013775436633012514 5ustar nileshnileshrvest/demo/00Index0000644000175000017500000000021713767413737013647 0ustar nileshnileshunited Scrape mileage details from united.com tripadvisor Scrape review data from tripadvisor zillow Scrape housing info from zillow rvest/demo/tripadvisor.R0000644000175000017500000000156113775423515015203 0ustar nileshnilesh# Inspired by # http://notesofdabbler.github.io/201408_hotelReview/scrapeTripAdvisor.html library(rvest) url <- "http://www.tripadvisor.com/Hotel_Review-g37209-d1762915-Reviews-JW_Marriott_Indianapolis-Indianapolis_Indiana.html" reviews <- url %>% read_html() %>% html_elements("#REVIEWS .innerBubble") id <- reviews %>% html_element(".quote a") %>% html_attr("id") quote <- reviews %>% html_element(".quote span") %>% html_text() rating <- reviews %>% html_element(".rating .rating_s_fill") %>% html_attr("alt") %>% gsub(" of 5 stars", "", .) %>% as.integer() date <- reviews %>% html_element(".rating .ratingDate") %>% html_attr("title") %>% strptime("%b %d, %Y") %>% as.POSIXct() review <- reviews %>% html_element(".entry .partial_entry") %>% html_text() data.frame(id, quote, rating, date, review, stringsAsFactors = FALSE) %>% View() rvest/demo/zillow.R0000644000175000017500000000151413775423476014161 0ustar nileshnilesh# Inspired by https://github.com/notesofdabbler library(rvest) library(tidyr) page <- read_html("http://www.zillow.com/homes/for_sale/Greenwood-IN/fsba,fsbo,fore,cmsn_lt/house_type/52333_rid/39.638414,-86.011362,39.550714,-86.179419_rect/12_zm/0_mmm/") houses <- page %>% html_elements(".photo-cards li article") z_id <- houses %>% html_attr("id") address <- houses %>% html_element(".zsg-photo-card-address") %>% html_text() price <- houses %>% html_element(".zsg-photo-card-price") %>% html_text() %>% readr::parse_number() params <- houses %>% html_element(".zsg-photo-card-info") %>% html_text() %>% strsplit("\u00b7") beds <- params %>% purrr::map_chr(1) %>% readr::parse_number() baths <- params %>% purrr::map_chr(2) %>% readr::parse_number() house_area <- params %>% purrr::map_chr(3) %>% readr::parse_number() rvest/demo/united.R0000644000175000017500000000071213775436633014127 0ustar nileshnilesh# Scrape miles from united site library(rvest) united <- session("http://www.united.com/") login <- united %>% html_element("form[name=LoginForm]") %>% html_form() %>% html_form_set( MpNumber = "GY797363", Password = password ) logged_in <- united %>% session_submit(login) logged_in %>% follow_link("View account") %>% html_element("#ctl00_ContentInfo_AccountSummary_spanEliteMilesNew") %>% html_text() %>% readr::parse_number() rvest/MD50000644000175000017500000001002414132660177012063 0ustar nileshnileshb179d69b3999d72a6be1b9de283bf72d *DESCRIPTION e8a97d0782b5ca3faa6669db5e7da997 *LICENSE 1aa8989ebe056fed42d2aa4640dc0f53 *NAMESPACE 8e6738c35b2521a3bab7eb4ed16a29e2 *NEWS.md 452b62853ab77b3c1f7b4231771ddd29 *R/encoding.R ccd4ec0dacc779b8733af8a642b109d4 *R/form.R b9bbef57a1f61b150f30fc504b6842ab *R/html.R b7c36d3bad90be1bcd154df9457e2759 *R/rename.R e78016d9c14b4ec9d54bbaf33f865891 *R/rvest-package.R 7d95677581a03f930591d42ffb9ed0c4 *R/selectors.R 84f9bbabed1eceb8d6d2aacbc0b5c2d4 *R/session.R 87bbd0f095c4744bcb001eb438fbc545 *R/table.R 3011d9a76ebc5504bdb04977193ddcf5 *R/testthat.R f41504de2b0cb9a3bf5a50feff6d3e0b *R/text.R 60f6ff612a2d9b1c0252e208b7dd7c24 *R/utils.R 53ba96bca139a8108f781e3b3f9c328b *README.md 9861b4b3aebd34cd2f23700ec61dea04 *build/vignette.rds 96facf495896646206f14ea3932cb996 *demo/00Index 5b1a65183607f9c7d536fb03efac59a1 *demo/tripadvisor.R daaaa642bfbf152064f757e795fba645 *demo/united.R 8e08c2f3f3a7d02be1d796f820f257c9 *demo/zillow.R 1d0e6e024c643aa2bdcaa61bd4e3968c *inst/WORDLIST b6e294946e527a9886d29a7d5963f283 *inst/doc/rvest.R 83bec8dba1ec1f1a5146bd199916c90f *inst/doc/rvest.Rmd 90b5c9a3f6fbab56e5fda4e1779b8b75 *inst/doc/rvest.html bc8ae98aec6e23fe5802241ffc409841 *inst/doc/starwars.R 1d030fd990b11a62568bd5edf7710a07 *inst/doc/starwars.Rmd 61e7c66bc2bcfa439a74ad1d06c58ee6 *inst/doc/starwars.html be4cd29b5c2b16bd908a786c53cf0276 *inst/html-ex/bad-encoding.html cb1e46f469cfbbbde29c8b5113e1d789 *man/figures/lifecycle-archived.svg c0d2e5a54f1fa4ff02bf9533079dd1f7 *man/figures/lifecycle-defunct.svg a1b8c987c676c16af790f563f96cbb1f *man/figures/lifecycle-deprecated.svg c3978703d8f40f2679795335715e98f4 *man/figures/lifecycle-experimental.svg 952b59dc07b171b97d5d982924244f61 *man/figures/lifecycle-maturing.svg 27b879bf3677ea76e3991d56ab324081 *man/figures/lifecycle-questioning.svg 53b3f893324260b737b3c46ed2a0e643 *man/figures/lifecycle-stable.svg 1c1fe7a759b86dc6dbcbe7797ab8246c *man/figures/lifecycle-superseded.svg 39579f197389469e5bec6c4bb2e34777 *man/figures/logo.png eb9ac3d3af03097f746a0fe39050243b *man/google_form.Rd a6d869162565326cc577521a9bfdc0af *man/html_attr.Rd a644e18fdcf48efd1b51a71338e30de9 *man/html_children.Rd 39dd263d3539e8cfe6a53fb8fc8e2761 *man/html_element.Rd 82dd7199f651dd55a2f24219f63768e0 *man/html_encoding_guess.Rd 9a1a5352986ce91c2b91c610a2bbf49a *man/html_form.Rd 480665ed4e0a662cd7168bae2f773ecc *man/html_name.Rd 981e43b2c8ba9d56fe2a17f2b1b95dea *man/html_table.Rd 7824150f123e587749235481d99f6af2 *man/html_text.Rd 1e641362f441edc4f7fa5e3665ec8ae4 *man/minimal_html.Rd 56baeb81da4f1bbdb66fc2914e610472 *man/reexports.Rd e4de192a61e9191cc5acaab22bc3387d *man/rename.Rd ceac1f465444fe123838d404dd41eaf4 *man/repair_encoding.Rd c35e73e81e6ebf1afa0f00d25fa6fca2 *man/rvest-package.Rd b24724a42df8bc9450618572c8e2cb7b *man/session.Rd 0622a97a2aaa3c342f09636052c2d7f5 *tests/spelling.R ef55516173099bedc78483fe7c0dd23c *tests/testthat.R 1eff4c362767dfa35019447f63aaa5bf *tests/testthat/_snaps/encoding.md 61a023edde1371ffef158c580adbe889 *tests/testthat/_snaps/form.md 394d2b13389ec6b38cd017699a4de168 *tests/testthat/_snaps/rename.md ce97b14761bc7b06da7da6867aec6951 *tests/testthat/_snaps/selectors.md bc9fc497b280daab21848d3bc155dd11 *tests/testthat/_snaps/session.md 8e1f3248b4c30548354f2569e44cb76c *tests/testthat/_snaps/table.md 483c192b5de64a50ab5954cb7ed69322 *tests/testthat/_snaps/utils.md 963251571de4ecb243dcaef1ab0c5586 *tests/testthat/test-encoding.R f7f95d4b45ebaf4fc573997f74c7f417 *tests/testthat/test-form.R 60b3257c6910bb1e559c64688fd10914 *tests/testthat/test-html.R 3dd7435aba1764a7bb22544bc080c585 *tests/testthat/test-rename.R 3d9b9f27a90ff7c11c9b9c0de79a2be8 *tests/testthat/test-selectors.R 466191bea5bd3fd07f4bc9e2063ff126 *tests/testthat/test-session.R 75298f4e28a1da5c3e91820efc20e6ea *tests/testthat/test-table.R b4797b2d4cca238d5c828f8778a1d983 *tests/testthat/test-text.R 2f2f8524da52b2515bcfa4c4d472033d *tests/testthat/test-utils.R 07292e7a862259805eebb8606cb61044 *tests/testthat/test.html 83bec8dba1ec1f1a5146bd199916c90f *vignettes/rvest.Rmd 1d030fd990b11a62568bd5edf7710a07 *vignettes/starwars.Rmd rvest/NEWS.md0000644000175000017500000001745414132343516012662 0ustar nileshnilesh# rvest 1.0.2 * Fixes for CRAN * `html_table()` converts empty tables to empty tibbles (@epiben, #327). # rvest 1.0.1 * `html_table()` correctly handles tables with cells that contain blank values for `rowspan` and/or `colspan`, so that e.g. `` is parsed as `` (@epiben, #323). * Fix broken example # rvest 1.0.0 ## New features * New `html_text2()` provides a more natural rendering of HTML nodes into text, converting `
` into "\n", and removing non-significant whitespace (#175). By default, it also converts ` ` into regular spaces, which you can suppress with `preserve_nbsp = TRUE` (#284). * `html_table()` has been re-written from scratch to more closely mimic the algorithm that browsers use for parsing tables. This should mean that there are far fewer tables for which it fails to produce some output (#63, #204, #215). The `fill` argument has been deprecated since it is no longer needed. `html_table()` now returns a tibble rather than a data frame to be compatible with the rest of the tidyverse (#199). Its performance has been considerably improved (#237). It also gains a `na.strings` argument to control what values are converted to `NA` (#107), and a `convert` argument to control whether to run the conversion (#311). * New `html_form_submit()` allows you to submit a form directly, without needing to create a session (#300). * rvest is now licensed as MIT (#287). ## API changes Since this is the 1.0.0 release, I included a large number of API changes to make rvest more compatible with current tidyverse conventions. Older functions have been deprecated, so existing code will continue to work (albeit with a few new warnings). * rvest now imports xml2 rather than depending on it. This is cleaner because it avoids attaching all the xml2 functions that you're less likely to use. To reduce the change of breakages, rvest re-exports xml2 functions `read_html()` and `url_absolute()`, but your code may now need an explicit `library(xml2)`. * `html_form()` now returns an object with class `rvest_form` (instead of form). Fields within a form now have class `rvest_field`, instead of a variety of classes that were lacking the `rvest_` prefix. All functions for working with forms have a common `html_form_` prefix: `set_values()` became `html_form_set()`. `submit_form()` was renamed to `session_submit()` because it returns a session. * `html_node()` and `html_nodes()` have been superseded in favor of `html_element()` and `html_elements()` since they (almost) always return elements, not nodes (#298). * `html_session()` is now `session()` and returns an object of class `rvest_session` (instead of `session`). All functions that work with session objects now have a common `session_` prefix. * Long deprecated `html()`, `html_tag()`, `xml()` functions have been removed. * `minimal_html()` (which doesn't appear to be used by any other package) has had its arguments flipped to make it more intuitive. * `guess_encoding()` has been renamed to `html_encoding_guess()` to avoid a clash with `stringr::guess_encoding()` (#209). `repair_encoding()` has been deprecated because it doesn't appear to work. * `pluck()` is no longer exported to avoid a clash with `purrr::pluck()`; if you need it use `purrr::map_chr()` and friends instead (#209). * `xml_tag()`, `xml_node()`, and `xml_nodes()` have been formally deprecated in favor of their `html_` equivalents. ## Minor improvements and bug fixes * The "harvesting the web" vignette has been rewritten to focus more on basics rvest, eliminating the screenshots to keep the installed package as svelte as possible. It's also been renamed to `vignette("rvest")` since it's the vignette that you should read first. * The SelectorGadget vignette is now a web-only article, , so we can be more generous with screenshots since they're no longer bundled with every install of the package. Together with the rewrite of the other vignette, this means that rvest is now ~90 Kb instead of ~1.1 Mb. * All uses of IMDB have been eliminated since the site explicitly prohibits scraping (#195). * `session_submit()` errors if `form` doesn't have a `url` (#288). * New `session_forward()` function to complement `session_back()`. It now allows you to pick the submission button by position (#156). The `...` argument is deprecated; please use `config` instead. * `html_form_set()` can now accept character vectors allowing you to select multiple checkboxes in a set or select multiple values from a multi-`` as well as``. * `submit_request()` (and hence `submit_form()`) recognizes forms with `` as a valid form submission button. # rvest 0.2.0 ## New features * `html()` and `xml()` pass `...` on to `httr::GET()` so you can more finely control the request (#48). * Add xml support: parse with `xml()`, then work with using `xml_node()`, `xml_attr()`, `xml_attrs()`, `xml_text()` and `xml_tag()` (#24). * `xml_structure()`: new function that displays the structure (i.e. tag and attribute names) of a xml/html object (#10). ## Bug fixes * `follow_link()` now accepts css and xpath selectors. (#38, #41, #42) * `html()` does a better job of dealing with encodings (passing the problem on to `XML::parseHTML()`) instead of trying to do it itself (#25, #50). * `html_attr()` returns default value when input is NULL (#49) * Add missing `html_node()` method for session. * `html_nodes()` now returns an empty list if no elements are found (#31). * `submit_form()` converts relative paths to absolute URLs (#52). It also deals better with 0-length inputs (#29). rvest/DESCRIPTION0000644000175000017500000000217414132660177013270 0ustar nileshnileshPackage: rvest Title: Easily Harvest (Scrape) Web Pages Version: 1.0.2 Authors@R: c(person(given = "Hadley", family = "Wickham", role = c("aut", "cre"), email = "hadley@rstudio.com"), person(given = "RStudio", role = "cph")) Description: Wrappers around the 'xml2' and 'httr' packages to make it easy to download, then manipulate, HTML and XML. License: MIT + file LICENSE URL: https://rvest.tidyverse.org/, https://github.com/tidyverse/rvest BugReports: https://github.com/tidyverse/rvest/issues Depends: R (>= 3.2) Imports: httr (>= 0.5), lifecycle (>= 1.0.0), magrittr, rlang (>= 0.4.10), selectr, tibble, xml2 (>= 1.3) Suggests: covr, glue, knitr, readr, rmarkdown, repurrrsive, spelling, stringi (>= 0.3.1), testthat (>= 3.0.2), webfakes VignetteBuilder: knitr Config/testthat/edition: 3 Encoding: UTF-8 Language: en-US RoxygenNote: 7.1.2 NeedsCompilation: no Packaged: 2021-10-15 18:24:40 UTC; hadley Author: Hadley Wickham [aut, cre], RStudio [cph] Maintainer: Hadley Wickham Repository: CRAN Date/Publication: 2021-10-16 23:30:07 UTC rvest/README.md0000644000175000017500000001103514132342252013024 0ustar nileshnilesh # rvest [![CRAN status](https://www.r-pkg.org/badges/version/rvest)](https://cran.r-project.org/package=rvest) [![R-CMD-check](https://github.com/tidyverse/rvest/workflows/R-CMD-check/badge.svg)](https://github.com/tidyverse/rvest/actions) [![Codecov test coverage](https://codecov.io/gh/tidyverse/rvest/branch/master/graph/badge.svg)](https://app.codecov.io/gh/tidyverse/rvest?branch=master) ## Overview rvest helps you scrape (or harvest) data from web pages. It is designed to work with [magrittr](https://github.com/tidyverse/magrittr) to make it easy to express common web scraping tasks, inspired by libraries like [beautiful soup](https://www.crummy.com/software/BeautifulSoup/) and [RoboBrowser](http://robobrowser.readthedocs.io/en/latest/readme.html). If you’re scraping multiple pages, I highly recommend using rvest in concert with [polite](https://dmi3kno.github.io/polite/). The polite package ensures that you’re respecting the [robots.txt](https://en.wikipedia.org/wiki/Robots_exclusion_standard) and not hammering the site with too many requests. ## Installation ``` r # The easiest way to get rvest is to install the whole tidyverse: install.packages("tidyverse") # Alternatively, install just rvest: install.packages("rvest") ``` ## Usage ``` r library(rvest) # Start by reading a HTML page with read_html(): starwars <- read_html("https://rvest.tidyverse.org/articles/starwars.html") # Then find elements that match a css selector or XPath expression # using html_elements(). In this example, each
corresponds # to a different film films <- starwars %>% html_elements("section") films #> {xml_nodeset (7)} #> [1]

\nThe Phantom Menace\n

\n

\nReleased: 1999 ... #> [2]

\nAttack of the Clones\n

\n

\nReleased: 20 ... #> [3]

\nRevenge of the Sith\n

\n

\nReleased: 200 ... #> [4]

\nA New Hope\n

\n

\nReleased: 1977-05-25\n ... #> [5]

\nThe Empire Strikes Back\n

\n

\nReleased: ... #> [6]

\nReturn of the Jedi\n

\n

\nReleased: 1983 ... #> [7]

\nThe Force Awakens\n

\n

\nReleased: 2015- ... # Then use html_element() to extract one element per film. Here # we the title is given by the text inside

title <- films %>% html_element("h2") %>% html_text2() title #> [1] "The Phantom Menace" "Attack of the Clones" #> [3] "Revenge of the Sith" "A New Hope" #> [5] "The Empire Strikes Back" "Return of the Jedi" #> [7] "The Force Awakens" # Or use html_attr() to get data out of attributes. html_attr() always # returns a string so we convert it to an integer using a readr function episode <- films %>% html_element("h2") %>% html_attr("data-id") %>% readr::parse_integer() episode #> [1] 1 2 3 4 5 6 7 ``` If the page contains tabular data you can convert it directly to a data frame with `html_table()`: ``` r html <- read_html("https://en.wikipedia.org/w/index.php?title=The_Lego_Movie&oldid=998422565") html %>% html_element(".tracklist") %>% html_table() #> # A tibble: 29 × 4 #> No. Title `Performer(s)` Length #> #> 1 1. "\"Everything Is Awesome\"" "Tegan and Sara featuring The Lonel… 2:43 #> 2 2. "\"Prologue\"" "" 2:28 #> 3 3. "\"Emmett's Morning\"" "" 2:00 #> 4 4. "\"Emmett Falls in Love\"" "" 1:11 #> 5 5. "\"Escape\"" "" 3:26 #> 6 6. "\"Into the Old West\"" "" 1:00 #> 7 7. "\"Wyldstyle Explains\"" "" 1:21 #> 8 8. "\"Emmett's Mind\"" "" 2:17 #> 9 9. "\"The Transformation\"" "" 1:46 #> 10 10. "\"Saloons and Wagons\"" "" 3:38 #> # … with 19 more rows ``` ## Code of Conduct Please note that the rvest project is released with a [Contributor Code of Conduct](https://rvest.tidyverse.org/CODE_OF_CONDUCT.html). By contributing to this project, you agree to abide by its terms. rvest/man/0000755000175000017500000000000014101012310012301 5ustar nileshnileshrvest/man/reexports.Rd0000644000175000017500000000102713770130737014654 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rvest-package.R \docType{import} \name{reexports} \alias{reexports} \alias{read_html} \alias{url_absolute} \alias{\%>\%} \title{Objects exported from other packages} \keyword{internal} \description{ These objects are imported from other packages. Follow the links below to see their documentation. \describe{ \item{magrittr}{\code{\link[magrittr:pipe]{\%>\%}}} \item{xml2}{\code{\link[xml2:read_xml]{read_html}}, \code{\link[xml2]{url_absolute}}} }} rvest/man/html_encoding_guess.Rd0000644000175000017500000000205113775424150016637 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/encoding.R \name{html_encoding_guess} \alias{html_encoding_guess} \alias{guess_encoding} \title{Guess faulty character encoding} \usage{ html_encoding_guess(x) } \arguments{ \item{x}{A character vector.} } \description{ \code{html_encoding_guess()} helps you handle web pages that declare an incorrect encoding. Use \code{html_encoding_guess()} to generate a list of possible encodings, then try each out by using \code{encoding} argument of \code{read_html()}. \code{html_encoding_guess()} replaces the deprecated \code{guess_encoding()}. } \examples{ # A file with bad encoding included in the package path <- system.file("html-ex", "bad-encoding.html", package = "rvest") x <- read_html(path) x \%>\% html_elements("p") \%>\% html_text() html_encoding_guess(x) # Two valid encodings, only one of which is correct read_html(path, encoding = "ISO-8859-1") \%>\% html_elements("p") \%>\% html_text() read_html(path, encoding = "ISO-8859-2") \%>\% html_elements("p") \%>\% html_text() } rvest/man/html_form.Rd0000644000175000017500000000426013776122700014607 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/form.R \name{html_form} \alias{html_form} \alias{html_form_set} \alias{html_form_submit} \title{Parse forms and set values} \usage{ html_form(x, base_url = NULL) html_form_set(form, ...) html_form_submit(form, submit = NULL) } \arguments{ \item{x}{A document (from \code{\link[=read_html]{read_html()}}), node set (from \code{\link[=html_elements]{html_elements()}}), node (from \code{\link[=html_element]{html_element()}}), or session (from \code{\link[=session]{session()}}).} \item{base_url}{Base url of underlying HTML document. The default, \code{NULL}, uses the url of the HTML document underlying \code{x}.} \item{form}{A form} \item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name-value pairs giving fields to modify. Provide a character vector to set multiple checkboxes in a set or select multiple values from a multi-select.} \item{submit}{Which button should be used to submit the form? \itemize{ \item \code{NULL}, the default, uses the first button. \item A string selects a button by its name. \item A number selects a button using its relative position. }} } \value{ \itemize{ \item \code{html_form()} returns as S3 object with class \code{rvest_form} when applied to a single element. It returns a list of \code{rvest_form} objects when applied to multiple elements or a document. \item \code{html_form_set()} returns an \code{rvest_form} object. \item \code{html_form_submit()} submits the form, returning an httr response which can be parsed with \code{\link[=read_html]{read_html()}}. } } \description{ Use \code{html_form()} to extract a form, set values with \code{html_form_set()}, and submit it with \code{html_form_submit()}. } \examples{ html <- read_html("http://www.google.com") search <- html_form(html)[[1]] search <- search \%>\% html_form_set(q = "My little pony", hl = "fr") # Or if you have a list of values, use !!! vals <- list(q = "web scraping", hl = "en") search <- search \%>\% html_form_set(!!!vals) # To submit and get result: \dontrun{ resp <- html_form_submit(search) read_html(resp) } } \seealso{ HTML 4.01 form specification: \url{http://www.w3.org/TR/html401/interact/forms.html} } rvest/man/html_table.Rd0000644000175000017500000000466514007274024014737 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/table.R \name{html_table} \alias{html_table} \title{Parse an html table into a data frame} \usage{ html_table( x, header = NA, trim = TRUE, fill = deprecated(), dec = ".", na.strings = "NA", convert = TRUE ) } \arguments{ \item{x}{A document (from \code{\link[=read_html]{read_html()}}), node set (from \code{\link[=html_elements]{html_elements()}}), node (from \code{\link[=html_element]{html_element()}}), or session (from \code{\link[=session]{session()}}).} \item{header}{Use first row as header? If \code{NA}, will use first row if it consists of \verb{} tags. If \code{TRUE}, column names are left exactly as they are in the source document, which may require post-processing to generate a valid data frame.} \item{trim}{Remove leading and trailing whitespace within each cell?} \item{fill}{Deprecated - missing cells in tables are now always automatically filled with \code{NA}.} \item{dec}{The character used as decimal place marker.} \item{na.strings}{Character vector of values that will be converted to \code{NA} if \code{convert} is \code{TRUE}.} \item{convert}{If \code{TRUE}, will run \code{\link[=type.convert]{type.convert()}} to interpret texts as integer, double, or \code{NA}.} } \value{ When applied to a single element, \code{html_table()} returns a single tibble. When applied to multiple elements or a document, \code{html_table()} returns a list of tibbles. } \description{ The algorithm mimics what a browser does, but repeats the values of merged cells in every cell that cover. } \examples{ sample1 <- minimal_html("
Col ACol B
1x
4y
10z
") sample1 \%>\% html_element("table") \%>\% html_table() # Values in merged cells will be duplicated sample2 <- minimal_html("
ABC
123
45
67
") sample2 \%>\% html_element("table") \%>\% html_table() # If a row is missing cells, they'll be filled with NAs sample3 <- minimal_html("
ABC
12
3
4
") sample3 \%>\% html_element("table") \%>\% html_table() } rvest/man/repair_encoding.Rd0000644000175000017500000000123114014035320015726 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/encoding.R \name{repair_encoding} \alias{repair_encoding} \title{Repair faulty encoding} \usage{ repair_encoding(x, from = NULL) } \arguments{ \item{from}{The encoding that the string is actually in. If \code{NULL}, \code{guess_encoding} will be used.} } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} This function has been deprecated because it doesn't work. Instead re-read the HTML file with correct \code{encoding} argument. } \keyword{internal} rvest/man/google_form.Rd0000644000175000017500000000047014101012310015070 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/utils.R \name{google_form} \alias{google_form} \title{Make link to google form given id} \usage{ google_form(x) } \arguments{ \item{x}{Unique identifier for form} } \description{ Make link to google form given id } \keyword{internal} rvest/man/html_attr.Rd0000644000175000017500000000242314132341320014601 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/html.R \name{html_attr} \alias{html_attr} \alias{html_attrs} \title{Get element attributes} \usage{ html_attr(x, name, default = NA_character_) html_attrs(x) } \arguments{ \item{x}{A document (from \code{\link[=read_html]{read_html()}}), node set (from \code{\link[=html_elements]{html_elements()}}), node (from \code{\link[=html_element]{html_element()}}), or session (from \code{\link[=session]{session()}}).} \item{name}{Name of attribute to retrieve.} \item{default}{A string used as a default value when the attribute does not exist in every element.} } \value{ A character vector (for \code{html_attr()}) or list (\code{html_attrs()}) the same length as \code{x}. } \description{ \code{html_attr()} gets a single attribute; \code{html_attrs()} gets all attributes. } \examples{ html <- minimal_html('') html \%>\% html_elements("a") \%>\% html_attrs() html \%>\% html_elements("a") \%>\% html_attr("href") html \%>\% html_elements("li") \%>\% html_attr("class") html \%>\% html_elements("li") \%>\% html_attr("class", default = "inactive") } rvest/man/rvest-package.Rd0000644000175000017500000000141614132341252015344 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rvest-package.R \docType{package} \name{rvest-package} \alias{rvest} \alias{rvest-package} \title{rvest: Easily Harvest (Scrape) Web Pages} \description{ \if{html}{\figure{logo.png}{options: align='right' alt='logo' width='120'}} Wrappers around the 'xml2' and 'httr' packages to make it easy to download, then manipulate, HTML and XML. } \seealso{ Useful links: \itemize{ \item \url{https://rvest.tidyverse.org/} \item \url{https://github.com/tidyverse/rvest} \item Report bugs at \url{https://github.com/tidyverse/rvest/issues} } } \author{ \strong{Maintainer}: Hadley Wickham \email{hadley@rstudio.com} Other contributors: \itemize{ \item RStudio [copyright holder] } } \keyword{internal} rvest/man/minimal_html.Rd0000644000175000017500000000066313775437157015313 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/utils.R \name{minimal_html} \alias{minimal_html} \title{Create an HTML document from inline HTML} \usage{ minimal_html(html, title = "") } \arguments{ \item{html}{HTML contents of page.} \item{title}{Page title (required by HTML spec).} } \description{ Create an HTML document from inline HTML } \examples{ minimal_html("

test

") } \keyword{internal} rvest/man/html_element.Rd0000644000175000017500000000607113776110054015276 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/selectors.R \name{html_element} \alias{html_element} \alias{html_elements} \title{Select elements from an HTML document} \usage{ html_element(x, css, xpath) html_elements(x, css, xpath) } \arguments{ \item{x}{Either a document, a node set or a single node.} \item{css, xpath}{Elements to select. Supply one of \code{css} or \code{xpath} depending on whether you want to use a CSS selector or XPath 1.0 expression.} } \value{ \code{html_element()} returns a nodeset the same length as the input. \code{html_elements()} flattens the output so there's no direct way to map the output to the input. } \description{ \code{html_element()} and \code{html_elements()} find HTML element using CSS selectors or XPath expressions. CSS selectors are particularly useful in conjunction with \url{https://selectorgadget.com/}, which makes it very easy to discover the selector you need. } \section{CSS selector support}{ CSS selectors are translated to XPath selectors by the \pkg{selectr} package, which is a port of the python \pkg{cssselect} library, \url{https://pythonhosted.org/cssselect/}. It implements the majority of CSS3 selectors, as described in \url{http://www.w3.org/TR/2011/REC-css3-selectors-20110929/}. The exceptions are listed below: \itemize{ \item Pseudo selectors that require interactivity are ignored: \verb{:hover}, \verb{:active}, \verb{:focus}, \verb{:target}, \verb{:visited}. \item The following pseudo classes don't work with the wild card element, *: \verb{*:first-of-type}, \verb{*:last-of-type}, \verb{*:nth-of-type}, \verb{*:nth-last-of-type}, \verb{*:only-of-type} \item It supports \verb{:contains(text)} \item You can use !=, \verb{[foo!=bar]} is the same as \verb{:not([foo=bar])} \item \verb{:not()} accepts a sequence of simple selectors, not just a single simple selector. } } \examples{ html <- minimal_html("

This is a heading

This is a paragraph

This is an important paragraph

") html \%>\% html_element("h1") html \%>\% html_elements("p") html \%>\% html_elements(".important") html \%>\% html_elements("#first") # html_element() vs html_elements() -------------------------------------- html <- minimal_html("
  • C-3PO is a droid that weighs 167 kg
  • R2-D2 is a droid that weighs 96 kg
  • Yoda weighs 66 kg
  • R4-P17 is a droid
") li <- html \%>\% html_elements("li") # When applied to a node set, html_elements() returns all matching elements # beneath any of the inputs, flattening results into a new node set. li \%>\% html_elements("i") # When applied to a node set, html_element() always returns a vector the # same length as the input, using a "missing" element where needed. li \%>\% html_element("i") # and html_text() and html_attr() will return NA li \%>\% html_element("i") \%>\% html_text2() li \%>\% html_element("span") \%>\% html_attr("class") } rvest/man/html_text.Rd0000644000175000017500000000504613775651250014641 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/text.R \name{html_text} \alias{html_text} \alias{html_text2} \title{Get element text} \usage{ html_text(x, trim = FALSE) html_text2(x, preserve_nbsp = FALSE) } \arguments{ \item{x}{A document, node, or node set.} \item{trim}{If \code{TRUE} will trim leading and trailing spaces.} \item{preserve_nbsp}{Should non-breaking spaces be preserved? By default, \code{html_text2()} converts to ordinary spaces to ease further computation. When \code{preserve_nbsp} is \code{TRUE}, \verb{ } will appear in strings as \code{"\\ua0"}. This often causes confusion because it prints the same way as \code{" "}.} } \value{ A character vector the same length as \code{x} } \description{ There are two ways to retrieve text from a element: \code{html_text()} and \code{html_text2()}. \code{html_text()} is a thin wrapper around \code{\link[xml2:xml_text]{xml2::xml_text()}} which returns just the raw underlying text. \code{html_text2()} simulates how text looks in a browser, using an approach inspired by JavaScript's \href{https://developer.mozilla.org/en-US/docs/Web/API/HTMLElement/innerText}{innerText()}. Roughly speaking, it converts \verb{
} to \code{"\\n"}, adds blank lines around \verb{

} tags, and lightly formats tabular data. \code{html_text2()} is usually what you want, but it is much slower than \code{html_text()} so for simple applications where performance is important you may want to use \code{html_text()} instead. } \examples{ # To understand the difference between html_text() and html_text2() # take the following html: html <- minimal_html( "

This is a paragraph. This another sentence.
This should start on a new line" ) # html_text() returns the raw underlying text, which includes whitespace # that would be ignored by a browser, and ignores the
html \%>\% html_element("p") \%>\% html_text() \%>\% writeLines() # html_text2() simulates what a browser would display. Non-significant # whitespace is collapsed, and
is turned into a line break html \%>\% html_element("p") \%>\% html_text2() \%>\% writeLines() # By default, html_text2() also converts non-breaking spaces to regular # spaces: html <- minimal_html("

x y

") x1 <- html \%>\% html_element("p") \%>\% html_text() x2 <- html \%>\% html_element("p") \%>\% html_text2() # When printed, non-breaking spaces look exactly like regular spaces x1 x2 # But aren't actually the same: x1 == x2 # Which you can confirm by looking at their underlying binary # representaion: charToRaw(x1) charToRaw(x2) } rvest/man/figures/0000755000175000017500000000000013767413737014010 5ustar nileshnileshrvest/man/figures/lifecycle-defunct.svg0000644000175000017500000000170413767413737020120 0ustar nileshnileshlifecyclelifecycledefunctdefunct rvest/man/figures/logo.png0000644000175000017500000007523013767413737015465 0ustar nileshnileshPNG  IHDRX?gAMA a cHRMz&u0`:pQ<bKGD pHYs!7!73XztIME 3yIDATxwuU=9AAL T,9H^'Y^9uڵ%ٖmɖmYV%Q9f$&]3 H"3`~S]]]]nx+pFK7 |7es1B07A\ - "򙂈3 p+G#^: |1"FDE# 4y67ƈȋ!f7 ]Ǹ  p;PċfW: q,%oxxwyKOs/>"/`AV'_G>-`"/tD^A86w v; {""/TD^`A\\:^ }e &s"/.|9͇g߀ƈ f kƓ٧b?ZOd!"B@DӈĵkOׄ4`JI[*Ew6Қi&%|O (ӌ|4w wԀDK5O EZ3Y.3"_9~*KO="BA^e'mӝўJcJ>yAJ T\w.~7FD>u|0}8ߠv3teM=(.<R'{(!ňħG} x+;}jdltv%}U姙">.? r$[ZkU LU }H8|_=btgd{2<%יeo||/IW:CG*BN{   N,ioyH||,)iKNgH6hЧN%ag\;y:-"#|i@ ASe~ 41`sO9P.85&*Hx x"U{2?ł87!|jBS 5"""0,,Sڒ@w1-*MYf$[| |哖نAg2Eg*Mx}ߙ=Rr)-ްW_"Iw*CڶOD :t2lI:IbrĹz[rJ䗝,N/*E8i5;"e23aTbbn2eG˛s"1zR)rv1o>3U?9! R뼡e *3M)E&B8J1V,; #,% ͍OJ:I:I a=7I{ ˌU+o8#Qd~Aоe~-ik//z4d`Ĥ42pAk Ugt'S4*- Lj VJ,sAO32aҕLK,H!@Ϡ'E=>M|\-~AEII,)i%J$#3W9u}(P3nU#qG|y Uʌ,%lx>e~MvDpe~ɧeg-PZcq5kY/%ei8^Z{."@ܔe.)%|H=vlAŹG̋U>4DBr}_3 Ɛ|}][P:$Rnu/sR2oR/_}߆7+nB㛹/!Ĭs0'8\9^2X-S.[<%>#I 0=GZPG),)P3wgWB"f B)!]u?~KJa ~?Bi5ב nRXM^E5>)jj/;i-;B6'/XDxd~u7W SJ#cuW -:UUZc7fJkL)5ESʆ\3of㦐8ufNVކS>)KHT@z 3GqBA>c [8߈gcѸGzl&\ ?~#gX~)iI2ֿ0X)wE'[sϫ4'ic)ƪEA\.El8-lAL-C7 ؆Ԭ]( MLT}kp f7Պ4pB{ qjXPB_<vcW.\sAr̉W1)I$KfY֌Dₚo݊!9 7XNn=9tF քχ.9alX:sH:dHv1ꖲ07|XS0eba+|! cZ46B?YSRJ݀{CSsvsr Ν_.'_|⹺fndz! VLJ*O,1)q Aʛ|ataɰ bV!hX=KJjpwW,?RR\qäZz̟ B>Oa #ȶ{1 25N)F=W<*e&]gAOUg$,ig vh$bҠ{2&B H~c:G"H&1Q>qijٟt3dECw}&̒fx| #p0^"]J.H >\ xsEB1*O̐(,=j0n6|8YT c:1vZZs=ή[p/WkG Sc U:['*GLxe^hZ/,qG>L~?!e~9>mK uAgg‹zf5C-$S}WkPaY %(O7vua-(u)ubc0B+Zw{f.0>'ؔoD[Ʋh,sRָx p)^;a^e~MWnciїLO7,! аBnqf.)_5I.W. tc7=W+\źRDK  n,%U!Eaug |Кj&Co}i2ʎRx:UaŘ7޹^Y=lC6*|1,{c54hlFZτl d'o5s,UYsq; Am 3XH4$N6?Ԟ>HZSE}ϻ) `LMJU$ hA5dBUUA[0m^ _¬UY,NW[ĚLoێQ5IäND .] ͤW0y[W/^ eD6DIVs\k G⣞#d~ %'ѭg#{q֥3taj!G@'*f 9B PZQfPIll)7b֪a\z1n[Ԕm$vS8:L4k^zGh1ϧb9řQJx IfIRD54͖u9m:ـ<w +J;F4!, ZËV=Z JT["}+^k #$J~n˗[~h]}ww9~FTlj IMM$Q\xi۔nm%OsXĊ-֯BZ I8<+0S~-g.ti:_N&~K䫜AV ?>I!X* VKVqjfo3!FI5JO2_IZ2 =n@J:M1LPUFRNBMtVX]d0s9 4n>ҡC(ǡ00MT*4 a mRBUZB\dY&^Xz>rJd"Au|8EB -B~~h@:.H^!9iz OY, `\Oe,'N!Y# kG;sdl@ IJc=W>Hvwt]})$6bd?NccrĚ[(iriJUY {MMD:[*b(_)_Ȧ_FexgQ:tKճ>% 7kW\i˅*a']b n$BWj|3jzQ=<eLxRJp-P1$X4F"AˆxRk֬FHImr jny{թ)J+<84eA*HmTGGwvioG>(ٷʕx">Ҳ,+26b77c3)>EL%RV$_3ygzL5 qAt;尋&t *w3HiݰJkd@S[VRwu9`&f"NypDGQ\iYfrVRǫT7}Es t:jL#,01p q)e(R;.BJax/hEފ3=͐et%׳@ݳ:}5Azh! ?zĉuvyN$#˜d/v5S}~$54Sػ$,A 0 2!MsYT8ի+ZQ,MNbeX V.֧0knt^BYhjGsP[;IQ&҂W*S+yq(ui;w32 Z:2ў\YIޓu7 84v~b@.C)q(qZ1l #d3fԪz{UwiZcADDG/l^Wk=3L%Ivtֶ .But !$eKqQb4X$Ҍ︸~)|ۆv*I!\ukj5Ts35AV=o-ZcBՍ `0X! 2'pVI}`0i:@<όN]t>.{4@6#H^{M!dPAETz˅LjHtuRÎǂV-5N>O!.8;U(ecLCuZs9Z iX4+%)Qصi֬aJlqдnv.Ѓ+Mt^v)ձqX\Lu-imqH$l5d0_ꮑ+|U3ЅR@W(ǯTSIK{NNW !:/ڂ_.Ӳm X Tq{dg'Aut0H? X"Jm|5Wal X,>ƞJi^|Z)&L "V@34kע5( {\#u jtѪL4VUb_qW1,=& G?nXX%8uMm2N)ߏN\C.e1b6۶Ӽq^@30:J#||Z8=^*MMd/'InZDexiTɮ^liW +!$CAܦ D\Ka^jg){Y亻Yvfɮ\iۍgֻ;H 5shu_dI-I0ԴZ*!Ĉ03|ϣ5`$ox{;4tԒ^z"zh]brvW13֯#t !LXs3S/Dof"IA Fl+W26ʃC$ $k5ʃL<,_ƒwLҴ,j΋/Bcu~qE9ڃ,iMNmgs<\[0.s,*\a Na75a&i|#O]{CZɮ.e69IlY?ϣ\ukq E:/+2i 0:t^z1knfH\Av2>anw/^Bq [v YbM$CwCM;FutX.GE[M5U~p1t=;\c<'*,LITԄ4@1JHPfhZߣZȇxNrG^XhNnR¦@]j@oO0 #`Z߄_vfD^"FdOhpK%ן4J 2+W -L2Mk2prk M#gzʃ!W.+oR/qu61 B[ԋ/UHZqK} ҫVa[gzÎ1s>]o}3m5379Bya!f;@P%XJQAZe˨v޹$z{ZQth(MW+8SӁNܶ"H2q8Sع&Ɵy#˘zq;m\sEHomnq;p EV<ߧg/&&Һ;}X4at(IyhF.4={I-YBedܪŪ< jv\I?DCk\=d&C'L&)Gy>tBD{Chɭ]Qػ`f2x8;0b1)t]~V:CjbMM${+e5aT7wa0ȭZIud年lÏ0#ձ /#6>Nҥvƞz{r}ej7߂:Ac:_zHi>k.c&'aSTFhp6aKbr߳DGL=hbxhw\o ui!h:A,Ɩ2M!>e)W*$ȿ-RoAijϑYOBR%=h)IXNmt 1I%? N$v=:5@ypukI-mSG4*u"-:a=ÞP $qy--&'&049owLAet8&hDMN2ItnZ)ك}òpm%߇[(\x[+6К=$m :GW]E?Ŀ ?(B]V"GqA {`6eKZ3p8ǭT:SJm,gh_ʣ]d.XY嗧.z],|gziZ͛I <4Lzrd&dG( %r%ز Χ61LRZ+&+eM?tW^MOE/^DC߁, ɐ<0R,}HtcVXX S`X} 4oTm63 =+) '݅aY/g(q8s \Gݭ^?dX GZcTiZ"x#${zPs3;v {T*8SLxx{TFF󦫉2MMT( 4IKQ|9▅֐v +tv⭭FG.-К唇-Mא Jҡ>T0\>}73]>|:r:ܹ@L)d{7\KGq4LgjDgU+Q*Av 2*##h CAi|#ߏ͂ADwюNQ*4+MN2c C 4<Ɵ~x{S/ ۋ0Mn3ET K\$N\} x;tħ3TBp=D;#Hf&L*K:v 0b r-6beDh -fm;؁Rg,y[Ibf3v쥽ě[P\&aYd^OU"m矇W.3#^~[.d0)+W2p]''1yVOoz#pGJɝP -{FWZk{Q"ڂ1쳴oD2N_®ؙ,Ȯ^|JѱBvt CdV ڍՈkioxf*IU ?F&հS)cعmS8}׿yhgr yt\|ndzĭK:jVhL'RXAf9bӖt]=TG8t=t_q1\DW*hټ Rt ܾ+eq E=A1c43f:EE[ Yk13iL5{^2:/Ju>x8MgoFa0?o\WE^{v).QuдXuV.(GĊÙ"Avy?1DT2Dy5,<0s3̳_x-7PF [Ï0x$S/ ԄVTFFIttp;q >b?q#2zhSTGXw^ 9GwuDNiQ_ uf-`f:NYz5c">,m/rua%h^]fԦWʴJ`jr= ?( 1’3~5w|]V|9;'%GnFQ f[r2N1Vuk !r2#q%*{#ށ_aSع,0ɭ]IR%ByU**,~J֮w/?NӴ,| +e bj<T11=_5| m&6" B2K_uXַ۶/iq3T# O)tL}ԦDLYо"N-AE&MWQmr's0S)cdW,gm4ɮ.]vzzعl = oL~&}mɭ[39Eq>b1Stv0ӔH\Lp6 ;H(?tliwmtj5Zs7RV~eRKbXX$ e,_^pazjc&'hZ&9]_7qu/"'FTGeY|>{GKʟu =d6KaLgd(~LexΝ\s5W\즍t@4<' T \FهH=XcL4("l)f2InFxI('&MNR%f5%TS_B9*oOX-$/=cZ)%xן!_leh @[QCzxm[.J$Htvw}EyоGs)0w_8d9\zeɧm͛yeѼ~=Ziǩay!|Ls3F"NetXSiZ6# 7_͌<8{}{a*L"+"Ldv?u_/#|{U?yB) 4!iFu)#G>vﻟ}߻ Vc$q0(B0ZI{MNa75KwJXяFV*gR'kDyGD\)~/c{J~gffI.S}^؆W.#- +eG1Z/lgE/T84~*;ڙ~!mmxarV1d#La[8G40:;'v-ND54 Rd׭ů9(ϣ㒋)݋J9<~zfmA+0#rD>N(`4xǁ=oYgk.^=nMg3)$QR5VFxRr$;:ОKet׃}x6rWqλвq^jxvSoo?C>x쪕XՑQX$ξ}(EZcE`t,NOc~!阜==KEiqi6k֬܋/aåI.T P`ŗh;<JIZ%,X 3`z˔#Ks}q {k(Ku| /`1FH]x>ڶMN줰g/]^ ,{qڶQݝ'&I\Πj5|awbc6V&7IQ=X A|XGX' lOqijzV`i2Yr%=ݳ;v:E"dPNAvJ{kutRzEC$32}?4 h:kz^n쪕á[=m;+.G&(*WHtuRǶ>Z 8+BH̲e ?0V*Pf{i{=+(03i.^z߹]1ljn+--AP"v-qGFB`&>k(u6,?e؜|R𲛚pV_ UQسez'f<[*!B Aydqf1 F|QZ#FFar4pHF;^,N9Gqh+"߇W.SoVka$&1LdhlX "rDgԻoͰ({?wy'F,Fu>U+Cy۶kmA&!{ Skid,<*C?"aa`yW^z8SS$Ïhii!kmeF2)ًW*߽w]Gu`qԎHB6^_~CJqpC4RIR}}&&E$Ӄ30ͷzf]-eh)A+_`(=Bbr~.,^sJ30| %lO2\ n<'%-FkMs( ];=0W)i;\èC~Na{2逸hR )x*=OP@X|ɻ!أQn񉠂J / ۈ%m Ab1m#%"WM`K(SEDyϮU+x>kWp磏1>9InUЇyH?Tbjv֬ Iv*n<&Y\G&v:3=}]t>j.//P{yҘ!jRJ(՗v2ڙ# TBIJ0,܉Qzx!Q{[:k^޽~#OܺGCu/qGhLsY*##${Itv0};34RL%JF"Na+V0ShҞ޵ `rx R"f=B8W)t2\- ZG ygf{S|Q/t5?Y70Ī|01]:6ԋ/[]d0H"RM9bXs3udk+H!įTkA{*1k,2KIQ'5 ݆MB'Du!!hsZ֭]{׌wr))%D8)`'H/[ @Ypy4iW+؆Ԅ,apE \mA ]zeAdtLY _cRмiD.7f~ӦW}S?jA@ԂrE%ßF*(%Aa^mm4o8aNG(E2 n-/lL$ Z-\i~B>Bt ET&5%P|˛IVkthixQњlG}ZXI c Eh!ag!%gEy!FcBar*/HA0Ma݉Z)D KWPmgQCRQ;<(3\H!}6~ilŘi?ZXw:rF ` g{qV*'p*ux7ؕ} / p]V t~]/]zq;FDybB@:ǟibiˤ![!J.%0W%v~`eBᎃhD"e\o=UϥRj-t pRZqgkᅣ<;߈<+!8г3a_M` pU첬` Ѕ6MDxB)*? RZ?^!,$\n1{V ~⬏|l{#6f}gDGXB0<4H8}|'Nkj"Q5 {`3 -p 5CX[Vc- p}%Vח#B&F*.QsJ1x8x?8ul:u G$\'"˲bJ0 ;vPT(&5\,B#A;5ڙ5 Hі@ 3Z׳!i- u@h(}ت6>Ҳ0):D8.B0?jo~StvtR} D7yR,r/p%4vmlᏸLFeΎyq{z8 PZ380ҥKY_AE[@@>tXZ`/蝥Xm@jh۞M2!i65c=#ֶq^֖?'\}u Fi)GDy2Bf:w܉֚Z}rCCìYZʊq.Uk T^5'xf{AY WBJjXpFݽ@Yf?uDg~KN ~#")VIM{Jiy36nR059ɖ.o} ַr?Fm8888׎e8XYaH!6:hy^kdT[c}%g|ԪUL Zކ_s0+uӊ&7WZfI&݅T񛟢CXtK,ey|G~ 45GG%")Z+[ ۷kגfFGGjL&d2\e K,g#ᬫb2<H}4s6r'>~Yt)/<apwϟBUμX ,tX t I007㿺>oZM>_@j,룷!aY/nΝwKzٰq#]]L ~y'{^z_ y>zy?UC H47Ѵt 6m͛i8x_|d2ɖ-[}jя3CŽ >3d xQ|%[~O~ZƮ;)kr<hYf5 cuLkXb--HӤZR.Qiۻ^|6/cڵgĺuϷmnHykpH JIG>/گw.FF}a -e}V$qDC"֖Vʕ2m.yJJ)ضM: ۶׾{`2O<5+.0 ):>^/֧9<r{ab|\.G.c:GH%fuVL&C>gxxX̦uH$AP\.fH$^Ԇ5͒Jbh bq !mb8q=pW[vلR c'ȅ>8$Vn@haXM> .@JJ)8Jݻw绎KTuRE<}Z+$c12Rή.\ell~ngӂ -",,D^@Zӗko{;[GYv&''`rri* mS*V*!X+|0$J *ÒK]i{9nf}65Bv0 4liNarx˿>5׿믿j:xNYt)===T*)xm457J&T*$S)ccaY+.H^tףR8|+ B VOU`- D^@ \C{Yi>,mmeY66=K۳SOSOkKz'tr,泥vi50e\B*VBRpPLV1sh ZqTi3 p\bBJ0) R{'͈'&3YT\E Ḙ=:<ӷ>FGi` (CVBwh n@J5dB[G<[2 E"/@&AdjA@N +=?ICam(ƨTJ7=7^!;Q>pL%dcvZmF Qk C}Ocz^  *UK;0m/K{a91I堇K;JTygK{ِBĞ}`RZCv6 GHF HNP'LNp 3?c7sϞh"/GNYquϽfn?f=wf~QEX8nN/ƩFKT'G8=}{Y--n ) #zH\rĖ Vnt٢CD0iOzˌQ }BbQ9AdL 's8.} AM17* D>V8laeᏌ⏏8 _f*W{V a,2ChP"jrU,a rMMd2.QPW[GQ<|#"Ak?AD<?>NBJ5ֲd>>\I_'3y:;vR)El|۬ |T|x6II^yȖfmҨr ǞtԶmo܄^8D"A I-7mhoGcAJotgۋx#O~;mXGJU+gӿ4 807FgOw#"Akd.G?U(r;YouɼX}={<(_R ~cV𣳭ֈXIYBl-XKx ٟ 6S_7W?&V,7~[ތL%_&̞nl"Qy1?[cr-]_\K/QsQ%QcЕ a|:O_AkCgfs>Vi$.xSP}ÄS~ǁA^y8E5;i_1FG=Vo{?:yHKA]6'#WEd3{~SXK~mua Z@IMWSFkF)z;*XiL%i"AjE)qQ22%a=鷼{ݚ ׽U,2VZ#S)ZS||3R!0 .bg\6Jw݃_," l wmt7:vTb1__Bg'W$};oH[;ݳC瞧 $. n0MR\į!AJ7+鏒 r/SF#tMZ>+d>yT}·n n 7ezՍמɿ'Yb H`ZIRF-Կ4?%y}֞Z#`zqf#"1@ji47}{*7DH RnĖ HlgV([fm."yFLSE&>OA/gxx1eIm1lr?QwB W"b{koc?1}kǡ ^6+㮦P,33Ws31K^g(0{)Hmhe?3*GaGug&$yຸN ^Q>b;ߎǃ@Dӂ(up[|&^9v&+=+ .d,FMW5ٽѕRSOv B[V+؆gPRM9\AJJ7J}?gS55t~Ry! ߻ʣ㏎;©BD?6,ykE)JE?f 2j=Z}ˈwncv y8Tf{'_p^ 4DhO!?UU&IRo}3k2śoKi%N[ B;RR۾&ɫDf2Ķ͎m_zc[g2D$sk3 귨<+,bgӿAϿ3M?t:DqETD;c[dCwY&$j+Z@Tߦu`pttROII'Oy{>+gVwm ȥGD>BPAڮfH^y' ^Ǜ; fnA'^BMNxg BRo sÇfdnW$TT`C8v|Bx5…UQ0o}LM~HZu3}u~ Kan65A{bgG>2!ҮK[PG"*3`Y^~xCXf#2?~"9-l;Xjz'BJO>6Lr쵫kG{zsS;/02S_ǟj%2։ O5tݳ6'.قˑ2@λGjQyYJ){JW]OqXJ|J=oN+ |$-njZS@^M$.͟t۝NF!Gٻ6ˢ?G|vP#={Tfhi>Ff#"aIuak%3i75m'⋯|oV6:,c(sԯPpԦzGC&fdPX1c㏎zMlWt ڈ|*~;߁TViߧxru淩P$9k~E-t@nLb_G#hk{~|WV,)Az͵GI^:OѥZ >|3>lᛉ+ZzUDH B*?`[pݻʃu;eU@ gT| gZbf%kVc^̤~ÍrM{ލn OPRaX}HW]1;8;wQ釯<0珍a6K^u_G t5;1ވ8& bs))3Q^c\)%էf7qBa6n qCaf i# ^s66n0foޠZ;u&+収2}ARE7>]bg%va{S_` _A7W&b)Q+xe:)v+˨Rҭ_-T{_$_ڬ"^TB&*N62 ze$TyO j/R׌eqמ~eܐk=@ʹVx@\700M>}A': hjBMN ]A x?H' iY3$hCJxԞ}M?d_[cwSAa#h$Ʃ>S_w&pUocՅ"AJ-dޭ]5=Mmۋn U(.t̀V>c[E|ur=@)$BbrӜ})5g@0kd]bvu`tt`d`ຨR r l r*( J?V*ށՁljBhSxCxAcZ_?HI* =!ӊrlVwW.w9d&3BЅAN$o "/*Ȱ=Gh8꾄ҝ hޯ@(\DǹSZ1g΄Ip8.Kx4#ȭ^h$2!Wk(Ҋ)9\͸!׼D.M\̈"̂ja{@ {L;?_])'s0>ny"Z1ƹ'灿 2n5|W'ThJh,vGFB;ԊgI˃OwUf/,)]v.iUH:$@(qǹ%&NG쁣w{O1h ]Hڅ&r#ǹ#Z1'E\x7Z:qg~+0ǁ_O t IQYf3 4. UNUrH?Uq7 |h:7`)h ",6Q^kk⤬(iщ;^3l#xGX(֊$ Y!5[q} "sG 2;MH,"kaa#(Ԍ ֭Lh1 auLĝy<Dž#ezw_o L8Dq"\-e`x[? " <ׁٟ&XjtVGXǹB1ƹ7t_o<^< "ǀ_ vutqd#.T8w\t!| q}ĭcNnu Oh!ވs ;O ;X Gy%O'HTbe~u>7y/ĝy#c'P<0"b<>O %qJp $z"Y! eiԊQp".pj[>G-~ xl1)*=-F8U82SNp{%-F#Nߩl1kBj-d~ uD@7w_2SyEGh Iד<>D?~UK%/'-;UXx&nOʹȃ13gYf=FaYMB`47arثWa-]B;Q"jl :bIM9p[$HR C@S.AϭllX+neF]!L;߆k7]O|;fqx;Ak e~~Cw ?*qH̰#` 6n:ՉGSI~.ثWR{]J{Wݽغ5Tzxa}Z1IvoIb7;1ɤG1vqB wD@DJuP, b7;{=էݻ{ޅsw)蹒}3HwŃ3R"@W*d?!bg_pS_2M0{G)w?5)FG2lj3Fuf ⡛쏏7ϓrJw܍Uj.2S'E-[r_yO8-E)I>8oߊgRH淈E5ΎI-oē#={öo#\5֊娱q|p[$[|pJeJQ0+eO+"f 5061Y0_ӡi~ & |ߙ 3,2$OduڞFT}cD܅ s-[<e~%hk("/P̗l"/p̵l5!"/̵lq""FDE? @DE#/}ǸJ$Py#"/bl1a|dUן!Q|(_!P@}CD3 GK 2"Yg5G+%tEXtdate:create2019-01-11T14:00:28-06:00p%tEXtdate:modify2017-11-21T00:26:08-06:003X[IENDB`rvest/man/figures/lifecycle-archived.svg0000644000175000017500000000170713767413737020260 0ustar nileshnilesh lifecyclelifecyclearchivedarchived rvest/man/figures/lifecycle-stable.svg0000644000175000017500000000167413767413737017750 0ustar nileshnileshlifecyclelifecyclestablestable rvest/man/figures/lifecycle-questioning.svg0000644000175000017500000000171413767413737021036 0ustar nileshnileshlifecyclelifecyclequestioningquestioning rvest/man/figures/lifecycle-experimental.svg0000644000175000017500000000171613767413737021170 0ustar nileshnileshlifecyclelifecycleexperimentalexperimental rvest/man/figures/lifecycle-maturing.svg0000644000175000017500000000170613767413737020320 0ustar nileshnileshlifecyclelifecyclematuringmaturing rvest/man/figures/lifecycle-superseded.svg0000644000175000017500000000171313767413737020633 0ustar nileshnilesh lifecyclelifecyclesupersededsuperseded rvest/man/figures/lifecycle-deprecated.svg0000644000175000017500000000171213767413737020567 0ustar nileshnileshlifecyclelifecycledeprecateddeprecated rvest/man/html_children.Rd0000644000175000017500000000126213776122153015435 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/html.R \name{html_children} \alias{html_children} \title{Get element children} \usage{ html_children(x) } \arguments{ \item{x}{A document (from \code{\link[=read_html]{read_html()}}), node set (from \code{\link[=html_elements]{html_elements()}}), node (from \code{\link[=html_element]{html_element()}}), or session (from \code{\link[=session]{session()}}).} } \description{ Get element children } \examples{ html <- minimal_html("
  • 1
  • 2
  • 3
") ul <- html_elements(html, "ul") html_children(ul) html <- minimal_html("

Hello Hadley!") p <- html_elements(html, "p") html_children(p) } rvest/man/rename.Rd0000644000175000017500000000325614014035320014056 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rename.R \name{rename} \alias{set_values} \alias{submit_form} \alias{xml_tag} \alias{xml_node} \alias{xml_nodes} \alias{html_nodes} \alias{html_node} \alias{back} \alias{forward} \alias{jump_to} \alias{follow_link} \alias{html_session} \title{Functions renamed in rvest 1.0.0} \usage{ set_values(form, ...) submit_form(session, form, submit = NULL, ...) xml_tag(x) xml_node(...) xml_nodes(...) html_nodes(...) html_node(...) back(x) forward(x) jump_to(x, url, ...) follow_link(x, ...) html_session(url, ...) } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} rvest 1.0.0 renamed a number of functions to ensure that every function has a common prefix, matching tidyverse conventions that emerged since rvest was first created. \itemize{ \item \code{set_values()} -> \code{html_form_set()} \item \code{submit_form()} -> \code{session_submit()} \item \code{xml_tag()} -> \code{html_name()} \item \code{xml_node()} & \code{html_node()} -> \code{html_element()} \item \code{xml_nodes()} & \code{html_nodes()} -> \code{html_elements()} } (\code{html_node()} and \code{html_nodes()} are only superseded because they're so widely used.) Additionally all session related functions gained a common prefix: \itemize{ \item \code{html_session()} -> \code{session()} \item \code{forward()} -> \code{session_forward()} \item \code{back()} -> \code{session_back()} \item \code{jump_to()} -> \code{session_jump_to()} \item \code{follow_link()} -> \code{session_follow_link()} } } \keyword{internal} rvest/man/session.Rd0000644000175000017500000000516113775650464014320 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/session.R \name{session} \alias{session} \alias{is.session} \alias{session_jump_to} \alias{session_follow_link} \alias{session_back} \alias{session_forward} \alias{session_history} \alias{session_submit} \title{Simulate a session in web browser} \usage{ session(url, ...) is.session(x) session_jump_to(x, url, ...) session_follow_link(x, i, css, xpath, ...) session_back(x) session_forward(x) session_history(x) session_submit(x, form, submit = NULL, ...) } \arguments{ \item{url}{A URL, either relative or absolute, to navigate to.} \item{...}{Any additional httr config to use throughout the session.} \item{x}{A session.} \item{i}{A integer to select the ith link or a string to match the first link containing that text (case sensitive).} \item{css}{Elements to select. Supply one of \code{css} or \code{xpath} depending on whether you want to use a CSS selector or XPath 1.0 expression.} \item{xpath}{Elements to select. Supply one of \code{css} or \code{xpath} depending on whether you want to use a CSS selector or XPath 1.0 expression.} \item{form}{An \link{html_form} to submit} \item{submit}{Which button should be used to submit the form? \itemize{ \item \code{NULL}, the default, uses the first button. \item A string selects a button by its name. \item A number selects a button using its relative position. }} } \description{ This set of functions allows you to simulate a user interacting with a website, using forms and navigating from page to page. \itemize{ \item Create a session with \code{session(url)} \item Navigate to a specified url with \code{session_jump_to()}, or follow a link on the page with \code{session_follow_link()}. \item Submit an \link{html_form} with \code{session_submit()}. \item View the history with \code{session_history()} and navigate back and forward with \code{session_back()} and \code{session_forward()}. \item Extract page contents with \code{\link[=html_element]{html_element()}} and \code{\link[=html_elements]{html_elements()}}, or get the complete HTML document with \code{\link[=read_html]{read_html()}}. \item Inspect the HTTP response with \code{\link[httr:cookies]{httr::cookies()}}, \code{\link[httr:headers]{httr::headers()}}, and \code{\link[httr:status_code]{httr::status_code()}}. } } \examples{ s <- session("http://hadley.nz") s \%>\% session_jump_to("hadley-wickham.jpg") \%>\% session_jump_to("/") \%>\% session_history() s \%>\% session_jump_to("hadley-wickham.jpg") \%>\% session_back() \%>\% session_history() \donttest{ s \%>\% session_follow_link(css = "p a") \%>\% html_elements("p") } } rvest/man/html_name.Rd0000644000175000017500000000125613776122153014570 0ustar nileshnilesh% Generated by roxygen2: do not edit by hand % Please edit documentation in R/html.R \name{html_name} \alias{html_name} \title{Get element name} \usage{ html_name(x) } \arguments{ \item{x}{A document (from \code{\link[=read_html]{read_html()}}), node set (from \code{\link[=html_elements]{html_elements()}}), node (from \code{\link[=html_element]{html_element()}}), or session (from \code{\link[=session]{session()}}).} } \value{ A character vector the same length as \code{x} } \description{ Get element name } \examples{ url <- "https://rvest.tidyverse.org/articles/starwars.html" html <- read_html(url) html \%>\% html_element("div") \%>\% html_children() \%>\% html_name() } rvest/vignettes/0000755000175000017500000000000014132343550013557 5ustar nileshnileshrvest/vignettes/rvest.Rmd0000644000175000017500000002702214101012300015347 0ustar nileshnilesh--- title: "Web scraping 101" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Web scraping 101} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, echo=FALSE} knitr::opts_chunk$set(comment = "#>", collapse = TRUE) ``` This vignette introduces you to the basics of web scraping with rvest. You'll first learn the basics of HTML and how to use CSS selectors to refer to specific elements, then you'll learn how to use rvest functions to get data out of HTML and into R. ```{r} library(rvest) ``` ## HTML basics HTML stands for "HyperText Markup Language" and looks like this: ``` {.html} Page title

A heading

Some text & some bold text.

``` HTML has a hierarchical structure formed by **elements** which consist of a start tag (e.g. ``), optional **attributes** (`id='first'`), an end tag[^1] (like ``), and **contents** (everything in between the start and end tag). [^1]: A number of tags (including `

` and `

  • )` don't require end tags, but I think it's best to include them because it makes seeing the structure of the HTML a little easier. Since `<` and `>` are used for start and end tags, you can't write them directly. Instead you have to use the HTML **escapes** `>` (greater than) and `<` (less than). And since those escapes use `&`, if you want a literal ampersand you have to escape it as `&`. There are a wide range of possible HTML escapes but you don't need to worry about them too much because rvest automatically handles them for you. ### Elements All up, there are over 100 HTML elements. Some of the most important are: - Every HTML page must be must be in an `` element, and it must have two children: ``, which contains document metadata like the page title, and ``, which contains the content you see in the browser. - Block tags like `

    ` (heading 1), `

    ` (paragraph), and `

      ` (ordered list) form the overall structure of the page. - Inline tags like `` (bold), `` (italics), and `` (links) formats text inside block tags. If you encounter a tag that you've never seen before, you can find out what it does with a little googling. I recommend the [MDN Web Docs](https://developer.mozilla.org/en-US/docs/Web/HTML) which are produced by Mozilla, the company that makes the Firefox web browser. ### Contents Most elements can have content in between their start and end tags. This content can either be text or more elements. For example, the following HTML contains paragraph of text, with one word in bold. ```{=html}

      Hi! My name is Hadley.

      ``` The **children** of a node refers only to elements, so the `

      ` element above has one child, the `` element. The `` element has no children, but it does have contents (the text "name"). Some elements, like `` can't have children. These elements depend solely on attributes for their behavior. ### Attributes Tags can have named **attributes** which look like `name1='value1' name2='value2'`. Two of the most important attributes are `id` and `class`, which are used in conjunction with CSS (Cascading Style Sheets) to control the visual appearance of the page. These are often useful when scraping data off a page. ## Reading HTML with rvest You'll usually start the scraping process with `read_html()`. This returns a `xml_document`[^2] object which you'll then manipulate using rvest functions: [^2]: This class comes from the [xml2](https://xml2.r-lib.org) package. xml2 is a low-level package that rvest builds on top of. ```{r} html <- read_html("http://rvest.tidyverse.org/") class(html) ``` For examples and experimentation, rvest also includes a function that lets you create an `xml_document` from literal HTML: ```{r} html <- minimal_html("

      This is a paragraph

      • This is a bulleted list
      ") html ``` Regardless of how you get the HTML, you'll need some way to identify the elements that contain the data you care about. rvest provides two options: CSS selectors and XPath expressions. Here I'll focus on CSS selectors because they're simpler but still sufficiently powerful for most scraping tasks. ## CSS selectors CSS is short for cascading style sheets, and is a tool for defining the visual styling of HTML documents. CSS includes a miniature language for selecting elements on a page called **CSS selectors**. CSS selectors define patterns for locating HTML elements, and are useful for scraping because they provide a concise way of describing which elements you want to extract. CSS selectors can be quite complex, but fortunately you only need the simplest for rvest, because you can also write R code for more complicated situations. The four most important selectors are: - `p`: selects all `

      ` elements. - `.title`: selects all elements with `class` "title". - `p.special`: selects all `

      ` elements with `class` "special". - `#title`: selects the element with the `id` attribute that equals "title". Id attributes must be unique within a document, so this will only ever select a single element. If you want to learn more CSS selectors I recommend starting with the fun [CSS dinner](https://flukeout.github.io/) tutorial and then referring to the [MDN web docs](https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_Selectors). Lets try out the most important selectors with a simple example: ```{r} html <- minimal_html("

      This is a heading

      This is a paragraph

      This is an important paragraph

      ") ``` In rvest you can extract a single element with `html_element()` or all matching elements with `html_elements()`. Both functions take a document[^3] and a css selector: [^3]: Or another element, more on that shortly. ```{r} html %>% html_element("h1") html %>% html_elements("p") html %>% html_elements(".important") html %>% html_elements("#first") ``` Selectors can also be combined in various ways using **combinators**. For example,The most important combinator is " ", the **descendant** combination, because `p a` selects all `
      ` elements that are a child of a `

      ` element. If you don't know exactly what selector you need, I highly recommend using [SelectorGadget](https://rvest.tidyverse.org/articles/selectorgadget.html), which lets you automatically generate the selector you need by supplying positive and negative examples in the browser. ## Extracting data Now that you've got the elements you care about, you'll need to get data out of them. You'll usually get the data from either the text contents or an attribute. But, sometimes (if you're lucky!), the data you need will be in an HTML table. ### Text Use `html_text2()` to extract the plain text contents of an HTML element: ```{r} html <- minimal_html("

      1. apple & pear
      2. banana
      3. pineapple
      ") html %>% html_elements("li") %>% html_text2() ``` Note that the escaped ampersand is automatically converted to `&`; you'll only ever see HTML escapes in the source HTML, not in the data returned by rvest. You might wonder why I used `html_text2()`, since it seems to give the same result as `html_text()`: ```{r} html %>% html_elements("li") %>% html_text() ``` The main difference is how the two functions handle white space. In HTML, white space is largely ignored, and it's the structure of the elements that defines how text is laid out. `html_text2()` does its best to follow the same rules, giving you something similar to what you'd see in the browser. Take this example which contains a bunch of white space that HTML ignores. ```{r} html <- minimal_html("

      This is a paragraph.

      This is another paragraph. It has two sentences.

      ") ``` `html_text2()` gives you what you expect: two paragraphs of text separated by a blank line. ```{r} html %>% html_element("body") %>% html_text2() %>% cat() ``` Whereas `html_text()` returns the garbled raw underlying text: ```{r} html %>% html_element("body") %>% html_text() %>% cat() ``` ### Attributes Attributes are used to record the destination of links (the `href` attribute of `
      ` elements) and the source of images (the `src` attribute of the `` element): ```{r} html <- minimal_html("

      cats

      ") ``` The value of an attribute can be retrieved with `html_attr()`: ```{r} html %>% html_elements("a") %>% html_attr("href") html %>% html_elements("img") %>% html_attr("src") ``` Note that `html_attr()` always returns a string, so you may need to post-process with `as.integer()`/`readr::parse_integer()` or similar. ```{r} html %>% html_elements("img") %>% html_attr("width") html %>% html_elements("img") %>% html_attr("width") %>% as.integer() ``` ### Tables HTML tables are composed four main elements: ``, `` (table row), ` i <- length(values) + 1 length(values) <- height while (length(dw$col) > 0) { vals <- rep(NA_character_, width) for (col in dw$col) { cell <- dw_find(dw, col) vals[col:(col + cell$colspan - 1L)] <- cell$text } values[[i]] <- vals i <- i + 1 dw <- dw_prune(dw) } values <- lapply(values, `[`, seq_len(width)) matrix(unlist(values), ncol = width, byrow = TRUE) } dw_find <- function(dw, col) { match <- col == dw$col list( col = dw$col[match], rowspan = dw$rowspan[match], colspan = dw$colspan[match], text = dw$text[match] ) } dw_init <- function() { list( col = integer(), rowspan = integer(), colspan = integer(), text = character() ) } dw_add <- function(dw, col, rowspan, colspan, text) { dw$col <- c(dw$col, col) dw$text <- c(dw$text, text) dw$rowspan <- c(dw$rowspan, rowspan) dw$colspan <- c(dw$colspan, colspan) dw } dw_prune <- function(dw) { dw$rowspan <- dw$rowspan - 1L keep <- dw$rowspan > 0L dw$col <- dw$col[keep] dw$text <- dw$text[keep] dw$rowspan <- dw$rowspan[keep] dw$colspan <- dw$colspan[keep] dw } rvest/R/selectors.R0000644000175000017500000000741113776110052014103 0ustar nileshnilesh#' Select elements from an HTML document #' #' `html_element()` and `html_elements()` find HTML element using CSS selectors #' or XPath expressions. CSS selectors are particularly useful in conjunction #' with , which makes it very easy to discover the #' selector you need. #' #' @section CSS selector support: #' #' CSS selectors are translated to XPath selectors by the \pkg{selectr} #' package, which is a port of the python \pkg{cssselect} library, #' . #' #' It implements the majority of CSS3 selectors, as described in #' . The #' exceptions are listed below: #' #' * Pseudo selectors that require interactivity are ignored: #' `:hover`, `:active`, `:focus`, `:target`, `:visited`. #' * The following pseudo classes don't work with the wild card element, *: #' `*:first-of-type`, `*:last-of-type`, `*:nth-of-type`, #' `*:nth-last-of-type`, `*:only-of-type` #' * It supports `:contains(text)` #' * You can use !=, `[foo!=bar]` is the same as `:not([foo=bar])` #' * `:not()` accepts a sequence of simple selectors, not just a single #' simple selector. #' #' @param x Either a document, a node set or a single node. #' @param css,xpath Elements to select. Supply one of `css` or `xpath` #' depending on whether you want to use a CSS selector or XPath 1.0 #' expression. #' @returns `html_element()` returns a nodeset the same length as the input. #' `html_elements()` flattens the output so there's no direct way to map #' the output to the input. #' @export #' @examples #' html <- minimal_html(" #'

      This is a heading

      #'

      This is a paragraph

      #'

      This is an important paragraph

      #' ") #' #' html %>% html_element("h1") #' html %>% html_elements("p") #' html %>% html_elements(".important") #' html %>% html_elements("#first") #' #' # html_element() vs html_elements() -------------------------------------- #' html <- minimal_html(" #'
        #'
      • C-3PO is a droid that weighs 167 kg
      • #'
      • R2-D2 is a droid that weighs 96 kg
      • #'
      • Yoda weighs 66 kg
      • #'
      • R4-P17 is a droid
      • #'
      #' ") #' li <- html %>% html_elements("li") #' #' # When applied to a node set, html_elements() returns all matching elements #' # beneath any of the inputs, flattening results into a new node set. #' li %>% html_elements("i") #' #' # When applied to a node set, html_element() always returns a vector the #' # same length as the input, using a "missing" element where needed. #' li %>% html_element("i") #' # and html_text() and html_attr() will return NA #' li %>% html_element("i") %>% html_text2() #' li %>% html_element("span") %>% html_attr("class") html_element <- function(x, css, xpath) { UseMethod("html_element") } #' @export #' @rdname html_element html_elements <- function(x, css, xpath) { UseMethod("html_elements") } #' @export html_elements.default <- function(x, css, xpath) { xml2::xml_find_all(x, make_selector(css, xpath)) } #' @export html_element.default <- function(x, css, xpath) { xml2::xml_find_first(x, make_selector(css, xpath)) } make_selector <- function(css, xpath) { if (missing(css) && missing(xpath)) stop("Please supply one of css or xpath", call. = FALSE) if (!missing(css) && !missing(xpath)) stop("Please supply css or xpath, not both", call. = FALSE) if (!missing(css)) { if (!is.character(css) && length(css) == 1) stop("`css` must be a string") selectr::css_to_xpath(css, prefix = ".//") } else { if (!is.character(xpath) && length(xpath) == 1) stop("`xpath` must be a string") xpath } } rvest/R/form.R0000644000175000017500000002127113776122653013055 0ustar nileshnilesh#' Parse forms and set values #' #' Use `html_form()` to extract a form, set values with `html_form_set()`, #' and submit it with `html_form_submit()`. #' #' @export #' @inheritParams html_name #' @param base_url Base url of underlying HTML document. The default, `NULL`, #' uses the url of the HTML document underlying `x`. #' @seealso HTML 4.01 form specification: #' #' @return #' * `html_form()` returns as S3 object with class `rvest_form` when applied #' to a single element. It returns a list of `rvest_form` objects when #' applied to multiple elements or a document. #' #' * `html_form_set()` returns an `rvest_form` object. #' #' * `html_form_submit()` submits the form, returning an httr response which #' can be parsed with [read_html()]. #' @examples #' html <- read_html("http://www.google.com") #' search <- html_form(html)[[1]] #' #' search <- search %>% html_form_set(q = "My little pony", hl = "fr") #' #' # Or if you have a list of values, use !!! #' vals <- list(q = "web scraping", hl = "en") #' search <- search %>% html_form_set(!!!vals) #' #' # To submit and get result: #' \dontrun{ #' resp <- html_form_submit(search) #' read_html(resp) #' } html_form <- function(x, base_url = NULL) UseMethod("html_form") #' @export html_form.xml_document <- function(x, base_url = NULL) { html_form(xml2::xml_find_all(x, ".//form"), base_url = base_url) } #' @export html_form.xml_nodeset <- function(x, base_url = NULL) { lapply(x, html_form, base_url = base_url) } #' @export html_form.xml_node <- function(x, base_url = NULL) { stopifnot(xml2::xml_name(x) == "form") attr <- as.list(xml2::xml_attrs(x)) name <- attr$id %||% attr$name %||% "" # for human readers method <- toupper(attr$method %||% "GET") enctype <- convert_enctype(attr$enctype) nodes <- html_elements(x, "input, select, textarea, button") fields <- lapply(nodes, function(x) { switch(xml2::xml_name(x), textarea = parse_textarea(x), input = parse_input(x), select = parse_select(x), button = parse_button(x) ) }) names(fields) <- map_chr(fields, function(x) x$name %||% "") structure( list( name = name, method = method, action = xml2::url_absolute(attr$action, base_url %||% xml2::xml_url(x)), enctype = enctype, fields = fields ), class = "rvest_form") } #' @export print.rvest_form <- function(x, ...) { cat(" '", x$name, "' (", x$method, " ", x$action, ")\n", sep = "") cat(format_list(x$fields, indent = 1), "\n", sep = "") } # set ---------------------------------------------------------------- #' @rdname html_form #' @param form A form #' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Name-value pairs giving #' fields to modify. #' #' Provide a character vector to set multiple checkboxes in a set or #' select multiple values from a multi-select. #' @export html_form_set <- function(form, ...) { check_form(form) new_values <- list2(...) check_fields(form, new_values) for (field in names(new_values)) { type <- form$fields[[field]]$type %||% "non-input" if (type == "hidden") { warn(paste0("Setting value of hidden field '", field, "'.")) } else if (type == "submit") { abort(paste0("Can't change value of input with type submit: '", field, "'.")) } form$fields[[field]]$value <- new_values[[field]] } form } # submit ------------------------------------------------------------------ #' @rdname html_form #' @param submit Which button should be used to submit the form? #' * `NULL`, the default, uses the first button. #' * A string selects a button by its name. #' * A number selects a button using its relative position. #' @export html_form_submit <- function(form, submit = NULL) { check_form(form) subm <- submission_build(form, submit) submission_submit(subm) } submission_build <- function(form, submit) { method <- form$method if (!(method %in% c("POST", "GET"))) { warn(paste0("Invalid method (", method, "), defaulting to GET")) method <- "GET" } if (length(form$action) == 0) { abort("`form` doesn't contain a `action` attribute") } list( method = method, enctype = form$enctype, action = form$action, values = submission_build_values(form, submit) ) } submission_submit <- function(x, ...) { if (x$method == "POST") { httr::POST(url = x$action, body = x$values, encode = x$enctype, ...) } else { httr::GET(url = x$action, query = x$values, ...) } } submission_build_values <- function(form, submit = NULL) { fields <- form$fields submit <- submission_find_submit(fields, submit) entry_list <- c(Filter(Negate(is_button), fields), list(submit)) entry_list <- Filter(function(x) !is.null(x$name), entry_list) if (length(entry_list) == 0) { return(list()) } values <- lapply(entry_list, function(x) as.character(x$value)) names <- map_chr(entry_list, "[[", "name") out <- set_names(unlist(values, use.names = FALSE), rep(names, lengths(values))) as.list(out) } submission_find_submit <- function(fields, idx) { buttons <- Filter(is_button, fields) if (is.null(idx)) { if (length(buttons) == 0) { list() } else { if (length(buttons) > 1) { inform(paste0("Submitting with '", buttons[[1]]$name, "'")) } buttons[[1]] } } else if (is.numeric(idx) && length(idx) == 1) { if (idx < 1 || idx > length(buttons)) { abort("Numeric `submit` out of range") } buttons[[idx]] } else if (is.character(idx) && length(idx) == 1) { if (!idx %in% names(buttons)) { abort(c( paste0("No found with name '", idx, "'."), i = paste0("Possible values: ", paste0(names(buttons), collapse = ", ")) )) } buttons[[idx]] } else { abort("`submit` must be NULL, a string, or a number.") } } is_button <- function(x) { tolower(x$type) %in% c("submit", "image", "button") } # Field parsing ----------------------------------------------------------- rvest_field <- function(type, name, value, attr, ...) { structure( list( type = type, name = name, value = value, attr = attr, ... ), class = "rvest_field" ) } #' @export format.rvest_field <- function(x, ...) { if (x$type == "password") { value <- paste0(rep("*", nchar(x$value %||% "")), collapse = "") } else { value <- paste(x$value, collapse = ", ") value <- str_trunc(encodeString(value), 20) } paste0(" (", x$type, ") ", x$name, ": ", value) } #' @export print.rvest_field <- function(x, ...) { cat(format(x, ...), "\n", sep = "") invisible(x) } parse_input <- function(x) { attr <- as.list(xml2::xml_attrs(x)) rvest_field( type = attr$type %||% "text", name = attr$name, value = attr$value, attr = attr ) } parse_select <- function(x) { attr <- as.list(xml2::xml_attrs(x)) options <- parse_options(html_elements(x, "option")) rvest_field( type = "select", name = attr$name, value = options$value, attr = attr, options = options$options ) } parse_options <- function(options) { parse_option <- function(option) { name <- xml2::xml_text(option) list( value = xml2::xml_attr(option, "value", default = name), name = name, selected = xml2::xml_has_attr(option, "selected") ) } parsed <- lapply(options, parse_option) value <- map_chr(parsed, "[[", "value") name <- map_chr(parsed, "[[", "name") selected <- map_lgl(parsed, "[[", "selected") list( value = value[selected], options = stats::setNames(value, name) ) } parse_textarea <- function(x) { attr <- as.list(xml2::xml_attrs(x)) rvest_field( type = "textarea", name = attr$name, value = xml2::xml_text(x), attr = attr ) } parse_button <- function(x) { attr <- as.list(xml2::xml_attrs(x)) rvest_field( type = "button", name = attr$name, value = attr$value, attr = attr ) } # Helpers ----------------------------------------------------------------- convert_enctype <- function(x) { if (is.null(x)) { "form" } else if (x == "application/x-www-form-urlencoded") { "form" } else if (x == "multipart/form-data") { "multipart" } else { warn(paste0("Unknown enctype (", x, "). Defaulting to form encoded.")) "form" } } format_list <- function(x, indent = 0) { spaces <- paste(rep(" ", indent), collapse = "") formatted <- vapply(x, format, character(1)) paste0(spaces, formatted, collapse = "\n") } check_fields <- function(form, values) { no_match <- setdiff(names(values), names(form$fields)) if (length(no_match) > 0) { str <- paste("'", no_match, "'", collapse = ", ") abort(paste0("Can't set value of fields that don't exist: ", str)) } } rvest/R/utils.R0000644000175000017500000000262014132341314013227 0ustar nileshnileshmap_chr <- function(.x, .f, ...) { vapply(.x, .f, ..., FUN.VALUE = character(1), USE.NAMES = FALSE) } map_lgl <- function(.x, .f, ...) { vapply(.x, .f, ..., FUN.VALUE = logical(1), USE.NAMES = FALSE) } str_trunc <- function(x, width) { if (nchar(x) <= width) { x } else { paste0(substr(x, 1, width - 3), "...") } } #' Make link to google form given id #' #' @param x Unique identifier for form #' @export #' @keywords internal google_form <- function(x) { xml2::read_html(httr::GET(paste0("https://docs.google.com/forms/d/", x, "/viewform"))) } #' Create an HTML document from inline HTML #' #' @param html HTML contents of page. #' @param title Page title (required by HTML spec). #' @keywords internal #' @export #' @examples #' minimal_html("

      test

      ") minimal_html <- function(html, title = "") { # From http://www.brucelawson.co.uk/2010/a-minimal-html5-document/ xml2::read_html(paste0( "\n", "\n", "", title, "\n", html )) } cat_line <- function(...) { cat(paste0(..., "\n", collapse = "")) } env_cache <- function(env, nm, value, inherit = FALSE) { if (env_has(env, nm, inherit = inherit)) { env_get(env, nm, inherit = TRUE) } else { env_poke(env, nm, value) value } } inspect <- function(x) { path <- tempfile(fileext = ".html") writeLines(as.character(x), path) utils::browseURL(path) } rvest/LICENSE0000644000175000017500000000005313767413737012574 0ustar nileshnileshYEAR: 2020 COPYRIGHT HOLDER: rvest authors rvest/inst/0000755000175000017500000000000014132343545012530 5ustar nileshnileshrvest/inst/html-ex/0000755000175000017500000000000013767413737014124 5ustar nileshnileshrvest/inst/html-ex/bad-encoding.html0000644000175000017500000000014613767413737017325 0ustar nileshnilesh Bad encoding

      migr cause clbre dj vu.

      rvest/inst/WORDLIST0000644000175000017500000000043313775651410013727 0ustar nileshnilesharounds bookmarklet Codecov combinator combinators colspan colspans config configs CMD css http httr HyperText IMDB innerText ith libxml magrittr MDN nodeset nodesets prepending RoboBrowser rowspan rowspans tibble tidyverse selectorgadget SelectorGadget starwars stringi xpath XPath rvest/inst/doc/0000755000175000017500000000000014132343545013275 5ustar nileshnileshrvest/inst/doc/rvest.Rmd0000644000175000017500000002702214101012300015061 0ustar nileshnilesh--- title: "Web scraping 101" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Web scraping 101} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, echo=FALSE} knitr::opts_chunk$set(comment = "#>", collapse = TRUE) ``` This vignette introduces you to the basics of web scraping with rvest. You'll first learn the basics of HTML and how to use CSS selectors to refer to specific elements, then you'll learn how to use rvest functions to get data out of HTML and into R. ```{r} library(rvest) ``` ## HTML basics HTML stands for "HyperText Markup Language" and looks like this: ``` {.html} Page title

      A heading

      Some text & some bold text.

      ``` HTML has a hierarchical structure formed by **elements** which consist of a start tag (e.g. ``), optional **attributes** (`id='first'`), an end tag[^1] (like ``), and **contents** (everything in between the start and end tag). [^1]: A number of tags (including `

      ` and `

    1. )` don't require end tags, but I think it's best to include them because it makes seeing the structure of the HTML a little easier. Since `<` and `>` are used for start and end tags, you can't write them directly. Instead you have to use the HTML **escapes** `>` (greater than) and `<` (less than). And since those escapes use `&`, if you want a literal ampersand you have to escape it as `&`. There are a wide range of possible HTML escapes but you don't need to worry about them too much because rvest automatically handles them for you. ### Elements All up, there are over 100 HTML elements. Some of the most important are: - Every HTML page must be must be in an `` element, and it must have two children: ``, which contains document metadata like the page title, and ``, which contains the content you see in the browser. - Block tags like `

      ` (heading 1), `

      ` (paragraph), and `

        ` (ordered list) form the overall structure of the page. - Inline tags like `` (bold), `` (italics), and `` (links) formats text inside block tags. If you encounter a tag that you've never seen before, you can find out what it does with a little googling. I recommend the [MDN Web Docs](https://developer.mozilla.org/en-US/docs/Web/HTML) which are produced by Mozilla, the company that makes the Firefox web browser. ### Contents Most elements can have content in between their start and end tags. This content can either be text or more elements. For example, the following HTML contains paragraph of text, with one word in bold. ```{=html}

        Hi! My name is Hadley.

        ``` The **children** of a node refers only to elements, so the `

        ` element above has one child, the `` element. The `` element has no children, but it does have contents (the text "name"). Some elements, like `` can't have children. These elements depend solely on attributes for their behavior. ### Attributes Tags can have named **attributes** which look like `name1='value1' name2='value2'`. Two of the most important attributes are `id` and `class`, which are used in conjunction with CSS (Cascading Style Sheets) to control the visual appearance of the page. These are often useful when scraping data off a page. ## Reading HTML with rvest You'll usually start the scraping process with `read_html()`. This returns a `xml_document`[^2] object which you'll then manipulate using rvest functions: [^2]: This class comes from the [xml2](https://xml2.r-lib.org) package. xml2 is a low-level package that rvest builds on top of. ```{r} html <- read_html("http://rvest.tidyverse.org/") class(html) ``` For examples and experimentation, rvest also includes a function that lets you create an `xml_document` from literal HTML: ```{r} html <- minimal_html("

        This is a paragraph

        • This is a bulleted list
        ") html ``` Regardless of how you get the HTML, you'll need some way to identify the elements that contain the data you care about. rvest provides two options: CSS selectors and XPath expressions. Here I'll focus on CSS selectors because they're simpler but still sufficiently powerful for most scraping tasks. ## CSS selectors CSS is short for cascading style sheets, and is a tool for defining the visual styling of HTML documents. CSS includes a miniature language for selecting elements on a page called **CSS selectors**. CSS selectors define patterns for locating HTML elements, and are useful for scraping because they provide a concise way of describing which elements you want to extract. CSS selectors can be quite complex, but fortunately you only need the simplest for rvest, because you can also write R code for more complicated situations. The four most important selectors are: - `p`: selects all `

        ` elements. - `.title`: selects all elements with `class` "title". - `p.special`: selects all `

        ` elements with `class` "special". - `#title`: selects the element with the `id` attribute that equals "title". Id attributes must be unique within a document, so this will only ever select a single element. If you want to learn more CSS selectors I recommend starting with the fun [CSS dinner](https://flukeout.github.io/) tutorial and then referring to the [MDN web docs](https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_Selectors). Lets try out the most important selectors with a simple example: ```{r} html <- minimal_html("

        This is a heading

        This is a paragraph

        This is an important paragraph

        ") ``` In rvest you can extract a single element with `html_element()` or all matching elements with `html_elements()`. Both functions take a document[^3] and a css selector: [^3]: Or another element, more on that shortly. ```{r} html %>% html_element("h1") html %>% html_elements("p") html %>% html_elements(".important") html %>% html_elements("#first") ``` Selectors can also be combined in various ways using **combinators**. For example,The most important combinator is " ", the **descendant** combination, because `p a` selects all `
        ` elements that are a child of a `

        ` element. If you don't know exactly what selector you need, I highly recommend using [SelectorGadget](https://rvest.tidyverse.org/articles/selectorgadget.html), which lets you automatically generate the selector you need by supplying positive and negative examples in the browser. ## Extracting data Now that you've got the elements you care about, you'll need to get data out of them. You'll usually get the data from either the text contents or an attribute. But, sometimes (if you're lucky!), the data you need will be in an HTML table. ### Text Use `html_text2()` to extract the plain text contents of an HTML element: ```{r} html <- minimal_html("

        1. apple & pear
        2. banana
        3. pineapple
        ") html %>% html_elements("li") %>% html_text2() ``` Note that the escaped ampersand is automatically converted to `&`; you'll only ever see HTML escapes in the source HTML, not in the data returned by rvest. You might wonder why I used `html_text2()`, since it seems to give the same result as `html_text()`: ```{r} html %>% html_elements("li") %>% html_text() ``` The main difference is how the two functions handle white space. In HTML, white space is largely ignored, and it's the structure of the elements that defines how text is laid out. `html_text2()` does its best to follow the same rules, giving you something similar to what you'd see in the browser. Take this example which contains a bunch of white space that HTML ignores. ```{r} html <- minimal_html("

        This is a paragraph.

        This is another paragraph. It has two sentences.

        ") ``` `html_text2()` gives you what you expect: two paragraphs of text separated by a blank line. ```{r} html %>% html_element("body") %>% html_text2() %>% cat() ``` Whereas `html_text()` returns the garbled raw underlying text: ```{r} html %>% html_element("body") %>% html_text() %>% cat() ``` ### Attributes Attributes are used to record the destination of links (the `href` attribute of `
        ` elements) and the source of images (the `src` attribute of the `` element): ```{r} html <- minimal_html("

        cats

        ") ``` The value of an attribute can be retrieved with `html_attr()`: ```{r} html %>% html_elements("a") %>% html_attr("href") html %>% html_elements("img") %>% html_attr("src") ``` Note that `html_attr()` always returns a string, so you may need to post-process with `as.integer()`/`readr::parse_integer()` or similar. ```{r} html %>% html_elements("img") %>% html_attr("width") html %>% html_elements("img") %>% html_attr("width") %>% as.integer() ``` ### Tables HTML tables are composed four main elements: `

    2. ` (table heading), and `` (table data). Here's a simple HTML table with two columns and three rows: ```{r} html <- minimal_html("
      x y
      1.5 2.7
      4.9 1.3
      7.2 8.1
      ") ``` Because tables are a common way to store data, rvest includes the handy `html_table()` which converts a table into a data frame: ```{r} html %>% html_node("table") %>% html_table() ``` ## Element vs elements When using rvest, your eventual goal is usually to build up a data frame, and you want each row to correspond some repeated unit on the HTML page. In this case, you should generally start by using `html_elements()` to select the elements that contain each observation then use `html_element()` to extract the variables from each observation. This guarantees that you'll get the same number of values for each variable because `html_element()` always returns the same number of outputs as inputs. To illustrate this problem take a look at this simple example I constructed using a few entries from `dplyr::starwars`: ```{r} html <- minimal_html("
      • C-3PO is a droid that weighs 167 kg
      • R2-D2 is a droid that weighs 96 kg
      • Yoda weighs 66 kg
      • R4-P17 is a droid
      ") ``` If you try to extract name, species, and weight directly, you end up with one vector of length four and two vectors of length three, and no way to align them: ```{r} html %>% html_elements("b") %>% html_text2() html %>% html_elements("i") %>% html_text2() html %>% html_elements(".weight") %>% html_text2() ``` Instead, use `html_elements()` to find a element that corresponds to each character, then use `html_element()` to extract each variable for all observations: ```{r} characters <- html %>% html_elements("li") characters %>% html_element("b") %>% html_text2() characters %>% html_element("i") %>% html_text2() characters %>% html_element(".weight") %>% html_text2() ``` `html_element()` automatically fills in `NA` when no elements match, keeping all of the variables aligned and making it easy to create a data frame: ```{r} data.frame( name = characters %>% html_element("b") %>% html_text2(), species = characters %>% html_element("i") %>% html_text2(), weight = characters %>% html_element(".weight") %>% html_text2() ) ``` rvest/vignettes/starwars.Rmd0000644000175000017500000000165113775161316016106 0ustar nileshnilesh--- title: "Star Wars films" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Star Wars films} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- This vignette contains some data about the Star Wars films for use in rvest examples and vignettes. ```{r, echo = FALSE, results = "asis"} library(rvest) crawl_html <- function(x) { x %>% gsub("\r", "", .) %>% gsub("\n\n", "

      ", .) %>% gsub("\n", " ", .) %>% paste0("

      ", ., "

      ") } film_desc <- function(x) { glue::glue_data(x, "

      {title}

      Released: {release_date}

      Director: {director}

      {crawl_html(opening_crawl)}
      ") } films <- repurrrsive::sw_films films <- films[order(sapply(films, "[[", "episode_id"))] descs <- vapply(films, film_desc, character(1)) writeLines(descs) ``` rvest/build/0000755000175000017500000000000014132343545012652 5ustar nileshnileshrvest/build/vignette.rds0000644000175000017500000000034714132343545015215 0ustar nileshnileshuM0h8;6nBQ6wab3뼓MLDQw୑[-~cBz"Arѳ:bgWĜYyq|uM?6{qEd>񺉑/p4iZ҂p3FL-3.qT3`5+reLJ,^5:|%{@麮MTD K9AO)#rvest/tests/0000755000175000017500000000000013767413737012733 5ustar nileshnileshrvest/tests/testthat/0000755000175000017500000000000014132660177014560 5ustar nileshnileshrvest/tests/testthat/test-rename.R0000644000175000017500000000172413775436633017145 0ustar nileshnileshtest_that("xml functions are deprecated", { x <- minimal_html("

      Hello

      ") expect_snapshot(. <- xml_tag(x)) expect_snapshot(. <- xml_node(x, "p")) expect_snapshot(. <- xml_nodes(x, "p")) }) test_that("html_node(s) is superseded (no warnings)", { x <- minimal_html("

      Hello

      ") expect_equal(html_node(x, "p"), html_element(x, "p")) expect_equal(html_nodes(x, "p"), html_elements(x, "p")) }) test_that("set_values() is deprecated", { html <- minimal_html('
      ') form <- html_form(html)[[1]] expect_snapshot(set_values(form, text = "abc")) }) test_that("prefixless session functions are deprecated", { expect_snapshot({ s <- html_session("http://rvest.tidyverse.org/") . <- follow_link(s, i = 1) s <- jump_to(s, "https://rvest.tidyverse.org/reference/index.html") s <- back(s) s <- forward(s) }) }) # session_submit() is tested in form-submit because it needs a test server rvest/tests/testthat/test-encoding.R0000644000175000017500000000104014132340655017435 0ustar nileshnileshtest_that("can guess encoding", { path <- system.file("html-ex", "bad-encoding.html", package = "rvest") x <- read_html(path) expect_snapshot(html_encoding_guess(x)) # deprecated expect_snapshot(guess_encoding(x)) }) test_that("encoding repair is deprecated", { skip_on_cran() path <- system.file("html-ex", "bad-encoding.html", package = "rvest") x <- read_html(path) text <- html_text(html_element(x, "p")) expect_snapshot(repair_encoding(text), error = TRUE) expect_snapshot(repair_encoding(text, "ISO-8859-1")) }) rvest/tests/testthat/test-selectors.R0000644000175000017500000000416313775423620017671 0ustar nileshnileshtest_that("can select one or more nodes", { html <- minimal_html("

      ") expect_s3_class(html_elements(html, "p"), "xml_nodeset") expect_s3_class(html_element(html, "p"), "xml_node") }) test_that("xpath with // selects from root", { test <- read_html(test_path("test.html")) p <- html_elements(test, xpath = "//p") expect_equal(length(p), 4) p2 <- html_elements(p[[1]], xpath = "//p") expect_equal(length(p2), 4) p3 <- html_elements(p[[3]], xpath = "b") expect_equal(length(p3), 1) b <- html_elements(p, xpath = "b") expect_equal(length(b), 2) }) test_that("css class selects from current value", { test <- read_html(test_path("test.html")) p <- html_elements(test, css = "p") expect_equal(length(p), 4) p3 <- html_elements(p[[3]], css = "b") expect_equal(length(p3), 1) b <- html_elements(p, css = "b") expect_equal(length(b), 2) }) test_that("css selects don't select themselves", { test <- read_html(test_path("test.html")) p <- test %>% html_elements("p") %>% html_elements("p") expect_equal(length(p), 0) p <- test %>% html_elements("p") %>% `[[`(1) %>% html_elements("p") expect_equal(length(p), 0) }) test_that("css selects find all children", { test <- read_html(test_path("test.html")) b <- test %>% html_elements("body") %>% html_elements("b") expect_equal(length(b), 3) }) test_that("empty matches returns empty list", { test <- read_html(test_path("test.html")) none <- test %>% html_elements("none") expect_equal(length(none), 0) expect_equal(none %>% html_element("none") %>% length(), 0) expect_equal(none %>% html_elements("none") %>% length(), 0) }) # make_selector ----------------------------------------------------------- test_that("validates inputs", { expect_snapshot(make_selector(), error = TRUE) expect_snapshot(make_selector("a", "b"), error = TRUE) expect_snapshot(make_selector(css = 1), error = TRUE) expect_snapshot(make_selector(xpath = 1), error = TRUE) }) test_that("converts css to xpath", { expect_equal(make_selector(css = "p"), ".//p") }) test_that("preserves xpath", { expect_equal(make_selector(xpath = ".//p"), ".//p") }) rvest/tests/testthat/test-form.R0000644000175000017500000001361413775645536016646 0ustar nileshnileshtest_that("can find from from doc, nodes, and node", { html <- minimal_html('

      ') forms <- html_form(html) expect_type(forms, "list") expect_length(forms, 2) forms <- html_form(html_elements(html, "form")) expect_type(forms, "list") expect_length(forms, 2) form <- html_form(html_element(html, "form")) expect_s3_class(form, "rvest_form") }) test_that("has useful print method", { html <- minimal_html('
      ') expect_snapshot(html_form(html, base_url = "http://google.com")[[1]]) expect_snapshot(html_form(html)[[1]]$fields[[2]]) }) test_that("select options are named character vector", { select <- minimal_html("select parsing", '
      ') form <- select %>% html_element("form") %>% html_form() expect_equal(form$fields[[1]]$options, c(a = "1", b = "2")) }) test_that("select values are inherited from names", { page <- minimal_html("optional values", ' ') opts <- page %>% html_element('select') %>% parse_select() expect_equal(opts$options, c(x = "1", y = "y")) }) test_that("parse_fields gets the button", { select <- minimal_html("button test", '
      ') form <- select %>% html_element("form") %>% html_form() expect_equal(form$fields[[1]]$type, "button") }) test_that("handles different encoding types", { expect_equal(convert_enctype(NULL), "form") expect_equal(convert_enctype("application/x-www-form-urlencoded"), "form") expect_equal(convert_enctype("multipart/form-data"), "multipart") expect_snapshot(convert_enctype("unknown")) }) # set -------------------------------------------------------------- test_that("can set values of inputs", { html <- minimal_html('
      ') form <- html_form(html)[[1]] form <- html_form_set(form, text = "abc") expect_equal(form$fields$text$value, "abc") # warns that setting hidden field expect_snapshot(form <- html_form_set(form, hidden = "abc")) expect_equal(form$fields$hidden$value, "abc") }) test_that("has informative errors", { html <- minimal_html('
      ') form <- html_form(html)[[1]] expect_snapshot(html_form_set(form, text = "x"), error = TRUE) expect_snapshot(html_form_set(form, missing = "x"), error = TRUE) }) # submit ------------------------------------------------------------------ test_that("works as expected in simple case", { html <- minimal_html('
      ') form <- html_form(html, base_url = "http://here.com")[[1]] sub <- submission_build(form, "clickMe") expect_equal(sub$method, "POST") expect_equal(sub$action, "http://here.com/test-path") expect_equal(sub$values, list(x = "1")) }) test_that("useful feedback on invalid forms", { html <- minimal_html("
      ") form <- html_form(html)[[1]] expect_snapshot(submission_build(form, NULL), error = TRUE) html <- minimal_html("
      ") form <- html_form(html)[[1]] expect_snapshot(x <- submission_build(form, NULL)) }) test_that("can handle multiple values", { html <- minimal_html('
      ') form <- html_form(html)[[1]] form <- html_form_set(form, x = c("1", "2", "3"), y = character()) expect_equal( submission_build_values(form), list(x = "1", x = "2", x = "3") ) }) test_that("handles multiple buttons", { html <- minimal_html('
      ') form <- html_form(html)[[1]] # Messages when picking automatically expect_snapshot(vals <- submission_build_values(form, NULL)) expect_equal(vals, list(one = "1")) expect_equal(submission_build_values(form, "two"), list(two = "2")) expect_equal(submission_build_values(form, 2L), list(two = "2")) # Useful failure messages expect_snapshot(submission_build_values(form, 3L), error = TRUE) expect_snapshot(submission_build_values(form, "three"), error = TRUE) expect_snapshot(submission_build_values(form, TRUE), error = TRUE) }) test_that("handles no buttons", { html <- minimal_html('
      ') form <- html_form(html)[[1]] expect_equal( submission_build_values(form), list(x = "1") ) }) test_that("can submit using three primary techniques", { app <- webfakes::local_app_process(app_request()) html <- minimal_html('
      ') form <- html_form(html, base_url = app$url())[[1]] expect_snapshot({ show_response(html_form_submit(form)) form$method <- "POST" show_response(html_form_submit(form)) form$enctype <- "multipart" show_response(html_form_submit(form)) }) }) rvest/tests/testthat/test-session.R0000644000175000017500000000563414017431436017347 0ustar nileshnileshtest_that("basic session process works as expected", { expect_snapshot({ s <- session("http://hadley.nz/") s expect_true(is.session(s)) s <- session_follow_link(s, css = "p a") session_history(s) }) }) test_that("session caches xml parsing and sets base url", { s <- session("https://rvest.tidyverse.org/") expect_equal(s$cache$html, NULL) html <- read_html(s) expect_true(rlang::is_reference(s$cache$html, html)) expect_equal(xml2::xml_url(html), "https://rvest.tidyverse.org/") }) test_that("errors if try to access HTML from non-HTML page", { expect_snapshot(error = TRUE, { s <- session("https://rvest.tidyverse.org/logo.png") read_html(s) }) }) test_that("session responds to httr and rvest methods", { # skip_on_cran() s <- session("http://rstudio.com/") expect_silent(html_form(s)) expect_silent(html_table(s)) expect_silent(html_element(s, "body")) expect_silent(html_element(s, "body")) expect_silent(status_code(s)) expect_silent(headers(s)) expect_silent(cookies(s)) }) test_that("informative errors for bad inputs", { expect_snapshot_error(check_form(1)) expect_snapshot_error(check_session(1)) }) # navigation -------------------------------------------------------------- test_that("can navigate back and forward", { s <- session("http://hadley.nz/") expect_equal(s$back, character()) expect_equal(s$forward, character()) expect_snapshot_error(session_back(s)) expect_snapshot_error(session_forward(s)) s <- session_jump_to(s, "hadley-wickham.jpg") expect_equal(s$back, "http://hadley.nz/") expect_equal(s$forward, character()) expect_equal(session_forward(session_back(s))$url, s$url) s <- session_back(s) expect_equal(s$back, character()) expect_equal(s$forward, "http://hadley.nz/hadley-wickham.jpg") s <- session_forward(s) expect_equal(s$back, "http://hadley.nz/") expect_equal(s$forward, character()) }) test_that("can find link by position, content, css, or xpath", { html <- minimal_html(" a b ") expect_equal(find_href(html, i = 1), "a") expect_equal(find_href(html, i = "b"), "b") expect_equal(find_href(html, css = "a.b"), "b") # Failure modes expect_snapshot(find_href(html, i = 1, css = "a"), error = TRUE) expect_snapshot(find_href(html, i = TRUE), error = TRUE) expect_snapshot(find_href(html, i = "c"), error = TRUE) expect_snapshot(find_href(html, css = "p a"), error = TRUE) }) test_that("can submit a form", { app <- webfakes::local_app_process(app_request()) html <- minimal_html('
      ') form <- html_form(html, base_url = app$url())[[1]] s <- session("http://hadley.nz/") s <- session_submit(s, form) expect_s3_class(s, "rvest_session") resp <- httr::content(s$response) expect_equal(resp$query, "x=1&y=2") }) rvest/tests/testthat/test-html.R0000644000175000017500000000056713775424427016644 0ustar nileshnileshtest_that("forwards to xml2 functions", { html <- minimal_html("

      Hello children

      ") p <- html_elements(html, "p") expect_equal(html_name(p), "p") expect_equal(html_attr(p, "id"), "x") expect_equal(html_attr(p, "id2"), NA_character_) expect_equal(html_attrs(p), list(c(id = "x"))) expect_equal(html_children(p), html_elements(html, "i")) }) rvest/tests/testthat/test-table.R0000644000175000017500000001026114101012613016725 0ustar nileshnileshtest_that("can parse simple table", { html <- minimal_html('
      xyz
      1EveJackson
      2JohnDoe
      ') table <- html_table(html)[[1]] expect_snapshot_output(table) }) test_that("strips whitespace", { html <- minimal_html('
      x
      x
      x
      x
      ') table <- html_table(html)[[1]] expect_equal(table$x, c("x", "x", "x")) }) test_that("can parse with colspan", { html <- minimal_html('
      xyz
      1
      12
      12
      ') table <- html_table(html)[[1]] expect_snapshot_output(table) }) test_that("can parse with rowspan", { html <- minimal_html('
      xyz
      123
      23
      3
      ') table <- html_table(html)[[1]] expect_snapshot_output(table) }) test_that("can handle wobbling rowspan", { html <- minimal_html('
      xyz
      1a1b1c
      2b
      3a3c
      ') table <- html_table(html)[[1]] expect_snapshot_output(table) }) test_that("can handle trailing rowspans", { html <- minimal_html('
      xyz
      1 2 3
      ') table <- html_table(html)[[1]] expect_snapshot_output(table) }) test_that("can handle blank colspans", { html <- minimal_html('
      xy
      1 2
      3
      ') table <- html_table(html)[[1]] expect_snapshot_output(table) }) test_that("can handle blank rowspans", { html <- minimal_html('
      xy
      1 2
      3
      ') table <- html_table(html)[[1]] expect_snapshot_output(table) }) test_that("can handle empty row", { html <- minimal_html('
      x
      2
      ') table <- html_table(html)[[1]] expect_snapshot_output(table) }) test_that("defaults to minimal name repair", { html <- minimal_html('
      xx
      ') table <- html_table(html)[[1]] expect_named(table, c("x", "x", "")) }) test_that("adds names if needed", { html <- minimal_html('
      12
      ') table <- html_table(html)[[1]] expect_named(table, c("X1", "X2")) }) test_that("passes arguments to type.convert", { html <- minimal_html("
      xy
      NA1,2
      ") table <- html_table(html, na.strings = "")[[1]] expect_equal(table$x, "NA") table <- html_table(html, dec = ",")[[1]] expect_equal(table$y, 1.2) }) test_that("no conversion", { html <- minimal_html('
      xy
      001100.0
      ') table <- html_table(html, convert = FALSE)[[1]] expect_snapshot_output(table) }) test_that("fill = FALSE is deprecated", { html <- minimal_html('
      x
      1
      ') expect_snapshot({ . <- html_table(html, fill = FALSE) . <- html_table(html, fill = TRUE) }) }) test_that("can handle empty tables", { html <- minimal_html('
      ') table <- html_table(html)[[1]] expect_snapshot_output(table) }) rvest/tests/testthat/test.html0000644000175000017500000000013412362565206016423 0ustar nileshnilesh

      rvest/tests/testthat/test-text.R0000644000175000017500000000644613775423635016666 0ustar nileshnileshtest_that("html_text returns raw html", { html <- minimal_html("

      x\ny
      z

      ") p <- html_elements(html, "p") expect_equal(html_text(p), "x\nyz") }) # html_text2 -------------------------------------------------------------- test_that("handles block containing only inline elements", { html <- minimal_html("

      a b c

      ") expect_equal(html_text2(html), "a b c") # internal newlines are trimmed html <- minimal_html("

      a\n\nb\nc

      ") expect_equal(html_text2(html), "a b c") }) test_that("handles multiple paragraphs with line breaks", { html <- minimal_html("

      a

      b
      c ") expect_equal(html_text2(html), "a\n\nb\nc") expect_equal(html_text2(html_elements(html, "p")), c("a", "b\nc")) }) test_that("handles table", { html <- minimal_html("
      ab
      12
      23
      ") expect_equal(html_text2(html), "a\tb\n1\t2\n2\t3") }) test_that("handles mixed block as well as can be expected", { html <- minimal_html("

      a

      b
      ") expect_equal(html_text2(html_element(html, "div")), "a\n\nb\n") }) test_that("returns NA for xml_missing", { expect_equal(html_text2(xml2::xml_missing()), NA_character_) }) test_that("breaks as expected", { expect_identical(tag_margin("p"), 2L) expect_identical(tag_margin("li"), 1L) expect_identical(tag_margin("b"), 0L) }) # inline ------------------------------------------------------------------ test_that("handle single line of text", { html <- minimal_html("

      a b c

      ") expect_equal(html_text_inline(html_element(html, "p")), "a b c") # collapses space across nodes html <- minimal_html("

      a b c

      ") expect_equal(html_text_inline(html_element(html, "p")), "a b c") }) test_that("converts br to \n", { html <- minimal_html("


      x

      ") expect_equal(html_text_inline(html_element(html, "p")), "\nx") html <- minimal_html("

      x

      ") expect_equal(html_text_inline(html_element(html, "p")), "x\n") html <- minimal_html("



      ") expect_equal(html_text_inline(html_element(html, "p")), "\n\n") }) test_that("empty block returns empty string", { html <- minimal_html("

      ") expect_equal(html_text_inline(html_element(html, "p")), "") }) test_that("collapse whitespace handles single line", { expect_equal(collapse_whitespace("\n\tx\t\n"), "x") expect_equal(collapse_whitespace("x y"), "x y") }) test_that("optionally preserve nbsp", { expect_equal(collapse_whitespace("x \u00a0 y"), "x y") expect_equal(collapse_whitespace("x\u00a0y", TRUE), "x\u00a0y") }) # PaddedText -------------------------------------------------------------- test_that("margins only added within text", { text <- PaddedText$new() text$add_margin(1) text$add_text("x") text$add_margin(1) expect_equal(text$output(), "x") }) test_that("margins are collapsed", { text <- PaddedText$new() text$add_text("x") text$add_margin(1) expect_equal(text$lines, 1) text$add_margin(2) expect_equal(text$lines, 2) text$add_text("y") expect_equal(text$output(), "x\n\ny") }) test_that("empty text is ignored", { text <- PaddedText$new() text$add_text("") text$add_margin(1) text$add_text("x") expect_equal(text$output(), "x") }) rvest/tests/testthat/test-utils.R0000644000175000017500000000040013775334175017022 0ustar nileshnileshtest_that("can truncate strings", { expect_equal(str_trunc("abcdef", 10), "abcdef") expect_equal(str_trunc("abcdef", 4), "a...") }) test_that("minimal html doesn't change unexpectedly", { expect_snapshot(cat(as.character(minimal_html("

      Hi")))) }) rvest/tests/testthat/_snaps/0000755000175000017500000000000014132340572016035 5ustar nileshnileshrvest/tests/testthat/_snaps/table.md0000644000175000017500000000317714132340572017456 0ustar nileshnilesh# can parse simple table # A tibble: 2 x 3 x y z 1 1 Eve Jackson 2 2 John Doe # can parse with colspan # A tibble: 3 x 3 x y z 1 1 1 1 2 1 1 2 3 1 2 2 # can parse with rowspan # A tibble: 3 x 3 x y z 1 1 2 3 2 1 2 3 3 1 2 3 # can handle wobbling rowspan # A tibble: 3 x 3 x y z 1 1a 1b 1c 2 1a 2b 1c 3 3a 2b 3c # can handle trailing rowspans # A tibble: 4 x 3 x y z 1 1 2 3 2 NA 2 3 3 NA 2 NA 4 NA 2 NA # can handle blank colspans # A tibble: 2 x 2 x y 1 1 2 2 3 3 # can handle blank rowspans # A tibble: 2 x 2 x y 1 1 2 2 3 3 # can handle empty row # A tibble: 1 x 1 x 1 2 # no conversion # A tibble: 1 x 2 x y 1 001 100.0 # fill = FALSE is deprecated Code . <- html_table(html, fill = FALSE) Warning The `fill` argument of `html_table()` is deprecated as of rvest 1.0.0. An improved algorithm fills by default so it is no longer needed. Code . <- html_table(html, fill = TRUE) # can handle empty tables # A tibble: 0 x 0 rvest/tests/testthat/_snaps/selectors.md0000644000175000017500000000065214132340566020370 0ustar nileshnilesh# validates inputs Code make_selector() Error Please supply one of css or xpath --- Code make_selector("a", "b") Error Please supply css or xpath, not both --- Code make_selector(css = 1) Error `css` must be a string --- Code make_selector(xpath = 1) Error `xpath` must be a string rvest/tests/testthat/_snaps/session.md0000644000175000017500000000260114132340572020041 0ustar nileshnilesh# basic session process works as expected Code s <- session("http://hadley.nz/") s Output http://hadley.nz/ Status: 200 Type: text/html Size: 9090 Code expect_true(is.session(s)) s <- session_follow_link(s, css = "p a") Message Navigating to http://rstudio.com Code session_history(s) Output http://hadley.nz/ - https://www.rstudio.com/ # errors if try to access HTML from non-HTML page Code s <- session("https://rvest.tidyverse.org/logo.png") read_html(s) Error Page doesn't appear to be html. # informative errors for bad inputs `form` must be a single form produced by html_form() --- `x` must be produced by session() # can navigate back and forward Can't go back any further --- Can't go forward any further # can find link by position, content, css, or xpath Code find_href(html, i = 1, css = "a") Error Must supply exactly one of `i`, `css`, or `xpath` --- Code find_href(html, i = TRUE) Error `i` must a string or integer --- Code find_href(html, i = "c") Error No links have text 'c' --- Code find_href(html, css = "p a") Error No links matched `css`/`xpath` rvest/tests/testthat/_snaps/encoding.md0000644000175000017500000000320214132340564020143 0ustar nileshnilesh# can guess encoding Code html_encoding_guess(x) Output encoding language confidence 1 ISO-8859-1 fr 0.31 2 ISO-8859-2 ro 0.22 3 UTF-16BE 0.10 4 UTF-16LE 0.10 5 GB18030 zh 0.10 6 Big5 zh 0.10 7 ISO-8859-9 tr 0.06 8 IBM424_rtl he 0.01 9 IBM424_ltr he 0.01 --- Code guess_encoding(x) Warning `guess_encoding()` was deprecated in rvest 1.0.0. Please use `html_encoding_guess()` instead. Output encoding language confidence 1 ISO-8859-1 fr 0.31 2 ISO-8859-2 ro 0.22 3 UTF-16BE 0.10 4 UTF-16LE 0.10 5 GB18030 zh 0.10 6 Big5 zh 0.10 7 ISO-8859-9 tr 0.06 8 IBM424_rtl he 0.01 9 IBM424_ltr he 0.01 # encoding repair is deprecated Code repair_encoding(text) Warning `html_encoding_repair()` was deprecated in rvest 1.0.0. Instead, re-load using the `encoding` argument of `read_html()` Error No guess has more than 50% confidence --- Code repair_encoding(text, "ISO-8859-1") Warning `html_encoding_repair()` was deprecated in rvest 1.0.0. Instead, re-load using the `encoding` argument of `read_html()` Output [1] "Émigré cause célèbre déjà vu." rvest/tests/testthat/_snaps/rename.md0000644000175000017500000000353714132340566017641 0ustar nileshnilesh# xml functions are deprecated Code . <- xml_tag(x) Warning `xml_tag()` was deprecated in rvest 1.0.0. Please use `html_name()` instead. --- Code . <- xml_node(x, "p") Warning `xml_node()` was deprecated in rvest 1.0.0. Please use `html_element()` instead. --- Code . <- xml_nodes(x, "p") Warning `xml_nodes()` was deprecated in rvest 1.0.0. Please use `html_elements()` instead. # set_values() is deprecated Code set_values(form, text = "abc") Warning `set_values()` was deprecated in rvest 1.0.0. Please use `html_form_set()` instead. Output

      '' (GET ) (text) text: abc # prefixless session functions are deprecated Code s <- html_session("http://rvest.tidyverse.org/") Warning `html_session()` was deprecated in rvest 1.0.0. Please use `session()` instead. Code . <- follow_link(s, i = 1) Warning `follow_link()` was deprecated in rvest 1.0.0. Please use `session_follow_link()` instead. Message Navigating to index.html Code s <- jump_to(s, "https://rvest.tidyverse.org/reference/index.html") Warning `jump_to()` was deprecated in rvest 1.0.0. Please use `session_jump_to()` instead. Code s <- back(s) Warning `back()` was deprecated in rvest 1.0.0. Please use `session_back()` instead. Code s <- forward(s) Warning `forward()` was deprecated in rvest 1.0.0. Please use `session_forward()` instead. rvest/tests/testthat/_snaps/utils.md0000644000175000017500000000052614132340573017523 0ustar nileshnilesh# minimal html doesn't change unexpectedly Code cat(as.character(minimal_html("

      Hi"))) Output

      Hi

      rvest/tests/testthat/_snaps/form.md0000644000175000017500000000521314132340566017326 0ustar nileshnilesh# has useful print method Code html_form(html, base_url = "http://google.com")[[1]] Output 'test' (POST http://google.com/test-path) (select) select: (text) name: Hadley (password) name: ****** (button) clickMe: (textarea) address: ABCDEF --- Code html_form(html)[[1]]$fields[[2]] Output (text) name: Hadley # handles different encoding types Code convert_enctype("unknown") Warning Unknown enctype (unknown). Defaulting to form encoded. Output [1] "form" # can set values of inputs Code form <- html_form_set(form, hidden = "abc") Warning Setting value of hidden field 'hidden'. # has informative errors Code html_form_set(form, text = "x") Error Can't change value of input with type submit: 'text'. --- Code html_form_set(form, missing = "x") Error Can't set value of fields that don't exist: ' missing ' # useful feedback on invalid forms Code submission_build(form, NULL) Error `form` doesn't contain a `action` attribute --- Code x <- submission_build(form, NULL) Warning Invalid method (FOO), defaulting to GET # handles multiple buttons Code vals <- submission_build_values(form, NULL) Message Submitting with 'one' --- Code submission_build_values(form, 3L) Error Numeric `submit` out of range --- Code submission_build_values(form, "three") Error No found with name 'three'. i Possible values: one, two --- Code submission_build_values(form, TRUE) Error `submit` must be NULL, a string, or a number. # can submit using three primary techniques Code show_response(html_form_submit(form)) Output GET Query string: x=1&x=2&y=3 Code form$method <- "POST" show_response(html_form_submit(form)) Output POST application/x-www-form-urlencoded Query string: x=1&x=2&y=3 Code form$enctype <- "multipart" show_response(html_form_submit(form)) Output POST multipart/form-data; boundary=--- Query string: --- Content-Disposition: form-data; name="x" 1 --- Content-Disposition: form-data; name="x" 2 --- Content-Disposition: form-data; name="y" 3 ----- rvest/tests/testthat.R0000644000175000017500000000006612364025001014667 0ustar nileshnileshlibrary(testthat) library(rvest) test_check("rvest") rvest/tests/spelling.R0000644000175000017500000000024113767413737014670 0ustar nileshnileshif(requireNamespace('spelling', quietly = TRUE)) spelling::spell_check_test(vignettes = TRUE, error = FALSE, skip_on_cran = TRUE) rvest/R/0000755000175000017500000000000014101012613011735 5ustar nileshnileshrvest/R/html.R0000644000175000017500000000371214132341276013045 0ustar nileshnilesh#' Get element name #' #' @param x A document (from [read_html()]), node set (from [html_elements()]), #' node (from [html_element()]), or session (from [session()]). #' @return A character vector the same length as `x` #' @export #' @examples #' url <- "https://rvest.tidyverse.org/articles/starwars.html" #' html <- read_html(url) #' #' html %>% #' html_element("div") %>% #' html_children() %>% #' html_name() #' @export #' @importFrom xml2 xml_name html_name <- function(x) { xml_name(x) } #' Get element attributes #' #' `html_attr()` gets a single attribute; `html_attrs()` gets all attributes. #' #' @inheritParams html_name #' @param name Name of attribute to retrieve. #' @param default A string used as a default value when the attribute does #' not exist in every element. #' @return A character vector (for `html_attr()`) or list (`html_attrs()`) #' the same length as `x`. #' @examples #' html <- minimal_html('
        #'
      • a
      • #'
      • b
      • #'
      • b
      • #'
      ') #' #' html %>% html_elements("a") %>% html_attrs() #' #' html %>% html_elements("a") %>% html_attr("href") #' html %>% html_elements("li") %>% html_attr("class") #' html %>% html_elements("li") %>% html_attr("class", default = "inactive") #' @export #' @importFrom xml2 xml_attr html_attr <- function(x, name, default = NA_character_) { xml_attr(x, name, default = default) } #' @rdname html_attr #' @export #' @importFrom xml2 xml_attrs html_attrs <- function(x) { xml_attrs(x) } #' Get element children #' #' @inheritParams html_name #' @examples #' html <- minimal_html("
      • 1
      • 2
      • 3
      ") #' ul <- html_elements(html, "ul") #' html_children(ul) #' #' html <- minimal_html("

      Hello Hadley!") #' p <- html_elements(html, "p") #' html_children(p) #' @importFrom xml2 xml_children #' @export html_children <- function(x) { xml_children(x) } rvest/R/encoding.R0000644000175000017500000000422213775423740013675 0ustar nileshnilesh#' Guess faulty character encoding #' #' `html_encoding_guess()` helps you handle web pages that declare an incorrect #' encoding. Use `html_encoding_guess()` to generate a list of possible #' encodings, then try each out by using `encoding` argument of `read_html()`. #' `html_encoding_guess()` replaces the deprecated `guess_encoding()`. #' #' @param x A character vector. #' @export #' @examples #' # A file with bad encoding included in the package #' path <- system.file("html-ex", "bad-encoding.html", package = "rvest") #' x <- read_html(path) #' x %>% html_elements("p") %>% html_text() #' #' html_encoding_guess(x) #' # Two valid encodings, only one of which is correct #' read_html(path, encoding = "ISO-8859-1") %>% html_elements("p") %>% html_text() #' read_html(path, encoding = "ISO-8859-2") %>% html_elements("p") %>% html_text() html_encoding_guess <- function(x) { check_installed("stringi") guess <- stringi::stri_enc_detect(paste(x, collapse = "")) df <- as.data.frame(guess[[1]], stringsAsFactors = FALSE) names(df) <- tolower(names(df)) df } #' @export #' @rdname html_encoding_guess #' @usage NULL guess_encoding <- function(x) { lifecycle::deprecate_warn("1.0.0", "guess_encoding()", "html_encoding_guess()") html_encoding_guess(x) } #' Repair faulty encoding #' #' `r lifecycle::badge("deprecated")` #' This function has been deprecated because it doesn't work. Instead #' re-read the HTML file with correct `encoding` argument. #' #' @export #' @keywords internal #' @param from The encoding that the string is actually in. If `NULL`, #' `guess_encoding` will be used. repair_encoding <- function(x, from = NULL) { lifecycle::deprecate_warn("1.0.0", "html_encoding_repair()", details = "Instead, re-load using the `encoding` argument of `read_html()`" ) check_installed("stringi") if (is.null(from)) { best_guess <- html_encoding_guess(x)[1, , drop = FALSE] from <- best_guess$encoding conf <- best_guess$confidence * 100 if (conf < 50) { stop("No guess has more than 50% confidence", call. = FALSE) } inform(paste0("Best guess: ", from, " (", conf, "% confident)")) } stringi::stri_conv(x, from) } rvest/R/rename.R0000644000175000017500000000525114007274024013345 0ustar nileshnilesh#' Functions renamed in rvest 1.0.0 #' #' @description #' `r lifecycle::badge('deprecated')` #' #' rvest 1.0.0 renamed a number of functions to ensure that every function #' has a common prefix, matching tidyverse conventions that emerged since #' rvest was first created. #' #' * `set_values()` -> `html_form_set()` #' * `submit_form()` -> `session_submit()` #' * `xml_tag()` -> `html_name()` #' * `xml_node()` & `html_node()` -> `html_element()` #' * `xml_nodes()` & `html_nodes()` -> `html_elements()` #' #' (`html_node()` and `html_nodes()` are only superseded because they're #' so widely used.) #' #' Additionally all session related functions gained a common prefix: #' #' * `html_session()` -> `session()` #' * `forward()` -> `session_forward()` #' * `back()` -> `session_back()` #' * `jump_to()` -> `session_jump_to()` #' * `follow_link()` -> `session_follow_link()` #' #' @keywords internal #' @name rename #' @aliases NULL NULL #' @rdname rename #' @export set_values <- function(form, ...) { lifecycle::deprecate_warn("1.0.0", "set_values()", "html_form_set()") html_form_set(form = form, ...) } #' @rdname rename #' @export submit_form <- function(session, form, submit = NULL, ...) { lifecycle::deprecate_warn("1.0.0", "submit_form()", "session_submit()") session_submit(x = session, form = form, submit = submit, ...) } #' @export #' @rdname rename xml_tag <- function(x) { lifecycle::deprecate_warn("1.0.0", "xml_tag()", "html_name()") html_name(x) } #' @export #' @rdname rename xml_node <- function(...) { lifecycle::deprecate_warn("1.0.0", "xml_node()", "html_element()") html_node(...) } #' @export #' @rdname rename xml_nodes <- function(...) { lifecycle::deprecate_warn("1.0.0", "xml_nodes()", "html_elements()") html_nodes(...) } #' @export #' @rdname rename html_nodes <- function(...) { html_elements(...) } #' @export #' @rdname rename html_node <- function(...) { html_element(...) } #' @export #' @rdname rename back <- function(x) { lifecycle::deprecate_warn("1.0.0", "back()", "session_back()") session_back(x) } #' @export #' @rdname rename forward <- function(x) { lifecycle::deprecate_warn("1.0.0", "forward()", "session_forward()") session_forward(x) } #' @export #' @rdname rename jump_to <- function(x, url, ...) { lifecycle::deprecate_warn("1.0.0", "jump_to()", "session_jump_to()") session_jump_to(x, url, ...) } #' @export #' @rdname rename follow_link <- function(x, ...) { lifecycle::deprecate_warn("1.0.0", "follow_link()", "session_follow_link()") session_follow_link(x, ...) } #' @export #' @rdname rename html_session <- function(url, ...) { lifecycle::deprecate_warn("1.0.0", "html_session()", "session()") session(url, ...) } rvest/R/text.R0000644000175000017500000001474613775651234013110 0ustar nileshnilesh#' Get element text #' #' @description #' There are two ways to retrieve text from a element: `html_text()` and #' `html_text2()`. `html_text()` is a thin wrapper around [xml2::xml_text()] #' which returns just the raw underlying text. `html_text2()` simulates how #' text looks in a browser, using an approach inspired by JavaScript's #' [innerText()](https://developer.mozilla.org/en-US/docs/Web/API/HTMLElement/innerText). #' Roughly speaking, it converts `
      ` to `"\n"`, adds blank lines #' around `

      ` tags, and lightly formats tabular data. #' #' `html_text2()` is usually what you want, but it is much slower than #' `html_text()` so for simple applications where performance is important #' you may want to use `html_text()` instead. #' #' @inheritParams xml2::xml_text #' @importFrom xml2 xml_text #' @return A character vector the same length as `x` #' @examples #' # To understand the difference between html_text() and html_text2() #' # take the following html: #' #' html <- minimal_html( #' "

      This is a paragraph. #' This another sentence.
      This should start on a new line" #' ) #' #' # html_text() returns the raw underlying text, which includes whitespace #' # that would be ignored by a browser, and ignores the
      #' html %>% html_element("p") %>% html_text() %>% writeLines() #' #' # html_text2() simulates what a browser would display. Non-significant #' # whitespace is collapsed, and
      is turned into a line break #' html %>% html_element("p") %>% html_text2() %>% writeLines() #' #' # By default, html_text2() also converts non-breaking spaces to regular #' # spaces: #' html <- minimal_html("

      x y

      ") #' x1 <- html %>% html_element("p") %>% html_text() #' x2 <- html %>% html_element("p") %>% html_text2() #' #' # When printed, non-breaking spaces look exactly like regular spaces #' x1 #' x2 #' # But aren't actually the same: #' x1 == x2 #' # Which you can confirm by looking at their underlying binary #' # representaion: #' charToRaw(x1) #' charToRaw(x2) #' @export html_text <- function(x, trim = FALSE) { xml_text(x, trim = trim) } #' @export #' @rdname html_text #' @param preserve_nbsp Should non-breaking spaces be preserved? By default, #' `html_text2()` converts to ordinary spaces to ease further computation. #' When `preserve_nbsp` is `TRUE`, ` ` will appear in strings as #' `"\ua0"`. This often causes confusion because it prints the same way as #' `" "`. html_text2 <- function(x, preserve_nbsp = FALSE) { UseMethod("html_text2") } #' @export html_text2.xml_document <- function(x, preserve_nbsp = FALSE) { body <- xml2::xml_find_first(x, ".//body") html_text2(body, preserve_nbsp = preserve_nbsp) } #' @export html_text2.xml_nodeset <- function(x, preserve_nbsp = FALSE) { vapply( x, html_text2, preserve_nbsp = preserve_nbsp, FUN.VALUE = character(1) ) } #' @export html_text2.xml_node <- function(x, preserve_nbsp = FALSE) { text <- PaddedText$new() html_text_block(x, text, preserve_nbsp = preserve_nbsp) text$output() } #' @export html_text2.xml_missing <- function(x, preserve_nbsp = FALSE) { NA_character_ } # Algorithm roughly inspired by # https://html.spec.whatwg.org/multipage/dom.html#the-innertext-idl-attribute # but following deatils in # https://developer.mozilla.org/en-US/docs/Web/API/Document_Object_Model/Whitespace#How_does_CSS_process_whitespace html_text_block <- function(x, text, preserve_nbsp = FALSE) { if (xml2::xml_type(x) == "text") { text$add_text(collapse_whitespace(xml2::xml_text(x), preserve_nbsp)) } else if (is_inline(x)) { text$add_text(html_text_inline(x, preserve_nbsp)) } else { children <- xml2::xml_contents(x) n <- length(children) for (i in seq_along(children)) { child <- children[[i]] name <- xml2::xml_name(child) margin <- tag_margin(name) text$add_margin(margin) html_text_block(child, text, preserve_nbsp = preserve_nbsp) switch(name, tr = if (i != n) text$add_text("\n"), th = , td = if (i != n) text$add_text("\t"), br = text$add_text("\n") ) text$add_margin(margin) } } } is_inline <- function(x) { children <- xml2::xml_children(x) !any(xml2::xml_name(children) %in% c(block_tag, table_tag)) } block_tag <- c( # https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements "address", "article", "aside", "blockquote", "details", "dialog", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr", "li", "main", "nav", "ol", "p", "pre", "section", "table", "ul", "caption" ) table_tag <- c("tr", "td", "th") tag_margin <- function(name) { # + caption if (name == "p") { 2L } else if (name %in% block_tag) { 1L } else { 0L } } html_text_inline <- function(x, preserve_nbsp = FALSE) { children <- xml2::xml_contents(x) n <- length(children) if (n == 0) { return("") } text <- xml2::xml_text(children) is_br <- xml2::xml_name(children) == "br" line_num <- cumsum(c(TRUE, is_br[-n])) lines <- split(text, line_num) lines <- vapply(lines, paste0, collapse = "", FUN.VALUE = character(1)) if (xml2::xml_name(x) != "pre") { lines <- collapse_whitespace(lines, preserve_nbsp) } has_br <- unname(tapply(is_br, line_num, any)) paste0(lines, ifelse(has_br, "\n", ""), collapse = "") } # https://drafts.csswg.org/css-text/#white-space-phase-1 collapse_whitespace <- function(x, preserve_nbsp = FALSE) { # Remove leading and trailing whitespace x <- gsub("(^[ \t\n]+)|([ \t\n]+$)", "", x, perl = TRUE) # Convert any whitespace sequence to a space match <- if (preserve_nbsp) "[\t\n ]+" else "[\t\n \u00a0]+" x <- gsub(match, " ", x, perl = TRUE) x } # Text with line break padding in between blocks, collapsing breaks # similarly to css margin collapsing rules PaddedText <- R6::R6Class("PaddedText", list( text = character(), lines = 0, i = 1L, add_margin = function(n) { # Don't add breaks before encountering text if (self$i == 1) { return() } self$lines <- max(self$lines, n) }, convert_breaks = function() { if (self$lines == 0) { return() } self$text[[self$i]] <- strrep("\n", self$lines) self$i <- self$i + 1 self$lines <- 0 }, add_text = function(x) { # Ignore empty strings if (identical(x, "")) { return() } self$convert_breaks() self$text[[self$i]] <- x self$i <- self$i + 1L }, output = function() { paste(self$text, collapse = "") } )) rvest/R/testthat.R0000644000175000017500000000122313775645364013755 0ustar nileshnileshapp_request <- function() { req_json <- function(req, res) { out <- list( method = req$method, query = req$query_string, type = req$headers$`Content-Type` %||% NA_character_, body = rawToChar(req$.body %||% raw()) ) res$send_json(out, auto_unbox = TRUE) } app <- webfakes::new_app() app$post("/", req_json) app$get("/", req_json) app } show_response <- function(x) { strip_divider <- function(x) { gsub("-{3,}[a-f0-9]+", "---", x) } x <- httr::content(x) cat_line(toupper(x$method), " ", strip_divider(x$type)) cat_line("Query string: ", x$query) cat_line(strip_divider(x$body)) } rvest/R/session.R0000644000175000017500000001634214017431436013567 0ustar nileshnilesh#' Simulate a session in web browser #' #' @description #' This set of functions allows you to simulate a user interacting with a #' website, using forms and navigating from page to page. #' #' * Create a session with `session(url)` #' * Navigate to a specified url with `session_jump_to()`, or follow a link on the #' page with `session_follow_link()`. #' * Submit an [html_form] with `session_submit()`. #' * View the history with `session_history()` and navigate back and forward #' with `session_back()` and `session_forward()`. #' * Extract page contents with [html_element()] and [html_elements()], or get the #' complete HTML document with [read_html()]. #' * Inspect the HTTP response with [httr::cookies()], [httr::headers()], #' and [httr::status_code()]. #' #' @param url For `session()` location to start, for `session_jump_to()` #' location to go to next. #' @param ... Any additional httr config to use throughout the session. #' @param x An object to test to see if it's a session. #' @export #' @examples #' s <- session("http://hadley.nz") #' s %>% #' session_jump_to("hadley-wickham.jpg") %>% #' session_jump_to("/") %>% #' session_history() #' #' s %>% #' session_jump_to("hadley-wickham.jpg") %>% #' session_back() %>% #' session_history() #' #' \donttest{ #' s %>% #' session_follow_link(css = "p a") %>% #' html_elements("p") #' } session <- function(url, ...) { session <- structure( list( handle = httr::handle(url), config = c(..., httr::config(autoreferer = 1L)), response = NULL, url = NULL, back = character(), forward = character(), cache = new_environment() ), class = "rvest_session" ) session_get(session, url) } #' @export #' @rdname session is.session <- function(x) inherits(x, "rvest_session") #' @export print.rvest_session <- function(x, ...) { cat(" ", x$url, "\n", sep = "") cat(" Status: ", httr::status_code(x), "\n", sep = "") cat(" Type: ", httr::headers(x)$`Content-Type`, "\n", sep = "") cat(" Size: ", length(x$response$content), "\n", sep = "") invisible(x) } session_get <- function(x, url, ...) { resp <- httr::GET(url, x$config, ..., handle = x$handle) session_set_response(x, resp) } session_set_response <- function(x, response) { httr::warn_for_status(response) x$response <- response x$url <- response$url x$cache <- new_environment() x } #' @param x A session. #' @param url A URL, either relative or absolute, to navigate to. #' @export #' @rdname session session_jump_to <- function(x, url, ...) { check_session(x) url <- xml2::url_absolute(url, x$url) last_url <- x$url x <- session_get(x, url, ...) x$back <- c(last_url, x$back) x$forward <- character() x } #' @param i A integer to select the ith link or a string to match the #' first link containing that text (case sensitive). #' @inheritParams html_element #' @export #' @rdname session session_follow_link <- function(x, i, css, xpath, ...) { check_session(x) url <- find_href(x, i = i, css = css, xpath = xpath) inform(paste0("Navigating to ", url)) session_jump_to(x, url, ...) } find_href <- function(x, i, css, xpath) { if (sum(!missing(i), !missing(css), !missing(xpath)) != 1) { abort("Must supply exactly one of `i`, `css`, or `xpath`") } if (!missing(i)) { stopifnot(length(i) == 1) a <- html_elements(x, "a") if (is.numeric(i)) { out <- a[[i]] } else if (is.character(i)) { text <- html_text(a) match <- grepl(i, text, fixed = TRUE) if (!any(match)) { stop("No links have text '", i, "'", call. = FALSE) } out <- a[[which(match)[[1]]]] } else { abort("`i` must a string or integer") } } else { a <- html_elements(x, css = css, xpath = xpath) if (length(a) == 0) { abort("No links matched `css`/`xpath`") } out <- a[[1]] } html_attr(out, "href") } #' @export #' @rdname session session_back <- function(x) { check_session(x) if (length(x$back) == 0) { abort("Can't go back any further") } url <- x$back[[1]] x$back <- x$back[-1] old_url <- x$url x <- session_get(x, url) x$forward <- c(old_url, x$forward) x } #' @export #' @rdname session session_forward <- function(x) { check_session(x) if (length(x$forward) == 0) { abort("Can't go forward any further") } url <- x$forward[[1]] old_url <- x$url x <- session_get(x, url) x$forward <- x$forward[-1] x$back <- c(old_url, x$back) x } #' @export #' @rdname session session_history <- function(x) { check_session(x) urls <- c(rev(x$back), x$url, x$forward) prefix <- rep(c(" ", "- ", " "), c(length(x$back), 1, length(x$forward))) cat_line(prefix, urls) } # form -------------------------------------------------------------------- #' @param form An [html_form] to submit #' @inheritParams html_form_submit #' @rdname session #' @export session_submit <- function(x, form, submit = NULL, ...) { check_session(x) check_form(form) subm <- submission_build(form, submit) resp <- submission_submit(subm, x$config, ..., handle = x$handle) session_set_response(x, resp) } # xml2 methods ------------------------------------------------------------ #' @importFrom xml2 read_html #' @export read_html.rvest_session <- function(x, ...) { if (!is_html(x$response)) { abort("Page doesn't appear to be html.") } env_cache(x$cache, "html", read_html(x$response, ..., base_url = x$url)) } is_html <- function(x) { type <- httr::headers(x)$`Content-Type` if (is.null(type)) return(FALSE) parsed <- httr::parse_media(type) parsed$complete %in% c("text/html", "application/xhtml+xml") } # rvest methods ----------------------------------------------------------------- #' @export html_form.rvest_session <- function(x, base_url = NULL) { html_form(read_html(x), base_url = base_url) } #' @export html_table.rvest_session <- function(x, header = NA, trim = TRUE, fill = deprecated(), dec = ".", na.strings = "NA", convert = TRUE) { html_table( read_html(x), header = header, trim = trim, fill = fill, dec = dec, na.strings = na.strings, convert = convert ) } #' @export html_element.rvest_session <- function(x, css, xpath) { html_element(read_html(x), css, xpath) } #' @export html_elements.rvest_session <- function(x, css, xpath) { html_elements(read_html(x), css, xpath) } # httr methods ----------------------------------------------------------------- #' @importFrom httr status_code #' @export status_code.rvest_session <- function(x) { status_code(x$response) } #' @importFrom httr headers #' @export headers.rvest_session <- function(x) { headers(x$response) } #' @importFrom httr cookies #' @export cookies.rvest_session <- function(x) { cookies(x$response) } # helpers ----------------------------------------------------------------- check_form <- function(x) { if (!inherits(x, "rvest_form")) { abort("`form` must be a single form produced by html_form()") } } check_session <- function(x) { if (!inherits(x, "rvest_session")) { abort("`x` must be produced by session()") } } rvest/R/rvest-package.R0000644000175000017500000000066113770130734014637 0ustar nileshnilesh#' @keywords internal #' @import rlang #' @importFrom lifecycle deprecated "_PACKAGE" #' @importFrom xml2 read_html #' @export xml2::read_html #' @importFrom xml2 url_absolute #' @export xml2::url_absolute #' @export #' @importFrom magrittr %>% magrittr::`%>%` # The following block is used by usethis to automatically manage # roxygen namespace tags. Modify with care! ## usethis namespace: start ## usethis namespace: end NULL rvest/R/table.R0000644000175000017500000001701314101012613013151 0ustar nileshnilesh#' Parse an html table into a data frame #' #' The algorithm mimics what a browser does, but repeats the values of merged #' cells in every cell that cover. #' #' @inheritParams html_name #' @param header Use first row as header? If `NA`, will use first row #' if it consists of `
      ` tags. #' #' If `TRUE`, column names are left exactly as they are in the source #' document, which may require post-processing to generate a valid data #' frame. #' @param trim Remove leading and trailing whitespace within each cell? #' @param fill Deprecated - missing cells in tables are now always #' automatically filled with `NA`. #' @param dec The character used as decimal place marker. #' @param na.strings Character vector of values that will be converted to `NA` #' if `convert` is `TRUE`. #' @param convert If `TRUE`, will run [`type.convert()`] to interpret texts as #' integer, double, or `NA`. #' @return #' When applied to a single element, `html_table()` returns a single tibble. #' When applied to multiple elements or a document, `html_table()` returns #' a list of tibbles. #' @export #' @examples #' sample1 <- minimal_html(" #' #' #' #' #'
      Col ACol B
      1x
      4y
      10z
      ") #' sample1 %>% #' html_element("table") %>% #' html_table() #' #' # Values in merged cells will be duplicated #' sample2 <- minimal_html(" #' #' #' #' #'
      ABC
      123
      45
      67
      ") #' sample2 %>% #' html_element("table") %>% #' html_table() #' #' # If a row is missing cells, they'll be filled with NAs #' sample3 <- minimal_html(" #' #' #' #' #'
      ABC
      12
      3
      4
      ") #' sample3 %>% #' html_element("table") %>% #' html_table() html_table <- function(x, header = NA, trim = TRUE, fill = deprecated(), dec = ".", na.strings = "NA", convert = TRUE ) { UseMethod("html_table") } #' @export html_table.xml_document <- function(x, header = NA, trim = TRUE, fill = deprecated(), dec = ".", na.strings = "NA", convert = TRUE) { tables <- xml2::xml_find_all(x, ".//table") html_table( tables, header = header, trim = trim, fill = fill, dec = dec, na.strings = na.strings, convert = convert ) } #' @export html_table.xml_nodeset <- function(x, header = NA, trim = TRUE, fill = deprecated(), dec = ".", na.strings = "NA", convert = TRUE) { lapply( x, html_table, header = header, trim = trim, fill = fill, dec = dec, na.strings = na.strings, convert = convert ) } #' @export html_table.xml_node <- function(x, header = NA, trim = TRUE, fill = deprecated(), dec = ".", na.strings = "NA", convert = TRUE) { if (lifecycle::is_present(fill) && !isTRUE(fill)) { lifecycle::deprecate_warn( when = "1.0.0", what = "html_table(fill = )", details = "An improved algorithm fills by default so it is no longer needed." ) } ns <- xml2::xml_ns(x) rows <- xml2::xml_find_all(x, ".//tr", ns = ns) cells <- lapply(rows, xml2::xml_find_all, ".//td|.//th", ns = ns) if (length(cells) == 0) { return(tibble::tibble()) } out <- table_fill(cells, trim = trim) if (is.na(header)) { header <- all(html_name(cells[[1]]) == "th") } if (header) { col_names <- out[1, , drop = FALSE] out <- out[-1, , drop = FALSE] } else { col_names <- paste0("X", seq_len(ncol(out))) } colnames(out) <- col_names df <- tibble::as_tibble(out, .name_repair = "minimal") if (isTRUE(convert)) { df[] <- lapply(df, function(x) { utils::type.convert(x, as.is = TRUE, dec = dec, na.strings = na.strings) }) } df } # Table fillng algorithm -------------------------------------------------- # Base on https://html.spec.whatwg.org/multipage/tables.html#forming-a-table table_fill <- function(cells, trim = TRUE) { width <- 0 height <- length(cells) # initial estimate values <- vector("list", height) # list of downward spanning cells dw <- dw_init() # https://html.spec.whatwg.org/multipage/tables.html#algorithm-for-processing-rows for (i in seq_along(cells)) { row <- cells[[i]] if (length(row) == 0) { next } rowspan <- as.integer(html_attr(row, "rowspan", default = NA_character_)) rowspan[is.na(rowspan)] <- 1 colspan <- as.integer(html_attr(row, "colspan", default = NA_character_)) colspan[is.na(colspan)] <- 1 text <- html_text(row) if (isTRUE(trim)) { text <- gsub("^[[:space:]\u00a0]+|[[:space:]\u00a0]+$", "", text) } vals <- rep(NA_character_, width) col <- 1 j <- 1 while(j <= length(row)) { if (col %in% dw$col) { cell <- dw_find(dw, col) cell_text <- cell$text cell_colspan <- cell$colspan } else { cell_text <- text[[j]] cell_colspan <- colspan[[j]] if (rowspan[[j]] > 1) { dw <- dw_add(dw, col, rowspan[[j]], colspan[[j]], text[[j]]) } j <- j + 1 } vals[col:(col + cell_colspan - 1L)] <- cell_text col <- col + cell_colspan } # Add any downward cells after last
      for(j in seq2(col - 1L, width)) { if (j %in% dw$col) { cell <- dw_find(dw, j) vals[j:(j + cell$colspan - 1L)] <- cell$text } } dw <- dw_prune(dw) values[[i]] <- vals height <- max(height, i + max(rowspan) - 1L) width <- max(width, col - 1L) } # Add any downward cells after
      `, `` (table row), `
      ` (table heading), and `` (table data). Here's a simple HTML table with two columns and three rows: ```{r} html <- minimal_html("
      x y
      1.5 2.7
      4.9 1.3
      7.2 8.1
      ") ``` Because tables are a common way to store data, rvest includes the handy `html_table()` which converts a table into a data frame: ```{r} html %>% html_node("table") %>% html_table() ``` ## Element vs elements When using rvest, your eventual goal is usually to build up a data frame, and you want each row to correspond some repeated unit on the HTML page. In this case, you should generally start by using `html_elements()` to select the elements that contain each observation then use `html_element()` to extract the variables from each observation. This guarantees that you'll get the same number of values for each variable because `html_element()` always returns the same number of outputs as inputs. To illustrate this problem take a look at this simple example I constructed using a few entries from `dplyr::starwars`: ```{r} html <- minimal_html("
      • C-3PO is a droid that weighs 167 kg
      • R2-D2 is a droid that weighs 96 kg
      • Yoda weighs 66 kg
      • R4-P17 is a droid
      ") ``` If you try to extract name, species, and weight directly, you end up with one vector of length four and two vectors of length three, and no way to align them: ```{r} html %>% html_elements("b") %>% html_text2() html %>% html_elements("i") %>% html_text2() html %>% html_elements(".weight") %>% html_text2() ``` Instead, use `html_elements()` to find a element that corresponds to each character, then use `html_element()` to extract each variable for all observations: ```{r} characters <- html %>% html_elements("li") characters %>% html_element("b") %>% html_text2() characters %>% html_element("i") %>% html_text2() characters %>% html_element(".weight") %>% html_text2() ``` `html_element()` automatically fills in `NA` when no elements match, keeping all of the variables aligned and making it easy to create a data frame: ```{r} data.frame( name = characters %>% html_element("b") %>% html_text2(), species = characters %>% html_element("i") %>% html_text2(), weight = characters %>% html_element(".weight") %>% html_text2() ) ``` rvest/inst/doc/starwars.R0000644000175000017500000000126314132343545015270 0ustar nileshnilesh## ---- echo = FALSE, results = "asis"------------------------------------------ library(rvest) crawl_html <- function(x) { x %>% gsub("\r", "", .) %>% gsub("\n\n", "

      ", .) %>% gsub("\n", " ", .) %>% paste0("

      ", ., "

      ") } film_desc <- function(x) { glue::glue_data(x, "

      {title}

      Released: {release_date}

      Director: {director}

      {crawl_html(opening_crawl)}
      ") } films <- repurrrsive::sw_films films <- films[order(sapply(films, "[[", "episode_id"))] descs <- vapply(films, film_desc, character(1)) writeLines(descs) rvest/inst/doc/rvest.html0000644000175000017500000014160014132343545015330 0ustar nileshnilesh Web scraping 101

      Web scraping 101

      This vignette introduces you to the basics of web scraping with rvest. You’ll first learn the basics of HTML and how to use CSS selectors to refer to specific elements, then you’ll learn how to use rvest functions to get data out of HTML and into R.

      library(rvest)

      HTML basics

      HTML stands for “HyperText Markup Language” and looks like this:

      <html>
      <head>
        <title>Page title</title>
      </head>
      <body>
        <h1 id='first'>A heading</h1>
        <p>Some text &amp; <b>some bold text.</b></p>
        <img src='myimg.png' width='100' height='100'>
      </body>

      HTML has a hierarchical structure formed by elements which consist of a start tag (e.g. <tag>), optional attributes (id='first'), an end tag1 (like </tag>), and contents (everything in between the start and end tag).

      Since < and > are used for start and end tags, you can’t write them directly. Instead you have to use the HTML escapes &gt; (greater than) and &lt; (less than). And since those escapes use &, if you want a literal ampersand you have to escape it as &amp;. There are a wide range of possible HTML escapes but you don’t need to worry about them too much because rvest automatically handles them for you.

      Elements

      All up, there are over 100 HTML elements. Some of the most important are:

      • Every HTML page must be must be in an <html> element, and it must have two children: <head>, which contains document metadata like the page title, and <body>, which contains the content you see in the browser.

      • Block tags like <h1> (heading 1), <p> (paragraph), and <ol> (ordered list) form the overall structure of the page.

      • Inline tags like <b> (bold), <i> (italics), and <a> (links) formats text inside block tags.

      If you encounter a tag that you’ve never seen before, you can find out what it does with a little googling. I recommend the MDN Web Docs which are produced by Mozilla, the company that makes the Firefox web browser.

      Contents

      Most elements can have content in between their start and end tags. This content can either be text or more elements. For example, the following HTML contains paragraph of text, with one word in bold.

      Hi! My name is Hadley.

      The children of a node refers only to elements, so the <p> element above has one child, the <b> element. The <b> element has no children, but it does have contents (the text “name”).

      Some elements, like <img> can’t have children. These elements depend solely on attributes for their behavior.

      Attributes

      Tags can have named attributes which look like name1='value1' name2='value2'. Two of the most important attributes are id and class, which are used in conjunction with CSS (Cascading Style Sheets) to control the visual appearance of the page. These are often useful when scraping data off a page.

      Reading HTML with rvest

      You’ll usually start the scraping process with read_html(). This returns a xml_document2 object which you’ll then manipulate using rvest functions:

      html <- read_html("http://rvest.tidyverse.org/")
      class(html)
      #> [1] "xml_document" "xml_node"

      For examples and experimentation, rvest also includes a function that lets you create an xml_document from literal HTML:

      html <- minimal_html("
        <p>This is a paragraph<p>
        <ul>
          <li>This is a bulleted list</li>
        </ul>
      ")
      html
      #> {html_document}
      #> <html>
      #> [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
      #> [2] <body>\n<p>This is a paragraph</p>\n<p>\n  </p>\n<ul>\n<li>This is a bull ...

      Regardless of how you get the HTML, you’ll need some way to identify the elements that contain the data you care about. rvest provides two options: CSS selectors and XPath expressions. Here I’ll focus on CSS selectors because they’re simpler but still sufficiently powerful for most scraping tasks.

      CSS selectors

      CSS is short for cascading style sheets, and is a tool for defining the visual styling of HTML documents. CSS includes a miniature language for selecting elements on a page called CSS selectors. CSS selectors define patterns for locating HTML elements, and are useful for scraping because they provide a concise way of describing which elements you want to extract.

      CSS selectors can be quite complex, but fortunately you only need the simplest for rvest, because you can also write R code for more complicated situations. The four most important selectors are:

      • p: selects all <p> elements.

      • .title: selects all elements with class “title”.

      • p.special: selects all <p> elements with class “special”.

      • #title: selects the element with the id attribute that equals “title”. Id attributes must be unique within a document, so this will only ever select a single element.

      If you want to learn more CSS selectors I recommend starting with the fun CSS dinner tutorial and then referring to the MDN web docs.

      Lets try out the most important selectors with a simple example:

      html <- minimal_html("
        <h1>This is a heading</h1>
        <p id='first'>This is a paragraph</p>
        <p class='important'>This is an important paragraph</p>
      ")

      In rvest you can extract a single element with html_element() or all matching elements with html_elements(). Both functions take a document3 and a css selector:

      html %>% html_element("h1")
      #> {html_node}
      #> <h1>
      html %>% html_elements("p")
      #> {xml_nodeset (2)}
      #> [1] <p id="first">This is a paragraph</p>
      #> [2] <p class="important">This is an important paragraph</p>
      html %>% html_elements(".important")
      #> {xml_nodeset (1)}
      #> [1] <p class="important">This is an important paragraph</p>
      html %>% html_elements("#first")
      #> {xml_nodeset (1)}
      #> [1] <p id="first">This is a paragraph</p>

      Selectors can also be combined in various ways using combinators. For example,The most important combinator is ” “, the descendant combination, because p a selects all <a> elements that are a child of a <p> element.

      If you don’t know exactly what selector you need, I highly recommend using SelectorGadget, which lets you automatically generate the selector you need by supplying positive and negative examples in the browser.

      Extracting data

      Now that you’ve got the elements you care about, you’ll need to get data out of them. You’ll usually get the data from either the text contents or an attribute. But, sometimes (if you’re lucky!), the data you need will be in an HTML table.

      Text

      Use html_text2() to extract the plain text contents of an HTML element:

      html <- minimal_html("
        <ol>
          <li>apple &amp; pear</li>
          <li>banana</li>
          <li>pineapple</li>
        </ol>
      ")
      html %>% 
        html_elements("li") %>% 
        html_text2()
      #> [1] "apple & pear" "banana"       "pineapple"

      Note that the escaped ampersand is automatically converted to &; you’ll only ever see HTML escapes in the source HTML, not in the data returned by rvest.

      You might wonder why I used html_text2(), since it seems to give the same result as html_text():

      html %>% 
        html_elements("li") %>% 
        html_text()
      #> [1] "apple & pear" "banana"       "pineapple"

      The main difference is how the two functions handle white space. In HTML, white space is largely ignored, and it’s the structure of the elements that defines how text is laid out. html_text2() does its best to follow the same rules, giving you something similar to what you’d see in the browser. Take this example which contains a bunch of white space that HTML ignores.

      html <- minimal_html("<body>
        <p>
        This is
        a
        paragraph.</p><p>This is another paragraph.
        
        It has two sentences.</p>
      ")

      html_text2() gives you what you expect: two paragraphs of text separated by a blank line.

      html %>% 
        html_element("body") %>% 
        html_text2() %>% 
        cat()
      #> This is a paragraph.
      #> 
      #> This is another paragraph. It has two sentences.

      Whereas html_text() returns the garbled raw underlying text:

      html %>% 
        html_element("body") %>% 
        html_text() %>% 
        cat()
      #> 
      #>   
      #>   This is
      #>   a
      #>   paragraph.This is another paragraph.
      #>   
      #>   It has two sentences.

      Attributes

      Attributes are used to record the destination of links (the href attribute of <a> elements) and the source of images (the src attribute of the <img> element):

      html <- minimal_html("
        <p><a href='https://en.wikipedia.org/wiki/Cat'>cats</a></p>
        <img src='https://cataas.com/cat' width='100' height='200'>
      ")

      The value of an attribute can be retrieved with html_attr():

      html %>% 
        html_elements("a") %>% 
        html_attr("href")
      #> [1] "https://en.wikipedia.org/wiki/Cat"
      
      html %>% 
        html_elements("img") %>% 
        html_attr("src")
      #> [1] "https://cataas.com/cat"

      Note that html_attr() always returns a string, so you may need to post-process with as.integer()/readr::parse_integer() or similar.

      html %>% 
        html_elements("img") %>% 
        html_attr("width")
      #> [1] "100"
      
      html %>% 
        html_elements("img") %>% 
        html_attr("width") %>% 
        as.integer()
      #> [1] 100

      Tables

      HTML tables are composed four main elements: <table>, <tr> (table row), <th> (table heading), and <td> (table data). Here’s a simple HTML table with two columns and three rows:

      html <- minimal_html("
        <table>
          <tr>
            <th>x</th>
            <th>y</th>
          </tr>
          <tr>
            <td>1.5</td>
            <td>2.7</td>
          </tr>
          <tr>
            <td>4.9</td>
            <td>1.3</td>
          </tr>
          <tr>
            <td>7.2</td>
            <td>8.1</td>
          </tr>
        </table>
        ")

      Because tables are a common way to store data, rvest includes the handy html_table() which converts a table into a data frame:

      html %>% 
        html_node("table") %>% 
        html_table()
      #> # A tibble: 3 × 2
      #>       x     y
      #>   <dbl> <dbl>
      #> 1   1.5   2.7
      #> 2   4.9   1.3
      #> 3   7.2   8.1

      Element vs elements

      When using rvest, your eventual goal is usually to build up a data frame, and you want each row to correspond some repeated unit on the HTML page. In this case, you should generally start by using html_elements() to select the elements that contain each observation then use html_element() to extract the variables from each observation. This guarantees that you’ll get the same number of values for each variable because html_element() always returns the same number of outputs as inputs.

      To illustrate this problem take a look at this simple example I constructed using a few entries from dplyr::starwars:

      html <- minimal_html("
        <ul>
          <li><b>C-3PO</b> is a <i>droid</i> that weighs <span class='weight'>167 kg</span></li>
          <li><b>R2-D2</b> is a <i>droid</i> that weighs <span class='weight'>96 kg</span></li>
          <li><b>Yoda</b> weighs <span class='weight'>66 kg</span></li>
          <li><b>R4-P17</b> is a <i>droid</i></li>
        </ul>
        ")

      If you try to extract name, species, and weight directly, you end up with one vector of length four and two vectors of length three, and no way to align them:

      html %>% html_elements("b") %>% html_text2()
      #> [1] "C-3PO"  "R2-D2"  "Yoda"   "R4-P17"
      html %>% html_elements("i") %>% html_text2()
      #> [1] "droid" "droid" "droid"
      html %>% html_elements(".weight") %>% html_text2()
      #> [1] "167 kg" "96 kg"  "66 kg"

      Instead, use html_elements() to find a element that corresponds to each character, then use html_element() to extract each variable for all observations:

      characters <- html %>% html_elements("li")
      
      characters %>% html_element("b") %>% html_text2()
      #> [1] "C-3PO"  "R2-D2"  "Yoda"   "R4-P17"
      characters %>% html_element("i") %>% html_text2()
      #> [1] "droid" "droid" NA      "droid"
      characters %>% html_element(".weight") %>% html_text2()
      #> [1] "167 kg" "96 kg"  "66 kg"  NA

      html_element() automatically fills in NA when no elements match, keeping all of the variables aligned and making it easy to create a data frame:

      data.frame(
        name = characters %>% html_element("b") %>% html_text2(),
        species = characters %>% html_element("i") %>% html_text2(),
        weight = characters %>% html_element(".weight") %>% html_text2()
      )
      #>     name species weight
      #> 1  C-3PO   droid 167 kg
      #> 2  R2-D2   droid  96 kg
      #> 3   Yoda    <NA>  66 kg
      #> 4 R4-P17   droid   <NA>

      1. A number of tags (including <p> and <li>) don’t require end tags, but I think it’s best to include them because it makes seeing the structure of the HTML a little easier.↩︎

      2. This class comes from the xml2 package. xml2 is a low-level package that rvest builds on top of.↩︎

      3. Or another element, more on that shortly.↩︎

      rvest/inst/doc/rvest.R0000644000175000017500000001022714132343545014565 0ustar nileshnilesh## ---- echo=FALSE-------------------------------------------------------------- knitr::opts_chunk$set(comment = "#>", collapse = TRUE) ## ----------------------------------------------------------------------------- library(rvest) ## ----------------------------------------------------------------------------- html <- read_html("http://rvest.tidyverse.org/") class(html) ## ----------------------------------------------------------------------------- html <- minimal_html("

      This is a paragraph

      • This is a bulleted list
      ") html ## ----------------------------------------------------------------------------- html <- minimal_html("

      This is a heading

      This is a paragraph

      This is an important paragraph

      ") ## ----------------------------------------------------------------------------- html %>% html_element("h1") html %>% html_elements("p") html %>% html_elements(".important") html %>% html_elements("#first") ## ----------------------------------------------------------------------------- html <- minimal_html("
      1. apple & pear
      2. banana
      3. pineapple
      ") html %>% html_elements("li") %>% html_text2() ## ----------------------------------------------------------------------------- html %>% html_elements("li") %>% html_text() ## ----------------------------------------------------------------------------- html <- minimal_html("

      This is a paragraph.

      This is another paragraph. It has two sentences.

      ") ## ----------------------------------------------------------------------------- html %>% html_element("body") %>% html_text2() %>% cat() ## ----------------------------------------------------------------------------- html %>% html_element("body") %>% html_text() %>% cat() ## ----------------------------------------------------------------------------- html <- minimal_html("

      cats

      ") ## ----------------------------------------------------------------------------- html %>% html_elements("a") %>% html_attr("href") html %>% html_elements("img") %>% html_attr("src") ## ----------------------------------------------------------------------------- html %>% html_elements("img") %>% html_attr("width") html %>% html_elements("img") %>% html_attr("width") %>% as.integer() ## ----------------------------------------------------------------------------- html <- minimal_html("
      x y
      1.5 2.7
      4.9 1.3
      7.2 8.1
      ") ## ----------------------------------------------------------------------------- html %>% html_node("table") %>% html_table() ## ----------------------------------------------------------------------------- html <- minimal_html("
      • C-3PO is a droid that weighs 167 kg
      • R2-D2 is a droid that weighs 96 kg
      • Yoda weighs 66 kg
      • R4-P17 is a droid
      ") ## ----------------------------------------------------------------------------- html %>% html_elements("b") %>% html_text2() html %>% html_elements("i") %>% html_text2() html %>% html_elements(".weight") %>% html_text2() ## ----------------------------------------------------------------------------- characters <- html %>% html_elements("li") characters %>% html_element("b") %>% html_text2() characters %>% html_element("i") %>% html_text2() characters %>% html_element(".weight") %>% html_text2() ## ----------------------------------------------------------------------------- data.frame( name = characters %>% html_element("b") %>% html_text2(), species = characters %>% html_element("i") %>% html_text2(), weight = characters %>% html_element(".weight") %>% html_text2() ) rvest/inst/doc/starwars.html0000644000175000017500000002722514132343545016041 0ustar nileshnilesh Star Wars films

      Star Wars films

      This vignette contains some data about the Star Wars films for use in rvest examples and vignettes.

      The Phantom Menace

      Released: 1999-05-19

      Director: George Lucas

      Turmoil has engulfed the Galactic Republic. The taxation of trade routes to outlying star systems is in dispute.

      Hoping to resolve the matter with a blockade of deadly battleships, the greedy Trade Federation has stopped all shipping to the small planet of Naboo.

      While the Congress of the Republic endlessly debates this alarming chain of events, the Supreme Chancellor has secretly dispatched two Jedi Knights, the guardians of peace and justice in the galaxy, to settle the conflict….

      Attack of the Clones

      Released: 2002-05-16

      Director: George Lucas

      There is unrest in the Galactic Senate. Several thousand solar systems have declared their intentions to leave the Republic.

      This separatist movement, under the leadership of the mysterious Count Dooku, has made it difficult for the limited number of Jedi Knights to maintain peace and order in the galaxy.

      Senator Amidala, the former Queen of Naboo, is returning to the Galactic Senate to vote on the critical issue of creating an ARMY OF THE REPUBLIC to assist the overwhelmed Jedi….

      Revenge of the Sith

      Released: 2005-05-19

      Director: George Lucas

      War! The Republic is crumbling under attacks by the ruthless Sith Lord, Count Dooku. There are heroes on both sides. Evil is everywhere.

      In a stunning move, the fiendish droid leader, General Grievous, has swept into the Republic capital and kidnapped Chancellor Palpatine, leader of the Galactic Senate.

      As the Separatist Droid Army attempts to flee the besieged capital with their valuable hostage, two Jedi Knights lead a desperate mission to rescue the captive Chancellor….

      A New Hope

      Released: 1977-05-25

      Director: George Lucas

      It is a period of civil war. Rebel spaceships, striking from a hidden base, have won their first victory against the evil Galactic Empire.

      During the battle, Rebel spies managed to steal secret plans to the Empire’s ultimate weapon, the DEATH STAR, an armored space station with enough power to destroy an entire planet.

      Pursued by the Empire’s sinister agents, Princess Leia races home aboard her starship, custodian of the stolen plans that can save her people and restore freedom to the galaxy….

      The Empire Strikes Back

      Released: 1980-05-17

      Director: Irvin Kershner

      It is a dark time for the Rebellion. Although the Death Star has been destroyed, Imperial troops have driven the Rebel forces from their hidden base and pursued them across the galaxy.

      Evading the dreaded Imperial Starfleet, a group of freedom fighters led by Luke Skywalker has established a new secret base on the remote ice world of Hoth.

      The evil lord Darth Vader, obsessed with finding young Skywalker, has dispatched thousands of remote probes into the far reaches of space….

      Return of the Jedi

      Released: 1983-05-25

      Director: Richard Marquand

      Luke Skywalker has returned to his home planet of Tatooine in an attempt to rescue his friend Han Solo from the clutches of the vile gangster Jabba the Hutt.

      Little does Luke know that the GALACTIC EMPIRE has secretly begun construction on a new armored space station even more powerful than the first dreaded Death Star.

      When completed, this ultimate weapon will spell certain doom for the small band of rebels struggling to restore freedom to the galaxy…

      The Force Awakens

      Released: 2015-12-11

      Director: J. J. Abrams

      Luke Skywalker has vanished. In his absence, the sinister FIRST ORDER has risen from the ashes of the Empire and will not rest until Skywalker, the last Jedi, has been destroyed. With the support of the REPUBLIC, General Leia Organa leads a brave RESISTANCE. She is desperate to find her brother Luke and gain his help in restoring peace and justice to the galaxy. Leia has sent her most daring pilot on a secret mission to Jakku, where an old ally has discovered a clue to Luke’s whereabouts….

      rvest/inst/doc/starwars.Rmd0000644000175000017500000000165113775161316015620 0ustar nileshnilesh--- title: "Star Wars films" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Star Wars films} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- This vignette contains some data about the Star Wars films for use in rvest examples and vignettes. ```{r, echo = FALSE, results = "asis"} library(rvest) crawl_html <- function(x) { x %>% gsub("\r", "", .) %>% gsub("\n\n", "

      ", .) %>% gsub("\n", " ", .) %>% paste0("

      ", ., "

      ") } film_desc <- function(x) { glue::glue_data(x, "

      {title}

      Released: {release_date}

      Director: {director}

      {crawl_html(opening_crawl)}
      ") } films <- repurrrsive::sw_films films <- films[order(sapply(films, "[[", "episode_id"))] descs <- vapply(films, film_desc, character(1)) writeLines(descs) ``` rvest/NAMESPACE0000644000175000017500000000366114132342306012772 0ustar nileshnilesh# Generated by roxygen2: do not edit by hand S3method(cookies,rvest_session) S3method(format,rvest_field) S3method(headers,rvest_session) S3method(html_element,default) S3method(html_element,rvest_session) S3method(html_elements,default) S3method(html_elements,rvest_session) S3method(html_form,rvest_session) S3method(html_form,xml_document) S3method(html_form,xml_node) S3method(html_form,xml_nodeset) S3method(html_table,rvest_session) S3method(html_table,xml_document) S3method(html_table,xml_node) S3method(html_table,xml_nodeset) S3method(html_text2,xml_document) S3method(html_text2,xml_missing) S3method(html_text2,xml_node) S3method(html_text2,xml_nodeset) S3method(print,rvest_field) S3method(print,rvest_form) S3method(print,rvest_session) S3method(read_html,rvest_session) S3method(status_code,rvest_session) export("%>%") export(back) export(follow_link) export(forward) export(google_form) export(guess_encoding) export(html_attr) export(html_attrs) export(html_children) export(html_element) export(html_elements) export(html_encoding_guess) export(html_form) export(html_form_set) export(html_form_submit) export(html_name) export(html_node) export(html_nodes) export(html_session) export(html_table) export(html_text) export(html_text2) export(is.session) export(jump_to) export(minimal_html) export(read_html) export(repair_encoding) export(session) export(session_back) export(session_follow_link) export(session_forward) export(session_history) export(session_jump_to) export(session_submit) export(set_values) export(submit_form) export(url_absolute) export(xml_node) export(xml_nodes) export(xml_tag) import(rlang) importFrom(httr,cookies) importFrom(httr,headers) importFrom(httr,status_code) importFrom(lifecycle,deprecated) importFrom(magrittr,"%>%") importFrom(xml2,read_html) importFrom(xml2,url_absolute) importFrom(xml2,xml_attr) importFrom(xml2,xml_attrs) importFrom(xml2,xml_children) importFrom(xml2,xml_name) importFrom(xml2,xml_text)