rvest/0000755000176200001440000000000014562475432011431 5ustar liggesusersrvest/NAMESPACE0000644000176200001440000000411614557166073012654 0ustar liggesusers# Generated by roxygen2: do not edit by hand S3method(cookies,rvest_session) S3method(format,rvest_field) S3method(headers,rvest_session) S3method(html_element,LiveHTML) S3method(html_element,default) S3method(html_element,rvest_session) S3method(html_elements,LiveHTML) S3method(html_elements,default) S3method(html_elements,rvest_session) S3method(html_form,rvest_session) S3method(html_form,xml_document) S3method(html_form,xml_node) S3method(html_form,xml_nodeset) S3method(html_table,LiveHTML) S3method(html_table,rvest_session) S3method(html_table,xml_document) S3method(html_table,xml_node) S3method(html_table,xml_nodeset) S3method(html_text2,xml_document) S3method(html_text2,xml_missing) S3method(html_text2,xml_node) S3method(html_text2,xml_nodeset) S3method(print,rvest_field) S3method(print,rvest_form) S3method(print,rvest_session) S3method(read_html,rvest_session) S3method(status_code,rvest_session) export("%>%") export(LiveHTML) export(back) export(follow_link) export(forward) export(google_form) export(guess_encoding) export(html_attr) export(html_attrs) export(html_children) export(html_element) export(html_elements) export(html_encoding_guess) export(html_form) export(html_form_set) export(html_form_submit) export(html_name) export(html_node) export(html_nodes) export(html_session) export(html_table) export(html_text) export(html_text2) export(is.session) export(jump_to) export(minimal_html) export(read_html) export(read_html_live) export(repair_encoding) export(session) export(session_back) export(session_follow_link) export(session_forward) export(session_history) export(session_jump_to) export(session_submit) export(set_values) export(submit_form) export(url_absolute) export(xml_node) export(xml_nodes) export(xml_tag) import(rlang) importFrom(glue,glue) importFrom(httr,cookies) importFrom(httr,headers) importFrom(httr,status_code) importFrom(lifecycle,deprecated) importFrom(magrittr,"%>%") importFrom(xml2,read_html) importFrom(xml2,url_absolute) importFrom(xml2,xml_attr) importFrom(xml2,xml_attrs) importFrom(xml2,xml_children) importFrom(xml2,xml_name) importFrom(xml2,xml_text) rvest/demo/0000755000176200001440000000000013775436633012362 5ustar liggesusersrvest/demo/united.R0000644000176200001440000000071213775436633013775 0ustar liggesusers# Scrape miles from united site library(rvest) united <- session("http://www.united.com/") login <- united %>% html_element("form[name=LoginForm]") %>% html_form() %>% html_form_set( MpNumber = "GY797363", Password = password ) logged_in <- united %>% session_submit(login) logged_in %>% follow_link("View account") %>% html_element("#ctl00_ContentInfo_AccountSummary_spanEliteMilesNew") %>% html_text() %>% readr::parse_number() rvest/demo/zillow.R0000644000176200001440000000151413775423476014027 0ustar liggesusers# Inspired by https://github.com/notesofdabbler library(rvest) library(tidyr) page <- read_html("http://www.zillow.com/homes/for_sale/Greenwood-IN/fsba,fsbo,fore,cmsn_lt/house_type/52333_rid/39.638414,-86.011362,39.550714,-86.179419_rect/12_zm/0_mmm/") houses <- page %>% html_elements(".photo-cards li article") z_id <- houses %>% html_attr("id") address <- houses %>% html_element(".zsg-photo-card-address") %>% html_text() price <- houses %>% html_element(".zsg-photo-card-price") %>% html_text() %>% readr::parse_number() params <- houses %>% html_element(".zsg-photo-card-info") %>% html_text() %>% strsplit("\u00b7") beds <- params %>% purrr::map_chr(1) %>% readr::parse_number() baths <- params %>% purrr::map_chr(2) %>% readr::parse_number() house_area <- params %>% purrr::map_chr(3) %>% readr::parse_number() rvest/demo/00Index0000644000176200001440000000021713767413737013515 0ustar liggesusersunited Scrape mileage details from united.com tripadvisor Scrape review data from tripadvisor zillow Scrape housing info from zillow rvest/demo/tripadvisor.R0000644000176200001440000000156113775423515015051 0ustar liggesusers# Inspired by # http://notesofdabbler.github.io/201408_hotelReview/scrapeTripAdvisor.html library(rvest) url <- "http://www.tripadvisor.com/Hotel_Review-g37209-d1762915-Reviews-JW_Marriott_Indianapolis-Indianapolis_Indiana.html" reviews <- url %>% read_html() %>% html_elements("#REVIEWS .innerBubble") id <- reviews %>% html_element(".quote a") %>% html_attr("id") quote <- reviews %>% html_element(".quote span") %>% html_text() rating <- reviews %>% html_element(".rating .rating_s_fill") %>% html_attr("alt") %>% gsub(" of 5 stars", "", .) %>% as.integer() date <- reviews %>% html_element(".rating .ratingDate") %>% html_attr("title") %>% strptime("%b %d, %Y") %>% as.POSIXct() review <- reviews %>% html_element(".entry .partial_entry") %>% html_text() data.frame(id, quote, rating, date, review, stringsAsFactors = FALSE) %>% View() rvest/LICENSE0000644000176200001440000000005314554031036012421 0ustar liggesusersYEAR: 2023 COPYRIGHT HOLDER: rvest authors rvest/README.md0000644000176200001440000001063414554031036012701 0ustar liggesusers # rvest rvest website [![CRAN status](https://www.r-pkg.org/badges/version/rvest)](https://cran.r-project.org/package=rvest) [![R-CMD-check](https://github.com/tidyverse/rvest/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/tidyverse/rvest/actions/workflows/R-CMD-check.yaml) [![Codecov test coverage](https://codecov.io/gh/tidyverse/rvest/branch/main/graph/badge.svg)](https://app.codecov.io/gh/tidyverse/rvest?branch=main) ## Overview rvest helps you scrape (or harvest) data from web pages. It is designed to work with [magrittr](https://github.com/tidyverse/magrittr) to make it easy to express common web scraping tasks, inspired by libraries like [beautiful soup](https://www.crummy.com/software/BeautifulSoup/) and [RoboBrowser](http://robobrowser.readthedocs.io/en/latest/readme.html). If you’re scraping multiple pages, I highly recommend using rvest in concert with [polite](https://dmi3kno.github.io/polite/). The polite package ensures that you’re respecting the [robots.txt](https://en.wikipedia.org/wiki/Robots_exclusion_standard) and not hammering the site with too many requests. ## Installation ``` r # The easiest way to get rvest is to install the whole tidyverse: install.packages("tidyverse") # Alternatively, install just rvest: install.packages("rvest") ``` ## Usage ``` r library(rvest) # Start by reading a HTML page with read_html(): starwars <- read_html("https://rvest.tidyverse.org/articles/starwars.html") # Then find elements that match a css selector or XPath expression # using html_elements(). In this example, each
corresponds # to a different film films <- starwars %>% html_elements("section") films #> {xml_nodeset (7)} #> [1]

\nThe Phantom Menace\n

\n

\nReleased: 1999 ... #> [2]

\nAttack of the Clones\n

\n

\nReleased: 20 ... #> [3]

\nRevenge of the Sith\n

\n

\nReleased: 200 ... #> [4]

\nA New Hope\n

\n

\nReleased: 1977-05-25\n ... #> [5]

\nThe Empire Strikes Back\n

\n

\nReleased: ... #> [6]

\nReturn of the Jedi\n

\n

\nReleased: 1983 ... #> [7]

\nThe Force Awakens\n

\n

\nReleased: 2015- ... # Then use html_element() to extract one element per film. Here # we the title is given by the text inside

title <- films %>% html_element("h2") %>% html_text2() title #> [1] "The Phantom Menace" "Attack of the Clones" #> [3] "Revenge of the Sith" "A New Hope" #> [5] "The Empire Strikes Back" "Return of the Jedi" #> [7] "The Force Awakens" # Or use html_attr() to get data out of attributes. html_attr() always # returns a string so we convert it to an integer using a readr function episode <- films %>% html_element("h2") %>% html_attr("data-id") %>% readr::parse_integer() episode #> [1] 1 2 3 4 5 6 7 ``` If the page contains tabular data you can convert it directly to a data frame with `html_table()`: ``` r html <- read_html("https://en.wikipedia.org/w/index.php?title=The_Lego_Movie&oldid=998422565") html %>% html_element(".tracklist") %>% html_table() #> # A tibble: 29 × 4 #> No. Title `Performer(s)` Length #> #> 1 1. "\"Everything Is Awesome\"" "Tegan and Sara featuring The Lonel… 2:43 #> 2 2. "\"Prologue\"" "" 2:28 #> 3 3. "\"Emmett's Morning\"" "" 2:00 #> 4 4. "\"Emmett Falls in Love\"" "" 1:11 #> 5 5. "\"Escape\"" "" 3:26 #> 6 6. "\"Into the Old West\"" "" 1:00 #> 7 7. "\"Wyldstyle Explains\"" "" 1:21 #> 8 8. "\"Emmett's Mind\"" "" 2:17 #> 9 9. "\"The Transformation\"" "" 1:46 #> 10 10. "\"Saloons and Wagons\"" "" 3:38 #> # ℹ 19 more rows ``` rvest/man/0000755000176200001440000000000014560241772012200 5ustar liggesusersrvest/man/html_text.Rd0000644000176200001440000000504613775651250014507 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/text.R \name{html_text} \alias{html_text} \alias{html_text2} \title{Get element text} \usage{ html_text(x, trim = FALSE) html_text2(x, preserve_nbsp = FALSE) } \arguments{ \item{x}{A document, node, or node set.} \item{trim}{If \code{TRUE} will trim leading and trailing spaces.} \item{preserve_nbsp}{Should non-breaking spaces be preserved? By default, \code{html_text2()} converts to ordinary spaces to ease further computation. When \code{preserve_nbsp} is \code{TRUE}, \verb{ } will appear in strings as \code{"\\ua0"}. This often causes confusion because it prints the same way as \code{" "}.} } \value{ A character vector the same length as \code{x} } \description{ There are two ways to retrieve text from a element: \code{html_text()} and \code{html_text2()}. \code{html_text()} is a thin wrapper around \code{\link[xml2:xml_text]{xml2::xml_text()}} which returns just the raw underlying text. \code{html_text2()} simulates how text looks in a browser, using an approach inspired by JavaScript's \href{https://developer.mozilla.org/en-US/docs/Web/API/HTMLElement/innerText}{innerText()}. Roughly speaking, it converts \verb{
} to \code{"\\n"}, adds blank lines around \verb{

} tags, and lightly formats tabular data. \code{html_text2()} is usually what you want, but it is much slower than \code{html_text()} so for simple applications where performance is important you may want to use \code{html_text()} instead. } \examples{ # To understand the difference between html_text() and html_text2() # take the following html: html <- minimal_html( "

This is a paragraph. This another sentence.
This should start on a new line" ) # html_text() returns the raw underlying text, which includes whitespace # that would be ignored by a browser, and ignores the
html \%>\% html_element("p") \%>\% html_text() \%>\% writeLines() # html_text2() simulates what a browser would display. Non-significant # whitespace is collapsed, and
is turned into a line break html \%>\% html_element("p") \%>\% html_text2() \%>\% writeLines() # By default, html_text2() also converts non-breaking spaces to regular # spaces: html <- minimal_html("

x y

") x1 <- html \%>\% html_element("p") \%>\% html_text() x2 <- html \%>\% html_element("p") \%>\% html_text2() # When printed, non-breaking spaces look exactly like regular spaces x1 x2 # But aren't actually the same: x1 == x2 # Which you can confirm by looking at their underlying binary # representaion: charToRaw(x1) charToRaw(x2) } rvest/man/google_form.Rd0000644000176200001440000000047014101012310014736 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/utils.R \name{google_form} \alias{google_form} \title{Make link to google form given id} \usage{ google_form(x) } \arguments{ \item{x}{Unique identifier for form} } \description{ Make link to google form given id } \keyword{internal} rvest/man/html_form.Rd0000644000176200001440000000426114277722126014463 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/form.R \name{html_form} \alias{html_form} \alias{html_form_set} \alias{html_form_submit} \title{Parse forms and set values} \usage{ html_form(x, base_url = NULL) html_form_set(form, ...) html_form_submit(form, submit = NULL) } \arguments{ \item{x}{A document (from \code{\link[=read_html]{read_html()}}), node set (from \code{\link[=html_elements]{html_elements()}}), node (from \code{\link[=html_element]{html_element()}}), or session (from \code{\link[=session]{session()}}).} \item{base_url}{Base url of underlying HTML document. The default, \code{NULL}, uses the url of the HTML document underlying \code{x}.} \item{form}{A form} \item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name-value pairs giving fields to modify. Provide a character vector to set multiple checkboxes in a set or select multiple values from a multi-select.} \item{submit}{Which button should be used to submit the form? \itemize{ \item \code{NULL}, the default, uses the first button. \item A string selects a button by its name. \item A number selects a button using its relative position. }} } \value{ \itemize{ \item \code{html_form()} returns as S3 object with class \code{rvest_form} when applied to a single element. It returns a list of \code{rvest_form} objects when applied to multiple elements or a document. \item \code{html_form_set()} returns an \code{rvest_form} object. \item \code{html_form_submit()} submits the form, returning an httr response which can be parsed with \code{\link[=read_html]{read_html()}}. } } \description{ Use \code{html_form()} to extract a form, set values with \code{html_form_set()}, and submit it with \code{html_form_submit()}. } \examples{ html <- read_html("http://www.google.com") search <- html_form(html)[[1]] search <- search \%>\% html_form_set(q = "My little pony", hl = "fr") # Or if you have a list of values, use !!! vals <- list(q = "web scraping", hl = "en") search <- search \%>\% html_form_set(!!!vals) # To submit and get result: \dontrun{ resp <- html_form_submit(search) read_html(resp) } } \seealso{ HTML 4.01 form specification: \url{https://www.w3.org/TR/html401/interact/forms.html} } rvest/man/rename.Rd0000644000176200001440000000325614014035320013724 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rename.R \name{rename} \alias{set_values} \alias{submit_form} \alias{xml_tag} \alias{xml_node} \alias{xml_nodes} \alias{html_nodes} \alias{html_node} \alias{back} \alias{forward} \alias{jump_to} \alias{follow_link} \alias{html_session} \title{Functions renamed in rvest 1.0.0} \usage{ set_values(form, ...) submit_form(session, form, submit = NULL, ...) xml_tag(x) xml_node(...) xml_nodes(...) html_nodes(...) html_node(...) back(x) forward(x) jump_to(x, url, ...) follow_link(x, ...) html_session(url, ...) } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} rvest 1.0.0 renamed a number of functions to ensure that every function has a common prefix, matching tidyverse conventions that emerged since rvest was first created. \itemize{ \item \code{set_values()} -> \code{html_form_set()} \item \code{submit_form()} -> \code{session_submit()} \item \code{xml_tag()} -> \code{html_name()} \item \code{xml_node()} & \code{html_node()} -> \code{html_element()} \item \code{xml_nodes()} & \code{html_nodes()} -> \code{html_elements()} } (\code{html_node()} and \code{html_nodes()} are only superseded because they're so widely used.) Additionally all session related functions gained a common prefix: \itemize{ \item \code{html_session()} -> \code{session()} \item \code{forward()} -> \code{session_forward()} \item \code{back()} -> \code{session_back()} \item \code{jump_to()} -> \code{session_jump_to()} \item \code{follow_link()} -> \code{session_follow_link()} } } \keyword{internal} rvest/man/read_html_live.Rd0000644000176200001440000000405714562443630015452 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/live.R \name{read_html_live} \alias{read_html_live} \title{Live web scraping (with chromote)} \usage{ read_html_live(url) } \arguments{ \item{url}{Website url to read from.} } \value{ \code{read_html_live()} returns an R6 \link{LiveHTML} object. You can interact with this object using the usual rvest functions, or call its methods, like \verb{$click()}, \verb{$scroll_to()}, and \verb{$type()} to interact with the live page like a human would. } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} \code{\link[=read_html]{read_html()}} operates on the HTML source code downloaded from the server. This works for most websites but can fail if the site uses javascript to generate the HTML. \code{read_html_live()} provides an alternative interface that runs a live web browser (Chrome) in the background. This allows you to access elements of the HTML page that are generated dynamically by javascript and to interact with the live page by clicking on buttons or typing in forms. Behind the scenes, this function uses the \href{https://rstudio.github.io/chromote/}{chromote} package, which requires that you have a copy of \href{https://www.google.com/chrome/}{Google Chrome} installed on your machine. } \examples{ \dontrun{ # When we retrieve the raw HTML for this site, it doesn't contain the # data we're interested in: static <- read_html("https://www.forbes.com/top-colleges/") static \%>\% html_elements(".TopColleges2023_tableRow__BYOSU") # Instead, we need to run the site in a real web browser, causing it to # download a JSON file and then dynamically generate the html: sess <- read_html_live("https://www.forbes.com/top-colleges/") sess$view() rows <- sess \%>\% html_elements(".TopColleges2023_tableRow__BYOSU") rows \%>\% html_element(".TopColleges2023_organizationName__J1lEV") \%>\% html_text() rows \%>\% html_element(".grant-aid") \%>\% html_text() } } rvest/man/minimal_html.Rd0000644000176200001440000000066313775437157015161 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/utils.R \name{minimal_html} \alias{minimal_html} \title{Create an HTML document from inline HTML} \usage{ minimal_html(html, title = "") } \arguments{ \item{html}{HTML contents of page.} \item{title}{Page title (required by HTML spec).} } \description{ Create an HTML document from inline HTML } \examples{ minimal_html("

test

") } \keyword{internal} rvest/man/html_name.Rd0000644000176200001440000000125613776122153014436 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/html.R \name{html_name} \alias{html_name} \title{Get element name} \usage{ html_name(x) } \arguments{ \item{x}{A document (from \code{\link[=read_html]{read_html()}}), node set (from \code{\link[=html_elements]{html_elements()}}), node (from \code{\link[=html_element]{html_element()}}), or session (from \code{\link[=session]{session()}}).} } \value{ A character vector the same length as \code{x} } \description{ Get element name } \examples{ url <- "https://rvest.tidyverse.org/articles/starwars.html" html <- read_html(url) html \%>\% html_element("div") \%>\% html_children() \%>\% html_name() } rvest/man/html_attr.Rd0000644000176200001440000000242314132341320014447 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/html.R \name{html_attr} \alias{html_attr} \alias{html_attrs} \title{Get element attributes} \usage{ html_attr(x, name, default = NA_character_) html_attrs(x) } \arguments{ \item{x}{A document (from \code{\link[=read_html]{read_html()}}), node set (from \code{\link[=html_elements]{html_elements()}}), node (from \code{\link[=html_element]{html_element()}}), or session (from \code{\link[=session]{session()}}).} \item{name}{Name of attribute to retrieve.} \item{default}{A string used as a default value when the attribute does not exist in every element.} } \value{ A character vector (for \code{html_attr()}) or list (\code{html_attrs()}) the same length as \code{x}. } \description{ \code{html_attr()} gets a single attribute; \code{html_attrs()} gets all attributes. } \examples{ html <- minimal_html('') html \%>\% html_elements("a") \%>\% html_attrs() html \%>\% html_elements("a") \%>\% html_attr("href") html \%>\% html_elements("li") \%>\% html_attr("class") html \%>\% html_elements("li") \%>\% html_attr("class", default = "inactive") } rvest/man/html_table.Rd0000644000176200001440000000466514007274024014605 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/table.R \name{html_table} \alias{html_table} \title{Parse an html table into a data frame} \usage{ html_table( x, header = NA, trim = TRUE, fill = deprecated(), dec = ".", na.strings = "NA", convert = TRUE ) } \arguments{ \item{x}{A document (from \code{\link[=read_html]{read_html()}}), node set (from \code{\link[=html_elements]{html_elements()}}), node (from \code{\link[=html_element]{html_element()}}), or session (from \code{\link[=session]{session()}}).} \item{header}{Use first row as header? If \code{NA}, will use first row if it consists of \verb{} tags. If \code{TRUE}, column names are left exactly as they are in the source document, which may require post-processing to generate a valid data frame.} \item{trim}{Remove leading and trailing whitespace within each cell?} \item{fill}{Deprecated - missing cells in tables are now always automatically filled with \code{NA}.} \item{dec}{The character used as decimal place marker.} \item{na.strings}{Character vector of values that will be converted to \code{NA} if \code{convert} is \code{TRUE}.} \item{convert}{If \code{TRUE}, will run \code{\link[=type.convert]{type.convert()}} to interpret texts as integer, double, or \code{NA}.} } \value{ When applied to a single element, \code{html_table()} returns a single tibble. When applied to multiple elements or a document, \code{html_table()} returns a list of tibbles. } \description{ The algorithm mimics what a browser does, but repeats the values of merged cells in every cell that cover. } \examples{ sample1 <- minimal_html("
Col ACol B
1x
4y
10z
") sample1 \%>\% html_element("table") \%>\% html_table() # Values in merged cells will be duplicated sample2 <- minimal_html("
ABC
123
45
67
") sample2 \%>\% html_element("table") \%>\% html_table() # If a row is missing cells, they'll be filled with NAs sample3 <- minimal_html("
ABC
12
3
4
") sample3 \%>\% html_element("table") \%>\% html_table() } rvest/man/html_element.Rd0000644000176200001440000000607214277722126015153 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/selectors.R \name{html_element} \alias{html_element} \alias{html_elements} \title{Select elements from an HTML document} \usage{ html_element(x, css, xpath) html_elements(x, css, xpath) } \arguments{ \item{x}{Either a document, a node set or a single node.} \item{css, xpath}{Elements to select. Supply one of \code{css} or \code{xpath} depending on whether you want to use a CSS selector or XPath 1.0 expression.} } \value{ \code{html_element()} returns a nodeset the same length as the input. \code{html_elements()} flattens the output so there's no direct way to map the output to the input. } \description{ \code{html_element()} and \code{html_elements()} find HTML element using CSS selectors or XPath expressions. CSS selectors are particularly useful in conjunction with \url{https://selectorgadget.com/}, which makes it very easy to discover the selector you need. } \section{CSS selector support}{ CSS selectors are translated to XPath selectors by the \pkg{selectr} package, which is a port of the python \pkg{cssselect} library, \url{https://pythonhosted.org/cssselect/}. It implements the majority of CSS3 selectors, as described in \url{https://www.w3.org/TR/2011/REC-css3-selectors-20110929/}. The exceptions are listed below: \itemize{ \item Pseudo selectors that require interactivity are ignored: \verb{:hover}, \verb{:active}, \verb{:focus}, \verb{:target}, \verb{:visited}. \item The following pseudo classes don't work with the wild card element, *: \verb{*:first-of-type}, \verb{*:last-of-type}, \verb{*:nth-of-type}, \verb{*:nth-last-of-type}, \verb{*:only-of-type} \item It supports \verb{:contains(text)} \item You can use !=, \verb{[foo!=bar]} is the same as \verb{:not([foo=bar])} \item \verb{:not()} accepts a sequence of simple selectors, not just a single simple selector. } } \examples{ html <- minimal_html("

This is a heading

This is a paragraph

This is an important paragraph

") html \%>\% html_element("h1") html \%>\% html_elements("p") html \%>\% html_elements(".important") html \%>\% html_elements("#first") # html_element() vs html_elements() -------------------------------------- html <- minimal_html("
  • C-3PO is a droid that weighs 167 kg
  • R2-D2 is a droid that weighs 96 kg
  • Yoda weighs 66 kg
  • R4-P17 is a droid
") li <- html \%>\% html_elements("li") # When applied to a node set, html_elements() returns all matching elements # beneath any of the inputs, flattening results into a new node set. li \%>\% html_elements("i") # When applied to a node set, html_element() always returns a vector the # same length as the input, using a "missing" element where needed. li \%>\% html_element("i") # and html_text() and html_attr() will return NA li \%>\% html_element("i") \%>\% html_text2() li \%>\% html_element("span") \%>\% html_attr("class") } rvest/man/repair_encoding.Rd0000644000176200001440000000123114014035320015574 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/encoding.R \name{repair_encoding} \alias{repair_encoding} \title{Repair faulty encoding} \usage{ repair_encoding(x, from = NULL) } \arguments{ \item{from}{The encoding that the string is actually in. If \code{NULL}, \code{guess_encoding} will be used.} } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} This function has been deprecated because it doesn't work. Instead re-read the HTML file with correct \code{encoding} argument. } \keyword{internal} rvest/man/rvest-package.Rd0000644000176200001440000000144614554031036015221 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rvest-package.R \docType{package} \name{rvest-package} \alias{rvest} \alias{rvest-package} \title{rvest: Easily Harvest (Scrape) Web Pages} \description{ \if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}} Wrappers around the 'xml2' and 'httr' packages to make it easy to download, then manipulate, HTML and XML. } \seealso{ Useful links: \itemize{ \item \url{https://rvest.tidyverse.org/} \item \url{https://github.com/tidyverse/rvest} \item Report bugs at \url{https://github.com/tidyverse/rvest/issues} } } \author{ \strong{Maintainer}: Hadley Wickham \email{hadley@posit.co} Other contributors: \itemize{ \item Posit Software, PBC [copyright holder, funder] } } \keyword{internal} rvest/man/figures/0000755000176200001440000000000014554031036013635 5ustar liggesusersrvest/man/figures/lifecycle-defunct.svg0000644000176200001440000000242414554031036017745 0ustar liggesusers lifecycle: defunct lifecycle defunct rvest/man/figures/lifecycle-maturing.svg0000644000176200001440000000243014554031036020140 0ustar liggesusers lifecycle: maturing lifecycle maturing rvest/man/figures/logo.png0000644000176200001440000007611214554031036015312 0ustar liggesusersPNG  IHDRޫh cHRMz&u0`:pQ<bKGDtIME .߰h{IDATxwyNg6GE$H"DIlYmْϾ};ߝ} ,%1Q$H"9lΓCQݳK6a35==}qKW?gN\Sc . 1w`m_^ X':/a n _%GP$"@ȗ2DQCVවP.~ |P:/U | Ew |EQĮCf˫_~CD4P'%6~:os;+xpN䥍:0jko~ 0-S}@$^ʨx MN H 5 N䥈:fĹˀ_/)։tP'A yOg;rB u/ |QC\ ^0FS4 y\Dd?^'EF 3-ǁDe/q$a 4DL 'IJӞ7KL_NE v?| ;+$CF 㺌s d*?u"/.^DԐ7 UEu⚺N[`6CѶ9u"NyF y(o7 F@\CiL&j|,9ǯh d3ss⧀R-.'EI=\s!;"/&LD6D7wJp}lqQ'1]aJ91ƭ]$Am4Q >fȖAl u3;/2!à#=2Jx: r 岔33%[։<; < ̐݁s,e~8WhFL$\0 ,<ܗaT||?uQ'`ս%.ۤBaIR0O`IJI\?!]*G|E $%/u"E̯3%Ø.F2V;X@.C~~2{- ։|n,YQCS2/ t8W9Dz01Qq%>ދ2P'񹰴gED q d~of>d~]$q+bYŸ=J\)ˬ0%[|luQ' p7d~f>d~VDpDŹYT1,SJ&KEs2S9T-*s ԉ<ܙw n;-JVv`GLXhS|.~gq?e%v0FY_+G_ܴ'hDЃ\ ]=)/eV*2%[<|Eͫm~ 0iư._9ȶ2\3WYad+WL!{ jY&*4Z"Q: Rs|B{N ȧP]A(e?frGήPKOU^D 6Dv̪Oih%_qQ{*p`>Oy~2@XxYͽ2ۙ9+[r2m31V*Ηl!dH|Yxս)_j6MY!bqRД̏-!Lc-%J\ʼef-e$[ vb܏+f;Ϥ3%U2(vvgo$%7/gnfJx8W:1Y_Zyʥ-ǔlсKė,au71%kxv90MbqCRieStdD>Oƞ leKg}UfcVe~A{$Fk$ֱtQv/|!ۉ> >,Hi/l- yRIiSތ纨D%1ߧ(fAsxvci3liᥔdlbJydF%"e.:g+Q;"512?ä39^f}5?;>Qg<וrZFddkq?43ƨYU%1u k"jo!ĹsVx@1On~28\l"/ܮ!o)U=-!۶RMl͏V+ y՟+,1#=4jT3)=\Uwצz# xO&@L!х ѯW f >\!Pu>{14bPHy~2-NxQx qMd~3WhDm[<KX.[z5TI%i\gcT`pSjҮ0P,q˨yg3mLVu:ZCR%nJ>Y KHӪqUZ< Fug !O) ,(B ce:8$Lj5?wKΖQWH-%JkyzZ#rKEw0Ŕ_^b`Hq~2-gKY3d~A9Z]ShY fOr*X.Ǐ_=dU6}B>X |k7N{굦oZv="Ժ*wiK,P$hO*@R;jTsutPbGy8a4U}%ϝ[ffQt3d~EgILpD^:Tmq&>4'ZDt!tC7 8|,nToGJCiqlš鿷DI:yWA4k2,+լ:a^ OFIL&k#NE@aY7VoaC *&A,dm?/QsD4J蠨b|/Am|qLY]}fGxB# 3ڊho\"؎CO烙NzLk</IxX`@ټSp`dtnbk0Cފ(yN&H`g3xކ*r]*i(pC Cd*#!h*>#ax|_*bW)h{08@S#%C:꺎ՉS*줒NS5~-UrhmfA lE,&Leb϶1T!^u1krO8W*c"e5AhhN7cuښW v ly4_h,Nu(wbؙ,F$ 'k+.Mv2džeбV]$xCMz\hr3!WW tTQO^sѺ:ȥy>Md)c&N@zrЩHI98<2|5/PII yʮKw.ea4$7 } 0LΆZN`N1>>X&8wЉ~y#].5~!AwG_B);kڲR:$׮!޳4A54l aK,"zNaD(KZ) ikiV @*A%<>AidØZHgB 75RdHCU`Ҵ};L=VIK-/ϜLiX7򵿪e?4 s ߸X%tq^1sBs^+=A%B.Hf9Rb e&X]]1uTOF,F!Eale>ruLzu4c(V|_-wNEz;ډ\[.Ul؎C-7abhIa Og(#LjDȼ;E߲ [J (qk opȺuiܲt]eǥq5Dք  zf'/I`U(:AP_wg×! {Aq@ac #O>t]X R!f ͷނfYW82Jg%pu1R)]HO2o?;wPC)JN)vu;uҦN9E3MrM׉*o<1AitH{ap:HR8DWU*4_s5nDqxGW^BFkوגW\.K%#ž46/ Ahu8a)Gl2r'N_+<1Acx &(8A1-DZ< ;wٲتdJ:/}]$y3>G+(O)) a^&f5aB;h+WjlpKjIit[6#^dBvΙ 5?^nyYA3և,qLni@JE 4uۭpD3Mb]*Ҍ 3'v+W"]=aA˗Q觜a&he(tx .vj*0< FΏդV ʔ_P[~ƂmĺKr*X8nODA3 sf"AbJ Sgr+W 4p[ša:Fmsll?S$׭%j27&?[e5 q\3#sq&kגEmƈH=FԲnʕ bʣcuPsCއ,p8bpnRn|n>q 'd_~B& ى5#DtMCիyYo٩K 8R׶ya).t)wz dP5*Laa L<چs2$B@筷PGBWjkE:N>O S)Ȟ:Eerm[߷;8p?o};nGGO 2$Jϛ&ʿXMKIs<>A#m3Y&80Hqd4 ɃX :(„ttPGhڲm[< MyjgC8.~WQE_2Ç1QXPc#7@ercT&Ә8-;wPRzcۦqb˺ɟ9C “<+ȟWkIYMN-y{Ivw#.+ 6Ed"pQ;zccD;0"|ƭ[VwN.,hhoklՃlh}eCDIpRUNAK_yvy˖1o#G 56ZxN:[GL- 3 q#><]kux<=-\f.xl4 죪)vꗱmDY0R8Dd2CI>LǍ7h ;Ked2)aVQb՝'r%F,"@47QD3 NniD~P, $V`cЭ~dHqhuTYҶk)1IbS%f5QJLjd8uȉEy_2_K0åKX\/bx = 3[[(nE (^eTi 4m݂ زe~נ:%! @,"팽Ń ӽUi[PCš!N~;d9L==@b]]Z _:ߦ Mݹ'00ͥ%]lwU*qD Fl4[yYηqMH!w nhT5ědŗ01"mm ]'v*L?@2T&&I^EylLwM&"nmۤ֬Q b1J4lXOcȳϣB`rhMxV*J6Ͻ@۞nDilՄ:;i&8$׬slhEl2"--8P@F 77Ӽm+$VL(qNg <1I-W@ 0wt&FM4o{udK3S:v ŁAd]T+/C[˒Pc­ n7#&_xQc/%zB!c) jji6iXXm-38P=+媠T-_F1&pa b'MkAnM6ni!Ae2M @6E%})g-T ?:Pٷ͛&`rTR5ʶ=:n$V {$A\ f,kW(`&NBRa|>z>~q[$юv*,V&jj-oBR1QGBo {aBG!4S1D;V.; |> ٔU"W,Ç= P1td'GxSX (f"Ch$Fw(7jj4:J]9wl'7<˱)OLmo%׏@joڝ1c1&'Eܱun4Ӥ[q9ZI F <<*?Úu( |?`ŁCQ 4N*P?;R"$zHB_ѣd=H $νsQV3аi#Gqq``AՔưR)\y\r*_(l |L[~zH'u]ڼ_n+?~lKG\zqkLYaqVXf$7mdWȝ>M]4InDhz;q- )L#֊g۸ 7X!CitTQ$4} yz\Fh1aG 6.H284NH$uk&4F,W]uNC`!T$CHogUOϫ^Ưگٴgʴ^KUP.t IPct;#38HdՄ[Z(1v_G%GLjm@>!二}mZ* #>GgO'{wQ)-_a%HS}Bmo"stOB%EL=RV|q>% z#8^w_u;BRiYX óm6c{ҲRסB" sg'nıh+ih sڶPe'O2ȣhun-xmsꤝH`9v_z5?;98//IAP ߯WI­-﹎J,Qkɞ:R"ʭ-hP FW3>ʓ==DtL$иf5iqJ%x[~I(lPSN.GilJ<#b9vkN}'Obhz..yv^P'@-Bg0_T*|͊od!vjpG;+qK%_zULo| ôhܼԺ<//>ZNLԴFyt|_6o؃Br&u4#GU#T ?4?(Ltc9z ʹpE:nɓ2Y G#4/;uo`[L$C/^P'Q*}0mo+š!MjF~L2#KE&d}!v_KL!5+hs³mi*4[GImMc՝g{ߧynikegAM7r|tܤ]ɞ:czxt8>N {xGe! MJ]Gs] 6q?2&H˵<8v:LjlaYx0f6W'E@D@btz4wKD C#|_Zw 0D3YmUc=r#uJb/cD`1zT"li}z$L| 4Ng0[Z(io'~Ap+cO>MٓDt 29I|rFFFh~F#H)1*6n,ROqs@~EfK5]; !x#|?wuW,J&pK%Z "md K ^cy"Ң;-;vӃ<xŗGO?}L_?0lXw7ɞ@ÔiL(z$B+y!P< p躵6"mm$lF>#G|rẖ4_OU۰]M$?JIm@ix {#arg)qS)k8;^v`+G/} qLqSA3tz0ypݵhyĻl F$B]Hb\R R=u U] } 788Dj:|ѣtr32HgpSӐ2#}8v.5lɣGѻ?AqxUd'޳3'w$TlB{HY>tʏ$T(@8V.6^< ˚_{:AsӞ]y>0FFt8r-WYa!!p2LhwSLI]MIt$\(VtsV'40!}:WK\!Bp>ѤD/bQWWY0 %q:, Z PɫUXlY32:ʡar[ EIx0Nb˺;NnxR8\M>I|3$ׯC@Zw`ɧ(4j.TO+)w$F+qj9LOWm[7/qNwD;w`߾DO[k*D4Եj#i=f[C`!X;@4RGk:L3#Յq{I+V@8"h`79-\Q;hi/Dd2@89RӐvV{HǷA& r50}˱}ӍCh"B dhmi8mfɜ=1]GFu$MHsv>32ck\W1H=JҶg76&ﹽHۦz +@P[D"RJ'u$OP\˪L` ALX9VIڹIh?3g]rOSx9M(\A ITU$C0 ŋǐ%5{ۯo;O= e_/2E Ltp,S}kl&“kHTDU? ){i;W]N#l4"ϲX`۶n!ku,^@XBpa92bIZ[[XêU- f ) , 4tA\eMSV4a"GY}ó)K? r5jX,AF<瑦F>FSSyf.fp jObi"&:b+k*p -U! zB8PX U%MO,4\'\?cS|9r0T~/:t'$InJccGAuv﹞Cql 2bj}I M=#O@iЎ=˩n edF$Ў D*㪊.]'xn2>g-[Ceϳ{dfC  |x7]D"A.#А-?!A:x޵*rIXMCh~ؓx@; VO@R-Jee{d3<_.;'l۶mg|G~ng/ ^@MjةCz0xU0 ۶عs'B//92~nj`@\FHORJ]Ws_%`~aGy:T;M)D(DR[Bһ7nF?oHcߋ:h[/QfBPй;8uoz_5Ր+%-%/f@ErRGyïg>yDU+P0@8RӫBz9'WMX3\1IZڦur|/UϿȍuC3_hwV$ :4:tc׮]4778~Bh$q\aeOw0oy׻moed*JOt6G՞JR"ކFGQT1 r#o_ry6oLcS4V*W~w$|Uޙxd#Bѓ4toN(b۔KeI54L&u==޳l6=}wuUFxs8P*2 S娜>-%E#n C B&a l߱߼J?o~e4Mե Sˠt^5OߡӧOsi ` R)<Zi:u{fijjbU,_Ʀ&tàX,bW*\!3<4/LOOmƶh,J84r?/}/?ǟz? 8+%*Fn'k(=rqXr%N"aW*ib+{zHRxGzragxxqb455111@ww7]]h~?xuCJyA2|s}[?|ChWBNKm)g.>sG$!aBs)Xlh,FOOOu'ri* S,TiPň~YqaSO?×ؖ/paP~1?~ Mυt,/Bnbe _/ O=G?lܰv,bppt:8tttǫ{366F"@5* B4B!LDJIX"˒N HE,gtdo~k>8?94ʺe0<᫠BAvӎΣi7~w~ü˲A0TJ $lCױ |q4P('= +{Vi\8qo[pͬ?Ԛ^mWA/!xB CVjYS]! d(^*M?Wod ˅44Vo;?vcin>LChljap, rK Y`@Ž3M{Tl,jn >e;=+059߯C]NxAHscSr~ΫƜ\1g{ϳ|ϸ.uqqT1:XtHvMbQUGd>OBu,*}ɟ MЛ(??>KuױB DozL4gz!G^5/[|dOLmk_M|ᬯ}Lau+YLMs\ezͮ |H)UYLMJCŰ֭\Ճ?@!1KU sU|aY4՛4c.[ю"=or/_PrBGq.dx,Nn&0|#J1cbYPUf6JA炔RCCGh[o~h$BӘ>EwѢp3&$8'<7+j}duR&J:k"w~Ʋ.hTIƝLS>xHp'&_"7;FoZ-Pۗ vpYS(>a$;YMYI~ W™}췾οi|`wlhk@Dp 3w>4%BqGF0Zozڸ7M5! wq/cS㿙O^Eoi@Chi~B@(ކF(mE}ȳ8p#~1! #ԌLdbo_^k|k~N-'zg*Nm#˥8/{=xk$wJ {z];pg(_uQbXNoFBTGJH-Vin^޹ U=OqŒrái!?;6 : u"k];MZO4z^:^S,P#4/T4oWtr=𮝘˺k!{ a =?cb6l&{OZC]o'b6_$`J)hߚF^/_ { hFk+[|֦0^F!r=Pm݌Yd)}Qg\z+@ϝ:kIr>{@+T3$W_v5-DXofS)>j QoghCd2-#n8I'(=<ͿTl޹G>_~zut+O/~xsD\G}~'A_'oPA_73b𓿋;Zkft |DzSS)bo}BӰ{/N[,s񷼩E|"ٔ?7˺ 9pG` y;"ԌL8t_zG;M8BBx{~T@jzUd5mzl)q'&n5.~!ғzn6ga-YQe0"\?}o ]u&y4P9qrBDMxbtvT,{ϐ~pYehu'ǍNR!, ciO_S$4zC}sЗA<8;>P,(=@{adzghCL;lmLh {íS@yK*yֺ5DjJtFYr<#p)}}&?LU/ћHmtݧije][EY>|;Z(FYZ햪EGQSiMP$XӴ׍ݽR BUqȟ "zkxڰퟖEc~Q'bB(>Y 1w4bwVL1xiT!aDoI6oRZ>*QzRӎmY^j﹏NlzCù~v:u/2ԲL=O҆wnhk#z U%$wAWቧ|"F&sIVe- xU] أj_4 z&Z"d{~~9K_K$CjkĪ;v沖,U{U؀X'N!pz(r@꫅K?O7E#J 75"LNyS ]‹TBx5JˇfnX?B2>F[ޢ_9#ǰOC)0I!%mڸ7c?Ay'p'ٳ6nH{?j|Д>r-֪ST^9N]OاN_QmuKH*RbAup'& ERJ?;:z^j'i8[\ȵ;9V7E%b^DShFB7sʑ_Z33B2/g\mDUݝLE"}l\)[JJ=i$G˨p}Ig\Pus=bx{*c3)}{{} NT|jus '~ϟ$!~Lߟ\DCsK}WJƤa2X0k];<03cHzfA0_ȋH H:h@]D^Ic Dciٟt%m9<)w#Y>+;S~kV+RIIXs׼d!^:GK1t;'|]Tp, ;Z[Saje)gϲߵ <{ j>&pNܿD]h4Չ\9Hɸ8e`,8XmW "}~OA^AH^qW3 )< Qɩ_.j Dgs@e6.4uk\E,֔?:se^SGz"G|=h9TSYX*YĹ幻 | …7}[|UTE_`) Bi6E yax1_~ hбf;Fq0Y bpp 0ǫ!܄rߊZ Sv[:) >T` 0\!rx/I`lƪ-elEyxeqL7p`[QPPkzYfs87̃o,𿁓&/,ܟA䫀>M\kU25b9By,Q/q,! ܎ D6k2Cuk\yboXz2Gx `a`Q{ SQūf3V]XDž2?U82WUW] E3L.t8YSYP;Γo)ߑb7Eg-~xs-6!h`:y R282BOE'n>kF- zY="8o$ %[u([ e+3e~q^d~Qy Kj.E(bۅU[H) `Jw 8`I=L#:LbĕU=,d~%=g߁"AZ/˼1S7ǹP~s-ZBвe3e~#cd2?ߧQ2"ԉ;\q5d lx5e3>Y862(ߗe~:DaJycMYvZl"K)r0G(S2:q: 3d-.t]ϔ I.+DXxqo.YF%&[, 8lP]f3^lC4Γl2?<9Ź'P2s u/ fyS klSh$,sd~_Ec\2B$[\@}Llw1P'"bl]("fŰC2ͯ.[d 8lP:az.^˔LpNE 3)bㅎu.b v9fuL]6\"cldօU+[ 2}%+Cu/EI`lƪs2_ŗՉ4P'B2FK)[%Ϡv(^˜Oy.P'%"70G.Q'%mq.7 |aw[,2%bvP'%7m1]e~41.7 |`נj?tP=p xb>0Cx+(⟡v.3Y3atEXtSoftwareAdobe ImageReadyqe<IENDB`rvest/man/figures/lifecycle-archived.svg0000644000176200001440000000243014554031036020077 0ustar liggesusers lifecycle: archived lifecycle archived rvest/man/figures/lifecycle-soft-deprecated.svg0000644000176200001440000000246614554031036021374 0ustar liggesusers lifecycle: soft-deprecated lifecycle soft-deprecated rvest/man/figures/lifecycle-questioning.svg0000644000176200001440000000244414554031036020664 0ustar liggesusers lifecycle: questioning lifecycle questioning rvest/man/figures/lifecycle-superseded.svg0000644000176200001440000000244014554031036020456 0ustar liggesusers lifecycle: superseded lifecycle superseded rvest/man/figures/lifecycle-stable.svg0000644000176200001440000000247214554031036017572 0ustar liggesusers lifecycle: stable lifecycle stable rvest/man/figures/lifecycle-experimental.svg0000644000176200001440000000245014554031036021011 0ustar liggesusers lifecycle: experimental lifecycle experimental rvest/man/figures/lifecycle-deprecated.svg0000644000176200001440000000244014554031036020413 0ustar liggesusers lifecycle: deprecated lifecycle deprecated rvest/man/reexports.Rd0000644000176200001440000000073514557172067014535 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rvest-package.R \docType{import} \name{reexports} \alias{reexports} \alias{url_absolute} \alias{\%>\%} \title{Objects exported from other packages} \keyword{internal} \description{ These objects are imported from other packages. Follow the links below to see their documentation. \describe{ \item{magrittr}{\code{\link[magrittr:pipe]{\%>\%}}} \item{xml2}{\code{\link[xml2]{url_absolute}}} }} rvest/man/LiveHTML.Rd0000644000176200001440000002045414562443630014057 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/live.R \name{LiveHTML} \alias{LiveHTML} \title{Interact with a live web page} \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} You construct an LiveHTML object with \code{\link[=read_html_live]{read_html_live()}} and then interact, like you're a human, using the methods described below. When debugging a scraping script it is particularly useful to use \verb{$view()}, which will open a live preview of the site, and you can actually see each of the operations performed on the real site. rvest provides relatively simple methods for scrolling, typing, and clicking. For richer interaction, you probably want to use a package that exposes a more powerful user interface, like \href{https://ashbythorpe.github.io/selenider/}{selendir}. } \examples{ \dontrun{ # To retrieve data for this paginated site, we need to repeatedly push # the "Load More" button sess <- read_html_live("https://www.bodybuilding.com/exercises/finder") sess$view() sess \%>\% html_elements(".ExResult-row") \%>\% length() sess$click(".ExLoadMore-btn") sess \%>\% html_elements(".ExResult-row") \%>\% length() sess$click(".ExLoadMore-btn") sess \%>\% html_elements(".ExResult-row") \%>\% length() } } \section{Public fields}{ \if{html}{\out{
}} \describe{ \item{\code{session}}{Underlying chromote session object. For expert use only.} } \if{html}{\out{
}} } \section{Methods}{ \subsection{Public methods}{ \itemize{ \item \href{#method-LiveHTML-new}{\code{LiveHTML$new()}} \item \href{#method-LiveHTML-print}{\code{LiveHTML$print()}} \item \href{#method-LiveHTML-view}{\code{LiveHTML$view()}} \item \href{#method-LiveHTML-html_elements}{\code{LiveHTML$html_elements()}} \item \href{#method-LiveHTML-click}{\code{LiveHTML$click()}} \item \href{#method-LiveHTML-get_scroll_position}{\code{LiveHTML$get_scroll_position()}} \item \href{#method-LiveHTML-scroll_into_view}{\code{LiveHTML$scroll_into_view()}} \item \href{#method-LiveHTML-scroll_to}{\code{LiveHTML$scroll_to()}} \item \href{#method-LiveHTML-scroll_by}{\code{LiveHTML$scroll_by()}} \item \href{#method-LiveHTML-type}{\code{LiveHTML$type()}} \item \href{#method-LiveHTML-press}{\code{LiveHTML$press()}} \item \href{#method-LiveHTML-clone}{\code{LiveHTML$clone()}} } } \if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-LiveHTML-new}{}}} \subsection{Method \code{new()}}{ initialize the object \subsection{Usage}{ \if{html}{\out{
}}\preformatted{LiveHTML$new(url)}\if{html}{\out{
}} } \subsection{Arguments}{ \if{html}{\out{
}} \describe{ \item{\code{url}}{URL to page.} } \if{html}{\out{
}} } } \if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-LiveHTML-print}{}}} \subsection{Method \code{print()}}{ Called when \code{print()}ed \subsection{Usage}{ \if{html}{\out{
}}\preformatted{LiveHTML$print(...)}\if{html}{\out{
}} } \subsection{Arguments}{ \if{html}{\out{
}} \describe{ \item{\code{...}}{Ignored} } \if{html}{\out{
}} } } \if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-LiveHTML-view}{}}} \subsection{Method \code{view()}}{ Display a live view of the site \subsection{Usage}{ \if{html}{\out{
}}\preformatted{LiveHTML$view()}\if{html}{\out{
}} } } \if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-LiveHTML-html_elements}{}}} \subsection{Method \code{html_elements()}}{ Extract HTML elements from the current page. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{LiveHTML$html_elements(css, xpath)}\if{html}{\out{
}} } \subsection{Arguments}{ \if{html}{\out{
}} \describe{ \item{\code{css, xpath}}{CSS selector or xpath expression.} } \if{html}{\out{
}} } } \if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-LiveHTML-click}{}}} \subsection{Method \code{click()}}{ Simulate a click on an HTML element. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{LiveHTML$click(css, n_clicks = 1)}\if{html}{\out{
}} } \subsection{Arguments}{ \if{html}{\out{
}} \describe{ \item{\code{css}}{CSS selector or xpath expression.} \item{\code{n_clicks}}{Number of clicks} } \if{html}{\out{
}} } } \if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-LiveHTML-get_scroll_position}{}}} \subsection{Method \code{get_scroll_position()}}{ Get the current scroll position. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{LiveHTML$get_scroll_position()}\if{html}{\out{
}} } } \if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-LiveHTML-scroll_into_view}{}}} \subsection{Method \code{scroll_into_view()}}{ Scroll selected element into view. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{LiveHTML$scroll_into_view(css)}\if{html}{\out{
}} } \subsection{Arguments}{ \if{html}{\out{
}} \describe{ \item{\code{css}}{CSS selector or xpath expression.} } \if{html}{\out{
}} } } \if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-LiveHTML-scroll_to}{}}} \subsection{Method \code{scroll_to()}}{ Scroll to specified location \subsection{Usage}{ \if{html}{\out{
}}\preformatted{LiveHTML$scroll_to(top = 0, left = 0)}\if{html}{\out{
}} } \subsection{Arguments}{ \if{html}{\out{
}} \describe{ \item{\code{top, left}}{Number of pixels from top/left respectively.} } \if{html}{\out{
}} } } \if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-LiveHTML-scroll_by}{}}} \subsection{Method \code{scroll_by()}}{ Scroll by the specified amount \subsection{Usage}{ \if{html}{\out{
}}\preformatted{LiveHTML$scroll_by(top = 0, left = 0)}\if{html}{\out{
}} } \subsection{Arguments}{ \if{html}{\out{
}} \describe{ \item{\code{top, left}}{Number of pixels to scroll up/down and left/right respectively.} } \if{html}{\out{
}} } } \if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-LiveHTML-type}{}}} \subsection{Method \code{type()}}{ Type text in the selected element \subsection{Usage}{ \if{html}{\out{
}}\preformatted{LiveHTML$type(css, text)}\if{html}{\out{
}} } \subsection{Arguments}{ \if{html}{\out{
}} \describe{ \item{\code{css}}{CSS selector or xpath expression.} \item{\code{text}}{A single string containing the text to type.} } \if{html}{\out{
}} } } \if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-LiveHTML-press}{}}} \subsection{Method \code{press()}}{ Simulate pressing a single key (including special keys). \subsection{Usage}{ \if{html}{\out{
}}\preformatted{LiveHTML$press(css, key_code, modifiers = character())}\if{html}{\out{
}} } \subsection{Arguments}{ \if{html}{\out{
}} \describe{ \item{\code{css}}{CSS selector or xpath expression. Set to \code{NULL}} \item{\code{key_code}}{Name of key. You can see a complete list of known keys at \url{https://pptr.dev/api/puppeteer.keyinput/}.} \item{\code{modifiers}}{A character vector of modifiers. Must be one or more of \verb{"Shift}, \code{"Control"}, \code{"Alt"}, or \code{"Meta"}.} } \if{html}{\out{
}} } } \if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-LiveHTML-clone}{}}} \subsection{Method \code{clone()}}{ The objects of this class are cloneable with this method. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{LiveHTML$clone(deep = FALSE)}\if{html}{\out{
}} } \subsection{Arguments}{ \if{html}{\out{
}} \describe{ \item{\code{deep}}{Whether to make a deep clone.} } \if{html}{\out{
}} } } } rvest/man/read_html.Rd0000644000176200001440000000654214557172067014443 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rvest-package.R \name{read_html} \alias{read_html} \title{Static web scraping (with xml2)} \usage{ read_html(x, encoding = "", ..., options = c("RECOVER", "NOERROR", "NOBLANKS")) } \arguments{ \item{x}{Usually a string representing a URL. See \code{\link[xml2:read_xml]{xml2::read_html()}} for other options.} \item{encoding}{Specify a default encoding for the document. Unless otherwise specified XML documents are assumed to be in UTF-8 or UTF-16. If the document is not UTF-8/16, and lacks an explicit encoding directive, this allows you to supply a default.} \item{...}{Additional arguments passed on to methods.} \item{options}{Set parsing options for the libxml2 parser. Zero or more of \describe{ \item{RECOVER}{recover on errors} \item{NOENT}{substitute entities} \item{DTDLOAD}{load the external subset} \item{DTDATTR}{default DTD attributes} \item{DTDVALID}{validate with the DTD} \item{NOERROR}{suppress error reports} \item{NOWARNING}{suppress warning reports} \item{PEDANTIC}{pedantic error reporting} \item{NOBLANKS}{remove blank nodes} \item{SAX1}{use the SAX1 interface internally} \item{XINCLUDE}{Implement XInclude substitition} \item{NONET}{Forbid network access} \item{NODICT}{Do not reuse the context dictionary} \item{NSCLEAN}{remove redundant namespaces declarations} \item{NOCDATA}{merge CDATA as text nodes} \item{NOXINCNODE}{do not generate XINCLUDE START/END nodes} \item{COMPACT}{compact small text nodes; no modification of the tree allowed afterwards (will possibly crash if you try to modify the tree)} \item{OLD10}{parse using XML-1.0 before update 5} \item{NOBASEFIX}{do not fixup XINCLUDE xml:base uris} \item{HUGE}{relax any hardcoded limit from the parser} \item{OLDSAX}{parse using SAX2 interface before 2.7.0} \item{IGNORE_ENC}{ignore internal document encoding hint} \item{BIG_LINES}{Store big lines numbers in text PSVI field} }} } \description{ \code{\link[=read_html]{read_html()}} works by performing a HTTP request then parsing the HTML received using the xml2 package. This is "static" scraping because it operates only on the raw HTML file. While this works for most sites, in some cases you will need to use \code{\link[=read_html_live]{read_html_live()}} if the parts of the page you want to scrape are dynamically generated with javascript. Generally, we recommend using \code{read_html()} if it works, as it will be faster and more robust, as it has fewer external dependencies (i.e. it doesn't rely on the Chrome web browser installed on your computer.) } \examples{ # Start by reading a HTML page with read_html(): starwars <- read_html("https://rvest.tidyverse.org/articles/starwars.html") # Then find elements that match a css selector or XPath expression # using html_elements(). In this example, each
corresponds # to a different film films <- starwars \%>\% html_elements("section") films # Then use html_element() to extract one element per film. Here # we the title is given by the text inside

title <- films \%>\% html_element("h2") \%>\% html_text2() title # Or use html_attr() to get data out of attributes. html_attr() always # returns a string so we convert it to an integer using a readr function episode <- films \%>\% html_element("h2") \%>\% html_attr("data-id") \%>\% readr::parse_integer() episode } rvest/man/session.Rd0000644000176200001440000000473714277721661014172 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/session.R \name{session} \alias{session} \alias{is.session} \alias{session_jump_to} \alias{session_follow_link} \alias{session_back} \alias{session_forward} \alias{session_history} \alias{session_submit} \title{Simulate a session in web browser} \usage{ session(url, ...) is.session(x) session_jump_to(x, url, ...) session_follow_link(x, i, css, xpath, ...) session_back(x) session_forward(x) session_history(x) session_submit(x, form, submit = NULL, ...) } \arguments{ \item{url}{A URL, either relative or absolute, to navigate to.} \item{...}{Any additional httr config to use throughout the session.} \item{x}{A session.} \item{i}{A integer to select the ith link or a string to match the first link containing that text (case sensitive).} \item{css, xpath}{Elements to select. Supply one of \code{css} or \code{xpath} depending on whether you want to use a CSS selector or XPath 1.0 expression.} \item{form}{An \link{html_form} to submit} \item{submit}{Which button should be used to submit the form? \itemize{ \item \code{NULL}, the default, uses the first button. \item A string selects a button by its name. \item A number selects a button using its relative position. }} } \description{ This set of functions allows you to simulate a user interacting with a website, using forms and navigating from page to page. \itemize{ \item Create a session with \code{session(url)} \item Navigate to a specified url with \code{session_jump_to()}, or follow a link on the page with \code{session_follow_link()}. \item Submit an \link{html_form} with \code{session_submit()}. \item View the history with \code{session_history()} and navigate back and forward with \code{session_back()} and \code{session_forward()}. \item Extract page contents with \code{\link[=html_element]{html_element()}} and \code{\link[=html_elements]{html_elements()}}, or get the complete HTML document with \code{\link[=read_html]{read_html()}}. \item Inspect the HTTP response with \code{\link[httr:cookies]{httr::cookies()}}, \code{\link[httr:headers]{httr::headers()}}, and \code{\link[httr:status_code]{httr::status_code()}}. } } \examples{ s <- session("http://hadley.nz") s \%>\% session_jump_to("hadley-wickham.jpg") \%>\% session_jump_to("/") \%>\% session_history() s \%>\% session_jump_to("hadley-wickham.jpg") \%>\% session_back() \%>\% session_history() \donttest{ s \%>\% session_follow_link(css = "p a") \%>\% html_elements("p") } } rvest/man/html_encoding_guess.Rd0000644000176200001440000000205113775424150016505 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/encoding.R \name{html_encoding_guess} \alias{html_encoding_guess} \alias{guess_encoding} \title{Guess faulty character encoding} \usage{ html_encoding_guess(x) } \arguments{ \item{x}{A character vector.} } \description{ \code{html_encoding_guess()} helps you handle web pages that declare an incorrect encoding. Use \code{html_encoding_guess()} to generate a list of possible encodings, then try each out by using \code{encoding} argument of \code{read_html()}. \code{html_encoding_guess()} replaces the deprecated \code{guess_encoding()}. } \examples{ # A file with bad encoding included in the package path <- system.file("html-ex", "bad-encoding.html", package = "rvest") x <- read_html(path) x \%>\% html_elements("p") \%>\% html_text() html_encoding_guess(x) # Two valid encodings, only one of which is correct read_html(path, encoding = "ISO-8859-1") \%>\% html_elements("p") \%>\% html_text() read_html(path, encoding = "ISO-8859-2") \%>\% html_elements("p") \%>\% html_text() } rvest/man/html_children.Rd0000644000176200001440000000126213776122153015303 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/html.R \name{html_children} \alias{html_children} \title{Get element children} \usage{ html_children(x) } \arguments{ \item{x}{A document (from \code{\link[=read_html]{read_html()}}), node set (from \code{\link[=html_elements]{html_elements()}}), node (from \code{\link[=html_element]{html_element()}}), or session (from \code{\link[=session]{session()}}).} } \description{ Get element children } \examples{ html <- minimal_html("
  • 1
  • 2
  • 3
") ul <- html_elements(html, "ul") html_children(ul) html <- minimal_html("

Hello Hadley!") p <- html_elements(html, "p") html_children(p) } rvest/DESCRIPTION0000644000176200001440000000226014562475432013137 0ustar liggesusersPackage: rvest Title: Easily Harvest (Scrape) Web Pages Version: 1.0.4 Authors@R: c( person("Hadley", "Wickham", , "hadley@posit.co", role = c("aut", "cre")), person("Posit Software, PBC", role = c("cph", "fnd")) ) Description: Wrappers around the 'xml2' and 'httr' packages to make it easy to download, then manipulate, HTML and XML. License: MIT + file LICENSE URL: https://rvest.tidyverse.org/, https://github.com/tidyverse/rvest BugReports: https://github.com/tidyverse/rvest/issues Depends: R (>= 3.6) Imports: cli, glue, httr (>= 0.5), lifecycle (>= 1.0.3), magrittr, rlang (>= 1.1.0), selectr, tibble, xml2 (>= 1.3) Suggests: chromote, covr, knitr, R6, readr, repurrrsive, rmarkdown, spelling, stringi (>= 0.3.1), testthat (>= 3.0.2), webfakes VignetteBuilder: knitr Config/Needs/website: tidyverse/tidytemplate Config/testthat/edition: 3 Config/testthat/parallel: true Encoding: UTF-8 Language: en-US RoxygenNote: 7.3.1 NeedsCompilation: no Packaged: 2024-02-12 17:13:26 UTC; hadleywickham Author: Hadley Wickham [aut, cre], Posit Software, PBC [cph, fnd] Maintainer: Hadley Wickham Repository: CRAN Date/Publication: 2024-02-12 20:10:02 UTC rvest/build/0000755000176200001440000000000014562450666012532 5ustar liggesusersrvest/build/vignette.rds0000644000176200001440000000035014562450666015067 0ustar liggesusersuA0E+ H8;7nBQLHyrqEg2e0a4C(6ȅ2ޱ\7"q|-s,{D)LؼE2?xQT-p&iluI^8rvVDgqVJ\(9ې{SsC.XPڶ}&(:^% ~Pg{rvest/tests/0000755000176200001440000000000013767413737012601 5ustar liggesusersrvest/tests/spelling.R0000644000176200001440000000024113767413737014536 0ustar liggesusersif(requireNamespace('spelling', quietly = TRUE)) spelling::spell_check_test(vignettes = TRUE, error = FALSE, skip_on_cran = TRUE) rvest/tests/testthat/0000755000176200001440000000000014562475432014433 5ustar liggesusersrvest/tests/testthat/test-table.R0000644000176200001440000001105314554031036016607 0ustar liggesuserstest_that("can parse simple table", { html <- minimal_html('
xyz
1EveJackson
2JohnDoe
') table <- html_table(html)[[1]] expect_snapshot_output(table) }) test_that("strips whitespace", { html <- minimal_html('
x
x
x
x
') table <- html_table(html)[[1]] expect_equal(table$x, c("x", "x", "x")) }) test_that("can parse with colspan", { html <- minimal_html('
xyz
1
12
12
') table <- html_table(html)[[1]] expect_snapshot_output(table) }) test_that("can parse with rowspan", { html <- minimal_html('
xyz
123
23
3
') table <- html_table(html)[[1]] expect_snapshot_output(table) }) test_that("can handle wobbling rowspan", { html <- minimal_html('
xyz
1a1b1c
2b
3a3c
') table <- html_table(html)[[1]] expect_snapshot_output(table) }) test_that("can handle trailing rowspans", { html <- minimal_html('
xyz
1 2 3
') table <- html_table(html)[[1]] expect_snapshot_output(table) }) test_that("can handle blank colspans", { html <- minimal_html('
xy
1 2
3
') table <- html_table(html)[[1]] expect_snapshot_output(table) }) test_that("can handle blank rowspans", { html <- minimal_html('
xy
1 2
3
') table <- html_table(html)[[1]] expect_snapshot_output(table) }) test_that("can handle empty row", { html <- minimal_html('
x
2
') table <- html_table(html)[[1]] expect_snapshot_output(table) }) test_that("defaults to minimal name repair", { html <- minimal_html('
xx
') table <- html_table(html)[[1]] expect_named(table, c("x", "x", "")) }) test_that("adds names if needed", { html <- minimal_html('
12
') table <- html_table(html)[[1]] expect_named(table, c("X1", "X2")) }) test_that("passes arguments to type.convert", { html <- minimal_html("
xy
NA1,2
") table <- html_table(html, na.strings = "")[[1]] expect_equal(table$x, "NA") table <- html_table(html, dec = ",")[[1]] expect_equal(table$y, 1.2) }) test_that("no conversion", { html <- minimal_html('
xy
001100.0
') table <- html_table(html, convert = FALSE)[[1]] expect_snapshot_output(table) }) test_that("fill = FALSE is deprecated", { html <- minimal_html('
x
1
') expect_snapshot({ . <- html_table(html, fill = FALSE) . <- html_table(html, fill = TRUE) }) }) test_that("can handle empty tables", { html <- minimal_html('
') table <- html_table(html)[[1]] expect_snapshot_output(table) }) test_that("can handle tables consisting of a single empty row", { html <- minimal_html('
') table <- html_table(html)[[1]] expect_snapshot_output(table) }) test_that("can handle tables consisting of only empty rows", { html <- minimal_html('
') table <- html_table(html)[[1]] expect_snapshot_output(table) }) rvest/tests/testthat/test.html0000644000176200001440000000013412362565206016271 0ustar liggesusers

rvest/tests/testthat/test-session.R0000644000176200001440000000557414554031036017216 0ustar liggesuserstest_that("basic session process works as expected", { expect_snapshot({ s <- session("http://hadley.nz/") s expect_true(is.session(s)) s <- session_follow_link(s, css = "p a") session_history(s) }) }) test_that("session caches xml parsing and sets base url", { s <- session("https://rvest.tidyverse.org/") expect_equal(s$cache$html, NULL) html <- read_html(s) expect_true(rlang::is_reference(s$cache$html, html)) expect_equal(xml2::xml_url(html), "https://rvest.tidyverse.org/") }) test_that("errors if try to access HTML from non-HTML page", { expect_snapshot(error = TRUE, { s <- session("https://rvest.tidyverse.org/logo.png") read_html(s) }) }) test_that("session responds to httr and rvest methods", { # skip_on_cran() s <- session("http://hadley.nz/") expect_silent(html_form(s)) expect_silent(html_table(s)) expect_silent(html_element(s, "body")) expect_silent(html_element(s, "body")) expect_silent(status_code(s)) expect_silent(headers(s)) expect_silent(cookies(s)) }) test_that("informative errors for bad inputs", { expect_snapshot_error(check_form(1)) expect_snapshot_error(check_session(1)) }) # navigation -------------------------------------------------------------- test_that("can navigate back and forward", { s <- session("https://hadley.nz/") expect_equal(s$back, character()) expect_equal(s$forward, character()) expect_snapshot_error(session_back(s)) expect_snapshot_error(session_forward(s)) s <- session_jump_to(s, "https://r4ds.hadley.nz/") expect_equal(s$back, "https://hadley.nz/") expect_equal(s$forward, character()) expect_equal(session_forward(session_back(s))$url, s$url) s <- session_back(s) expect_equal(s$back, character()) expect_equal(s$forward, "https://r4ds.hadley.nz/") s <- session_forward(s) expect_equal(s$back, "https://hadley.nz/") expect_equal(s$forward, character()) }) test_that("can find link by position, content, css, or xpath", { html <- minimal_html(" a b ") expect_equal(find_href(html, i = 1), "a") expect_equal(find_href(html, i = "b"), "b") expect_equal(find_href(html, css = "a.b"), "b") # Failure modes expect_snapshot(find_href(html, i = 1, css = "a"), error = TRUE) expect_snapshot(find_href(html, i = TRUE), error = TRUE) expect_snapshot(find_href(html, i = "c"), error = TRUE) expect_snapshot(find_href(html, css = "p a"), error = TRUE) }) test_that("can submit a form", { app <- local_test_app() html <- minimal_html('
') form <- html_form(html, base_url = app$url())[[1]] s <- session("http://hadley.nz/") s <- session_submit(s, form) expect_s3_class(s, "rvest_session") resp <- httr::content(s$response) expect_equal(resp$query, "x=1&y=2") }) rvest/tests/testthat/html/0000755000176200001440000000000014557166073015401 5ustar liggesusersrvest/tests/testthat/html/click.html0000644000176200001440000000076614557166073017365 0ustar liggesusers Button Event Example

rvest/tests/testthat/html/table.html0000644000176200001440000000103414557166073017354 0ustar liggesusers Simple Table
Header 1 Header 2 Header 3
Row 1, Cell 1 Row 1, Cell 2 Row 1, Cell 3
Row 2, Cell 1 Row 2, Cell 2 Row 2, Cell 3
rvest/tests/testthat/html/scroll.html0000644000176200001440000000054214557166073017566 0ustar liggesusers Scroll Position Display

Bottom

rvest/tests/testthat/html/press.html0000644000176200001440000000057014557166073017425 0ustar liggesusers Keypress Description

rvest/tests/testthat/html/type.html0000644000176200001440000000054014557166073017247 0ustar liggesusers Text Replication

rvest/tests/testthat/html/bullets.html0000644000176200001440000000027014557166073017740 0ustar liggesusers Simple Bulleted List
  • Item 1
  • Item 2
  • Item 3
  • Item 4
rvest/tests/testthat/test-form.R0000644000176200001440000001421014554042640016464 0ustar liggesuserstest_that("can find from from doc, nodes, and node", { html <- minimal_html('
') forms <- html_form(html) expect_type(forms, "list") expect_length(forms, 2) forms <- html_form(html_elements(html, "form")) expect_type(forms, "list") expect_length(forms, 2) form <- html_form(html_element(html, "form")) expect_s3_class(form, "rvest_form") }) test_that("has useful print method", { html <- minimal_html('
') expect_snapshot(html_form(html, base_url = "http://google.com")[[1]]) expect_snapshot(html_form(html)[[1]]$fields[[2]]) }) test_that("select options are named character vector", { select <- minimal_html("select parsing", '
') form <- select %>% html_element("form") %>% html_form() expect_equal(form$fields[[1]]$options, c(a = "1", b = "2")) }) test_that("select values are inherited from names", { page <- minimal_html("optional values", ' ') opts <- page %>% html_element('select') %>% parse_select() expect_equal(opts$options, c(x = "1", y = "y")) }) test_that("parse_fields gets the button", { select <- minimal_html("button test", '
') form <- select %>% html_element("form") %>% html_form() expect_equal(form$fields[[1]]$type, "button") }) test_that("handles different encoding types", { expect_equal(convert_enctype(NULL), "form") expect_equal(convert_enctype("application/x-www-form-urlencoded"), "form") expect_equal(convert_enctype("multipart/form-data"), "multipart") expect_snapshot(convert_enctype("unknown")) }) test_that("validates its inputs", { select <- minimal_html("button test", '
') expect_snapshot(error = TRUE, { html_form(html_element(select, "button")) html_form(select, base_url = 1) }) }) # set -------------------------------------------------------------- test_that("can set values of inputs", { html <- minimal_html('
') form <- html_form(html)[[1]] form <- html_form_set(form, text = "abc") expect_equal(form$fields$text$value, "abc") # warns that setting hidden field expect_snapshot(form <- html_form_set(form, hidden = "abc")) expect_equal(form$fields$hidden$value, "abc") }) test_that("has informative errors", { html <- minimal_html('
') form <- html_form(html)[[1]] expect_snapshot(html_form_set(form, text = "x"), error = TRUE) expect_snapshot(html_form_set(form, missing = "x"), error = TRUE) }) # submit ------------------------------------------------------------------ test_that("works as expected in simple case", { html <- minimal_html('
') form <- html_form(html, base_url = "http://here.com")[[1]] sub <- submission_build(form, "clickMe") expect_equal(sub$method, "POST") expect_equal(sub$action, "http://here.com/test-path") expect_equal(sub$values, list(x = "1")) }) test_that("useful feedback on invalid forms", { html <- minimal_html("
") form <- html_form(html)[[1]] expect_snapshot(submission_build(form, NULL), error = TRUE) html <- minimal_html("
") form <- html_form(html)[[1]] expect_snapshot(x <- submission_build(form, NULL)) }) test_that("can handle multiple values", { html <- minimal_html('
') form <- html_form(html)[[1]] form <- html_form_set(form, x = c("1", "2", "3"), y = character()) expect_equal( submission_build_values(form), list(x = "1", x = "2", x = "3") ) }) test_that("handles multiple buttons", { html <- minimal_html('
') form <- html_form(html)[[1]] # Messages when picking automatically expect_snapshot(vals <- submission_build_values(form, NULL)) expect_equal(vals, list(one = "1")) expect_equal(submission_build_values(form, "two"), list(two = "2")) expect_equal(submission_build_values(form, 2L), list(two = "2")) # Useful failure messages expect_snapshot(submission_build_values(form, 3L), error = TRUE) expect_snapshot(submission_build_values(form, "three"), error = TRUE) expect_snapshot(submission_build_values(form, TRUE), error = TRUE) }) test_that("handles no buttons", { html <- minimal_html('
') form <- html_form(html)[[1]] expect_equal( submission_build_values(form), list(x = "1") ) }) test_that("can submit using three primary techniques", { app <- local_test_app() html <- minimal_html('
') form <- html_form(html, base_url = app$url())[[1]] expect_snapshot({ show_response(html_form_submit(form)) form$method <- "POST" show_response(html_form_submit(form)) form$enctype <- "multipart" show_response(html_form_submit(form)) }) }) rvest/tests/testthat/test-live.R0000644000176200001440000000702314557166073016476 0ustar liggesuserstest_that("has print method", { skip_if_no_chromote() bullets <- read_html_live(html_test_path("bullets")) expect_snapshot(bullets) }) test_that("can find multiple elements", { skip_if_no_chromote() bullets <- read_html_live(html_test_path("bullets")) # can extract from page ul <- bullets %>% html_elements("ul") expect_length(ul, 1) # or with xpath ul <- bullets %>% html_elements(xpath = ".//ul") expect_length(ul, 1) # can extract from other elements li <- ul %>% html_elements("li") expect_length(li, 4) }) test_that("can extract tables", { skip_if_no_chromote() page <- read_html_live(html_test_path("table")) table <- page %>% html_table() %>% .[[1]] expect_equal(dim(table), c(2, 3)) }) test_that("can find single element", { skip_if_no_chromote() dynamic <- read_html_live("https://rvest.tidyverse.org/articles/starwars.html") static <- read_html("https://rvest.tidyverse.org/articles/starwars.html") expect_equal(html_element(dynamic, "p"), html_element(static, "p")) expect_equal(html_element(dynamic, "xyz"), html_element(static, "xyz")) }) test_that("can click a button", { skip_if_no_chromote() sess <- read_html_live(html_test_path("click")) sess$click("button") expect_equal(html_text(html_element(sess, "p")), "clicked") sess$click("button", 2) expect_equal(html_text(html_element(sess, "p")), "double clicked") }) test_that("can scroll in various ways", { skip_if_no_chromote() sess <- read_html_live(html_test_path("scroll")) expect_equal(sess$get_scroll_position(), list(x = 0, y = 0)) sess$scroll_to(500) Sys.sleep(0.2) expect_equal(sess$get_scroll_position(), list(x = 0, y = 500)) sess$scroll_by(-250) Sys.sleep(0.2) expect_equal(sess$get_scroll_position(), list(x = 0, y = 250)) sess$scroll_into_view("#bottom") Sys.sleep(0.2) expect_equal(sess$get_scroll_position(), list(x = 0, y = 685)) }) test_that("can type text", { skip_if_no_chromote() sess <- read_html_live(html_test_path("type")) sess$type("#inputText", "hello") expect_equal(html_text(html_element(sess, "#replicatedText")), "hello") }) test_that("can press special keys",{ skip_if_no_chromote() sess <- read_html_live(html_test_path("press")) sess$press("#inputBox", "ArrowRight") expect_equal(html_text(html_element(sess, "#keyInfo")), "ArrowRight/ArrowRight") sess$press("#inputBox", "BracketRight") expect_equal(html_text(html_element(sess, "#keyInfo")), "]/BracketRight") }) # as_key_desc ------------------------------------------------------------- test_that("gracefully errors on bad inputs", { expect_snapshot(error = TRUE, { as_key_desc("xyz") as_key_desc("X", "Malt") }) }) test_that("automatically adjusts for shift key", { # str(Filter(\(x) has_name(x, "shiftKey"), keydefs)) expect_equal(as_key_desc("KeyA")$key, "a") expect_equal(as_key_desc("KeyA", "Shift")$key, "A") # str(Filter(\(x) has_name(x, "shiftKeyCode"), keydefs)) expect_equal(as_key_desc("Numpad0")$windowsVirtualKeyCode, 45) expect_equal(as_key_desc("Numpad0", "Shift")$windowsVirtualKeyCode, 96) }) test_that("don't send text if modifier pushed", { expect_equal(as_key_desc("KeyA")$text, "a") expect_equal(as_key_desc("KeyA", "Shift")$text, "a") expect_equal(as_key_desc("KeyA", "Alt")$text, "") expect_equal(as_key_desc("KeyA", "Meta")$text, "") expect_equal(as_key_desc("KeyA", "Control")$text, "") }) test_that("modifiers are bitflag", { expect_equal(as_key_desc("KeyA", "Shift")$modifiers, 8) expect_equal(as_key_desc("KeyA", c("Alt", "Control"))$modifiers, 3) }) rvest/tests/testthat/test-selectors.R0000644000176200001440000000416314554034172017533 0ustar liggesuserstest_that("can select one or more nodes", { html <- minimal_html("

") expect_s3_class(html_elements(html, "p"), "xml_nodeset") expect_s3_class(html_element(html, "p"), "xml_node") }) test_that("xpath with // selects from root", { test <- read_html(test_path("test.html")) p <- html_elements(test, xpath = "//p") expect_equal(length(p), 4) p2 <- html_elements(p[[1]], xpath = "//p") expect_equal(length(p2), 4) p3 <- html_elements(p[[3]], xpath = "b") expect_equal(length(p3), 1) b <- html_elements(p, xpath = "b") expect_equal(length(b), 2) }) test_that("css class selects from current value", { test <- read_html(test_path("test.html")) p <- html_elements(test, css = "p") expect_equal(length(p), 4) p3 <- html_elements(p[[3]], css = "b") expect_equal(length(p3), 1) b <- html_elements(p, css = "b") expect_equal(length(b), 2) }) test_that("css selects don't select themselves", { test <- read_html(test_path("test.html")) p <- test %>% html_elements("p") %>% html_elements("p") expect_equal(length(p), 0) p <- test %>% html_elements("p") %>% `[[`(1) %>% html_elements("p") expect_equal(length(p), 0) }) test_that("css selects find all children", { test <- read_html(test_path("test.html")) b <- test %>% html_elements("body") %>% html_elements("b") expect_equal(length(b), 3) }) test_that("empty matches returns empty list", { test <- read_html(test_path("test.html")) none <- test %>% html_elements("none") expect_equal(length(none), 0) expect_equal(none %>% html_element("none") %>% length(), 0) expect_equal(none %>% html_elements("none") %>% length(), 0) }) # make_selector ----------------------------------------------------------- test_that("validates inputs", { expect_snapshot(make_selector(), error = TRUE) expect_snapshot(make_selector("a", "b"), error = TRUE) expect_snapshot(make_selector(css = 1), error = TRUE) expect_snapshot(make_selector(xpath = 1), error = TRUE) }) test_that("converts css to xpath", { expect_equal(make_selector(css = "p"), ".//p") }) test_that("preserves xpath", { expect_equal(make_selector(xpath = ".//p"), ".//p") }) rvest/tests/testthat/test-encoding.R0000644000176200001440000000133114553750033017310 0ustar liggesuserstest_that("can guess encoding", { skip("currently broken") skip_on_os("linux") # some hidden dependency on system library path <- system.file("html-ex", "bad-encoding.html", package = "rvest") x <- read_html(path) expect_snapshot(html_encoding_guess(x)) # deprecated expect_snapshot(guess_encoding(x)) }) test_that("encoding repair is deprecated", { skip("currently broken") skip_on_cran() skip_on_os("linux") # some hidden dependency on system library path <- system.file("html-ex", "bad-encoding.html", package = "rvest") x <- read_html(path) text <- html_text(html_element(x, "p")) expect_snapshot(repair_encoding(text), error = TRUE) expect_snapshot(repair_encoding(text, "ISO-8859-1")) }) rvest/tests/testthat/helper.R0000644000176200001440000000260014557166073016035 0ustar liggesuserslocal_test_app <- function(envir = parent.frame()) { skip_if_not_installed("webfakes") webfakes::local_app_process(app_request(), .local_envir = envir) } app_request <- function() { req_json <- function(req, res) { out <- list( method = req$method, query = req$query_string, type = req$headers$`Content-Type` %||% NA_character_, body = rawToChar(req$.body %||% raw()) ) res$send_json(out, auto_unbox = TRUE) } app <- webfakes::new_app() app$post("/", req_json) app$get("/", req_json) app } show_response <- function(x) { strip_divider <- function(x) { gsub("-{3,}[A-Za-z0-9-]+", "---{divider}", x) } x <- httr::content(x) cat_line(toupper(x$method), " ", strip_divider(x$type)) cat_line("Query string: ", x$query) cat_line(strip_divider(x$body)) } # chromote ---------------------------------------------------------------- skip_if_no_chromote <- function() { skip_on_cran() skip_if(lacks_chromote(), "chromote not available") } lacks_chromote <- function() { # We try twice because in particular Windows on GHA seems to need it, # but it doesn't otherwise hurt. More details at # https://github.com/rstudio/shinytest2/issues/209 env_cache(the, "lacks_chromote", !has_chromote() && !has_chromote()) } html_test_path <- function(name) { paste0("file://", normalizePath(test_path(paste0("html/", name, ".html")))) } rvest/tests/testthat/test-text.R0000644000176200001440000000644613775423635016534 0ustar liggesuserstest_that("html_text returns raw html", { html <- minimal_html("

x\ny
z

") p <- html_elements(html, "p") expect_equal(html_text(p), "x\nyz") }) # html_text2 -------------------------------------------------------------- test_that("handles block containing only inline elements", { html <- minimal_html("

a b c

") expect_equal(html_text2(html), "a b c") # internal newlines are trimmed html <- minimal_html("

a\n\nb\nc

") expect_equal(html_text2(html), "a b c") }) test_that("handles multiple paragraphs with line breaks", { html <- minimal_html("

a

b
c ") expect_equal(html_text2(html), "a\n\nb\nc") expect_equal(html_text2(html_elements(html, "p")), c("a", "b\nc")) }) test_that("handles table", { html <- minimal_html("
ab
12
23
") expect_equal(html_text2(html), "a\tb\n1\t2\n2\t3") }) test_that("handles mixed block as well as can be expected", { html <- minimal_html("

a

b
") expect_equal(html_text2(html_element(html, "div")), "a\n\nb\n") }) test_that("returns NA for xml_missing", { expect_equal(html_text2(xml2::xml_missing()), NA_character_) }) test_that("breaks as expected", { expect_identical(tag_margin("p"), 2L) expect_identical(tag_margin("li"), 1L) expect_identical(tag_margin("b"), 0L) }) # inline ------------------------------------------------------------------ test_that("handle single line of text", { html <- minimal_html("

a b c

") expect_equal(html_text_inline(html_element(html, "p")), "a b c") # collapses space across nodes html <- minimal_html("

a b c

") expect_equal(html_text_inline(html_element(html, "p")), "a b c") }) test_that("converts br to \n", { html <- minimal_html("


x

") expect_equal(html_text_inline(html_element(html, "p")), "\nx") html <- minimal_html("

x

") expect_equal(html_text_inline(html_element(html, "p")), "x\n") html <- minimal_html("



") expect_equal(html_text_inline(html_element(html, "p")), "\n\n") }) test_that("empty block returns empty string", { html <- minimal_html("

") expect_equal(html_text_inline(html_element(html, "p")), "") }) test_that("collapse whitespace handles single line", { expect_equal(collapse_whitespace("\n\tx\t\n"), "x") expect_equal(collapse_whitespace("x y"), "x y") }) test_that("optionally preserve nbsp", { expect_equal(collapse_whitespace("x \u00a0 y"), "x y") expect_equal(collapse_whitespace("x\u00a0y", TRUE), "x\u00a0y") }) # PaddedText -------------------------------------------------------------- test_that("margins only added within text", { text <- PaddedText$new() text$add_margin(1) text$add_text("x") text$add_margin(1) expect_equal(text$output(), "x") }) test_that("margins are collapsed", { text <- PaddedText$new() text$add_text("x") text$add_margin(1) expect_equal(text$lines, 1) text$add_margin(2) expect_equal(text$lines, 2) text$add_text("y") expect_equal(text$output(), "x\n\ny") }) test_that("empty text is ignored", { text <- PaddedText$new() text$add_text("") text$add_margin(1) text$add_text("x") expect_equal(text$output(), "x") }) rvest/tests/testthat/_snaps/0000755000176200001440000000000014560224013015677 5ustar liggesusersrvest/tests/testthat/_snaps/form.md0000644000176200001440000000605714560213754017206 0ustar liggesusers# has useful print method Code html_form(html, base_url = "http://google.com")[[1]] Output
'test' (POST http://google.com/test-path) (select) select: (text) name: Hadley (password) name: ****** (button) clickMe: (textarea) address: ABCDEF --- Code html_form(html)[[1]]$fields[[2]] Output (text) name: Hadley # handles different encoding types Code convert_enctype("unknown") Condition Warning: Unknown enctype (unknown). Defaulting to form encoded. Output [1] "form" # validates its inputs Code html_form(html_element(select, "button")) Condition Error in `html_form()`: ! `x` must be a element. Code html_form(select, base_url = 1) Condition Error in `FUN()`: ! `base_url` must be a single string or `NULL`, not the number 1. # can set values of inputs Code form <- html_form_set(form, hidden = "abc") Condition Warning: Setting value of hidden field "hidden". # has informative errors Code html_form_set(form, text = "x") Condition Error in `html_form_set()`: ! Can't change value of input with type submit: "text". --- Code html_form_set(form, missing = "x") Condition Error in `html_form_set()`: ! Can't set value of fields that don't exist: "missing". # useful feedback on invalid forms Code submission_build(form, NULL) Condition Error: ! `form` doesn't contain a `action` attribute. --- Code x <- submission_build(form, NULL) Condition Warning: Invalid method (FOO), defaulting to GET. # handles multiple buttons Code vals <- submission_build_values(form, NULL) Message Submitting with button "one". --- Code submission_build_values(form, 3L) Condition Error: ! Numeric `submit` out of range. --- Code submission_build_values(form, "three") Condition Error: ! No found with name "three". i Possible values: "one" and "two". --- Code submission_build_values(form, TRUE) Condition Error: ! `submit` must be NULL, a string, or a number. # can submit using three primary techniques Code show_response(html_form_submit(form)) Output GET Query string: x=1&x=2&y=3 Code form$method <- "POST" show_response(html_form_submit(form)) Output POST application/x-www-form-urlencoded Query string: x=1&x=2&y=3 Code form$enctype <- "multipart" show_response(html_form_submit(form)) Output POST multipart/form-data; boundary=---{divider} Query string: ---{divider} Content-Disposition: form-data; name="x" 1 ---{divider} Content-Disposition: form-data; name="x" 2 ---{divider} Content-Disposition: form-data; name="y" 3 ---{divider} rvest/tests/testthat/_snaps/table.md0000644000176200001440000000341214560213753017321 0ustar liggesusers# can parse simple table # A tibble: 2 x 3 x y z 1 1 Eve Jackson 2 2 John Doe # can parse with colspan # A tibble: 3 x 3 x y z 1 1 1 1 2 1 1 2 3 1 2 2 # can parse with rowspan # A tibble: 3 x 3 x y z 1 1 2 3 2 1 2 3 3 1 2 3 # can handle wobbling rowspan # A tibble: 3 x 3 x y z 1 1a 1b 1c 2 1a 2b 1c 3 3a 2b 3c # can handle trailing rowspans # A tibble: 4 x 3 x y z 1 1 2 3 2 NA 2 3 3 NA 2 NA 4 NA 2 NA # can handle blank colspans # A tibble: 2 x 2 x y 1 1 2 2 3 3 # can handle blank rowspans # A tibble: 2 x 2 x y 1 1 2 2 3 3 # can handle empty row # A tibble: 1 x 1 x 1 2 # no conversion # A tibble: 1 x 2 x y 1 001 100.0 # fill = FALSE is deprecated Code . <- html_table(html, fill = FALSE) Condition Warning: The `fill` argument of `html_table()` is deprecated as of rvest 1.0.0. i An improved algorithm fills by default so it is no longer needed. Code . <- html_table(html, fill = TRUE) # can handle empty tables # A tibble: 0 x 0 # can handle tables consisting of a single empty row # A tibble: 0 x 0 # can handle tables consisting of only empty rows # A tibble: 0 x 0 rvest/tests/testthat/_snaps/session.md0000644000176200001440000000276114560213755017725 0ustar liggesusers# basic session process works as expected Code s <- session("http://hadley.nz/") s Output https://hadley.nz/ Status: 200 Type: text/html; charset=utf-8 Size: 821273 Code expect_true(is.session(s)) s <- session_follow_link(s, css = "p a") Message Navigating to . Code session_history(s) Output https://hadley.nz/ - https://posit.co/ # errors if try to access HTML from non-HTML page Code s <- session("https://rvest.tidyverse.org/logo.png") read_html(s) Condition Error in `read_html()`: ! Page doesn't appear to be html. # informative errors for bad inputs `form` must be a single form produced by `html_form()`. --- `x` must be produced by `session()`. # can navigate back and forward Can't go back any further. --- Can't go forward any further. # can find link by position, content, css, or xpath Code find_href(html, i = 1, css = "a") Condition Error: ! Exactly one of `i`, `css`, or `xpath` must be supplied. x `i` and `css` were supplied together. --- Code find_href(html, i = TRUE) Condition Error: ! `i` must be a string or integer. --- Code find_href(html, i = "c") Condition Error: ! No links have text "c". --- Code find_href(html, css = "p a") Condition Error: ! No links matched `css`/`xpath` rvest/tests/testthat/_snaps/selectors.md0000644000176200001440000000100714560754163020240 0ustar liggesusers# validates inputs Code make_selector() Condition Error: ! One of `css` or `xpath` must be supplied. --- Code make_selector("a", "b") Condition Error: ! Exactly one of `css` or `xpath` must be supplied. --- Code make_selector(css = 1) Condition Error: ! `css` must be a single string, not the number 1. --- Code make_selector(xpath = 1) Condition Error: ! `xpath` must be a single string, not the number 1. rvest/tests/testthat/_snaps/rename.md0000644000176200001440000000335414560213753017506 0ustar liggesusers# xml functions are deprecated Code . <- xml_tag(x) Condition Warning: `xml_tag()` was deprecated in rvest 1.0.0. i Please use `html_name()` instead. --- Code . <- xml_node(x, "p") Condition Warning: `xml_node()` was deprecated in rvest 1.0.0. i Please use `html_element()` instead. --- Code . <- xml_nodes(x, "p") Condition Warning: `xml_nodes()` was deprecated in rvest 1.0.0. i Please use `html_elements()` instead. # set_values() is deprecated Code set_values(form, text = "abc") Condition Warning: `set_values()` was deprecated in rvest 1.0.0. i Please use `html_form_set()` instead. Output '' (GET ) (text) text: abc # prefixless session functions are deprecated Code s <- html_session("http://rvest.tidyverse.org/") Condition Warning: `html_session()` was deprecated in rvest 1.0.0. i Please use `session()` instead. Code . <- follow_link(s, i = 1) Condition Warning: `follow_link()` was deprecated in rvest 1.0.0. i Please use `session_follow_link()` instead. Message Navigating to <#container>. Code s <- jump_to(s, "https://rvest.tidyverse.org/reference/index.html") Condition Warning: `jump_to()` was deprecated in rvest 1.0.0. i Please use `session_jump_to()` instead. Code s <- back(s) Condition Warning: `back()` was deprecated in rvest 1.0.0. i Please use `session_back()` instead. Code s <- forward(s) Condition Warning: `forward()` was deprecated in rvest 1.0.0. i Please use `session_forward()` instead. rvest/tests/testthat/_snaps/encoding.md0000644000176200001440000000312514560213752020020 0ustar liggesusers# can guess encoding Code html_encoding_guess(x) Output encoding language confidence 1 ISO-8859-1 fr 0.31 2 ISO-8859-2 ro 0.22 3 UTF-16BE 0.10 4 UTF-16LE 0.10 5 GB18030 zh 0.10 6 Big5 zh 0.10 7 ISO-8859-9 tr 0.06 8 IBM424_rtl he 0.01 9 IBM424_ltr he 0.01 --- Code guess_encoding(x) Condition Warning: `guess_encoding()` was deprecated in rvest 1.0.0. i Please use `html_encoding_guess()` instead. Output encoding language confidence 1 ISO-8859-1 fr 0.31 2 ISO-8859-2 ro 0.22 3 UTF-16BE 0.10 4 UTF-16LE 0.10 5 GB18030 zh 0.10 6 Big5 zh 0.10 7 ISO-8859-9 tr 0.06 8 IBM424_rtl he 0.01 9 IBM424_ltr he 0.01 # encoding repair is deprecated Code repair_encoding(text) Condition Warning: `html_encoding_repair()` was deprecated in rvest 1.0.0. i Instead, re-load using the `encoding` argument of `read_html()` Error: ! No guess has more than 50% confidence --- Code repair_encoding(text, "ISO-8859-1") Condition Warning: `html_encoding_repair()` was deprecated in rvest 1.0.0. i Instead, re-load using the `encoding` argument of `read_html()` Output [1] "Émigré cause célèbre déjà vu." rvest/tests/testthat/_snaps/html.md0000644000176200001440000000046614560213752017203 0ustar liggesusers# validates inputs Code html_attr(html, 1) Condition Error in `html_attr()`: ! `name` must be a single string, not the number 1. Code html_attr(html, "id", 1) Condition Error in `html_attr()`: ! `default` must be a single string or `NA`, not the number 1. rvest/tests/testthat/_snaps/utils.md0000644000176200001440000000052614560754171017401 0ustar liggesusers# minimal html doesn't change unexpectedly Code cat(as.character(minimal_html("

Hi"))) Output

Hi

rvest/tests/testthat/_snaps/live.md0000644000176200001440000000104214560224027017162 0ustar liggesusers# has print method Code bullets Output {xml_nodeset (2)} [1] Simple Bulleted List [2]
    \n
  • Item 1
  • \n
  • Item 2
  • \n
  • Item 3
  • \n
  • Item 4 ... # gracefully errors on bad inputs Code as_key_desc("xyz") Condition Error in `as_key_desc()`: ! No key definition for "xyz". Code as_key_desc("X", "Malt") Condition Error: ! `modifiers` must be one of "Alt", "Control", "Meta", or "Shift", not "Malt". i Did you mean "Alt"? rvest/tests/testthat/test-html.R0000644000176200001440000000106614554042640016472 0ustar liggesuserstest_that("forwards to xml2 functions", { html <- minimal_html("

    Hello children

    ") p <- html_elements(html, "p") expect_equal(html_name(p), "p") expect_equal(html_attr(p, "id"), "x") expect_equal(html_attr(p, "id2"), NA_character_) expect_equal(html_attrs(p), list(c(id = "x"))) expect_equal(html_children(p), html_elements(html, "i")) }) test_that("validates inputs", { html <- minimal_html("

    Hello children

    ") expect_snapshot(error = TRUE, { html_attr(html, 1) html_attr(html, "id", 1) }) }) rvest/tests/testthat/test-utils.R0000644000176200001440000000040013775334175016670 0ustar liggesuserstest_that("can truncate strings", { expect_equal(str_trunc("abcdef", 10), "abcdef") expect_equal(str_trunc("abcdef", 4), "a...") }) test_that("minimal html doesn't change unexpectedly", { expect_snapshot(cat(as.character(minimal_html("

    Hi")))) }) rvest/tests/testthat/test-rename.R0000644000176200001440000000172413775436633017013 0ustar liggesuserstest_that("xml functions are deprecated", { x <- minimal_html("

    Hello

    ") expect_snapshot(. <- xml_tag(x)) expect_snapshot(. <- xml_node(x, "p")) expect_snapshot(. <- xml_nodes(x, "p")) }) test_that("html_node(s) is superseded (no warnings)", { x <- minimal_html("

    Hello

    ") expect_equal(html_node(x, "p"), html_element(x, "p")) expect_equal(html_nodes(x, "p"), html_elements(x, "p")) }) test_that("set_values() is deprecated", { html <- minimal_html('
  • ') form <- html_form(html)[[1]] expect_snapshot(set_values(form, text = "abc")) }) test_that("prefixless session functions are deprecated", { expect_snapshot({ s <- html_session("http://rvest.tidyverse.org/") . <- follow_link(s, i = 1) s <- jump_to(s, "https://rvest.tidyverse.org/reference/index.html") s <- back(s) s <- forward(s) }) }) # session_submit() is tested in form-submit because it needs a test server rvest/tests/testthat.R0000644000176200001440000000006612364025001014535 0ustar liggesuserslibrary(testthat) library(rvest) test_check("rvest") rvest/vignettes/0000755000176200001440000000000014562450666013443 5ustar liggesusersrvest/vignettes/rvest.Rmd0000644000176200001440000002701214554031036015237 0ustar liggesusers--- title: "Web scraping 101" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Web scraping 101} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, echo=FALSE} knitr::opts_chunk$set(comment = "#>", collapse = TRUE) ``` This vignette introduces you to the basics of web scraping with rvest. You'll first learn the basics of HTML and how to use CSS selectors to refer to specific elements, then you'll learn how to use rvest functions to get data out of HTML and into R. ```{r} library(rvest) ``` ## HTML basics HTML stands for "HyperText Markup Language" and looks like this: ``` {.html} Page title

    A heading

    Some text & some bold text.

    ``` HTML has a hierarchical structure formed by **elements** which consist of a start tag (e.g. ``), optional **attributes** (`id='first'`), an end tag[^1] (like ``), and **contents** (everything in between the start and end tag). [^1]: A number of tags (including `

    ` and `

  • )` don't require end tags, but I think it's best to include them because it makes seeing the structure of the HTML a little easier. Since `<` and `>` are used for start and end tags, you can't write them directly. Instead you have to use the HTML **escapes** `>` (greater than) and `<` (less than). And since those escapes use `&`, if you want a literal ampersand you have to escape it as `&`. There are a wide range of possible HTML escapes but you don't need to worry about them too much because rvest automatically handles them for you. ### Elements All up, there are over 100 HTML elements. Some of the most important are: - Every HTML page must be in an `` element, and it must have two children: ``, which contains document metadata like the page title, and ``, which contains the content you see in the browser. - Block tags like `

    ` (heading 1), `

    ` (paragraph), and `

      ` (ordered list) form the overall structure of the page. - Inline tags like `` (bold), `` (italics), and `` (links) formats text inside block tags. If you encounter a tag that you've never seen before, you can find out what it does with a little googling. I recommend the [MDN Web Docs](https://developer.mozilla.org/en-US/docs/Web/HTML) which are produced by Mozilla, the company that makes the Firefox web browser. ### Contents Most elements can have content in between their start and end tags. This content can either be text or more elements. For example, the following HTML contains paragraph of text, with one word in bold. ```{=html}

      Hi! My name is Hadley.

      ``` The **children** of a node refers only to elements, so the `

      ` element above has one child, the `` element. The `` element has no children, but it does have contents (the text "name"). Some elements, like `` can't have children. These elements depend solely on attributes for their behavior. ### Attributes Tags can have named **attributes** which look like `name1='value1' name2='value2'`. Two of the most important attributes are `id` and `class`, which are used in conjunction with CSS (Cascading Style Sheets) to control the visual appearance of the page. These are often useful when scraping data off a page. ## Reading HTML with rvest You'll usually start the scraping process with `read_html()`. This returns a `xml_document`[^2] object which you'll then manipulate using rvest functions: [^2]: This class comes from the [xml2](https://xml2.r-lib.org) package. xml2 is a low-level package that rvest builds on top of. ```{r} html <- read_html("http://rvest.tidyverse.org/") class(html) ``` For examples and experimentation, rvest also includes a function that lets you create an `xml_document` from literal HTML: ```{r} html <- minimal_html("

      This is a paragraph

      • This is a bulleted list
      ") html ``` Regardless of how you get the HTML, you'll need some way to identify the elements that contain the data you care about. rvest provides two options: CSS selectors and XPath expressions. Here I'll focus on CSS selectors because they're simpler but still sufficiently powerful for most scraping tasks. ## CSS selectors CSS is short for cascading style sheets, and is a tool for defining the visual styling of HTML documents. CSS includes a miniature language for selecting elements on a page called **CSS selectors**. CSS selectors define patterns for locating HTML elements, and are useful for scraping because they provide a concise way of describing which elements you want to extract. CSS selectors can be quite complex, but fortunately you only need the simplest for rvest, because you can also write R code for more complicated situations. The four most important selectors are: - `p`: selects all `

      ` elements. - `.title`: selects all elements with `class` "title". - `p.special`: selects all `

      ` elements with `class` "special". - `#title`: selects the element with the `id` attribute that equals "title". Id attributes must be unique within a document, so this will only ever select a single element. If you want to learn more CSS selectors I recommend starting with the fun [CSS dinner](https://flukeout.github.io/) tutorial and then referring to the [MDN web docs](https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_Selectors). Lets try out the most important selectors with a simple example: ```{r} html <- minimal_html("

      This is a heading

      This is a paragraph

      This is an important paragraph

      ") ``` In rvest you can extract a single element with `html_element()` or all matching elements with `html_elements()`. Both functions take a document[^3] and a css selector: [^3]: Or another element, more on that shortly. ```{r} html %>% html_element("h1") html %>% html_elements("p") html %>% html_elements(".important") html %>% html_elements("#first") ``` Selectors can also be combined in various ways using **combinators**. For example,The most important combinator is " ", the **descendant** combination, because `p a` selects all `
      ` elements that are a child of a `

      ` element. If you don't know exactly what selector you need, I highly recommend using [SelectorGadget](https://rvest.tidyverse.org/articles/selectorgadget.html), which lets you automatically generate the selector you need by supplying positive and negative examples in the browser. ## Extracting data Now that you've got the elements you care about, you'll need to get data out of them. You'll usually get the data from either the text contents or an attribute. But, sometimes (if you're lucky!), the data you need will be in an HTML table. ### Text Use `html_text2()` to extract the plain text contents of an HTML element: ```{r} html <- minimal_html("

      1. apple & pear
      2. banana
      3. pineapple
      ") html %>% html_elements("li") %>% html_text2() ``` Note that the escaped ampersand is automatically converted to `&`; you'll only ever see HTML escapes in the source HTML, not in the data returned by rvest. You might wonder why I used `html_text2()`, since it seems to give the same result as `html_text()`: ```{r} html %>% html_elements("li") %>% html_text() ``` The main difference is how the two functions handle white space. In HTML, white space is largely ignored, and it's the structure of the elements that defines how text is laid out. `html_text2()` does its best to follow the same rules, giving you something similar to what you'd see in the browser. Take this example which contains a bunch of white space that HTML ignores. ```{r} html <- minimal_html("

      This is a paragraph.

      This is another paragraph. It has two sentences.

      ") ``` `html_text2()` gives you what you expect: two paragraphs of text separated by a blank line. ```{r} html %>% html_element("body") %>% html_text2() %>% cat() ``` Whereas `html_text()` returns the garbled raw underlying text: ```{r} html %>% html_element("body") %>% html_text() %>% cat() ``` ### Attributes Attributes are used to record the destination of links (the `href` attribute of `
      ` elements) and the source of images (the `src` attribute of the `` element): ```{r} html <- minimal_html("

      cats

      ") ``` The value of an attribute can be retrieved with `html_attr()`: ```{r} html %>% html_elements("a") %>% html_attr("href") html %>% html_elements("img") %>% html_attr("src") ``` Note that `html_attr()` always returns a string, so you may need to post-process with `as.integer()`/`readr::parse_integer()` or similar. ```{r} html %>% html_elements("img") %>% html_attr("width") html %>% html_elements("img") %>% html_attr("width") %>% as.integer() ``` ### Tables HTML tables are composed four main elements: ``, `` (table row), ` i <- length(values) + 1 length(values) <- height while (length(dw$col) > 0) { vals <- rep(NA_character_, width) for (col in dw$col) { cell <- dw_find(dw, col) vals[col:(col + cell$colspan - 1L)] <- cell$text } values[[i]] <- vals i <- i + 1 dw <- dw_prune(dw) } values <- lapply(values, `[`, seq_len(width)) matrix(unlist(values), ncol = width, byrow = TRUE) } dw_find <- function(dw, col) { match <- col == dw$col list( col = dw$col[match], rowspan = dw$rowspan[match], colspan = dw$colspan[match], text = dw$text[match] ) } dw_init <- function() { list( col = integer(), rowspan = integer(), colspan = integer(), text = character() ) } dw_add <- function(dw, col, rowspan, colspan, text) { dw$col <- c(dw$col, col) dw$text <- c(dw$text, text) dw$rowspan <- c(dw$rowspan, rowspan) dw$colspan <- c(dw$colspan, colspan) dw } dw_prune <- function(dw) { dw$rowspan <- dw$rowspan - 1L keep <- dw$rowspan > 0L dw$col <- dw$col[keep] dw$text <- dw$text[keep] dw$rowspan <- dw$rowspan[keep] dw$colspan <- dw$colspan[keep] dw } rvest/R/selectors.R0000644000176200001440000000705614554042640013760 0ustar liggesusers#' Select elements from an HTML document #' #' `html_element()` and `html_elements()` find HTML element using CSS selectors #' or XPath expressions. CSS selectors are particularly useful in conjunction #' with , which makes it very easy to discover the #' selector you need. #' #' @section CSS selector support: #' #' CSS selectors are translated to XPath selectors by the \pkg{selectr} #' package, which is a port of the python \pkg{cssselect} library, #' . #' #' It implements the majority of CSS3 selectors, as described in #' . The #' exceptions are listed below: #' #' * Pseudo selectors that require interactivity are ignored: #' `:hover`, `:active`, `:focus`, `:target`, `:visited`. #' * The following pseudo classes don't work with the wild card element, *: #' `*:first-of-type`, `*:last-of-type`, `*:nth-of-type`, #' `*:nth-last-of-type`, `*:only-of-type` #' * It supports `:contains(text)` #' * You can use !=, `[foo!=bar]` is the same as `:not([foo=bar])` #' * `:not()` accepts a sequence of simple selectors, not just a single #' simple selector. #' #' @param x Either a document, a node set or a single node. #' @param css,xpath Elements to select. Supply one of `css` or `xpath` #' depending on whether you want to use a CSS selector or XPath 1.0 #' expression. #' @returns `html_element()` returns a nodeset the same length as the input. #' `html_elements()` flattens the output so there's no direct way to map #' the output to the input. #' @export #' @examples #' html <- minimal_html(" #'

      This is a heading

      #'

      This is a paragraph

      #'

      This is an important paragraph

      #' ") #' #' html %>% html_element("h1") #' html %>% html_elements("p") #' html %>% html_elements(".important") #' html %>% html_elements("#first") #' #' # html_element() vs html_elements() -------------------------------------- #' html <- minimal_html(" #'
        #'
      • C-3PO is a droid that weighs 167 kg
      • #'
      • R2-D2 is a droid that weighs 96 kg
      • #'
      • Yoda weighs 66 kg
      • #'
      • R4-P17 is a droid
      • #'
      #' ") #' li <- html %>% html_elements("li") #' #' # When applied to a node set, html_elements() returns all matching elements #' # beneath any of the inputs, flattening results into a new node set. #' li %>% html_elements("i") #' #' # When applied to a node set, html_element() always returns a vector the #' # same length as the input, using a "missing" element where needed. #' li %>% html_element("i") #' # and html_text() and html_attr() will return NA #' li %>% html_element("i") %>% html_text2() #' li %>% html_element("span") %>% html_attr("class") html_element <- function(x, css, xpath) { UseMethod("html_element") } #' @export #' @rdname html_element html_elements <- function(x, css, xpath) { UseMethod("html_elements") } #' @export html_elements.default <- function(x, css, xpath) { xml2::xml_find_all(x, make_selector(css, xpath)) } #' @export html_element.default <- function(x, css, xpath) { xml2::xml_find_first(x, make_selector(css, xpath)) } make_selector <- function(css, xpath, error_call = caller_env()) { check_exclusive(css, xpath, .call = error_call) if (!missing(css)) { check_string(css, call = error_call) selectr::css_to_xpath(css, prefix = ".//") } else { check_string(xpath, call = error_call) xpath } } rvest/R/rename.R0000644000176200001440000000525114007274024013213 0ustar liggesusers#' Functions renamed in rvest 1.0.0 #' #' @description #' `r lifecycle::badge('deprecated')` #' #' rvest 1.0.0 renamed a number of functions to ensure that every function #' has a common prefix, matching tidyverse conventions that emerged since #' rvest was first created. #' #' * `set_values()` -> `html_form_set()` #' * `submit_form()` -> `session_submit()` #' * `xml_tag()` -> `html_name()` #' * `xml_node()` & `html_node()` -> `html_element()` #' * `xml_nodes()` & `html_nodes()` -> `html_elements()` #' #' (`html_node()` and `html_nodes()` are only superseded because they're #' so widely used.) #' #' Additionally all session related functions gained a common prefix: #' #' * `html_session()` -> `session()` #' * `forward()` -> `session_forward()` #' * `back()` -> `session_back()` #' * `jump_to()` -> `session_jump_to()` #' * `follow_link()` -> `session_follow_link()` #' #' @keywords internal #' @name rename #' @aliases NULL NULL #' @rdname rename #' @export set_values <- function(form, ...) { lifecycle::deprecate_warn("1.0.0", "set_values()", "html_form_set()") html_form_set(form = form, ...) } #' @rdname rename #' @export submit_form <- function(session, form, submit = NULL, ...) { lifecycle::deprecate_warn("1.0.0", "submit_form()", "session_submit()") session_submit(x = session, form = form, submit = submit, ...) } #' @export #' @rdname rename xml_tag <- function(x) { lifecycle::deprecate_warn("1.0.0", "xml_tag()", "html_name()") html_name(x) } #' @export #' @rdname rename xml_node <- function(...) { lifecycle::deprecate_warn("1.0.0", "xml_node()", "html_element()") html_node(...) } #' @export #' @rdname rename xml_nodes <- function(...) { lifecycle::deprecate_warn("1.0.0", "xml_nodes()", "html_elements()") html_nodes(...) } #' @export #' @rdname rename html_nodes <- function(...) { html_elements(...) } #' @export #' @rdname rename html_node <- function(...) { html_element(...) } #' @export #' @rdname rename back <- function(x) { lifecycle::deprecate_warn("1.0.0", "back()", "session_back()") session_back(x) } #' @export #' @rdname rename forward <- function(x) { lifecycle::deprecate_warn("1.0.0", "forward()", "session_forward()") session_forward(x) } #' @export #' @rdname rename jump_to <- function(x, url, ...) { lifecycle::deprecate_warn("1.0.0", "jump_to()", "session_jump_to()") session_jump_to(x, url, ...) } #' @export #' @rdname rename follow_link <- function(x, ...) { lifecycle::deprecate_warn("1.0.0", "follow_link()", "session_follow_link()") session_follow_link(x, ...) } #' @export #' @rdname rename html_session <- function(url, ...) { lifecycle::deprecate_warn("1.0.0", "html_session()", "session()") session(url, ...) } rvest/R/rvest-package.R0000644000176200001440000000411714557172067014516 0ustar liggesusers#' @keywords internal #' @import rlang #' @importFrom lifecycle deprecated "_PACKAGE" #' Static web scraping (with xml2) #' #' @description #' [read_html()] works by performing a HTTP request then parsing the HTML #' received using the xml2 package. This is "static" scraping because it #' operates only on the raw HTML file. While this works for most sites, #' in some cases you will need to use [read_html_live()] if the parts of #' the page you want to scrape are dynamically generated with javascript. #' #' Generally, we recommend using `read_html()` if it works, as it will be #' faster and more robust, as it has fewer external dependencies (i.e. it #' doesn't rely on the Chrome web browser installed on your computer.) #' #' @inheritParams xml2::read_html #' @param x Usually a string representing a URL. See [xml2::read_html()] for #' other options. #' @rdname read_html #' @importFrom xml2 read_html #' @export #' @examples #' # Start by reading a HTML page with read_html(): #' starwars <- read_html("https://rvest.tidyverse.org/articles/starwars.html") #' #' # Then find elements that match a css selector or XPath expression #' # using html_elements(). In this example, each
      corresponds #' # to a different film #' films <- starwars %>% html_elements("section") #' films #' #' # Then use html_element() to extract one element per film. Here #' # we the title is given by the text inside

      #' title <- films %>% #' html_element("h2") %>% #' html_text2() #' title #' #' # Or use html_attr() to get data out of attributes. html_attr() always #' # returns a string so we convert it to an integer using a readr function #' episode <- films %>% #' html_element("h2") %>% #' html_attr("data-id") %>% #' readr::parse_integer() #' episode xml2::read_html #' @importFrom xml2 url_absolute #' @export xml2::url_absolute #' @export #' @importFrom magrittr %>% magrittr::`%>%` # The following block is used by usethis to automatically manage # roxygen namespace tags. Modify with care! ## usethis namespace: start #' @importFrom glue glue ## usethis namespace: end NULL the <- new_environment() rvest/R/import-standalone-types-check.R0000644000176200001440000002761614554042640017636 0ustar liggesusers# Standalone file: do not edit by hand # Source: # ---------------------------------------------------------------------- # # --- # repo: r-lib/rlang # file: standalone-types-check.R # last-updated: 2023-03-13 # license: https://unlicense.org # dependencies: standalone-obj-type.R # imports: rlang (>= 1.1.0) # --- # # ## Changelog # # 2023-03-13: # - Improved error messages of number checkers (@teunbrand) # - Added `allow_infinite` argument to `check_number_whole()` (@mgirlich). # - Added `check_data_frame()` (@mgirlich). # # 2023-03-07: # - Added dependency on rlang (>= 1.1.0). # # 2023-02-15: # - Added `check_logical()`. # # - `check_bool()`, `check_number_whole()`, and # `check_number_decimal()` are now implemented in C. # # - For efficiency, `check_number_whole()` and # `check_number_decimal()` now take a `NULL` default for `min` and # `max`. This makes it possible to bypass unnecessary type-checking # and comparisons in the default case of no bounds checks. # # 2022-10-07: # - `check_number_whole()` and `_decimal()` no longer treat # non-numeric types such as factors or dates as numbers. Numeric # types are detected with `is.numeric()`. # # 2022-10-04: # - Added `check_name()` that forbids the empty string. # `check_string()` allows the empty string by default. # # 2022-09-28: # - Removed `what` arguments. # - Added `allow_na` and `allow_null` arguments. # - Added `allow_decimal` and `allow_infinite` arguments. # - Improved errors with absent arguments. # # # 2022-09-16: # - Unprefixed usage of rlang functions with `rlang::` to # avoid onLoad issues when called from rlang (#1482). # # 2022-08-11: # - Added changelog. # # nocov start # Scalars ----------------------------------------------------------------- .standalone_types_check_dot_call <- .Call check_bool <- function(x, ..., allow_na = FALSE, allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x) && .standalone_types_check_dot_call(ffi_standalone_is_bool_1.0.7, x, allow_na, allow_null)) { return(invisible(NULL)) } stop_input_type( x, c("`TRUE`", "`FALSE`"), ..., allow_na = allow_na, allow_null = allow_null, arg = arg, call = call ) } check_string <- function(x, ..., allow_empty = TRUE, allow_na = FALSE, allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { is_string <- .rlang_check_is_string( x, allow_empty = allow_empty, allow_na = allow_na, allow_null = allow_null ) if (is_string) { return(invisible(NULL)) } } stop_input_type( x, "a single string", ..., allow_na = allow_na, allow_null = allow_null, arg = arg, call = call ) } .rlang_check_is_string <- function(x, allow_empty, allow_na, allow_null) { if (is_string(x)) { if (allow_empty || !is_string(x, "")) { return(TRUE) } } if (allow_null && is_null(x)) { return(TRUE) } if (allow_na && (identical(x, NA) || identical(x, na_chr))) { return(TRUE) } FALSE } check_name <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { is_string <- .rlang_check_is_string( x, allow_empty = FALSE, allow_na = FALSE, allow_null = allow_null ) if (is_string) { return(invisible(NULL)) } } stop_input_type( x, "a valid name", ..., allow_na = FALSE, allow_null = allow_null, arg = arg, call = call ) } IS_NUMBER_true <- 0 IS_NUMBER_false <- 1 IS_NUMBER_oob <- 2 check_number_decimal <- function(x, ..., min = NULL, max = NULL, allow_infinite = TRUE, allow_na = FALSE, allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (missing(x)) { exit_code <- IS_NUMBER_false } else if (0 == (exit_code <- .standalone_types_check_dot_call( ffi_standalone_check_number_1.0.7, x, allow_decimal = TRUE, min, max, allow_infinite, allow_na, allow_null ))) { return(invisible(NULL)) } .stop_not_number( x, ..., exit_code = exit_code, allow_decimal = TRUE, min = min, max = max, allow_na = allow_na, allow_null = allow_null, arg = arg, call = call ) } check_number_whole <- function(x, ..., min = NULL, max = NULL, allow_infinite = FALSE, allow_na = FALSE, allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (missing(x)) { exit_code <- IS_NUMBER_false } else if (0 == (exit_code <- .standalone_types_check_dot_call( ffi_standalone_check_number_1.0.7, x, allow_decimal = FALSE, min, max, allow_infinite, allow_na, allow_null ))) { return(invisible(NULL)) } .stop_not_number( x, ..., exit_code = exit_code, allow_decimal = FALSE, min = min, max = max, allow_na = allow_na, allow_null = allow_null, arg = arg, call = call ) } .stop_not_number <- function(x, ..., exit_code, allow_decimal, min, max, allow_na, allow_null, arg, call) { if (allow_decimal) { what <- "a number" } else { what <- "a whole number" } if (exit_code == IS_NUMBER_oob) { min <- min %||% -Inf max <- max %||% Inf if (min > -Inf && max < Inf) { what <- sprintf("%s between %s and %s", what, min, max) } else if (x < min) { what <- sprintf("%s larger than or equal to %s", what, min) } else if (x > max) { what <- sprintf("%s smaller than or equal to %s", what, max) } else { abort("Unexpected state in OOB check", .internal = TRUE) } } stop_input_type( x, what, ..., allow_na = allow_na, allow_null = allow_null, arg = arg, call = call ) } check_symbol <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_symbol(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "a symbol", ..., allow_na = FALSE, allow_null = allow_null, arg = arg, call = call ) } check_arg <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_symbol(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "an argument name", ..., allow_na = FALSE, allow_null = allow_null, arg = arg, call = call ) } check_call <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_call(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "a defused call", ..., allow_na = FALSE, allow_null = allow_null, arg = arg, call = call ) } check_environment <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_environment(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "an environment", ..., allow_na = FALSE, allow_null = allow_null, arg = arg, call = call ) } check_function <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_function(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "a function", ..., allow_na = FALSE, allow_null = allow_null, arg = arg, call = call ) } check_closure <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_closure(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "an R function", ..., allow_na = FALSE, allow_null = allow_null, arg = arg, call = call ) } check_formula <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_formula(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "a formula", ..., allow_na = FALSE, allow_null = allow_null, arg = arg, call = call ) } # Vectors ----------------------------------------------------------------- check_character <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_character(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "a character vector", ..., allow_na = FALSE, allow_null = allow_null, arg = arg, call = call ) } check_logical <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is_logical(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "a logical vector", ..., allow_na = FALSE, allow_null = allow_null, arg = arg, call = call ) } check_data_frame <- function(x, ..., allow_null = FALSE, arg = caller_arg(x), call = caller_env()) { if (!missing(x)) { if (is.data.frame(x)) { return(invisible(NULL)) } if (allow_null && is_null(x)) { return(invisible(NULL)) } } stop_input_type( x, "a data frame", ..., allow_null = allow_null, arg = arg, call = call ) } # nocov end rvest/R/text.R0000644000176200001440000001502614554042640012735 0ustar liggesusers#' Get element text #' #' @description #' There are two ways to retrieve text from a element: `html_text()` and #' `html_text2()`. `html_text()` is a thin wrapper around [xml2::xml_text()] #' which returns just the raw underlying text. `html_text2()` simulates how #' text looks in a browser, using an approach inspired by JavaScript's #' [innerText()](https://developer.mozilla.org/en-US/docs/Web/API/HTMLElement/innerText). #' Roughly speaking, it converts `
      ` to `"\n"`, adds blank lines #' around `

      ` tags, and lightly formats tabular data. #' #' `html_text2()` is usually what you want, but it is much slower than #' `html_text()` so for simple applications where performance is important #' you may want to use `html_text()` instead. #' #' @inheritParams xml2::xml_text #' @importFrom xml2 xml_text #' @return A character vector the same length as `x` #' @examples #' # To understand the difference between html_text() and html_text2() #' # take the following html: #' #' html <- minimal_html( #' "

      This is a paragraph. #' This another sentence.
      This should start on a new line" #' ) #' #' # html_text() returns the raw underlying text, which includes whitespace #' # that would be ignored by a browser, and ignores the
      #' html %>% html_element("p") %>% html_text() %>% writeLines() #' #' # html_text2() simulates what a browser would display. Non-significant #' # whitespace is collapsed, and
      is turned into a line break #' html %>% html_element("p") %>% html_text2() %>% writeLines() #' #' # By default, html_text2() also converts non-breaking spaces to regular #' # spaces: #' html <- minimal_html("

      x y

      ") #' x1 <- html %>% html_element("p") %>% html_text() #' x2 <- html %>% html_element("p") %>% html_text2() #' #' # When printed, non-breaking spaces look exactly like regular spaces #' x1 #' x2 #' # But aren't actually the same: #' x1 == x2 #' # Which you can confirm by looking at their underlying binary #' # representaion: #' charToRaw(x1) #' charToRaw(x2) #' @export html_text <- function(x, trim = FALSE) { check_bool(trim) xml_text(x, trim = trim) } #' @export #' @rdname html_text #' @param preserve_nbsp Should non-breaking spaces be preserved? By default, #' `html_text2()` converts to ordinary spaces to ease further computation. #' When `preserve_nbsp` is `TRUE`, ` ` will appear in strings as #' `"\ua0"`. This often causes confusion because it prints the same way as #' `" "`. html_text2 <- function(x, preserve_nbsp = FALSE) { check_bool(preserve_nbsp) UseMethod("html_text2") } #' @export html_text2.xml_document <- function(x, preserve_nbsp = FALSE) { body <- xml2::xml_find_first(x, ".//body") html_text2(body, preserve_nbsp = preserve_nbsp) } #' @export html_text2.xml_nodeset <- function(x, preserve_nbsp = FALSE) { vapply( x, html_text2, preserve_nbsp = preserve_nbsp, FUN.VALUE = character(1) ) } #' @export html_text2.xml_node <- function(x, preserve_nbsp = FALSE) { text <- PaddedText$new() html_text_block(x, text, preserve_nbsp = preserve_nbsp) text$output() } #' @export html_text2.xml_missing <- function(x, preserve_nbsp = FALSE) { NA_character_ } # Algorithm roughly inspired by # https://html.spec.whatwg.org/multipage/dom.html#the-innertext-idl-attribute # but following deatils in # https://developer.mozilla.org/en-US/docs/Web/API/Document_Object_Model/Whitespace#How_does_CSS_process_whitespace html_text_block <- function(x, text, preserve_nbsp = FALSE) { if (xml2::xml_type(x) == "text") { text$add_text(collapse_whitespace(xml2::xml_text(x), preserve_nbsp)) } else if (is_inline(x)) { text$add_text(html_text_inline(x, preserve_nbsp)) } else { children <- xml2::xml_contents(x) n <- length(children) for (i in seq_along(children)) { child <- children[[i]] name <- xml2::xml_name(child) margin <- tag_margin(name) text$add_margin(margin) html_text_block(child, text, preserve_nbsp = preserve_nbsp) switch(name, tr = if (i != n) text$add_text("\n"), th = , td = if (i != n) text$add_text("\t"), br = text$add_text("\n") ) text$add_margin(margin) } } } is_inline <- function(x) { children <- xml2::xml_children(x) !any(xml2::xml_name(children) %in% c(block_tag, table_tag)) } block_tag <- c( # https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements "address", "article", "aside", "blockquote", "details", "dialog", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr", "li", "main", "nav", "ol", "p", "pre", "section", "table", "ul", "caption" ) table_tag <- c("tr", "td", "th") tag_margin <- function(name) { # + caption if (name == "p") { 2L } else if (name %in% block_tag) { 1L } else { 0L } } html_text_inline <- function(x, preserve_nbsp = FALSE) { children <- xml2::xml_contents(x) n <- length(children) if (n == 0) { return("") } text <- xml2::xml_text(children) is_br <- xml2::xml_name(children) == "br" line_num <- cumsum(c(TRUE, is_br[-n])) lines <- split(text, line_num) lines <- vapply(lines, paste0, collapse = "", FUN.VALUE = character(1)) if (xml2::xml_name(x) != "pre") { lines <- collapse_whitespace(lines, preserve_nbsp) } has_br <- unname(tapply(is_br, line_num, any)) paste0(lines, ifelse(has_br, "\n", ""), collapse = "") } # https://drafts.csswg.org/css-text/#white-space-phase-1 collapse_whitespace <- function(x, preserve_nbsp = FALSE) { # Remove leading and trailing whitespace x <- gsub("(^[ \t\n]+)|([ \t\n]+$)", "", x, perl = TRUE) # Convert any whitespace sequence to a space match <- if (preserve_nbsp) "[\t\n ]+" else "[\t\n \u00a0]+" x <- gsub(match, " ", x, perl = TRUE) x } # Text with line break padding in between blocks, collapsing breaks # similarly to css margin collapsing rules PaddedText <- R6::R6Class("PaddedText", list( text = character(), lines = 0, i = 1L, add_margin = function(n) { # Don't add breaks before encountering text if (self$i == 1) { return() } self$lines <- max(self$lines, n) }, convert_breaks = function() { if (self$lines == 0) { return() } self$text[[self$i]] <- strrep("\n", self$lines) self$i <- self$i + 1 self$lines <- 0 }, add_text = function(x) { # Ignore empty strings if (identical(x, "")) { return() } self$convert_breaks() self$text[[self$i]] <- x self$i <- self$i + 1L }, output = function() { paste(self$text, collapse = "") } )) rvest/R/session.R0000644000176200001440000001672314554042640013441 0ustar liggesusers#' Simulate a session in web browser #' #' @description #' This set of functions allows you to simulate a user interacting with a #' website, using forms and navigating from page to page. #' #' * Create a session with `session(url)` #' * Navigate to a specified url with `session_jump_to()`, or follow a link on the #' page with `session_follow_link()`. #' * Submit an [html_form] with `session_submit()`. #' * View the history with `session_history()` and navigate back and forward #' with `session_back()` and `session_forward()`. #' * Extract page contents with [html_element()] and [html_elements()], or get the #' complete HTML document with [read_html()]. #' * Inspect the HTTP response with [httr::cookies()], [httr::headers()], #' and [httr::status_code()]. #' #' @param url For `session()` location to start, for `session_jump_to()` #' location to go to next. #' @param ... Any additional httr config to use throughout the session. #' @param x An object to test to see if it's a session. #' @export #' @examples #' s <- session("http://hadley.nz") #' s %>% #' session_jump_to("hadley-wickham.jpg") %>% #' session_jump_to("/") %>% #' session_history() #' #' s %>% #' session_jump_to("hadley-wickham.jpg") %>% #' session_back() %>% #' session_history() #' #' \donttest{ #' s %>% #' session_follow_link(css = "p a") %>% #' html_elements("p") #' } session <- function(url, ...) { check_string(url) session <- structure( list( handle = httr::handle(url), config = c(..., httr::config(autoreferer = 1L)), response = NULL, url = NULL, back = character(), forward = character(), cache = new_environment() ), class = "rvest_session" ) session_get(session, url) } #' @export #' @rdname session is.session <- function(x) inherits(x, "rvest_session") #' @export print.rvest_session <- function(x, ...) { cat(" ", x$url, "\n", sep = "") cat(" Status: ", httr::status_code(x), "\n", sep = "") cat(" Type: ", httr::headers(x)$`Content-Type`, "\n", sep = "") cat(" Size: ", length(x$response$content), "\n", sep = "") invisible(x) } session_get <- function(x, url, ...) { resp <- httr::GET(url, x$config, ..., handle = x$handle) session_set_response(x, resp) } session_set_response <- function(x, response) { httr::warn_for_status(response) x$response <- response x$url <- response$url x$cache <- new_environment() x } #' @param x A session. #' @param url A URL, either relative or absolute, to navigate to. #' @export #' @rdname session session_jump_to <- function(x, url, ...) { check_session(x) check_string(url) url <- xml2::url_absolute(url, x$url) last_url <- x$url x <- session_get(x, url, ...) x$back <- c(last_url, x$back) x$forward <- character() x } #' @param i A integer to select the ith link or a string to match the #' first link containing that text (case sensitive). #' @inheritParams html_element #' @export #' @rdname session session_follow_link <- function(x, i, css, xpath, ...) { check_session(x) url <- find_href(x, i = i, css = css, xpath = xpath) cli::cli_inform("Navigating to {.url {url}}.") session_jump_to(x, url, ...) } find_href <- function(x, i, css, xpath, error_call = caller_env()) { check_exclusive(i, css, xpath, .call = error_call) if (!missing(i)) { a <- html_elements(x, "a") if (is.numeric(i) && length(i) == 1) { out <- a[[i]] } else if (is.character(i) && length(i) == 1) { text <- html_text(a) match <- grepl(i, text, fixed = TRUE) if (!any(match)) { cli::cli_abort("No links have text {.str {i}}.", call = error_call) } out <- a[[which(match)[[1]]]] } else { cli::cli_abort("{.arg i} must be a string or integer.", call = error_call) } } else { a <- html_elements(x, css = css, xpath = xpath) if (length(a) == 0) { cli::cli_abort("No links matched `css`/`xpath`", call = error_call) } out <- a[[1]] } html_attr(out, "href") } #' @export #' @rdname session session_back <- function(x) { check_session(x) if (length(x$back) == 0) { cli::cli_abort("Can't go back any further.") } url <- x$back[[1]] x$back <- x$back[-1] old_url <- x$url x <- session_get(x, url) x$forward <- c(old_url, x$forward) x } #' @export #' @rdname session session_forward <- function(x) { check_session(x) if (length(x$forward) == 0) { cli::cli_abort("Can't go forward any further.") } url <- x$forward[[1]] old_url <- x$url x <- session_get(x, url) x$forward <- x$forward[-1] x$back <- c(old_url, x$back) x } #' @export #' @rdname session session_history <- function(x) { check_session(x) urls <- c(rev(x$back), x$url, x$forward) prefix <- rep(c(" ", "- ", " "), c(length(x$back), 1, length(x$forward))) cat_line(prefix, urls) } # form -------------------------------------------------------------------- #' @param form An [html_form] to submit #' @inheritParams html_form_submit #' @rdname session #' @export session_submit <- function(x, form, submit = NULL, ...) { check_session(x) check_form(form) subm <- submission_build(form, submit) resp <- submission_submit(subm, x$config, ..., handle = x$handle) session_set_response(x, resp) } # xml2 methods ------------------------------------------------------------ #' @importFrom xml2 read_html #' @export read_html.rvest_session <- function(x, ...) { if (!is_html(x$response)) { cli::cli_abort("Page doesn't appear to be html.") } env_cache(x$cache, "html", read_html(x$response, ..., base_url = x$url)) } is_html <- function(x) { type <- httr::headers(x)$`Content-Type` if (is.null(type)) return(FALSE) parsed <- httr::parse_media(type) parsed$complete %in% c("text/html", "application/xhtml+xml") } # rvest methods ----------------------------------------------------------------- #' @export html_form.rvest_session <- function(x, base_url = NULL) { html_form(read_html(x), base_url = base_url) } #' @export html_table.rvest_session <- function(x, header = NA, trim = TRUE, fill = deprecated(), dec = ".", na.strings = "NA", convert = TRUE) { html_table( read_html(x), header = header, trim = trim, fill = fill, dec = dec, na.strings = na.strings, convert = convert ) } #' @export html_element.rvest_session <- function(x, css, xpath) { html_element(read_html(x), css, xpath) } #' @export html_elements.rvest_session <- function(x, css, xpath) { html_elements(read_html(x), css, xpath) } # httr methods ----------------------------------------------------------------- #' @importFrom httr status_code #' @export status_code.rvest_session <- function(x) { status_code(x$response) } #' @importFrom httr headers #' @export headers.rvest_session <- function(x) { headers(x$response) } #' @importFrom httr cookies #' @export cookies.rvest_session <- function(x) { cookies(x$response) } # helpers ----------------------------------------------------------------- check_form <- function(x, call = caller_env()) { if (!inherits(x, "rvest_form")) { cli::cli_abort( "{.arg form} must be a single form produced by {.fn html_form}.", call = call ) } } check_session <- function(x, call = caller_env()) { if (!inherits(x, "rvest_session")) { cli::cli_abort("{.arg x} must be produced by {.fn session}.", call = call) } } rvest/R/encoding.R0000644000176200001440000000421514554042640013535 0ustar liggesusers#' Guess faulty character encoding #' #' `html_encoding_guess()` helps you handle web pages that declare an incorrect #' encoding. Use `html_encoding_guess()` to generate a list of possible #' encodings, then try each out by using `encoding` argument of `read_html()`. #' `html_encoding_guess()` replaces the deprecated `guess_encoding()`. #' #' @param x A character vector. #' @export #' @examples #' # A file with bad encoding included in the package #' path <- system.file("html-ex", "bad-encoding.html", package = "rvest") #' x <- read_html(path) #' x %>% html_elements("p") %>% html_text() #' #' html_encoding_guess(x) #' # Two valid encodings, only one of which is correct #' read_html(path, encoding = "ISO-8859-1") %>% html_elements("p") %>% html_text() #' read_html(path, encoding = "ISO-8859-2") %>% html_elements("p") %>% html_text() html_encoding_guess <- function(x) { check_installed("stringi") guess <- stringi::stri_enc_detect(paste(x, collapse = "")) df <- as.data.frame(guess[[1]], stringsAsFactors = FALSE) names(df) <- tolower(names(df)) df } #' @export #' @rdname html_encoding_guess #' @usage NULL guess_encoding <- function(x) { lifecycle::deprecate_warn("1.0.0", "guess_encoding()", "html_encoding_guess()") html_encoding_guess(x) } #' Repair faulty encoding #' #' `r lifecycle::badge("deprecated")` #' This function has been deprecated because it doesn't work. Instead #' re-read the HTML file with correct `encoding` argument. #' #' @export #' @keywords internal #' @param from The encoding that the string is actually in. If `NULL`, #' `guess_encoding` will be used. repair_encoding <- function(x, from = NULL) { lifecycle::deprecate_warn("1.0.0", "html_encoding_repair()", details = "Instead, re-load using the `encoding` argument of `read_html()`" ) check_installed("stringi") if (is.null(from)) { best_guess <- html_encoding_guess(x)[1, , drop = FALSE] from <- best_guess$encoding conf <- best_guess$confidence * 100 if (conf < 50) { cli::cli_abort("No guess has more than 50% confidence") } inform(paste0("Best guess: ", from, " (", conf, "% confident)")) } stringi::stri_conv(x, from) } rvest/R/form.R0000644000176200001440000002214514554042640012714 0ustar liggesusers#' Parse forms and set values #' #' Use `html_form()` to extract a form, set values with `html_form_set()`, #' and submit it with `html_form_submit()`. #' #' @export #' @inheritParams html_name #' @param base_url Base url of underlying HTML document. The default, `NULL`, #' uses the url of the HTML document underlying `x`. #' @seealso HTML 4.01 form specification: #' #' @return #' * `html_form()` returns as S3 object with class `rvest_form` when applied #' to a single element. It returns a list of `rvest_form` objects when #' applied to multiple elements or a document. #' #' * `html_form_set()` returns an `rvest_form` object. #' #' * `html_form_submit()` submits the form, returning an httr response which #' can be parsed with [read_html()]. #' @examples #' html <- read_html("http://www.google.com") #' search <- html_form(html)[[1]] #' #' search <- search %>% html_form_set(q = "My little pony", hl = "fr") #' #' # Or if you have a list of values, use !!! #' vals <- list(q = "web scraping", hl = "en") #' search <- search %>% html_form_set(!!!vals) #' #' # To submit and get result: #' \dontrun{ #' resp <- html_form_submit(search) #' read_html(resp) #' } html_form <- function(x, base_url = NULL) UseMethod("html_form") #' @export html_form.xml_document <- function(x, base_url = NULL) { html_form(xml2::xml_find_all(x, ".//form"), base_url = base_url) } #' @export html_form.xml_nodeset <- function(x, base_url = NULL) { lapply(x, html_form, base_url = base_url) } #' @export html_form.xml_node <- function(x, base_url = NULL) { if (xml2::xml_name(x) != "form") { cli::cli_abort("{.arg x} must be a
      element.") } check_string(base_url, allow_null = TRUE) attr <- as.list(xml2::xml_attrs(x)) name <- attr$id %||% attr$name %||% "" # for human readers method <- toupper(attr$method %||% "GET") enctype <- convert_enctype(attr$enctype) nodes <- html_elements(x, "input, select, textarea, button") fields <- lapply(nodes, function(x) { switch(xml2::xml_name(x), textarea = parse_textarea(x), input = parse_input(x), select = parse_select(x), button = parse_button(x) ) }) names(fields) <- map_chr(fields, function(x) x$name %||% "") structure( list( name = name, method = method, action = xml2::url_absolute(attr$action, base_url %||% xml2::xml_url(x)), enctype = enctype, fields = fields ), class = "rvest_form") } #' @export print.rvest_form <- function(x, ...) { cat(" '", x$name, "' (", x$method, " ", x$action, ")\n", sep = "") cat(format_list(x$fields, indent = 1), "\n", sep = "") } # set ---------------------------------------------------------------- #' @rdname html_form #' @param form A form #' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Name-value pairs giving #' fields to modify. #' #' Provide a character vector to set multiple checkboxes in a set or #' select multiple values from a multi-select. #' @export html_form_set <- function(form, ...) { check_form(form) new_values <- list2(...) check_fields(form, new_values) for (field in names(new_values)) { type <- form$fields[[field]]$type %||% "non-input" if (type == "hidden") { cli::cli_warn("Setting value of hidden field {.str {field}}.") } else if (type == "submit") { cli::cli_abort("Can't change value of input with type submit: {.str {field}}.") } form$fields[[field]]$value <- new_values[[field]] } form } # submit ------------------------------------------------------------------ #' @rdname html_form #' @param submit Which button should be used to submit the form? #' * `NULL`, the default, uses the first button. #' * A string selects a button by its name. #' * A number selects a button using its relative position. #' @export html_form_submit <- function(form, submit = NULL) { check_form(form) subm <- submission_build(form, submit) submission_submit(subm) } submission_build <- function(form, submit, error_call = caller_env()) { method <- form$method if (!(method %in% c("POST", "GET"))) { cli::cli_warn("Invalid method ({method}), defaulting to GET.", call = error_call) method <- "GET" } if (length(form$action) == 0) { cli::cli_abort("`form` doesn't contain a `action` attribute.", call = error_call) } list( method = method, enctype = form$enctype, action = form$action, values = submission_build_values(form, submit, error_call = error_call) ) } submission_submit <- function(x, ...) { if (x$method == "POST") { httr::POST(url = x$action, body = x$values, encode = x$enctype, ...) } else { httr::GET(url = x$action, query = x$values, ...) } } submission_build_values <- function(form, submit = NULL, error_call = caller_env()) { fields <- form$fields submit <- submission_find_submit(fields, submit, error_call = error_call) entry_list <- c(Filter(Negate(is_button), fields), list(submit)) entry_list <- Filter(function(x) !is.null(x$name), entry_list) if (length(entry_list) == 0) { return(list()) } values <- lapply(entry_list, function(x) as.character(x$value)) names <- map_chr(entry_list, "[[", "name") out <- set_names(unlist(values, use.names = FALSE), rep(names, lengths(values))) as.list(out) } submission_find_submit <- function(fields, idx, error_call = caller_env()) { buttons <- Filter(is_button, fields) if (is.null(idx)) { if (length(buttons) == 0) { list() } else { if (length(buttons) > 1) { cli::cli_inform("Submitting with button {.str {buttons[[1]]$name}}.") } buttons[[1]] } } else if (is.numeric(idx) && length(idx) == 1) { if (idx < 1 || idx > length(buttons)) { cli::cli_abort("Numeric {.arg submit} out of range.", call = error_call) } buttons[[idx]] } else if (is.character(idx) && length(idx) == 1) { if (!idx %in% names(buttons)) { cli::cli_abort( c( "No found with name {.str {idx}}.", i = "Possible values: {.str {names(buttons)}}." ), call = error_call ) } buttons[[idx]] } else { cli::cli_abort( "{.arg submit} must be NULL, a string, or a number.", call = error_call ) } } is_button <- function(x) { tolower(x$type) %in% c("submit", "image", "button") } # Field parsing ----------------------------------------------------------- rvest_field <- function(type, name, value, attr, ...) { structure( list( type = type, name = name, value = value, attr = attr, ... ), class = "rvest_field" ) } #' @export format.rvest_field <- function(x, ...) { if (x$type == "password") { value <- paste0(rep("*", nchar(x$value %||% "")), collapse = "") } else { value <- paste(x$value, collapse = ", ") value <- str_trunc(encodeString(value), 20) } paste0(" (", x$type, ") ", x$name, ": ", value) } #' @export print.rvest_field <- function(x, ...) { cat(format(x, ...), "\n", sep = "") invisible(x) } parse_input <- function(x) { attr <- as.list(xml2::xml_attrs(x)) rvest_field( type = attr$type %||% "text", name = attr$name, value = attr$value, attr = attr ) } parse_select <- function(x) { attr <- as.list(xml2::xml_attrs(x)) options <- parse_options(html_elements(x, "option")) rvest_field( type = "select", name = attr$name, value = options$value, attr = attr, options = options$options ) } parse_options <- function(options) { parse_option <- function(option) { name <- xml2::xml_text(option) list( value = xml2::xml_attr(option, "value", default = name), name = name, selected = xml2::xml_has_attr(option, "selected") ) } parsed <- lapply(options, parse_option) value <- map_chr(parsed, "[[", "value") name <- map_chr(parsed, "[[", "name") selected <- map_lgl(parsed, "[[", "selected") list( value = value[selected], options = stats::setNames(value, name) ) } parse_textarea <- function(x) { attr <- as.list(xml2::xml_attrs(x)) rvest_field( type = "textarea", name = attr$name, value = xml2::xml_text(x), attr = attr ) } parse_button <- function(x) { attr <- as.list(xml2::xml_attrs(x)) rvest_field( type = "button", name = attr$name, value = attr$value, attr = attr ) } # Helpers ----------------------------------------------------------------- convert_enctype <- function(x) { if (is.null(x)) { "form" } else if (x == "application/x-www-form-urlencoded") { "form" } else if (x == "multipart/form-data") { "multipart" } else { warn(paste0("Unknown enctype (", x, "). Defaulting to form encoded.")) "form" } } format_list <- function(x, indent = 0) { spaces <- paste(rep(" ", indent), collapse = "") formatted <- vapply(x, format, character(1)) paste0(spaces, formatted, collapse = "\n") } check_fields <- function(form, values, error_call = caller_env()) { no_match <- setdiff(names(values), names(form$fields)) if (length(no_match) > 0) { cli::cli_abort( "Can't set value of fields that don't exist: {.str {no_match}}.", call = error_call ) } } rvest/R/html.R0000644000176200001440000000401114554042640012705 0ustar liggesusers#' Get element name #' #' @param x A document (from [read_html()]), node set (from [html_elements()]), #' node (from [html_element()]), or session (from [session()]). #' @return A character vector the same length as `x` #' @export #' @examples #' url <- "https://rvest.tidyverse.org/articles/starwars.html" #' html <- read_html(url) #' #' html %>% #' html_element("div") %>% #' html_children() %>% #' html_name() #' @export #' @importFrom xml2 xml_name html_name <- function(x) { xml_name(x) } #' Get element attributes #' #' `html_attr()` gets a single attribute; `html_attrs()` gets all attributes. #' #' @inheritParams html_name #' @param name Name of attribute to retrieve. #' @param default A string used as a default value when the attribute does #' not exist in every element. #' @return A character vector (for `html_attr()`) or list (`html_attrs()`) #' the same length as `x`. #' @examples #' html <- minimal_html('
        #'
      • a
      • #'
      • b
      • #'
      • b
      • #'
      ') #' #' html %>% html_elements("a") %>% html_attrs() #' #' html %>% html_elements("a") %>% html_attr("href") #' html %>% html_elements("li") %>% html_attr("class") #' html %>% html_elements("li") %>% html_attr("class", default = "inactive") #' @export #' @importFrom xml2 xml_attr html_attr <- function(x, name, default = NA_character_) { check_string(name) check_string(default, allow_na = TRUE) xml_attr(x, name, default = default) } #' @rdname html_attr #' @export #' @importFrom xml2 xml_attrs html_attrs <- function(x) { xml_attrs(x) } #' Get element children #' #' @inheritParams html_name #' @examples #' html <- minimal_html("
      • 1
      • 2
      • 3
      ") #' ul <- html_elements(html, "ul") #' html_children(ul) #' #' html <- minimal_html("

      Hello Hadley!") #' p <- html_elements(html, "p") #' html_children(p) #' @importFrom xml2 xml_children #' @export html_children <- function(x) { xml_children(x) } rvest/R/import-standalone-obj-type.R0000644000176200001440000002072714554042640017144 0ustar liggesusers# Standalone file: do not edit by hand # Source: # ---------------------------------------------------------------------- # # --- # repo: r-lib/rlang # file: standalone-obj-type.R # last-updated: 2023-05-01 # license: https://unlicense.org # imports: rlang (>= 1.1.0) # --- # # ## Changelog # # 2023-05-01: # - `obj_type_friendly()` now only displays the first class of S3 objects. # # 2023-03-30: # - `stop_input_type()` now handles `I()` input literally in `arg`. # # 2022-10-04: # - `obj_type_friendly(value = TRUE)` now shows numeric scalars # literally. # - `stop_friendly_type()` now takes `show_value`, passed to # `obj_type_friendly()` as the `value` argument. # # 2022-10-03: # - Added `allow_na` and `allow_null` arguments. # - `NULL` is now backticked. # - Better friendly type for infinities and `NaN`. # # 2022-09-16: # - Unprefixed usage of rlang functions with `rlang::` to # avoid onLoad issues when called from rlang (#1482). # # 2022-08-11: # - Prefixed usage of rlang functions with `rlang::`. # # 2022-06-22: # - `friendly_type_of()` is now `obj_type_friendly()`. # - Added `obj_type_oo()`. # # 2021-12-20: # - Added support for scalar values and empty vectors. # - Added `stop_input_type()` # # 2021-06-30: # - Added support for missing arguments. # # 2021-04-19: # - Added support for matrices and arrays (#141). # - Added documentation. # - Added changelog. # # nocov start #' Return English-friendly type #' @param x Any R object. #' @param value Whether to describe the value of `x`. Special values #' like `NA` or `""` are always described. #' @param length Whether to mention the length of vectors and lists. #' @return A string describing the type. Starts with an indefinite #' article, e.g. "an integer vector". #' @noRd obj_type_friendly <- function(x, value = TRUE) { if (is_missing(x)) { return("absent") } if (is.object(x)) { if (inherits(x, "quosure")) { type <- "quosure" } else { type <- class(x)[[1L]] } return(sprintf("a <%s> object", type)) } if (!is_vector(x)) { return(.rlang_as_friendly_type(typeof(x))) } n_dim <- length(dim(x)) if (!n_dim) { if (!is_list(x) && length(x) == 1) { if (is_na(x)) { return(switch( typeof(x), logical = "`NA`", integer = "an integer `NA`", double = if (is.nan(x)) { "`NaN`" } else { "a numeric `NA`" }, complex = "a complex `NA`", character = "a character `NA`", .rlang_stop_unexpected_typeof(x) )) } show_infinites <- function(x) { if (x > 0) { "`Inf`" } else { "`-Inf`" } } str_encode <- function(x, width = 30, ...) { if (nchar(x) > width) { x <- substr(x, 1, width - 3) x <- paste0(x, "...") } encodeString(x, ...) } if (value) { if (is.numeric(x) && is.infinite(x)) { return(show_infinites(x)) } if (is.numeric(x) || is.complex(x)) { number <- as.character(round(x, 2)) what <- if (is.complex(x)) "the complex number" else "the number" return(paste(what, number)) } return(switch( typeof(x), logical = if (x) "`TRUE`" else "`FALSE`", character = { what <- if (nzchar(x)) "the string" else "the empty string" paste(what, str_encode(x, quote = "\"")) }, raw = paste("the raw value", as.character(x)), .rlang_stop_unexpected_typeof(x) )) } return(switch( typeof(x), logical = "a logical value", integer = "an integer", double = if (is.infinite(x)) show_infinites(x) else "a number", complex = "a complex number", character = if (nzchar(x)) "a string" else "\"\"", raw = "a raw value", .rlang_stop_unexpected_typeof(x) )) } if (length(x) == 0) { return(switch( typeof(x), logical = "an empty logical vector", integer = "an empty integer vector", double = "an empty numeric vector", complex = "an empty complex vector", character = "an empty character vector", raw = "an empty raw vector", list = "an empty list", .rlang_stop_unexpected_typeof(x) )) } } vec_type_friendly(x) } vec_type_friendly <- function(x, length = FALSE) { if (!is_vector(x)) { abort("`x` must be a vector.") } type <- typeof(x) n_dim <- length(dim(x)) add_length <- function(type) { if (length && !n_dim) { paste0(type, sprintf(" of length %s", length(x))) } else { type } } if (type == "list") { if (n_dim < 2) { return(add_length("a list")) } else if (is.data.frame(x)) { return("a data frame") } else if (n_dim == 2) { return("a list matrix") } else { return("a list array") } } type <- switch( type, logical = "a logical %s", integer = "an integer %s", numeric = , double = "a double %s", complex = "a complex %s", character = "a character %s", raw = "a raw %s", type = paste0("a ", type, " %s") ) if (n_dim < 2) { kind <- "vector" } else if (n_dim == 2) { kind <- "matrix" } else { kind <- "array" } out <- sprintf(type, kind) if (n_dim >= 2) { out } else { add_length(out) } } .rlang_as_friendly_type <- function(type) { switch( type, list = "a list", NULL = "`NULL`", environment = "an environment", externalptr = "a pointer", weakref = "a weak reference", S4 = "an S4 object", name = , symbol = "a symbol", language = "a call", pairlist = "a pairlist node", expression = "an expression vector", char = "an internal string", promise = "an internal promise", ... = "an internal dots object", any = "an internal `any` object", bytecode = "an internal bytecode object", primitive = , builtin = , special = "a primitive function", closure = "a function", type ) } .rlang_stop_unexpected_typeof <- function(x, call = caller_env()) { abort( sprintf("Unexpected type <%s>.", typeof(x)), call = call ) } #' Return OO type #' @param x Any R object. #' @return One of `"bare"` (for non-OO objects), `"S3"`, `"S4"`, #' `"R6"`, or `"R7"`. #' @noRd obj_type_oo <- function(x) { if (!is.object(x)) { return("bare") } class <- inherits(x, c("R6", "R7_object"), which = TRUE) if (class[[1]]) { "R6" } else if (class[[2]]) { "R7" } else if (isS4(x)) { "S4" } else { "S3" } } #' @param x The object type which does not conform to `what`. Its #' `obj_type_friendly()` is taken and mentioned in the error message. #' @param what The friendly expected type as a string. Can be a #' character vector of expected types, in which case the error #' message mentions all of them in an "or" enumeration. #' @param show_value Passed to `value` argument of `obj_type_friendly()`. #' @param ... Arguments passed to [abort()]. #' @inheritParams args_error_context #' @noRd stop_input_type <- function(x, what, ..., allow_na = FALSE, allow_null = FALSE, show_value = TRUE, arg = caller_arg(x), call = caller_env()) { # From standalone-cli.R cli <- env_get_list( nms = c("format_arg", "format_code"), last = topenv(), default = function(x) sprintf("`%s`", x), inherit = TRUE ) if (allow_na) { what <- c(what, cli$format_code("NA")) } if (allow_null) { what <- c(what, cli$format_code("NULL")) } if (length(what)) { what <- oxford_comma(what) } if (inherits(arg, "AsIs")) { format_arg <- identity } else { format_arg <- cli$format_arg } message <- sprintf( "%s must be %s, not %s.", format_arg(arg), what, obj_type_friendly(x, value = show_value) ) abort(message, ..., call = call, arg = arg) } oxford_comma <- function(chr, sep = ", ", final = "or") { n <- length(chr) if (n < 2) { return(chr) } head <- chr[seq_len(n - 1)] last <- chr[n] head <- paste(head, collapse = sep) # Write a or b. But a, b, or c. if (n > 2) { paste0(head, sep, final, " ", last) } else { paste0(head, " ", final, " ", last) } } # nocov end rvest/R/compat-purrr.R0000644000176200001440000001154514210000474014372 0ustar liggesusers# nocov start - compat-purrr.R # Latest version: https://github.com/r-lib/rlang/blob/master/R/compat-purrr.R # This file provides a minimal shim to provide a purrr-like API on top of # base R functions. They are not drop-in replacements but allow a similar style # of programming. # # Changelog: # 2020-04-14: # * Removed `pluck*()` functions # * Removed `*_cpl()` functions # * Used `as_function()` to allow use of `~` # * Used `.` prefix for helpers # # 2021-05-21: # * Fixed "object `x` not found" error in `imap()` (@mgirlich) # # 2021-12-15: # * `transpose()` now supports empty lists. map <- function(.x, .f, ...) { .f <- as_function(.f, env = global_env()) lapply(.x, .f, ...) } walk <- function(.x, .f, ...) { map(.x, .f, ...) invisible(.x) } map_lgl <- function(.x, .f, ...) { .rlang_purrr_map_mold(.x, .f, logical(1), ...) } map_int <- function(.x, .f, ...) { .rlang_purrr_map_mold(.x, .f, integer(1), ...) } map_dbl <- function(.x, .f, ...) { .rlang_purrr_map_mold(.x, .f, double(1), ...) } map_chr <- function(.x, .f, ...) { .rlang_purrr_map_mold(.x, .f, character(1), ...) } .rlang_purrr_map_mold <- function(.x, .f, .mold, ...) { .f <- as_function(.f, env = global_env()) out <- vapply(.x, .f, .mold, ..., USE.NAMES = FALSE) names(out) <- names(.x) out } map2 <- function(.x, .y, .f, ...) { .f <- as_function(.f, env = global_env()) out <- mapply(.f, .x, .y, MoreArgs = list(...), SIMPLIFY = FALSE) if (length(out) == length(.x)) { set_names(out, names(.x)) } else { set_names(out, NULL) } } map2_lgl <- function(.x, .y, .f, ...) { as.vector(map2(.x, .y, .f, ...), "logical") } map2_int <- function(.x, .y, .f, ...) { as.vector(map2(.x, .y, .f, ...), "integer") } map2_dbl <- function(.x, .y, .f, ...) { as.vector(map2(.x, .y, .f, ...), "double") } map2_chr <- function(.x, .y, .f, ...) { as.vector(map2(.x, .y, .f, ...), "character") } imap <- function(.x, .f, ...) { map2(.x, names(.x) %||% seq_along(.x), .f, ...) } pmap <- function(.l, .f, ...) { .f <- as.function(.f) args <- .rlang_purrr_args_recycle(.l) do.call("mapply", c( FUN = list(quote(.f)), args, MoreArgs = quote(list(...)), SIMPLIFY = FALSE, USE.NAMES = FALSE )) } .rlang_purrr_args_recycle <- function(args) { lengths <- map_int(args, length) n <- max(lengths) stopifnot(all(lengths == 1L | lengths == n)) to_recycle <- lengths == 1L args[to_recycle] <- map(args[to_recycle], function(x) rep.int(x, n)) args } keep <- function(.x, .f, ...) { .x[.rlang_purrr_probe(.x, .f, ...)] } discard <- function(.x, .p, ...) { sel <- .rlang_purrr_probe(.x, .p, ...) .x[is.na(sel) | !sel] } map_if <- function(.x, .p, .f, ...) { matches <- .rlang_purrr_probe(.x, .p) .x[matches] <- map(.x[matches], .f, ...) .x } .rlang_purrr_probe <- function(.x, .p, ...) { if (is_logical(.p)) { stopifnot(length(.p) == length(.x)) .p } else { .p <- as_function(.p, env = global_env()) map_lgl(.x, .p, ...) } } compact <- function(.x) { Filter(length, .x) } transpose <- function(.l) { if (!length(.l)) { return(.l) } inner_names <- names(.l[[1]]) if (is.null(inner_names)) { fields <- seq_along(.l[[1]]) } else { fields <- set_names(inner_names) } map(fields, function(i) { map(.l, .subset2, i) }) } every <- function(.x, .p, ...) { .p <- as_function(.p, env = global_env()) for (i in seq_along(.x)) { if (!rlang::is_true(.p(.x[[i]], ...))) return(FALSE) } TRUE } some <- function(.x, .p, ...) { .p <- as_function(.p, env = global_env()) for (i in seq_along(.x)) { if (rlang::is_true(.p(.x[[i]], ...))) return(TRUE) } FALSE } negate <- function(.p) { .p <- as_function(.p, env = global_env()) function(...) !.p(...) } reduce <- function(.x, .f, ..., .init) { f <- function(x, y) .f(x, y, ...) Reduce(f, .x, init = .init) } reduce_right <- function(.x, .f, ..., .init) { f <- function(x, y) .f(y, x, ...) Reduce(f, .x, init = .init, right = TRUE) } accumulate <- function(.x, .f, ..., .init) { f <- function(x, y) .f(x, y, ...) Reduce(f, .x, init = .init, accumulate = TRUE) } accumulate_right <- function(.x, .f, ..., .init) { f <- function(x, y) .f(y, x, ...) Reduce(f, .x, init = .init, right = TRUE, accumulate = TRUE) } detect <- function(.x, .f, ..., .right = FALSE, .p = is_true) { .p <- as_function(.p, env = global_env()) .f <- as_function(.f, env = global_env()) for (i in .rlang_purrr_index(.x, .right)) { if (.p(.f(.x[[i]], ...))) { return(.x[[i]]) } } NULL } detect_index <- function(.x, .f, ..., .right = FALSE, .p = is_true) { .p <- as_function(.p, env = global_env()) .f <- as_function(.f, env = global_env()) for (i in .rlang_purrr_index(.x, .right)) { if (.p(.f(.x[[i]], ...))) { return(i) } } 0L } .rlang_purrr_index <- function(x, right = FALSE) { idx <- seq_along(x) if (right) { idx <- rev(idx) } idx } # nocov end rvest/R/live.R0000644000176200001440000003307314562443627012723 0ustar liggesusers#' Live web scraping (with chromote) #' #' @description #' `r lifecycle::badge("experimental")` #' #' [read_html()] operates on the HTML source code downloaded from the server. #' This works for most websites but can fail if the site uses javascript to #' generate the HTML. `read_html_live()` provides an alternative interface #' that runs a live web browser (Chrome) in the background. This allows you to #' access elements of the HTML page that are generated dynamically by javascript #' and to interact with the live page by clicking on buttons or typing in #' forms. #' #' Behind the scenes, this function uses the #' [chromote](https://rstudio.github.io/chromote/) package, which requires that #' you have a copy of [Google Chrome](https://www.google.com/chrome/) installed #' on your machine. #' #' @return `read_html_live()` returns an R6 [LiveHTML] object. You can interact #' with this object using the usual rvest functions, or call its methods, #' like `$click()`, `$scroll_to()`, and `$type()` to interact with the live #' page like a human would. #' @param url Website url to read from. #' @export #' @examples #' \dontrun{ #' # When we retrieve the raw HTML for this site, it doesn't contain the #' # data we're interested in: #' static <- read_html("https://www.forbes.com/top-colleges/") #' static %>% html_elements(".TopColleges2023_tableRow__BYOSU") #' #' # Instead, we need to run the site in a real web browser, causing it to #' # download a JSON file and then dynamically generate the html: #' #' sess <- read_html_live("https://www.forbes.com/top-colleges/") #' sess$view() #' rows <- sess %>% html_elements(".TopColleges2023_tableRow__BYOSU") #' rows %>% html_element(".TopColleges2023_organizationName__J1lEV") %>% html_text() #' rows %>% html_element(".grant-aid") %>% html_text() #' } read_html_live <- function(url) { check_installed(c("chromote", "R6")) LiveHTML$new(url) } #' Interact with a live web page #' #' @description #' `r lifecycle::badge("experimental")` #' #' You construct an LiveHTML object with [read_html_live()] and then interact, #' like you're a human, using the methods described below. When debugging a #' scraping script it is particularly useful to use `$view()`, which will open #' a live preview of the site, and you can actually see each of the operations #' performed on the real site. #' #' rvest provides relatively simple methods for scrolling, typing, and #' clicking. For richer interaction, you probably want to use a package #' that exposes a more powerful user interface, like #' [selendir](https://ashbythorpe.github.io/selenider/). #' #' @export #' @examples #' \dontrun{ #' # To retrieve data for this paginated site, we need to repeatedly push #' # the "Load More" button #' sess <- read_html_live("https://www.bodybuilding.com/exercises/finder") #' sess$view() #' #' sess %>% html_elements(".ExResult-row") %>% length() #' sess$click(".ExLoadMore-btn") #' sess %>% html_elements(".ExResult-row") %>% length() #' sess$click(".ExLoadMore-btn") #' sess %>% html_elements(".ExResult-row") %>% length() #' } LiveHTML <- R6::R6Class( "LiveHTML", public = list( #' @field session Underlying chromote session object. For expert use only. session = NULL, #' @description initialize the object #' @param url URL to page. initialize = function(url) { check_installed("chromote") self$session <- chromote::ChromoteSession$new() self$session$Network$setUserAgentOverride("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36") # https://github.com/rstudio/chromote/issues/102 p <- self$session$Page$loadEventFired(wait_ = FALSE) self$session$Page$navigate(url, wait_ = FALSE) self$session$wait_for(p) private$root_id <- self$session$DOM$getDocument(0)$root$nodeId }, #' @description Called when `print()`ed #' @param ... Ignored print = function(...) { print(self$html_elements("html > *")) invisible(self) }, #' @description #' Display a live view of the site view = function() { private$check_active() self$session$view() invisible(self) }, #' @description #' Extract HTML elements from the current page. #' @param css,xpath CSS selector or xpath expression. html_elements = function(css, xpath) { private$check_active() nodes <- private$find_nodes(css, xpath) elements <- map_chr(nodes, function(node_id) { json <- private$call_node_method(node_id, ".outerHTML") json$result$value }) html <- paste0("", paste0(elements, collapse = "\n"), "") xml2::xml_children(xml2::xml_children(xml2::read_html(html))) }, #' @description Simulate a click on an HTML element. #' @param css CSS selector or xpath expression. #' @param n_clicks Number of clicks click = function(css, n_clicks = 1) { private$check_active() check_number_whole(n_clicks, min = 1) # Implementation based on puppeteer as described in # https://medium.com/@aslushnikov/automating-clicks-in-chromium-a50e7f01d3fb # With code from https://github.com/puppeteer/puppeteer/blob/b53de4e0942e93c/packages/puppeteer-core/src/cdp/Input.ts#L431-L459 node <- private$wait_for_selector(css) self$session$DOM$scrollIntoViewIfNeeded(node) # Quad = location of four corners (x1, y1, x2, y2, x3, y3, x4, y4) # Relative to viewport quads <- self$session$DOM$getBoxModel(node) content_quad <- as.numeric(quads$model$content) center_x <- mean(content_quad[c(1, 3, 5, 7)]) center_y <- mean(content_quad[c(2, 4, 6, 8)]) # https://chromedevtools.github.io/devtools-protocol/1-3/Input/#method-dispatchMouseEvent self$session$Input$dispatchMouseEvent( type = "mouseMoved", x = center_x, y = center_y, ) for (i in seq_len(n_clicks)) { self$session$Input$dispatchMouseEvent( type = "mousePressed", x = center_x, y = center_y, button = "left", clickCount = i, ) self$session$Input$dispatchMouseEvent( type = "mouseReleased", x = center_x, y = center_y, clickCount = i, button = "left" ) } invisible(self) }, #' @description Get the current scroll position. get_scroll_position = function() { private$check_active() out <- self$session$Runtime$evaluate( '({ x: window.scrollX, y: window.scrollY })', returnByValue = TRUE ) out$result$value }, #' @description Scroll selected element into view. #' @param css CSS selector or xpath expression. scroll_into_view = function(css) { private$check_active() node <- private$wait_for_selector(css) self$session$DOM$scrollIntoViewIfNeeded(node) invisible(self) }, #' @description Scroll to specified location #' @param top,left Number of pixels from top/left respectively. scroll_to = function(top = 0, left = 0) { private$check_active() check_number_whole(top) check_number_whole(left) # https://developer.mozilla.org/en-US/docs/Web/API/Element/scrollTo private$call_node_method( private$root_id, paste0(".documentElement.scrollTo(", left, ", ", top, ")") ) invisible(self) }, #' @description Scroll by the specified amount #' @param top,left Number of pixels to scroll up/down and left/right #' respectively. scroll_by = function(top = 0, left = 0) { private$check_active() check_number_whole(top) check_number_whole(left) # https://chromedevtools.github.io/devtools-protocol/1-3/Input/#method-dispatchMouseEvent self$session$Input$dispatchMouseEvent( type = "mouseWheel", x = 0, y = 0, deltaX = left, deltaY = top ) invisible(self) }, #' @description Type text in the selected element #' @param css CSS selector or xpath expression. #' @param text A single string containing the text to type. type = function(css, text) { private$check_active() check_string(text) node <- private$wait_for_selector(css) self$session$DOM$focus(node) self$session$Input$insertText(text) invisible(self) }, #' @description Simulate pressing a single key (including special keys). #' @param css CSS selector or xpath expression. Set to `NULL` #' @param key_code Name of key. You can see a complete list of known #' keys at . #' @param modifiers A character vector of modifiers. Must be one or more #' of `"Shift`, `"Control"`, `"Alt"`, or `"Meta"`. press = function(css, key_code, modifiers = character()) { private$check_active() desc <- as_key_desc(key_code, modifiers) node <- private$wait_for_selector(css) self$session$DOM$focus(node) exec(self$session$Input$dispatchKeyEvent, type = "keyDown", !!!desc) exec(self$session$Input$dispatchKeyEvent, type = "keyUp", !!!desc) invisible(self) } ), private = list( root_id = NULL, check_active = function() { if (new_chromote && !self$session$is_active()) { suppressMessages({ self$session <- self$session$respawn() private$root_id <- self$session$DOM$getDocument(0)$root$nodeId }) } }, wait_for_selector = function(css, timeout = 5) { done <- now() + timeout while(now() < done) { nodes <- private$find_nodes(css) if (length(nodes) > 0) { return(nodes) } Sys.sleep(0.1) } cli::cli_abort("Failed to find selector {.str {css}} in {timeout} seconds.") }, find_nodes = function(css, xpath) { check_exclusive(css, xpath) if (!missing(css)) { unlist(self$session$DOM$querySelectorAll(private$root_id, css)$nodeIds) } else { search <- glue::glue(" (function() {{ const xpathResult = document.evaluate('{xpath}', document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null); const nodes = []; for (let i = 0; i < xpathResult.snapshotLength; i++) {{ nodes.push(xpathResult.snapshotItem(i)); }} return(nodes); }})(); ") object_id <- self$session$Runtime$evaluate(search)$result$objectId props <- self$session$Runtime$getProperties(object_id, ownProperties = TRUE) ids <- map_chr(props$result, function(prop) prop$value$objectId %||% NA_character_) # Drop non-nodes ids <- ids[!is.na(ids)] unlist(map(ids, self$session$DOM$requestNode), use.names = FALSE) } }, # Inspired by https://github.com/rstudio/shinytest2/blob/v1/R/chromote-methods.R call_node_method = function(node_id, method, ...) { js_fun <- paste0("function() { return this", method, "}") obj_id <- private$object_id(node_id) # https://chromedevtools.github.io/devtools-protocol/tot/Runtime/#method-callFunctionOn self$session$Runtime$callFunctionOn(js_fun, objectId = obj_id, ...) }, object_id = function(node_id) { # https://chromedevtools.github.io/devtools-protocol/tot/DOM/#method-resolveNode self$session$DOM$resolveNode(node_id)$object$objectId } ) ) now <- function() proc.time()[[3]] #' @export html_table.LiveHTML <- function(x, header = NA, trim = TRUE, fill = deprecated(), dec = ".", na.strings = "NA", convert = TRUE) { tables <- html_elements(x, "table") html_table( tables, header = header, trim = trim, fill = fill, dec = dec, na.strings = na.strings, convert = convert ) } #' @export html_elements.LiveHTML <- function(x, css, xpath) { x$html_elements(css, xpath) } #' @export html_element.LiveHTML <- function(x, css, xpath) { out <- html_elements(x, css, xpath) if (length(out) == 0) { xml2::xml_missing() } else { out[[1]] } } # helpers ----------------------------------------------------------------- has_chromote <- function() { tryCatch( { default <- chromote::default_chromote_object() local_bindings(default_timeout = 5, .env = default) startup <- default$new_session(wait_ = FALSE) default$wait_for(startup) TRUE }, error = function(cnd) { FALSE } ) } as_key_desc <- function(key, modifiers = character(), error_call = caller_env()) { check_string(key, call = error_call) modifiers <- arg_match( modifiers, values = c("Alt", "Control", "Meta", "Shift"), multiple = TRUE, error_call = error_call ) if (!has_name(keydefs, key)) { cli::cli_abort("No key definition for {.str {key}}.") } def <- keydefs[[key]] desc <- list() desc$key <- def$key %||% "" if ("Shift" %in% modifiers && has_name(def, "shiftKey")) { desc$key <- def$shiftKey } desc$windowsVirtualKeyCode <- def$keyCode %||% 0 if ("Shift" %in% modifiers && has_name(def, "shiftKeyCode")) { desc$windowsVirtualKeyCode <- def$shiftKeyCode } desc$code <- def$code %||% "" desc$location <- def$location %||% 0 desc$text <- if (nchar(desc$key) == 1) def$key else def$text # no elements have shiftText field # if any modifiers besides shift are pressed, no text should be sent if (any(modifiers != "Shift")) { desc$text <- '' } desc$modifiers <- sum(c(Alt = 1, Control = 2, Meta = 4, Shift = 8)[modifiers]) desc } rvest/NEWS.md0000644000176200001440000002031414562450633012523 0ustar liggesusers# rvest 1.0.4 * New `read_html_live()` reads HTML into a real, live, HTML browser, meaning that you can scrape HTML generated by javascript. It returns a `LiveHTML` object which you can also use to simulate user interactions with the page, like clicking, typing, and scrolling (#245). * `html_table()` discards rows without cells (@epiben, #360). # rvest 1.0.3 * Re-document to fix HTML issues in `.Rd`. # rvest 1.0.2 * Fixes for CRAN * `html_table()` converts empty tables to empty tibbles (@epiben, #327). # rvest 1.0.1 * `html_table()` correctly handles tables with cells that contain blank values for `rowspan` and/or `colspan`, so that e.g. `

      ` (table heading), and `` (table data). Here's a simple HTML table with two columns and three rows: ```{r} html <- minimal_html("
      x y
      1.5 2.7
      4.9 1.3
      7.2 8.1
      ") ``` Because tables are a common way to store data, rvest includes the handy `html_table()` which converts a table into a data frame: ```{r} html %>% html_node("table") %>% html_table() ``` ## Element vs elements When using rvest, your eventual goal is usually to build up a data frame, and you want each row to correspond some repeated unit on the HTML page. In this case, you should generally start by using `html_elements()` to select the elements that contain each observation then use `html_element()` to extract the variables from each observation. This guarantees that you'll get the same number of values for each variable because `html_element()` always returns the same number of outputs as inputs. To illustrate this problem take a look at this simple example I constructed using a few entries from `dplyr::starwars`: ```{r} html <- minimal_html("
      • C-3PO is a droid that weighs 167 kg
      • R2-D2 is a droid that weighs 96 kg
      • Yoda weighs 66 kg
      • R4-P17 is a droid
      ") ``` If you try to extract name, species, and weight directly, you end up with one vector of length four and two vectors of length three, and no way to align them: ```{r} html %>% html_elements("b") %>% html_text2() html %>% html_elements("i") %>% html_text2() html %>% html_elements(".weight") %>% html_text2() ``` Instead, use `html_elements()` to find a element that corresponds to each character, then use `html_element()` to extract each variable for all observations: ```{r} characters <- html %>% html_elements("li") characters %>% html_element("b") %>% html_text2() characters %>% html_element("i") %>% html_text2() characters %>% html_element(".weight") %>% html_text2() ``` `html_element()` automatically fills in `NA` when no elements match, keeping all of the variables aligned and making it easy to create a data frame: ```{r} data.frame( name = characters %>% html_element("b") %>% html_text2(), species = characters %>% html_element("i") %>% html_text2(), weight = characters %>% html_element(".weight") %>% html_text2() ) ``` rvest/vignettes/starwars.Rmd0000644000176200001440000000165113775161316015754 0ustar liggesusers--- title: "Star Wars films" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Star Wars films} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- This vignette contains some data about the Star Wars films for use in rvest examples and vignettes. ```{r, echo = FALSE, results = "asis"} library(rvest) crawl_html <- function(x) { x %>% gsub("\r", "", .) %>% gsub("\n\n", "

      ", .) %>% gsub("\n", " ", .) %>% paste0("

      ", ., "

      ") } film_desc <- function(x) { glue::glue_data(x, "

      {title}

      Released: {release_date}

      Director: {director}

      {crawl_html(opening_crawl)}
      ") } films <- repurrrsive::sw_films films <- films[order(sapply(films, "[[", "episode_id"))] descs <- vapply(films, film_desc, character(1)) writeLines(descs) ``` rvest/R/0000755000176200001440000000000014560241772011626 5ustar liggesusersrvest/R/sysdata.rda0000644000176200001440000000533114557166073013776 0ustar liggesusersBZh91AY&SYe9\9`  |z* "u|eL7>zN=#-񾫝g/[`r~ Z9ov1b+ ="$ $`sg{ѵ$阢[Pyh߫o;۹Zˮ4i[xOG5]9ZoLp7mA-Uen vYѶ{NK @$]뭫FJX-) DTWXT m P)GBH!TC,AFʪ̝';4N71b9F#N44444442222222222K%dQ$6ߦYOgUdq6"1)JR4!B!B ZVtE Ap&Lu ԰QxT1aaYeh t;Ҷy*FRiȓy3XZ K'Ot]:OҜ{`3K4fjfjfjfjf'O΂z)'L0L0L0L0L0L0LTJB z AH)) R AH)  B R !n'q'q'q'q3K7SN4&КBhM 3tΙ:qD "A%1 %љ`R4Q(J%DQ(J%DQ(J%DZ31RI AD tbdR'=1&˳*|d(Mge33}g8 ͖rɬl%,uv>;NW#MIe'%aZeF6M|pfVQF*$"sسld̩)k3YKt㳌GJLU7R~DA UXXX]D8)D (H"+ E rI$T EEUEEUXAD2@ ș|CJI< !T)JPhiG "hyCO8HњyȂ\3:,i2@ ΋2I%3$ U$6.4?N`=ܹP?V(I4lgID)ŐnJvW‹SC"йfgPrJ S&c2л9$c$a uyQeulYh\زХ."b!$*օ{ZԢ%7MUUcbqqw%Pp:423 6Juy!Ur&'3c2жŖS+S R JUy(M8fQj&}{ rB ]h^1ߊKb1- QKB*օ(t, ( `^SkZx1 N`I‡.test

      ") minimal_html <- function(html, title = "") { # From http://www.brucelawson.co.uk/2010/a-minimal-html5-document/ xml2::read_html(paste0( "\n", "\n", "", title, "\n", html )) } cat_line <- function(...) { cat(paste0(..., "\n", collapse = "")) } env_cache <- function(env, nm, value, inherit = FALSE) { if (env_has(env, nm, inherit = inherit)) { env_get(env, nm, inherit = TRUE) } else { env_poke(env, nm, value) value } } inspect <- function(x) { path <- tempfile(fileext = ".html") writeLines(as.character(x), path) utils::browseURL(path) } rvest/R/zzz.R0000644000176200001440000000043414560241772012607 0ustar liggesusersnew_chromote <- NULL .onLoad <- function(...) { if (is_installed("chromote")) { new_chromote <<- utils::packageVersion("chromote") >= "0.1.2.9000" } else { # If chromote is not installed yet, assume it's not new to be safe. new_chromote <- FALSE } invisible() } rvest/R/table.R0000644000176200001440000001732414554042640013043 0ustar liggesusers#' Parse an html table into a data frame #' #' The algorithm mimics what a browser does, but repeats the values of merged #' cells in every cell that cover. #' #' @inheritParams html_name #' @param header Use first row as header? If `NA`, will use first row #' if it consists of `
      ` tags. #' #' If `TRUE`, column names are left exactly as they are in the source #' document, which may require post-processing to generate a valid data #' frame. #' @param trim Remove leading and trailing whitespace within each cell? #' @param fill Deprecated - missing cells in tables are now always #' automatically filled with `NA`. #' @param dec The character used as decimal place marker. #' @param na.strings Character vector of values that will be converted to `NA` #' if `convert` is `TRUE`. #' @param convert If `TRUE`, will run [`type.convert()`] to interpret texts as #' integer, double, or `NA`. #' @return #' When applied to a single element, `html_table()` returns a single tibble. #' When applied to multiple elements or a document, `html_table()` returns #' a list of tibbles. #' @export #' @examples #' sample1 <- minimal_html(" #' #' #' #' #'
      Col ACol B
      1x
      4y
      10z
      ") #' sample1 %>% #' html_element("table") %>% #' html_table() #' #' # Values in merged cells will be duplicated #' sample2 <- minimal_html(" #' #' #' #' #'
      ABC
      123
      45
      67
      ") #' sample2 %>% #' html_element("table") %>% #' html_table() #' #' # If a row is missing cells, they'll be filled with NAs #' sample3 <- minimal_html(" #' #' #' #' #'
      ABC
      12
      3
      4
      ") #' sample3 %>% #' html_element("table") %>% #' html_table() html_table <- function(x, header = NA, trim = TRUE, fill = deprecated(), dec = ".", na.strings = "NA", convert = TRUE ) { check_bool(header, allow_na = TRUE) check_bool(trim) check_string(dec) check_character(na.strings) check_bool(convert) UseMethod("html_table") } #' @export html_table.xml_document <- function(x, header = NA, trim = TRUE, fill = deprecated(), dec = ".", na.strings = "NA", convert = TRUE) { tables <- xml2::xml_find_all(x, ".//table") html_table( tables, header = header, trim = trim, fill = fill, dec = dec, na.strings = na.strings, convert = convert ) } #' @export html_table.xml_nodeset <- function(x, header = NA, trim = TRUE, fill = deprecated(), dec = ".", na.strings = "NA", convert = TRUE) { lapply( x, html_table, header = header, trim = trim, fill = fill, dec = dec, na.strings = na.strings, convert = convert ) } #' @export html_table.xml_node <- function(x, header = NA, trim = TRUE, fill = deprecated(), dec = ".", na.strings = "NA", convert = TRUE) { if (lifecycle::is_present(fill) && !isTRUE(fill)) { lifecycle::deprecate_warn( when = "1.0.0", what = "html_table(fill = )", details = "An improved algorithm fills by default so it is no longer needed.", user_env = caller_env(2) # S3 generic ) } ns <- xml2::xml_ns(x) rows <- xml2::xml_find_all(x, ".//tr", ns = ns) cells <- lapply(rows, xml2::xml_find_all, ".//td|.//th", ns = ns) cells <- compact(cells) if (length(cells) == 0) { return(tibble::tibble()) } out <- table_fill(cells, trim = trim) if (is.na(header)) { header <- all(html_name(cells[[1]]) == "th") } if (header) { col_names <- out[1, , drop = FALSE] out <- out[-1, , drop = FALSE] } else { col_names <- paste0("X", seq_len(ncol(out))) } colnames(out) <- col_names df <- tibble::as_tibble(out, .name_repair = "minimal") if (isTRUE(convert)) { df[] <- lapply(df, function(x) { utils::type.convert(x, as.is = TRUE, dec = dec, na.strings = na.strings) }) } df } # Table fillng algorithm -------------------------------------------------- # Base on https://html.spec.whatwg.org/multipage/tables.html#forming-a-table table_fill <- function(cells, trim = TRUE) { width <- 0 height <- length(cells) # initial estimate values <- vector("list", height) # list of downward spanning cells dw <- dw_init() # https://html.spec.whatwg.org/multipage/tables.html#algorithm-for-processing-rows for (i in seq_along(cells)) { row <- cells[[i]] if (length(row) == 0) { next } rowspan <- as.integer(html_attr(row, "rowspan", default = NA_character_)) rowspan[is.na(rowspan)] <- 1 colspan <- as.integer(html_attr(row, "colspan", default = NA_character_)) colspan[is.na(colspan)] <- 1 text <- html_text(row) if (isTRUE(trim)) { text <- gsub("^[[:space:]\u00a0]+|[[:space:]\u00a0]+$", "", text) } vals <- rep(NA_character_, width) col <- 1 j <- 1 while(j <= length(row)) { if (col %in% dw$col) { cell <- dw_find(dw, col) cell_text <- cell$text cell_colspan <- cell$colspan } else { cell_text <- text[[j]] cell_colspan <- colspan[[j]] if (rowspan[[j]] > 1) { dw <- dw_add(dw, col, rowspan[[j]], colspan[[j]], text[[j]]) } j <- j + 1 } vals[col:(col + cell_colspan - 1L)] <- cell_text col <- col + cell_colspan } # Add any downward cells after last
      for(j in seq2(col - 1L, width)) { if (j %in% dw$col) { cell <- dw_find(dw, j) vals[j:(j + cell$colspan - 1L)] <- cell$text } } dw <- dw_prune(dw) values[[i]] <- vals height <- max(height, i + max(rowspan) - 1L) width <- max(width, col - 1L) } # Add any downward cells after
      ` is parsed as `` (@epiben, #323). * Fix broken example # rvest 1.0.0 ## New features * New `html_text2()` provides a more natural rendering of HTML nodes into text, converting `
      ` into "\n", and removing non-significant whitespace (#175). By default, it also converts ` ` into regular spaces, which you can suppress with `preserve_nbsp = TRUE` (#284). * `html_table()` has been re-written from scratch to more closely mimic the algorithm that browsers use for parsing tables. This should mean that there are far fewer tables for which it fails to produce some output (#63, #204, #215). The `fill` argument has been deprecated since it is no longer needed. `html_table()` now returns a tibble rather than a data frame to be compatible with the rest of the tidyverse (#199). Its performance has been considerably improved (#237). It also gains a `na.strings` argument to control what values are converted to `NA` (#107), and a `convert` argument to control whether to run the conversion (#311). * New `html_form_submit()` allows you to submit a form directly, without needing to create a session (#300). * rvest is now licensed as MIT (#287). ## API changes Since this is the 1.0.0 release, I included a large number of API changes to make rvest more compatible with current tidyverse conventions. Older functions have been deprecated, so existing code will continue to work (albeit with a few new warnings). * rvest now imports xml2 rather than depending on it. This is cleaner because it avoids attaching all the xml2 functions that you're less likely to use. To reduce the change of breakages, rvest re-exports xml2 functions `read_html()` and `url_absolute()`, but your code may now need an explicit `library(xml2)`. * `html_form()` now returns an object with class `rvest_form` (instead of form). Fields within a form now have class `rvest_field`, instead of a variety of classes that were lacking the `rvest_` prefix. All functions for working with forms have a common `html_form_` prefix: `set_values()` became `html_form_set()`. `submit_form()` was renamed to `session_submit()` because it returns a session. * `html_node()` and `html_nodes()` have been superseded in favor of `html_element()` and `html_elements()` since they (almost) always return elements, not nodes (#298). * `html_session()` is now `session()` and returns an object of class `rvest_session` (instead of `session`). All functions that work with session objects now have a common `session_` prefix. * Long deprecated `html()`, `html_tag()`, `xml()` functions have been removed. * `minimal_html()` (which doesn't appear to be used by any other package) has had its arguments flipped to make it more intuitive. * `guess_encoding()` has been renamed to `html_encoding_guess()` to avoid a clash with `stringr::guess_encoding()` (#209). `repair_encoding()` has been deprecated because it doesn't appear to work. * `pluck()` is no longer exported to avoid a clash with `purrr::pluck()`; if you need it use `purrr::map_chr()` and friends instead (#209). * `xml_tag()`, `xml_node()`, and `xml_nodes()` have been formally deprecated in favor of their `html_` equivalents. ## Minor improvements and bug fixes * The "harvesting the web" vignette has been rewritten to focus more on basics rvest, eliminating the screenshots to keep the installed package as svelte as possible. It's also been renamed to `vignette("rvest")` since it's the vignette that you should read first. * The SelectorGadget vignette is now a web-only article, , so we can be more generous with screenshots since they're no longer bundled with every install of the package. Together with the rewrite of the other vignette, this means that rvest is now ~90 Kb instead of ~1.1 Mb. * All uses of IMDB have been eliminated since the site explicitly prohibits scraping (#195). * `session_submit()` errors if `form` doesn't have a `url` (#288). * New `session_forward()` function to complement `session_back()`. It now allows you to pick the submission button by position (#156). The `...` argument is deprecated; please use `config` instead. * `html_form_set()` can now accept character vectors allowing you to select multiple checkboxes in a set or select multiple values from a multi-`` as well as``. * `submit_request()` (and hence `submit_form()`) recognizes forms with `` as a valid form submission button. # rvest 0.2.0 ## New features * `html()` and `xml()` pass `...` on to `httr::GET()` so you can more finely control the request (#48). * Add xml support: parse with `xml()`, then work with using `xml_node()`, `xml_attr()`, `xml_attrs()`, `xml_text()` and `xml_tag()` (#24). * `xml_structure()`: new function that displays the structure (i.e. tag and attribute names) of a xml/html object (#10). ## Bug fixes * `follow_link()` now accepts css and xpath selectors. (#38, #41, #42) * `html()` does a better job of dealing with encodings (passing the problem on to `XML::parseHTML()`) instead of trying to do it itself (#25, #50). * `html_attr()` returns default value when input is NULL (#49) * Add missing `html_node()` method for session. * `html_nodes()` now returns an empty list if no elements are found (#31). * `submit_form()` converts relative paths to absolute URLs (#52). It also deals better with 0-length inputs (#29). rvest/MD50000644000176200001440000001221214562475432011737 0ustar liggesusersf3ed3cc853547ffcf52d1557c6ec3cf2 *DESCRIPTION bb593e5aacd04971fb190b3752cd0468 *LICENSE 336451b767a09bc464ce82c2966cda84 *NAMESPACE 3ead7810783b9ca973a068aba54c2e2d *NEWS.md 0fa60b49bc17479ffacbdf7e1d738a31 *R/compat-purrr.R d1941e2039af8d58ffc5b3cc6d384013 *R/encoding.R 099b3875e6de5ff8511597fde2f21601 *R/form.R 7d247cd373dd6be830a03f39082236ea *R/html.R c80a9eb1427c585807cecf618b6f3870 *R/import-standalone-obj-type.R c40f882046a958444c6058a9e2cb9a3b *R/import-standalone-types-check.R 7d29eccc99cb03afacfb2e15e904a4dc *R/live.R b7c36d3bad90be1bcd154df9457e2759 *R/rename.R 8158a6cb7e91a6fa46b3cbfb752dc2d2 *R/rvest-package.R f22795da6a08898017daf1266da7856f *R/selectors.R 38a2b9c01392d7b6364b3538c44029ac *R/session.R dcf6919362f42bc62ba1cb58bfeb120c *R/sysdata.rda 42150575ece2eafb52da12cee47d73ae *R/table.R 4febcd1db8acea2e6d13ce2a627ee5dd *R/text.R 8ae4c47191f3ab74347da49a7577ce82 *R/utils.R e38c2df576eb5eecba65f58ba098583c *R/zzz.R 62a2d2d893e4d8dd4698465454c2540f *README.md 8be82833c19f1bb13175f56b629852e4 *build/vignette.rds 96facf495896646206f14ea3932cb996 *demo/00Index 5b1a65183607f9c7d536fb03efac59a1 *demo/tripadvisor.R daaaa642bfbf152064f757e795fba645 *demo/united.R 8e08c2f3f3a7d02be1d796f820f257c9 *demo/zillow.R 81f447efe4938f06ad471f5f2dd11d70 *inst/WORDLIST f79a0e0a4b8a863982abfcedc7ac4c0b *inst/doc/rvest.R 1e743cade9a65846116e17b6a61e90ac *inst/doc/rvest.Rmd 4d3f9c63a9450e127903a366dcc12104 *inst/doc/rvest.html bcb3ed7978a66dca1e081a0bc3cd9989 *inst/doc/starwars.R 1d030fd990b11a62568bd5edf7710a07 *inst/doc/starwars.Rmd da1eeee1419265afb584f8ac36ce1ec1 *inst/doc/starwars.html be4cd29b5c2b16bd908a786c53cf0276 *inst/html-ex/bad-encoding.html 25e3de050bced6213eabe66a650f54d1 *man/LiveHTML.Rd a1cbaf3f328e8d74e747faacf640c7fc *man/figures/lifecycle-archived.svg 6f521fb1819410630e279d1abf88685a *man/figures/lifecycle-defunct.svg 391f696f961e28914508628a7af31b74 *man/figures/lifecycle-deprecated.svg 691b1eb2aec9e1bec96b79d11ba5e631 *man/figures/lifecycle-experimental.svg 405e252e54a79b33522e9699e4e9051c *man/figures/lifecycle-maturing.svg f41ed996be135fb35afe00641621da61 *man/figures/lifecycle-questioning.svg 306bef67d1c636f209024cf2403846fd *man/figures/lifecycle-soft-deprecated.svg ed42e3fbd7cc30bc6ca8fa9b658e24a8 *man/figures/lifecycle-stable.svg bf2f1ad432ecccee3400afe533404113 *man/figures/lifecycle-superseded.svg de342ed76f2b5a8df31316771007d90f *man/figures/logo.png eb9ac3d3af03097f746a0fe39050243b *man/google_form.Rd a6d869162565326cc577521a9bfdc0af *man/html_attr.Rd a644e18fdcf48efd1b51a71338e30de9 *man/html_children.Rd ba02f09806b339834651dae515698c7e *man/html_element.Rd 82dd7199f651dd55a2f24219f63768e0 *man/html_encoding_guess.Rd 09c22680287b3a8d17bedfc377a3b7d9 *man/html_form.Rd 480665ed4e0a662cd7168bae2f773ecc *man/html_name.Rd 981e43b2c8ba9d56fe2a17f2b1b95dea *man/html_table.Rd 7824150f123e587749235481d99f6af2 *man/html_text.Rd 1e641362f441edc4f7fa5e3665ec8ae4 *man/minimal_html.Rd c7e427d75d9661183b719f6e6d9cd78c *man/read_html.Rd 6180ba7919a1a1695fe61bc9f674f918 *man/read_html_live.Rd 6cd36b58988bea71192ceab705159daf *man/reexports.Rd e4de192a61e9191cc5acaab22bc3387d *man/rename.Rd ceac1f465444fe123838d404dd41eaf4 *man/repair_encoding.Rd 0601e7bc912ae827c4e86834dba7ab6f *man/rvest-package.Rd d28331be23b34e2bf47d1460c4eba3d3 *man/session.Rd 0622a97a2aaa3c342f09636052c2d7f5 *tests/spelling.R ef55516173099bedc78483fe7c0dd23c *tests/testthat.R 946511a52463a8ab8ce3dbea3c72561a *tests/testthat/_snaps/encoding.md 55089be7c20a04f9bebfbcc60bba46a3 *tests/testthat/_snaps/form.md de8028eedf94fe2660fc02f9e45f19d3 *tests/testthat/_snaps/html.md aab9db63234751a0c90674c2dda13023 *tests/testthat/_snaps/live.md 6453e9b69228a5a99296a0009a439a03 *tests/testthat/_snaps/rename.md d646b6f5e2c485a7dfed7ab4ee3e01e6 *tests/testthat/_snaps/selectors.md ff290a73435b74b0392666ae206c1346 *tests/testthat/_snaps/session.md 8ba17870d954c1e54970b68fab04caf2 *tests/testthat/_snaps/table.md 483c192b5de64a50ab5954cb7ed69322 *tests/testthat/_snaps/utils.md b4a02e14bb7283d55b1d0304b830231f *tests/testthat/helper.R f1d1f56a89b38b6143b1b533f595ed7b *tests/testthat/html/bullets.html fc11fa7ebb860a05c9a1af020b6434b6 *tests/testthat/html/click.html e1e572df149c8e3db11ba0091b1a0da6 *tests/testthat/html/press.html 4948ad3284c9d1593d0f455c1751a673 *tests/testthat/html/scroll.html 5c30dbc52e008063b4395580a917c2f3 *tests/testthat/html/table.html b7fe63f3dd2410f6f436a02ad62805d6 *tests/testthat/html/type.html cbe5b318c8cbbd5b398c88a70db8e099 *tests/testthat/test-encoding.R 12681a15cd8cd3d13d4889c21867dcc6 *tests/testthat/test-form.R 1808de36d606f5b97333095087c1e98e *tests/testthat/test-html.R 08f69fd1d8b37c6812636e022e8f60b3 *tests/testthat/test-live.R 3dd7435aba1764a7bb22544bc080c585 *tests/testthat/test-rename.R 3d9b9f27a90ff7c11c9b9c0de79a2be8 *tests/testthat/test-selectors.R 11cb3e35e23ce25a7b684422d3cdb432 *tests/testthat/test-session.R e49d069a988ff67b7c26ad4f01cd44b8 *tests/testthat/test-table.R b4797b2d4cca238d5c828f8778a1d983 *tests/testthat/test-text.R 2f2f8524da52b2515bcfa4c4d472033d *tests/testthat/test-utils.R 07292e7a862259805eebb8606cb61044 *tests/testthat/test.html 1e743cade9a65846116e17b6a61e90ac *vignettes/rvest.Rmd 1d030fd990b11a62568bd5edf7710a07 *vignettes/starwars.Rmd rvest/inst/0000755000176200001440000000000014562450666012410 5ustar liggesusersrvest/inst/doc/0000755000176200001440000000000014562450666013155 5ustar liggesusersrvest/inst/doc/rvest.Rmd0000644000176200001440000002701214554031036014751 0ustar liggesusers--- title: "Web scraping 101" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Web scraping 101} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, echo=FALSE} knitr::opts_chunk$set(comment = "#>", collapse = TRUE) ``` This vignette introduces you to the basics of web scraping with rvest. You'll first learn the basics of HTML and how to use CSS selectors to refer to specific elements, then you'll learn how to use rvest functions to get data out of HTML and into R. ```{r} library(rvest) ``` ## HTML basics HTML stands for "HyperText Markup Language" and looks like this: ``` {.html} Page title

      A heading

      Some text & some bold text.

      ``` HTML has a hierarchical structure formed by **elements** which consist of a start tag (e.g. ``), optional **attributes** (`id='first'`), an end tag[^1] (like ``), and **contents** (everything in between the start and end tag). [^1]: A number of tags (including `

      ` and `

    1. )` don't require end tags, but I think it's best to include them because it makes seeing the structure of the HTML a little easier. Since `<` and `>` are used for start and end tags, you can't write them directly. Instead you have to use the HTML **escapes** `>` (greater than) and `<` (less than). And since those escapes use `&`, if you want a literal ampersand you have to escape it as `&`. There are a wide range of possible HTML escapes but you don't need to worry about them too much because rvest automatically handles them for you. ### Elements All up, there are over 100 HTML elements. Some of the most important are: - Every HTML page must be in an `` element, and it must have two children: ``, which contains document metadata like the page title, and ``, which contains the content you see in the browser. - Block tags like `

      ` (heading 1), `

      ` (paragraph), and `

        ` (ordered list) form the overall structure of the page. - Inline tags like `` (bold), `` (italics), and `` (links) formats text inside block tags. If you encounter a tag that you've never seen before, you can find out what it does with a little googling. I recommend the [MDN Web Docs](https://developer.mozilla.org/en-US/docs/Web/HTML) which are produced by Mozilla, the company that makes the Firefox web browser. ### Contents Most elements can have content in between their start and end tags. This content can either be text or more elements. For example, the following HTML contains paragraph of text, with one word in bold. ```{=html}

        Hi! My name is Hadley.

        ``` The **children** of a node refers only to elements, so the `

        ` element above has one child, the `` element. The `` element has no children, but it does have contents (the text "name"). Some elements, like `` can't have children. These elements depend solely on attributes for their behavior. ### Attributes Tags can have named **attributes** which look like `name1='value1' name2='value2'`. Two of the most important attributes are `id` and `class`, which are used in conjunction with CSS (Cascading Style Sheets) to control the visual appearance of the page. These are often useful when scraping data off a page. ## Reading HTML with rvest You'll usually start the scraping process with `read_html()`. This returns a `xml_document`[^2] object which you'll then manipulate using rvest functions: [^2]: This class comes from the [xml2](https://xml2.r-lib.org) package. xml2 is a low-level package that rvest builds on top of. ```{r} html <- read_html("http://rvest.tidyverse.org/") class(html) ``` For examples and experimentation, rvest also includes a function that lets you create an `xml_document` from literal HTML: ```{r} html <- minimal_html("

        This is a paragraph

        • This is a bulleted list
        ") html ``` Regardless of how you get the HTML, you'll need some way to identify the elements that contain the data you care about. rvest provides two options: CSS selectors and XPath expressions. Here I'll focus on CSS selectors because they're simpler but still sufficiently powerful for most scraping tasks. ## CSS selectors CSS is short for cascading style sheets, and is a tool for defining the visual styling of HTML documents. CSS includes a miniature language for selecting elements on a page called **CSS selectors**. CSS selectors define patterns for locating HTML elements, and are useful for scraping because they provide a concise way of describing which elements you want to extract. CSS selectors can be quite complex, but fortunately you only need the simplest for rvest, because you can also write R code for more complicated situations. The four most important selectors are: - `p`: selects all `

        ` elements. - `.title`: selects all elements with `class` "title". - `p.special`: selects all `

        ` elements with `class` "special". - `#title`: selects the element with the `id` attribute that equals "title". Id attributes must be unique within a document, so this will only ever select a single element. If you want to learn more CSS selectors I recommend starting with the fun [CSS dinner](https://flukeout.github.io/) tutorial and then referring to the [MDN web docs](https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_Selectors). Lets try out the most important selectors with a simple example: ```{r} html <- minimal_html("

        This is a heading

        This is a paragraph

        This is an important paragraph

        ") ``` In rvest you can extract a single element with `html_element()` or all matching elements with `html_elements()`. Both functions take a document[^3] and a css selector: [^3]: Or another element, more on that shortly. ```{r} html %>% html_element("h1") html %>% html_elements("p") html %>% html_elements(".important") html %>% html_elements("#first") ``` Selectors can also be combined in various ways using **combinators**. For example,The most important combinator is " ", the **descendant** combination, because `p a` selects all `
        ` elements that are a child of a `

        ` element. If you don't know exactly what selector you need, I highly recommend using [SelectorGadget](https://rvest.tidyverse.org/articles/selectorgadget.html), which lets you automatically generate the selector you need by supplying positive and negative examples in the browser. ## Extracting data Now that you've got the elements you care about, you'll need to get data out of them. You'll usually get the data from either the text contents or an attribute. But, sometimes (if you're lucky!), the data you need will be in an HTML table. ### Text Use `html_text2()` to extract the plain text contents of an HTML element: ```{r} html <- minimal_html("

        1. apple & pear
        2. banana
        3. pineapple
        ") html %>% html_elements("li") %>% html_text2() ``` Note that the escaped ampersand is automatically converted to `&`; you'll only ever see HTML escapes in the source HTML, not in the data returned by rvest. You might wonder why I used `html_text2()`, since it seems to give the same result as `html_text()`: ```{r} html %>% html_elements("li") %>% html_text() ``` The main difference is how the two functions handle white space. In HTML, white space is largely ignored, and it's the structure of the elements that defines how text is laid out. `html_text2()` does its best to follow the same rules, giving you something similar to what you'd see in the browser. Take this example which contains a bunch of white space that HTML ignores. ```{r} html <- minimal_html("

        This is a paragraph.

        This is another paragraph. It has two sentences.

        ") ``` `html_text2()` gives you what you expect: two paragraphs of text separated by a blank line. ```{r} html %>% html_element("body") %>% html_text2() %>% cat() ``` Whereas `html_text()` returns the garbled raw underlying text: ```{r} html %>% html_element("body") %>% html_text() %>% cat() ``` ### Attributes Attributes are used to record the destination of links (the `href` attribute of `
        ` elements) and the source of images (the `src` attribute of the `` element): ```{r} html <- minimal_html("

        cats

        ") ``` The value of an attribute can be retrieved with `html_attr()`: ```{r} html %>% html_elements("a") %>% html_attr("href") html %>% html_elements("img") %>% html_attr("src") ``` Note that `html_attr()` always returns a string, so you may need to post-process with `as.integer()`/`readr::parse_integer()` or similar. ```{r} html %>% html_elements("img") %>% html_attr("width") html %>% html_elements("img") %>% html_attr("width") %>% as.integer() ``` ### Tables HTML tables are composed four main elements: ``, `` (table row), `
        ` (table heading), and `` (table data). Here's a simple HTML table with two columns and three rows: ```{r} html <- minimal_html("
        x y
        1.5 2.7
        4.9 1.3
        7.2 8.1
        ") ``` Because tables are a common way to store data, rvest includes the handy `html_table()` which converts a table into a data frame: ```{r} html %>% html_node("table") %>% html_table() ``` ## Element vs elements When using rvest, your eventual goal is usually to build up a data frame, and you want each row to correspond some repeated unit on the HTML page. In this case, you should generally start by using `html_elements()` to select the elements that contain each observation then use `html_element()` to extract the variables from each observation. This guarantees that you'll get the same number of values for each variable because `html_element()` always returns the same number of outputs as inputs. To illustrate this problem take a look at this simple example I constructed using a few entries from `dplyr::starwars`: ```{r} html <- minimal_html("
        • C-3PO is a droid that weighs 167 kg
        • R2-D2 is a droid that weighs 96 kg
        • Yoda weighs 66 kg
        • R4-P17 is a droid
        ") ``` If you try to extract name, species, and weight directly, you end up with one vector of length four and two vectors of length three, and no way to align them: ```{r} html %>% html_elements("b") %>% html_text2() html %>% html_elements("i") %>% html_text2() html %>% html_elements(".weight") %>% html_text2() ``` Instead, use `html_elements()` to find a element that corresponds to each character, then use `html_element()` to extract each variable for all observations: ```{r} characters <- html %>% html_elements("li") characters %>% html_element("b") %>% html_text2() characters %>% html_element("i") %>% html_text2() characters %>% html_element(".weight") %>% html_text2() ``` `html_element()` automatically fills in `NA` when no elements match, keeping all of the variables aligned and making it easy to create a data frame: ```{r} data.frame( name = characters %>% html_element("b") %>% html_text2(), species = characters %>% html_element("i") %>% html_text2(), weight = characters %>% html_element(".weight") %>% html_text2() ) ``` rvest/inst/doc/rvest.html0000644000176200001440000012750014562450666015213 0ustar liggesusers Web scraping 101

        Web scraping 101

        This vignette introduces you to the basics of web scraping with rvest. You’ll first learn the basics of HTML and how to use CSS selectors to refer to specific elements, then you’ll learn how to use rvest functions to get data out of HTML and into R.

        library(rvest)

        HTML basics

        HTML stands for “HyperText Markup Language” and looks like this:

        <html>
        <head>
          <title>Page title</title>
        </head>
        <body>
          <h1 id='first'>A heading</h1>
          <p>Some text &amp; <b>some bold text.</b></p>
          <img src='myimg.png' width='100' height='100'>
        </body>

        HTML has a hierarchical structure formed by elements which consist of a start tag (e.g. <tag>), optional attributes (id='first'), an end tag1 (like </tag>), and contents (everything in between the start and end tag).

        Since < and > are used for start and end tags, you can’t write them directly. Instead you have to use the HTML escapes &gt; (greater than) and &lt; (less than). And since those escapes use &, if you want a literal ampersand you have to escape it as &amp;. There are a wide range of possible HTML escapes but you don’t need to worry about them too much because rvest automatically handles them for you.

        Elements

        All up, there are over 100 HTML elements. Some of the most important are:

        • Every HTML page must be in an <html> element, and it must have two children: <head>, which contains document metadata like the page title, and <body>, which contains the content you see in the browser.

        • Block tags like <h1> (heading 1), <p> (paragraph), and <ol> (ordered list) form the overall structure of the page.

        • Inline tags like <b> (bold), <i> (italics), and <a> (links) formats text inside block tags.

        If you encounter a tag that you’ve never seen before, you can find out what it does with a little googling. I recommend the MDN Web Docs which are produced by Mozilla, the company that makes the Firefox web browser.

        Contents

        Most elements can have content in between their start and end tags. This content can either be text or more elements. For example, the following HTML contains paragraph of text, with one word in bold.

        Hi! My name is Hadley.

        The children of a node refers only to elements, so the <p> element above has one child, the <b> element. The <b> element has no children, but it does have contents (the text “name”).

        Some elements, like <img> can’t have children. These elements depend solely on attributes for their behavior.

        Attributes

        Tags can have named attributes which look like name1='value1' name2='value2'. Two of the most important attributes are id and class, which are used in conjunction with CSS (Cascading Style Sheets) to control the visual appearance of the page. These are often useful when scraping data off a page.

        Reading HTML with rvest

        You’ll usually start the scraping process with read_html(). This returns a xml_document2 object which you’ll then manipulate using rvest functions:

        html <- read_html("http://rvest.tidyverse.org/")
        class(html)
        #> [1] "xml_document" "xml_node"

        For examples and experimentation, rvest also includes a function that lets you create an xml_document from literal HTML:

        html <- minimal_html("
          <p>This is a paragraph<p>
          <ul>
            <li>This is a bulleted list</li>
          </ul>
        ")
        html
        #> {html_document}
        #> <html>
        #> [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
        #> [2] <body>\n<p>This is a paragraph</p>\n<p>\n  </p>\n<ul>\n<li>This is a bull ...

        Regardless of how you get the HTML, you’ll need some way to identify the elements that contain the data you care about. rvest provides two options: CSS selectors and XPath expressions. Here I’ll focus on CSS selectors because they’re simpler but still sufficiently powerful for most scraping tasks.

        CSS selectors

        CSS is short for cascading style sheets, and is a tool for defining the visual styling of HTML documents. CSS includes a miniature language for selecting elements on a page called CSS selectors. CSS selectors define patterns for locating HTML elements, and are useful for scraping because they provide a concise way of describing which elements you want to extract.

        CSS selectors can be quite complex, but fortunately you only need the simplest for rvest, because you can also write R code for more complicated situations. The four most important selectors are:

        • p: selects all <p> elements.

        • .title: selects all elements with class “title”.

        • p.special: selects all <p> elements with class “special”.

        • #title: selects the element with the id attribute that equals “title”. Id attributes must be unique within a document, so this will only ever select a single element.

        If you want to learn more CSS selectors I recommend starting with the fun CSS dinner tutorial and then referring to the MDN web docs.

        Lets try out the most important selectors with a simple example:

        html <- minimal_html("
          <h1>This is a heading</h1>
          <p id='first'>This is a paragraph</p>
          <p class='important'>This is an important paragraph</p>
        ")

        In rvest you can extract a single element with html_element() or all matching elements with html_elements(). Both functions take a document3 and a css selector:

        html %>% html_element("h1")
        #> {html_node}
        #> <h1>
        html %>% html_elements("p")
        #> {xml_nodeset (2)}
        #> [1] <p id="first">This is a paragraph</p>
        #> [2] <p class="important">This is an important paragraph</p>
        html %>% html_elements(".important")
        #> {xml_nodeset (1)}
        #> [1] <p class="important">This is an important paragraph</p>
        html %>% html_elements("#first")
        #> {xml_nodeset (1)}
        #> [1] <p id="first">This is a paragraph</p>

        Selectors can also be combined in various ways using combinators. For example,The most important combinator is ” “, the descendant combination, because p a selects all <a> elements that are a child of a <p> element.

        If you don’t know exactly what selector you need, I highly recommend using SelectorGadget, which lets you automatically generate the selector you need by supplying positive and negative examples in the browser.

        Extracting data

        Now that you’ve got the elements you care about, you’ll need to get data out of them. You’ll usually get the data from either the text contents or an attribute. But, sometimes (if you’re lucky!), the data you need will be in an HTML table.

        Text

        Use html_text2() to extract the plain text contents of an HTML element:

        html <- minimal_html("
          <ol>
            <li>apple &amp; pear</li>
            <li>banana</li>
            <li>pineapple</li>
          </ol>
        ")
        html %>% 
          html_elements("li") %>% 
          html_text2()
        #> [1] "apple & pear" "banana"       "pineapple"

        Note that the escaped ampersand is automatically converted to &; you’ll only ever see HTML escapes in the source HTML, not in the data returned by rvest.

        You might wonder why I used html_text2(), since it seems to give the same result as html_text():

        html %>% 
          html_elements("li") %>% 
          html_text()
        #> [1] "apple & pear" "banana"       "pineapple"

        The main difference is how the two functions handle white space. In HTML, white space is largely ignored, and it’s the structure of the elements that defines how text is laid out. html_text2() does its best to follow the same rules, giving you something similar to what you’d see in the browser. Take this example which contains a bunch of white space that HTML ignores.

        html <- minimal_html("<body>
          <p>
          This is
          a
          paragraph.</p><p>This is another paragraph.
          
          It has two sentences.</p>
        ")

        html_text2() gives you what you expect: two paragraphs of text separated by a blank line.

        html %>% 
          html_element("body") %>% 
          html_text2() %>% 
          cat()
        #> This is a paragraph.
        #> 
        #> This is another paragraph. It has two sentences.

        Whereas html_text() returns the garbled raw underlying text:

        html %>% 
          html_element("body") %>% 
          html_text() %>% 
          cat()
        #> 
        #>   
        #>   This is
        #>   a
        #>   paragraph.This is another paragraph.
        #>   
        #>   It has two sentences.

        Attributes

        Attributes are used to record the destination of links (the href attribute of <a> elements) and the source of images (the src attribute of the <img> element):

        html <- minimal_html("
          <p><a href='https://en.wikipedia.org/wiki/Cat'>cats</a></p>
          <img src='https://cataas.com/cat' width='100' height='200'>
        ")

        The value of an attribute can be retrieved with html_attr():

        html %>% 
          html_elements("a") %>% 
          html_attr("href")
        #> [1] "https://en.wikipedia.org/wiki/Cat"
        
        html %>% 
          html_elements("img") %>% 
          html_attr("src")
        #> [1] "https://cataas.com/cat"

        Note that html_attr() always returns a string, so you may need to post-process with as.integer()/readr::parse_integer() or similar.

        html %>% 
          html_elements("img") %>% 
          html_attr("width")
        #> [1] "100"
        
        html %>% 
          html_elements("img") %>% 
          html_attr("width") %>% 
          as.integer()
        #> [1] 100

        Tables

        HTML tables are composed four main elements: <table>, <tr> (table row), <th> (table heading), and <td> (table data). Here’s a simple HTML table with two columns and three rows:

        html <- minimal_html("
          <table>
            <tr>
              <th>x</th>
              <th>y</th>
            </tr>
            <tr>
              <td>1.5</td>
              <td>2.7</td>
            </tr>
            <tr>
              <td>4.9</td>
              <td>1.3</td>
            </tr>
            <tr>
              <td>7.2</td>
              <td>8.1</td>
            </tr>
          </table>
          ")

        Because tables are a common way to store data, rvest includes the handy html_table() which converts a table into a data frame:

        html %>% 
          html_node("table") %>% 
          html_table()
        #> # A tibble: 3 × 2
        #>       x     y
        #>   <dbl> <dbl>
        #> 1   1.5   2.7
        #> 2   4.9   1.3
        #> 3   7.2   8.1

        Element vs elements

        When using rvest, your eventual goal is usually to build up a data frame, and you want each row to correspond some repeated unit on the HTML page. In this case, you should generally start by using html_elements() to select the elements that contain each observation then use html_element() to extract the variables from each observation. This guarantees that you’ll get the same number of values for each variable because html_element() always returns the same number of outputs as inputs.

        To illustrate this problem take a look at this simple example I constructed using a few entries from dplyr::starwars:

        html <- minimal_html("
          <ul>
            <li><b>C-3PO</b> is a <i>droid</i> that weighs <span class='weight'>167 kg</span></li>
            <li><b>R2-D2</b> is a <i>droid</i> that weighs <span class='weight'>96 kg</span></li>
            <li><b>Yoda</b> weighs <span class='weight'>66 kg</span></li>
            <li><b>R4-P17</b> is a <i>droid</i></li>
          </ul>
          ")

        If you try to extract name, species, and weight directly, you end up with one vector of length four and two vectors of length three, and no way to align them:

        html %>% html_elements("b") %>% html_text2()
        #> [1] "C-3PO"  "R2-D2"  "Yoda"   "R4-P17"
        html %>% html_elements("i") %>% html_text2()
        #> [1] "droid" "droid" "droid"
        html %>% html_elements(".weight") %>% html_text2()
        #> [1] "167 kg" "96 kg"  "66 kg"

        Instead, use html_elements() to find a element that corresponds to each character, then use html_element() to extract each variable for all observations:

        characters <- html %>% html_elements("li")
        
        characters %>% html_element("b") %>% html_text2()
        #> [1] "C-3PO"  "R2-D2"  "Yoda"   "R4-P17"
        characters %>% html_element("i") %>% html_text2()
        #> [1] "droid" "droid" NA      "droid"
        characters %>% html_element(".weight") %>% html_text2()
        #> [1] "167 kg" "96 kg"  "66 kg"  NA

        html_element() automatically fills in NA when no elements match, keeping all of the variables aligned and making it easy to create a data frame:

        data.frame(
          name = characters %>% html_element("b") %>% html_text2(),
          species = characters %>% html_element("i") %>% html_text2(),
          weight = characters %>% html_element(".weight") %>% html_text2()
        )
        #>     name species weight
        #> 1  C-3PO   droid 167 kg
        #> 2  R2-D2   droid  96 kg
        #> 3   Yoda    <NA>  66 kg
        #> 4 R4-P17   droid   <NA>

        1. A number of tags (including <p> and <li>) don’t require end tags, but I think it’s best to include them because it makes seeing the structure of the HTML a little easier.↩︎

        2. This class comes from the xml2 package. xml2 is a low-level package that rvest builds on top of.↩︎

        3. Or another element, more on that shortly.↩︎

        rvest/inst/doc/starwars.html0000644000176200001440000002266314562450666015722 0ustar liggesusers Star Wars films

        Star Wars films

        This vignette contains some data about the Star Wars films for use in rvest examples and vignettes.

        The Phantom Menace

        Released: 1999-05-19

        Director: George Lucas

        Turmoil has engulfed the Galactic Republic. The taxation of trade routes to outlying star systems is in dispute.

        Hoping to resolve the matter with a blockade of deadly battleships, the greedy Trade Federation has stopped all shipping to the small planet of Naboo.

        While the Congress of the Republic endlessly debates this alarming chain of events, the Supreme Chancellor has secretly dispatched two Jedi Knights, the guardians of peace and justice in the galaxy, to settle the conflict….

        Attack of the Clones

        Released: 2002-05-16

        Director: George Lucas

        There is unrest in the Galactic Senate. Several thousand solar systems have declared their intentions to leave the Republic.

        This separatist movement, under the leadership of the mysterious Count Dooku, has made it difficult for the limited number of Jedi Knights to maintain peace and order in the galaxy.

        Senator Amidala, the former Queen of Naboo, is returning to the Galactic Senate to vote on the critical issue of creating an ARMY OF THE REPUBLIC to assist the overwhelmed Jedi….

        Revenge of the Sith

        Released: 2005-05-19

        Director: George Lucas

        War! The Republic is crumbling under attacks by the ruthless Sith Lord, Count Dooku. There are heroes on both sides. Evil is everywhere.

        In a stunning move, the fiendish droid leader, General Grievous, has swept into the Republic capital and kidnapped Chancellor Palpatine, leader of the Galactic Senate.

        As the Separatist Droid Army attempts to flee the besieged capital with their valuable hostage, two Jedi Knights lead a desperate mission to rescue the captive Chancellor….

        A New Hope

        Released: 1977-05-25

        Director: George Lucas

        It is a period of civil war. Rebel spaceships, striking from a hidden base, have won their first victory against the evil Galactic Empire.

        During the battle, Rebel spies managed to steal secret plans to the Empire’s ultimate weapon, the DEATH STAR, an armored space station with enough power to destroy an entire planet.

        Pursued by the Empire’s sinister agents, Princess Leia races home aboard her starship, custodian of the stolen plans that can save her people and restore freedom to the galaxy….

        The Empire Strikes Back

        Released: 1980-05-17

        Director: Irvin Kershner

        It is a dark time for the Rebellion. Although the Death Star has been destroyed, Imperial troops have driven the Rebel forces from their hidden base and pursued them across the galaxy.

        Evading the dreaded Imperial Starfleet, a group of freedom fighters led by Luke Skywalker has established a new secret base on the remote ice world of Hoth.

        The evil lord Darth Vader, obsessed with finding young Skywalker, has dispatched thousands of remote probes into the far reaches of space….

        Return of the Jedi

        Released: 1983-05-25

        Director: Richard Marquand

        Luke Skywalker has returned to his home planet of Tatooine in an attempt to rescue his friend Han Solo from the clutches of the vile gangster Jabba the Hutt.

        Little does Luke know that the GALACTIC EMPIRE has secretly begun construction on a new armored space station even more powerful than the first dreaded Death Star.

        When completed, this ultimate weapon will spell certain doom for the small band of rebels struggling to restore freedom to the galaxy…

        The Force Awakens

        Released: 2015-12-11

        Director: J. J. Abrams

        Luke Skywalker has vanished. In his absence, the sinister FIRST ORDER has risen from the ashes of the Empire and will not rest until Skywalker, the last Jedi, has been destroyed. With the support of the REPUBLIC, General Leia Organa leads a brave RESISTANCE. She is desperate to find her brother Luke and gain his help in restoring peace and justice to the galaxy. Leia has sent her most daring pilot on a secret mission to Jakku, where an old ally has discovered a clue to Luke’s whereabouts….

        rvest/inst/doc/starwars.R0000644000176200001440000000126314562450666015150 0ustar liggesusers## ----echo = FALSE, results = "asis"------------------------------------------- library(rvest) crawl_html <- function(x) { x %>% gsub("\r", "", .) %>% gsub("\n\n", "

        ", .) %>% gsub("\n", " ", .) %>% paste0("

        ", ., "

        ") } film_desc <- function(x) { glue::glue_data(x, "

        {title}

        Released: {release_date}

        Director: {director}

        {crawl_html(opening_crawl)}
        ") } films <- repurrrsive::sw_films films <- films[order(sapply(films, "[[", "episode_id"))] descs <- vapply(films, film_desc, character(1)) writeLines(descs) rvest/inst/doc/starwars.Rmd0000644000176200001440000000165113775161316015466 0ustar liggesusers--- title: "Star Wars films" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Star Wars films} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- This vignette contains some data about the Star Wars films for use in rvest examples and vignettes. ```{r, echo = FALSE, results = "asis"} library(rvest) crawl_html <- function(x) { x %>% gsub("\r", "", .) %>% gsub("\n\n", "

        ", .) %>% gsub("\n", " ", .) %>% paste0("

        ", ., "

        ") } film_desc <- function(x) { glue::glue_data(x, "

        {title}

        Released: {release_date}

        Director: {director}

        {crawl_html(opening_crawl)}
        ") } films <- repurrrsive::sw_films films <- films[order(sapply(films, "[[", "episode_id"))] descs <- vapply(films, film_desc, character(1)) writeLines(descs) ``` rvest/inst/doc/rvest.R0000644000176200001440000001022714562450665014444 0ustar liggesusers## ----echo=FALSE--------------------------------------------------------------- knitr::opts_chunk$set(comment = "#>", collapse = TRUE) ## ----------------------------------------------------------------------------- library(rvest) ## ----------------------------------------------------------------------------- html <- read_html("http://rvest.tidyverse.org/") class(html) ## ----------------------------------------------------------------------------- html <- minimal_html("

        This is a paragraph

        • This is a bulleted list
        ") html ## ----------------------------------------------------------------------------- html <- minimal_html("

        This is a heading

        This is a paragraph

        This is an important paragraph

        ") ## ----------------------------------------------------------------------------- html %>% html_element("h1") html %>% html_elements("p") html %>% html_elements(".important") html %>% html_elements("#first") ## ----------------------------------------------------------------------------- html <- minimal_html("
        1. apple & pear
        2. banana
        3. pineapple
        ") html %>% html_elements("li") %>% html_text2() ## ----------------------------------------------------------------------------- html %>% html_elements("li") %>% html_text() ## ----------------------------------------------------------------------------- html <- minimal_html("

        This is a paragraph.

        This is another paragraph. It has two sentences.

        ") ## ----------------------------------------------------------------------------- html %>% html_element("body") %>% html_text2() %>% cat() ## ----------------------------------------------------------------------------- html %>% html_element("body") %>% html_text() %>% cat() ## ----------------------------------------------------------------------------- html <- minimal_html("

        cats

        ") ## ----------------------------------------------------------------------------- html %>% html_elements("a") %>% html_attr("href") html %>% html_elements("img") %>% html_attr("src") ## ----------------------------------------------------------------------------- html %>% html_elements("img") %>% html_attr("width") html %>% html_elements("img") %>% html_attr("width") %>% as.integer() ## ----------------------------------------------------------------------------- html <- minimal_html("
        x y
        1.5 2.7
        4.9 1.3
        7.2 8.1
        ") ## ----------------------------------------------------------------------------- html %>% html_node("table") %>% html_table() ## ----------------------------------------------------------------------------- html <- minimal_html("
        • C-3PO is a droid that weighs 167 kg
        • R2-D2 is a droid that weighs 96 kg
        • Yoda weighs 66 kg
        • R4-P17 is a droid
        ") ## ----------------------------------------------------------------------------- html %>% html_elements("b") %>% html_text2() html %>% html_elements("i") %>% html_text2() html %>% html_elements(".weight") %>% html_text2() ## ----------------------------------------------------------------------------- characters <- html %>% html_elements("li") characters %>% html_element("b") %>% html_text2() characters %>% html_element("i") %>% html_text2() characters %>% html_element(".weight") %>% html_text2() ## ----------------------------------------------------------------------------- data.frame( name = characters %>% html_element("b") %>% html_text2(), species = characters %>% html_element("i") %>% html_text2(), weight = characters %>% html_element(".weight") %>% html_text2() ) rvest/inst/html-ex/0000755000176200001440000000000013767413737013772 5ustar liggesusersrvest/inst/html-ex/bad-encoding.html0000644000176200001440000000014613767413737017173 0ustar liggesusers Bad encoding

        migr cause clbre dj vu.

        rvest/inst/WORDLIST0000644000176200001440000000054014562444072013573 0ustar liggesusersCDATA CMD Codecov DTD HyperText IMDB LiveHTML MDN PBC PSVI RoboBrowser SelectorGadget XINCLUDE XInclude XPath arounds bookmarklet chromote cloneable colspan colspans combinator combinators config css fixup funder hardcoded httr innerText ith javascript libxml magrittr nodeset rowspan rowspans selendir substitition tibble tibbles tidyverse uris xpath