rvest/ 0000755 0001762 0000144 00000000000 14562475432 011431 5 ustar ligges users rvest/NAMESPACE 0000644 0001762 0000144 00000004116 14557166073 012654 0 ustar ligges users # Generated by roxygen2: do not edit by hand
S3method(cookies,rvest_session)
S3method(format,rvest_field)
S3method(headers,rvest_session)
S3method(html_element,LiveHTML)
S3method(html_element,default)
S3method(html_element,rvest_session)
S3method(html_elements,LiveHTML)
S3method(html_elements,default)
S3method(html_elements,rvest_session)
S3method(html_form,rvest_session)
S3method(html_form,xml_document)
S3method(html_form,xml_node)
S3method(html_form,xml_nodeset)
S3method(html_table,LiveHTML)
S3method(html_table,rvest_session)
S3method(html_table,xml_document)
S3method(html_table,xml_node)
S3method(html_table,xml_nodeset)
S3method(html_text2,xml_document)
S3method(html_text2,xml_missing)
S3method(html_text2,xml_node)
S3method(html_text2,xml_nodeset)
S3method(print,rvest_field)
S3method(print,rvest_form)
S3method(print,rvest_session)
S3method(read_html,rvest_session)
S3method(status_code,rvest_session)
export("%>%")
export(LiveHTML)
export(back)
export(follow_link)
export(forward)
export(google_form)
export(guess_encoding)
export(html_attr)
export(html_attrs)
export(html_children)
export(html_element)
export(html_elements)
export(html_encoding_guess)
export(html_form)
export(html_form_set)
export(html_form_submit)
export(html_name)
export(html_node)
export(html_nodes)
export(html_session)
export(html_table)
export(html_text)
export(html_text2)
export(is.session)
export(jump_to)
export(minimal_html)
export(read_html)
export(read_html_live)
export(repair_encoding)
export(session)
export(session_back)
export(session_follow_link)
export(session_forward)
export(session_history)
export(session_jump_to)
export(session_submit)
export(set_values)
export(submit_form)
export(url_absolute)
export(xml_node)
export(xml_nodes)
export(xml_tag)
import(rlang)
importFrom(glue,glue)
importFrom(httr,cookies)
importFrom(httr,headers)
importFrom(httr,status_code)
importFrom(lifecycle,deprecated)
importFrom(magrittr,"%>%")
importFrom(xml2,read_html)
importFrom(xml2,url_absolute)
importFrom(xml2,xml_attr)
importFrom(xml2,xml_attrs)
importFrom(xml2,xml_children)
importFrom(xml2,xml_name)
importFrom(xml2,xml_text)
rvest/demo/ 0000755 0001762 0000144 00000000000 13775436633 012362 5 ustar ligges users rvest/demo/united.R 0000644 0001762 0000144 00000000712 13775436633 013775 0 ustar ligges users # Scrape miles from united site
library(rvest)
united <- session("http://www.united.com/")
login <- united %>%
html_element("form[name=LoginForm]") %>%
html_form() %>%
html_form_set(
MpNumber = "GY797363",
Password = password
)
logged_in <- united %>% session_submit(login)
logged_in %>%
follow_link("View account") %>%
html_element("#ctl00_ContentInfo_AccountSummary_spanEliteMilesNew") %>%
html_text() %>%
readr::parse_number()
rvest/demo/zillow.R 0000644 0001762 0000144 00000001514 13775423476 014027 0 ustar ligges users # Inspired by https://github.com/notesofdabbler
library(rvest)
library(tidyr)
page <- read_html("http://www.zillow.com/homes/for_sale/Greenwood-IN/fsba,fsbo,fore,cmsn_lt/house_type/52333_rid/39.638414,-86.011362,39.550714,-86.179419_rect/12_zm/0_mmm/")
houses <- page %>%
html_elements(".photo-cards li article")
z_id <- houses %>% html_attr("id")
address <- houses %>%
html_element(".zsg-photo-card-address") %>%
html_text()
price <- houses %>%
html_element(".zsg-photo-card-price") %>%
html_text() %>%
readr::parse_number()
params <- houses %>%
html_element(".zsg-photo-card-info") %>%
html_text() %>%
strsplit("\u00b7")
beds <- params %>% purrr::map_chr(1) %>% readr::parse_number()
baths <- params %>% purrr::map_chr(2) %>% readr::parse_number()
house_area <- params %>% purrr::map_chr(3) %>% readr::parse_number()
rvest/demo/00Index 0000644 0001762 0000144 00000000217 13767413737 013515 0 ustar ligges users united Scrape mileage details from united.com
tripadvisor Scrape review data from tripadvisor
zillow Scrape housing info from zillow
rvest/demo/tripadvisor.R 0000644 0001762 0000144 00000001561 13775423515 015051 0 ustar ligges users # Inspired by
# http://notesofdabbler.github.io/201408_hotelReview/scrapeTripAdvisor.html
library(rvest)
url <- "http://www.tripadvisor.com/Hotel_Review-g37209-d1762915-Reviews-JW_Marriott_Indianapolis-Indianapolis_Indiana.html"
reviews <- url %>%
read_html() %>%
html_elements("#REVIEWS .innerBubble")
id <- reviews %>%
html_element(".quote a") %>%
html_attr("id")
quote <- reviews %>%
html_element(".quote span") %>%
html_text()
rating <- reviews %>%
html_element(".rating .rating_s_fill") %>%
html_attr("alt") %>%
gsub(" of 5 stars", "", .) %>%
as.integer()
date <- reviews %>%
html_element(".rating .ratingDate") %>%
html_attr("title") %>%
strptime("%b %d, %Y") %>%
as.POSIXct()
review <- reviews %>%
html_element(".entry .partial_entry") %>%
html_text()
data.frame(id, quote, rating, date, review, stringsAsFactors = FALSE) %>% View()
rvest/LICENSE 0000644 0001762 0000144 00000000053 14554031036 012421 0 ustar ligges users YEAR: 2023
COPYRIGHT HOLDER: rvest authors
rvest/README.md 0000644 0001762 0000144 00000010634 14554031036 012701 0 ustar ligges users
# rvest
[](https://cran.r-project.org/package=rvest)
[](https://github.com/tidyverse/rvest/actions/workflows/R-CMD-check.yaml)
[](https://app.codecov.io/gh/tidyverse/rvest?branch=main)
## Overview
rvest helps you scrape (or harvest) data from web pages. It is designed
to work with [magrittr](https://github.com/tidyverse/magrittr) to make
it easy to express common web scraping tasks, inspired by libraries like
[beautiful soup](https://www.crummy.com/software/BeautifulSoup/) and
[RoboBrowser](http://robobrowser.readthedocs.io/en/latest/readme.html).
If you’re scraping multiple pages, I highly recommend using rvest in
concert with [polite](https://dmi3kno.github.io/polite/). The polite
package ensures that you’re respecting the
[robots.txt](https://en.wikipedia.org/wiki/Robots_exclusion_standard)
and not hammering the site with too many requests.
## Installation
``` r
# The easiest way to get rvest is to install the whole tidyverse:
install.packages("tidyverse")
# Alternatively, install just rvest:
install.packages("rvest")
```
## Usage
``` r
library(rvest)
# Start by reading a HTML page with read_html():
starwars <- read_html("https://rvest.tidyverse.org/articles/starwars.html")
# Then find elements that match a css selector or XPath expression
# using html_elements(). In this example, each corresponds
# to a different film
films <- starwars %>% html_elements("section")
films
#> {xml_nodeset (7)}
#> [1]
\nThe Phantom Menace\n
\n
\nReleased: 1999 ...
#> [2]
\nAttack of the Clones\n
\n
\nReleased: 20 ...
#> [3]
\nRevenge of the Sith\n
\n
\nReleased: 200 ...
#> [4]
\nA New Hope\n
\n
\nReleased: 1977-05-25\n ...
#> [5]
\nThe Empire Strikes Back\n
\n
\nReleased: ...
#> [6]
\nReturn of the Jedi\n
\n
\nReleased: 1983 ...
#> [7]
\nThe Force Awakens\n
\n
\nReleased: 2015- ...
# Then use html_element() to extract one element per film. Here
# we the title is given by the text inside
title <- films %>%
html_element("h2") %>%
html_text2()
title
#> [1] "The Phantom Menace" "Attack of the Clones"
#> [3] "Revenge of the Sith" "A New Hope"
#> [5] "The Empire Strikes Back" "Return of the Jedi"
#> [7] "The Force Awakens"
# Or use html_attr() to get data out of attributes. html_attr() always
# returns a string so we convert it to an integer using a readr function
episode <- films %>%
html_element("h2") %>%
html_attr("data-id") %>%
readr::parse_integer()
episode
#> [1] 1 2 3 4 5 6 7
```
If the page contains tabular data you can convert it directly to a data
frame with `html_table()`:
``` r
html <- read_html("https://en.wikipedia.org/w/index.php?title=The_Lego_Movie&oldid=998422565")
html %>%
html_element(".tracklist") %>%
html_table()
#> # A tibble: 29 × 4
#> No. Title `Performer(s)` Length
#>
#> 1 1. "\"Everything Is Awesome\"" "Tegan and Sara featuring The Lonel… 2:43
#> 2 2. "\"Prologue\"" "" 2:28
#> 3 3. "\"Emmett's Morning\"" "" 2:00
#> 4 4. "\"Emmett Falls in Love\"" "" 1:11
#> 5 5. "\"Escape\"" "" 3:26
#> 6 6. "\"Into the Old West\"" "" 1:00
#> 7 7. "\"Wyldstyle Explains\"" "" 1:21
#> 8 8. "\"Emmett's Mind\"" "" 2:17
#> 9 9. "\"The Transformation\"" "" 1:46
#> 10 10. "\"Saloons and Wagons\"" "" 3:38
#> # ℹ 19 more rows
```
rvest/man/ 0000755 0001762 0000144 00000000000 14560241772 012200 5 ustar ligges users rvest/man/html_text.Rd 0000644 0001762 0000144 00000005046 13775651250 014507 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/text.R
\name{html_text}
\alias{html_text}
\alias{html_text2}
\title{Get element text}
\usage{
html_text(x, trim = FALSE)
html_text2(x, preserve_nbsp = FALSE)
}
\arguments{
\item{x}{A document, node, or node set.}
\item{trim}{If \code{TRUE} will trim leading and trailing spaces.}
\item{preserve_nbsp}{Should non-breaking spaces be preserved? By default,
\code{html_text2()} converts to ordinary spaces to ease further computation.
When \code{preserve_nbsp} is \code{TRUE}, \verb{ } will appear in strings as
\code{"\\ua0"}. This often causes confusion because it prints the same way as
\code{" "}.}
}
\value{
A character vector the same length as \code{x}
}
\description{
There are two ways to retrieve text from a element: \code{html_text()} and
\code{html_text2()}. \code{html_text()} is a thin wrapper around \code{\link[xml2:xml_text]{xml2::xml_text()}}
which returns just the raw underlying text. \code{html_text2()} simulates how
text looks in a browser, using an approach inspired by JavaScript's
\href{https://developer.mozilla.org/en-US/docs/Web/API/HTMLElement/innerText}{innerText()}.
Roughly speaking, it converts \verb{ } to \code{"\\n"}, adds blank lines
around \verb{
} tags, and lightly formats tabular data.
\code{html_text2()} is usually what you want, but it is much slower than
\code{html_text()} so for simple applications where performance is important
you may want to use \code{html_text()} instead.
}
\examples{
# To understand the difference between html_text() and html_text2()
# take the following html:
html <- minimal_html(
"
This is a paragraph.
This another sentence. This should start on a new line"
)
# html_text() returns the raw underlying text, which includes whitespace
# that would be ignored by a browser, and ignores the
html \%>\% html_element("p") \%>\% html_text() \%>\% writeLines()
# html_text2() simulates what a browser would display. Non-significant
# whitespace is collapsed, and is turned into a line break
html \%>\% html_element("p") \%>\% html_text2() \%>\% writeLines()
# By default, html_text2() also converts non-breaking spaces to regular
# spaces:
html <- minimal_html("
x y
")
x1 <- html \%>\% html_element("p") \%>\% html_text()
x2 <- html \%>\% html_element("p") \%>\% html_text2()
# When printed, non-breaking spaces look exactly like regular spaces
x1
x2
# But aren't actually the same:
x1 == x2
# Which you can confirm by looking at their underlying binary
# representaion:
charToRaw(x1)
charToRaw(x2)
}
rvest/man/google_form.Rd 0000644 0001762 0000144 00000000470 14101012310 014736 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utils.R
\name{google_form}
\alias{google_form}
\title{Make link to google form given id}
\usage{
google_form(x)
}
\arguments{
\item{x}{Unique identifier for form}
}
\description{
Make link to google form given id
}
\keyword{internal}
rvest/man/html_form.Rd 0000644 0001762 0000144 00000004261 14277722126 014463 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/form.R
\name{html_form}
\alias{html_form}
\alias{html_form_set}
\alias{html_form_submit}
\title{Parse forms and set values}
\usage{
html_form(x, base_url = NULL)
html_form_set(form, ...)
html_form_submit(form, submit = NULL)
}
\arguments{
\item{x}{A document (from \code{\link[=read_html]{read_html()}}), node set (from \code{\link[=html_elements]{html_elements()}}),
node (from \code{\link[=html_element]{html_element()}}), or session (from \code{\link[=session]{session()}}).}
\item{base_url}{Base url of underlying HTML document. The default, \code{NULL},
uses the url of the HTML document underlying \code{x}.}
\item{form}{A form}
\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name-value pairs giving
fields to modify.
Provide a character vector to set multiple checkboxes in a set or
select multiple values from a multi-select.}
\item{submit}{Which button should be used to submit the form?
\itemize{
\item \code{NULL}, the default, uses the first button.
\item A string selects a button by its name.
\item A number selects a button using its relative position.
}}
}
\value{
\itemize{
\item \code{html_form()} returns as S3 object with class \code{rvest_form} when applied
to a single element. It returns a list of \code{rvest_form} objects when
applied to multiple elements or a document.
\item \code{html_form_set()} returns an \code{rvest_form} object.
\item \code{html_form_submit()} submits the form, returning an httr response which
can be parsed with \code{\link[=read_html]{read_html()}}.
}
}
\description{
Use \code{html_form()} to extract a form, set values with \code{html_form_set()},
and submit it with \code{html_form_submit()}.
}
\examples{
html <- read_html("http://www.google.com")
search <- html_form(html)[[1]]
search <- search \%>\% html_form_set(q = "My little pony", hl = "fr")
# Or if you have a list of values, use !!!
vals <- list(q = "web scraping", hl = "en")
search <- search \%>\% html_form_set(!!!vals)
# To submit and get result:
\dontrun{
resp <- html_form_submit(search)
read_html(resp)
}
}
\seealso{
HTML 4.01 form specification:
\url{https://www.w3.org/TR/html401/interact/forms.html}
}
rvest/man/rename.Rd 0000644 0001762 0000144 00000003256 14014035320 013724 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/rename.R
\name{rename}
\alias{set_values}
\alias{submit_form}
\alias{xml_tag}
\alias{xml_node}
\alias{xml_nodes}
\alias{html_nodes}
\alias{html_node}
\alias{back}
\alias{forward}
\alias{jump_to}
\alias{follow_link}
\alias{html_session}
\title{Functions renamed in rvest 1.0.0}
\usage{
set_values(form, ...)
submit_form(session, form, submit = NULL, ...)
xml_tag(x)
xml_node(...)
xml_nodes(...)
html_nodes(...)
html_node(...)
back(x)
forward(x)
jump_to(x, url, ...)
follow_link(x, ...)
html_session(url, ...)
}
\description{
\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}
rvest 1.0.0 renamed a number of functions to ensure that every function
has a common prefix, matching tidyverse conventions that emerged since
rvest was first created.
\itemize{
\item \code{set_values()} -> \code{html_form_set()}
\item \code{submit_form()} -> \code{session_submit()}
\item \code{xml_tag()} -> \code{html_name()}
\item \code{xml_node()} & \code{html_node()} -> \code{html_element()}
\item \code{xml_nodes()} & \code{html_nodes()} -> \code{html_elements()}
}
(\code{html_node()} and \code{html_nodes()} are only superseded because they're
so widely used.)
Additionally all session related functions gained a common prefix:
\itemize{
\item \code{html_session()} -> \code{session()}
\item \code{forward()} -> \code{session_forward()}
\item \code{back()} -> \code{session_back()}
\item \code{jump_to()} -> \code{session_jump_to()}
\item \code{follow_link()} -> \code{session_follow_link()}
}
}
\keyword{internal}
rvest/man/read_html_live.Rd 0000644 0001762 0000144 00000004057 14562443630 015452 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/live.R
\name{read_html_live}
\alias{read_html_live}
\title{Live web scraping (with chromote)}
\usage{
read_html_live(url)
}
\arguments{
\item{url}{Website url to read from.}
}
\value{
\code{read_html_live()} returns an R6 \link{LiveHTML} object. You can interact
with this object using the usual rvest functions, or call its methods,
like \verb{$click()}, \verb{$scroll_to()}, and \verb{$type()} to interact with the live
page like a human would.
}
\description{
\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
\code{\link[=read_html]{read_html()}} operates on the HTML source code downloaded from the server.
This works for most websites but can fail if the site uses javascript to
generate the HTML. \code{read_html_live()} provides an alternative interface
that runs a live web browser (Chrome) in the background. This allows you to
access elements of the HTML page that are generated dynamically by javascript
and to interact with the live page by clicking on buttons or typing in
forms.
Behind the scenes, this function uses the
\href{https://rstudio.github.io/chromote/}{chromote} package, which requires that
you have a copy of \href{https://www.google.com/chrome/}{Google Chrome} installed
on your machine.
}
\examples{
\dontrun{
# When we retrieve the raw HTML for this site, it doesn't contain the
# data we're interested in:
static <- read_html("https://www.forbes.com/top-colleges/")
static \%>\% html_elements(".TopColleges2023_tableRow__BYOSU")
# Instead, we need to run the site in a real web browser, causing it to
# download a JSON file and then dynamically generate the html:
sess <- read_html_live("https://www.forbes.com/top-colleges/")
sess$view()
rows <- sess \%>\% html_elements(".TopColleges2023_tableRow__BYOSU")
rows \%>\% html_element(".TopColleges2023_organizationName__J1lEV") \%>\% html_text()
rows \%>\% html_element(".grant-aid") \%>\% html_text()
}
}
rvest/man/minimal_html.Rd 0000644 0001762 0000144 00000000663 13775437157 015161 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utils.R
\name{minimal_html}
\alias{minimal_html}
\title{Create an HTML document from inline HTML}
\usage{
minimal_html(html, title = "")
}
\arguments{
\item{html}{HTML contents of page.}
\item{title}{Page title (required by HTML spec).}
}
\description{
Create an HTML document from inline HTML
}
\examples{
minimal_html("
test
")
}
\keyword{internal}
rvest/man/html_name.Rd 0000644 0001762 0000144 00000001256 13776122153 014436 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/html.R
\name{html_name}
\alias{html_name}
\title{Get element name}
\usage{
html_name(x)
}
\arguments{
\item{x}{A document (from \code{\link[=read_html]{read_html()}}), node set (from \code{\link[=html_elements]{html_elements()}}),
node (from \code{\link[=html_element]{html_element()}}), or session (from \code{\link[=session]{session()}}).}
}
\value{
A character vector the same length as \code{x}
}
\description{
Get element name
}
\examples{
url <- "https://rvest.tidyverse.org/articles/starwars.html"
html <- read_html(url)
html \%>\%
html_element("div") \%>\%
html_children() \%>\%
html_name()
}
rvest/man/html_attr.Rd 0000644 0001762 0000144 00000002423 14132341320 014447 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/html.R
\name{html_attr}
\alias{html_attr}
\alias{html_attrs}
\title{Get element attributes}
\usage{
html_attr(x, name, default = NA_character_)
html_attrs(x)
}
\arguments{
\item{x}{A document (from \code{\link[=read_html]{read_html()}}), node set (from \code{\link[=html_elements]{html_elements()}}),
node (from \code{\link[=html_element]{html_element()}}), or session (from \code{\link[=session]{session()}}).}
\item{name}{Name of attribute to retrieve.}
\item{default}{A string used as a default value when the attribute does
not exist in every element.}
}
\value{
A character vector (for \code{html_attr()}) or list (\code{html_attrs()})
the same length as \code{x}.
}
\description{
\code{html_attr()} gets a single attribute; \code{html_attrs()} gets all attributes.
}
\examples{
html <- minimal_html('
')
html \%>\% html_elements("a") \%>\% html_attrs()
html \%>\% html_elements("a") \%>\% html_attr("href")
html \%>\% html_elements("li") \%>\% html_attr("class")
html \%>\% html_elements("li") \%>\% html_attr("class", default = "inactive")
}
rvest/man/html_table.Rd 0000644 0001762 0000144 00000004665 14007274024 014605 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/table.R
\name{html_table}
\alias{html_table}
\title{Parse an html table into a data frame}
\usage{
html_table(
x,
header = NA,
trim = TRUE,
fill = deprecated(),
dec = ".",
na.strings = "NA",
convert = TRUE
)
}
\arguments{
\item{x}{A document (from \code{\link[=read_html]{read_html()}}), node set (from \code{\link[=html_elements]{html_elements()}}),
node (from \code{\link[=html_element]{html_element()}}), or session (from \code{\link[=session]{session()}}).}
\item{header}{Use first row as header? If \code{NA}, will use first row
if it consists of \verb{
} tags.
If \code{TRUE}, column names are left exactly as they are in the source
document, which may require post-processing to generate a valid data
frame.}
\item{trim}{Remove leading and trailing whitespace within each cell?}
\item{fill}{Deprecated - missing cells in tables are now always
automatically filled with \code{NA}.}
\item{dec}{The character used as decimal place marker.}
\item{na.strings}{Character vector of values that will be converted to \code{NA}
if \code{convert} is \code{TRUE}.}
\item{convert}{If \code{TRUE}, will run \code{\link[=type.convert]{type.convert()}} to interpret texts as
integer, double, or \code{NA}.}
}
\value{
When applied to a single element, \code{html_table()} returns a single tibble.
When applied to multiple elements or a document, \code{html_table()} returns
a list of tibbles.
}
\description{
The algorithm mimics what a browser does, but repeats the values of merged
cells in every cell that cover.
}
\examples{
sample1 <- minimal_html("
Col A
Col B
1
x
4
y
10
z
")
sample1 \%>\%
html_element("table") \%>\%
html_table()
# Values in merged cells will be duplicated
sample2 <- minimal_html("
A
B
C
1
2
3
4
5
6
7
")
sample2 \%>\%
html_element("table") \%>\%
html_table()
# If a row is missing cells, they'll be filled with NAs
sample3 <- minimal_html("
A
B
C
1
2
3
4
")
sample3 \%>\%
html_element("table") \%>\%
html_table()
}
rvest/man/html_element.Rd 0000644 0001762 0000144 00000006072 14277722126 015153 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/selectors.R
\name{html_element}
\alias{html_element}
\alias{html_elements}
\title{Select elements from an HTML document}
\usage{
html_element(x, css, xpath)
html_elements(x, css, xpath)
}
\arguments{
\item{x}{Either a document, a node set or a single node.}
\item{css, xpath}{Elements to select. Supply one of \code{css} or \code{xpath}
depending on whether you want to use a CSS selector or XPath 1.0
expression.}
}
\value{
\code{html_element()} returns a nodeset the same length as the input.
\code{html_elements()} flattens the output so there's no direct way to map
the output to the input.
}
\description{
\code{html_element()} and \code{html_elements()} find HTML element using CSS selectors
or XPath expressions. CSS selectors are particularly useful in conjunction
with \url{https://selectorgadget.com/}, which makes it very easy to discover the
selector you need.
}
\section{CSS selector support}{
CSS selectors are translated to XPath selectors by the \pkg{selectr}
package, which is a port of the python \pkg{cssselect} library,
\url{https://pythonhosted.org/cssselect/}.
It implements the majority of CSS3 selectors, as described in
\url{https://www.w3.org/TR/2011/REC-css3-selectors-20110929/}. The
exceptions are listed below:
\itemize{
\item Pseudo selectors that require interactivity are ignored:
\verb{:hover}, \verb{:active}, \verb{:focus}, \verb{:target}, \verb{:visited}.
\item The following pseudo classes don't work with the wild card element, *:
\verb{*:first-of-type}, \verb{*:last-of-type}, \verb{*:nth-of-type},
\verb{*:nth-last-of-type}, \verb{*:only-of-type}
\item It supports \verb{:contains(text)}
\item You can use !=, \verb{[foo!=bar]} is the same as \verb{:not([foo=bar])}
\item \verb{:not()} accepts a sequence of simple selectors, not just a single
simple selector.
}
}
\examples{
html <- minimal_html("
This is a heading
This is a paragraph
This is an important paragraph
")
html \%>\% html_element("h1")
html \%>\% html_elements("p")
html \%>\% html_elements(".important")
html \%>\% html_elements("#first")
# html_element() vs html_elements() --------------------------------------
html <- minimal_html("
C-3PO is a droid that weighs 167 kg
R2-D2 is a droid that weighs 96 kg
Yoda weighs 66 kg
R4-P17 is a droid
")
li <- html \%>\% html_elements("li")
# When applied to a node set, html_elements() returns all matching elements
# beneath any of the inputs, flattening results into a new node set.
li \%>\% html_elements("i")
# When applied to a node set, html_element() always returns a vector the
# same length as the input, using a "missing" element where needed.
li \%>\% html_element("i")
# and html_text() and html_attr() will return NA
li \%>\% html_element("i") \%>\% html_text2()
li \%>\% html_element("span") \%>\% html_attr("class")
}
rvest/man/repair_encoding.Rd 0000644 0001762 0000144 00000001231 14014035320 015574 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/encoding.R
\name{repair_encoding}
\alias{repair_encoding}
\title{Repair faulty encoding}
\usage{
repair_encoding(x, from = NULL)
}
\arguments{
\item{from}{The encoding that the string is actually in. If \code{NULL},
\code{guess_encoding} will be used.}
}
\description{
\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}
This function has been deprecated because it doesn't work. Instead
re-read the HTML file with correct \code{encoding} argument.
}
\keyword{internal}
rvest/man/rvest-package.Rd 0000644 0001762 0000144 00000001446 14554031036 015221 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/rvest-package.R
\docType{package}
\name{rvest-package}
\alias{rvest}
\alias{rvest-package}
\title{rvest: Easily Harvest (Scrape) Web Pages}
\description{
\if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}}
Wrappers around the 'xml2' and 'httr' packages to make it easy to download, then manipulate, HTML and XML.
}
\seealso{
Useful links:
\itemize{
\item \url{https://rvest.tidyverse.org/}
\item \url{https://github.com/tidyverse/rvest}
\item Report bugs at \url{https://github.com/tidyverse/rvest/issues}
}
}
\author{
\strong{Maintainer}: Hadley Wickham \email{hadley@posit.co}
Other contributors:
\itemize{
\item Posit Software, PBC [copyright holder, funder]
}
}
\keyword{internal}
rvest/man/figures/ 0000755 0001762 0000144 00000000000 14554031036 013635 5 ustar ligges users rvest/man/figures/lifecycle-defunct.svg 0000644 0001762 0000144 00000002424 14554031036 017745 0 ustar ligges users
rvest/man/figures/lifecycle-maturing.svg 0000644 0001762 0000144 00000002430 14554031036 020140 0 ustar ligges users
rvest/man/figures/logo.png 0000644 0001762 0000144 00000076112 14554031036 015312 0 ustar ligges users PNG
IHDR ޫh cHRM z&