rvest/ 0000755 0001750 0001750 00000000000 14132660177 011556 5 ustar nilesh nilesh rvest/demo/ 0000755 0001750 0001750 00000000000 13775436633 012514 5 ustar nilesh nilesh rvest/demo/00Index 0000644 0001750 0001750 00000000217 13767413737 013647 0 ustar nilesh nilesh united Scrape mileage details from united.com
tripadvisor Scrape review data from tripadvisor
zillow Scrape housing info from zillow
rvest/demo/tripadvisor.R 0000644 0001750 0001750 00000001561 13775423515 015203 0 ustar nilesh nilesh # Inspired by
# http://notesofdabbler.github.io/201408_hotelReview/scrapeTripAdvisor.html
library(rvest)
url <- "http://www.tripadvisor.com/Hotel_Review-g37209-d1762915-Reviews-JW_Marriott_Indianapolis-Indianapolis_Indiana.html"
reviews <- url %>%
read_html() %>%
html_elements("#REVIEWS .innerBubble")
id <- reviews %>%
html_element(".quote a") %>%
html_attr("id")
quote <- reviews %>%
html_element(".quote span") %>%
html_text()
rating <- reviews %>%
html_element(".rating .rating_s_fill") %>%
html_attr("alt") %>%
gsub(" of 5 stars", "", .) %>%
as.integer()
date <- reviews %>%
html_element(".rating .ratingDate") %>%
html_attr("title") %>%
strptime("%b %d, %Y") %>%
as.POSIXct()
review <- reviews %>%
html_element(".entry .partial_entry") %>%
html_text()
data.frame(id, quote, rating, date, review, stringsAsFactors = FALSE) %>% View()
rvest/demo/zillow.R 0000644 0001750 0001750 00000001514 13775423476 014161 0 ustar nilesh nilesh # Inspired by https://github.com/notesofdabbler
library(rvest)
library(tidyr)
page <- read_html("http://www.zillow.com/homes/for_sale/Greenwood-IN/fsba,fsbo,fore,cmsn_lt/house_type/52333_rid/39.638414,-86.011362,39.550714,-86.179419_rect/12_zm/0_mmm/")
houses <- page %>%
html_elements(".photo-cards li article")
z_id <- houses %>% html_attr("id")
address <- houses %>%
html_element(".zsg-photo-card-address") %>%
html_text()
price <- houses %>%
html_element(".zsg-photo-card-price") %>%
html_text() %>%
readr::parse_number()
params <- houses %>%
html_element(".zsg-photo-card-info") %>%
html_text() %>%
strsplit("\u00b7")
beds <- params %>% purrr::map_chr(1) %>% readr::parse_number()
baths <- params %>% purrr::map_chr(2) %>% readr::parse_number()
house_area <- params %>% purrr::map_chr(3) %>% readr::parse_number()
rvest/demo/united.R 0000644 0001750 0001750 00000000712 13775436633 014127 0 ustar nilesh nilesh # Scrape miles from united site
library(rvest)
united <- session("http://www.united.com/")
login <- united %>%
html_element("form[name=LoginForm]") %>%
html_form() %>%
html_form_set(
MpNumber = "GY797363",
Password = password
)
logged_in <- united %>% session_submit(login)
logged_in %>%
follow_link("View account") %>%
html_element("#ctl00_ContentInfo_AccountSummary_spanEliteMilesNew") %>%
html_text() %>%
readr::parse_number()
rvest/MD5 0000644 0001750 0001750 00000010024 14132660177 012063 0 ustar nilesh nilesh b179d69b3999d72a6be1b9de283bf72d *DESCRIPTION
e8a97d0782b5ca3faa6669db5e7da997 *LICENSE
1aa8989ebe056fed42d2aa4640dc0f53 *NAMESPACE
8e6738c35b2521a3bab7eb4ed16a29e2 *NEWS.md
452b62853ab77b3c1f7b4231771ddd29 *R/encoding.R
ccd4ec0dacc779b8733af8a642b109d4 *R/form.R
b9bbef57a1f61b150f30fc504b6842ab *R/html.R
b7c36d3bad90be1bcd154df9457e2759 *R/rename.R
e78016d9c14b4ec9d54bbaf33f865891 *R/rvest-package.R
7d95677581a03f930591d42ffb9ed0c4 *R/selectors.R
84f9bbabed1eceb8d6d2aacbc0b5c2d4 *R/session.R
87bbd0f095c4744bcb001eb438fbc545 *R/table.R
3011d9a76ebc5504bdb04977193ddcf5 *R/testthat.R
f41504de2b0cb9a3bf5a50feff6d3e0b *R/text.R
60f6ff612a2d9b1c0252e208b7dd7c24 *R/utils.R
53ba96bca139a8108f781e3b3f9c328b *README.md
9861b4b3aebd34cd2f23700ec61dea04 *build/vignette.rds
96facf495896646206f14ea3932cb996 *demo/00Index
5b1a65183607f9c7d536fb03efac59a1 *demo/tripadvisor.R
daaaa642bfbf152064f757e795fba645 *demo/united.R
8e08c2f3f3a7d02be1d796f820f257c9 *demo/zillow.R
1d0e6e024c643aa2bdcaa61bd4e3968c *inst/WORDLIST
b6e294946e527a9886d29a7d5963f283 *inst/doc/rvest.R
83bec8dba1ec1f1a5146bd199916c90f *inst/doc/rvest.Rmd
90b5c9a3f6fbab56e5fda4e1779b8b75 *inst/doc/rvest.html
bc8ae98aec6e23fe5802241ffc409841 *inst/doc/starwars.R
1d030fd990b11a62568bd5edf7710a07 *inst/doc/starwars.Rmd
61e7c66bc2bcfa439a74ad1d06c58ee6 *inst/doc/starwars.html
be4cd29b5c2b16bd908a786c53cf0276 *inst/html-ex/bad-encoding.html
cb1e46f469cfbbbde29c8b5113e1d789 *man/figures/lifecycle-archived.svg
c0d2e5a54f1fa4ff02bf9533079dd1f7 *man/figures/lifecycle-defunct.svg
a1b8c987c676c16af790f563f96cbb1f *man/figures/lifecycle-deprecated.svg
c3978703d8f40f2679795335715e98f4 *man/figures/lifecycle-experimental.svg
952b59dc07b171b97d5d982924244f61 *man/figures/lifecycle-maturing.svg
27b879bf3677ea76e3991d56ab324081 *man/figures/lifecycle-questioning.svg
53b3f893324260b737b3c46ed2a0e643 *man/figures/lifecycle-stable.svg
1c1fe7a759b86dc6dbcbe7797ab8246c *man/figures/lifecycle-superseded.svg
39579f197389469e5bec6c4bb2e34777 *man/figures/logo.png
eb9ac3d3af03097f746a0fe39050243b *man/google_form.Rd
a6d869162565326cc577521a9bfdc0af *man/html_attr.Rd
a644e18fdcf48efd1b51a71338e30de9 *man/html_children.Rd
39dd263d3539e8cfe6a53fb8fc8e2761 *man/html_element.Rd
82dd7199f651dd55a2f24219f63768e0 *man/html_encoding_guess.Rd
9a1a5352986ce91c2b91c610a2bbf49a *man/html_form.Rd
480665ed4e0a662cd7168bae2f773ecc *man/html_name.Rd
981e43b2c8ba9d56fe2a17f2b1b95dea *man/html_table.Rd
7824150f123e587749235481d99f6af2 *man/html_text.Rd
1e641362f441edc4f7fa5e3665ec8ae4 *man/minimal_html.Rd
56baeb81da4f1bbdb66fc2914e610472 *man/reexports.Rd
e4de192a61e9191cc5acaab22bc3387d *man/rename.Rd
ceac1f465444fe123838d404dd41eaf4 *man/repair_encoding.Rd
c35e73e81e6ebf1afa0f00d25fa6fca2 *man/rvest-package.Rd
b24724a42df8bc9450618572c8e2cb7b *man/session.Rd
0622a97a2aaa3c342f09636052c2d7f5 *tests/spelling.R
ef55516173099bedc78483fe7c0dd23c *tests/testthat.R
1eff4c362767dfa35019447f63aaa5bf *tests/testthat/_snaps/encoding.md
61a023edde1371ffef158c580adbe889 *tests/testthat/_snaps/form.md
394d2b13389ec6b38cd017699a4de168 *tests/testthat/_snaps/rename.md
ce97b14761bc7b06da7da6867aec6951 *tests/testthat/_snaps/selectors.md
bc9fc497b280daab21848d3bc155dd11 *tests/testthat/_snaps/session.md
8e1f3248b4c30548354f2569e44cb76c *tests/testthat/_snaps/table.md
483c192b5de64a50ab5954cb7ed69322 *tests/testthat/_snaps/utils.md
963251571de4ecb243dcaef1ab0c5586 *tests/testthat/test-encoding.R
f7f95d4b45ebaf4fc573997f74c7f417 *tests/testthat/test-form.R
60b3257c6910bb1e559c64688fd10914 *tests/testthat/test-html.R
3dd7435aba1764a7bb22544bc080c585 *tests/testthat/test-rename.R
3d9b9f27a90ff7c11c9b9c0de79a2be8 *tests/testthat/test-selectors.R
466191bea5bd3fd07f4bc9e2063ff126 *tests/testthat/test-session.R
75298f4e28a1da5c3e91820efc20e6ea *tests/testthat/test-table.R
b4797b2d4cca238d5c828f8778a1d983 *tests/testthat/test-text.R
2f2f8524da52b2515bcfa4c4d472033d *tests/testthat/test-utils.R
07292e7a862259805eebb8606cb61044 *tests/testthat/test.html
83bec8dba1ec1f1a5146bd199916c90f *vignettes/rvest.Rmd
1d030fd990b11a62568bd5edf7710a07 *vignettes/starwars.Rmd
rvest/NEWS.md 0000644 0001750 0001750 00000017454 14132343516 012662 0 ustar nilesh nilesh # rvest 1.0.2
* Fixes for CRAN
* `html_table()` converts empty tables to empty tibbles (@epiben, #327).
# rvest 1.0.1
* `html_table()` correctly handles tables with cells that contain blank values
for `rowspan` and/or `colspan`, so that e.g. `
` is parsed as
` ` (@epiben, #323).
* Fix broken example
# rvest 1.0.0
## New features
* New `html_text2()` provides a more natural rendering of HTML nodes into text,
converting ` ` into "\n", and removing non-significant whitespace (#175).
By default, it also converts ` ` into regular spaces, which you can
suppress with `preserve_nbsp = TRUE` (#284).
* `html_table()` has been re-written from scratch to more closely mimic the
algorithm that browsers use for parsing tables. This should mean that there
are far fewer tables for which it fails to produce some output (#63, #204,
#215). The `fill` argument has been deprecated since it is no longer needed.
`html_table()` now returns a tibble rather than a data frame to be compatible
with the rest of the tidyverse (#199). Its performance has been considerably
improved (#237). It also gains a `na.strings` argument to control what values
are converted to `NA` (#107), and a `convert` argument to control whether to
run the conversion (#311).
* New `html_form_submit()` allows you to submit a form directly, without
needing to create a session (#300).
* rvest is now licensed as MIT (#287).
## API changes
Since this is the 1.0.0 release, I included a large number of API changes to make rvest more compatible with current tidyverse conventions. Older functions have been deprecated, so existing code will continue to work (albeit with a few new warnings).
* rvest now imports xml2 rather than depending on it. This is cleaner because
it avoids attaching all the xml2 functions that you're less likely to use.
To reduce the change of breakages, rvest re-exports xml2 functions
`read_html()` and `url_absolute()`, but your code may now need an explicit
`library(xml2)`.
* `html_form()` now returns an object with class `rvest_form` (instead of form).
Fields within a form now have class `rvest_field`, instead of a
variety of classes that were lacking the `rvest_` prefix. All functions for
working with forms have a common `html_form_` prefix: `set_values()` became
`html_form_set()`. `submit_form()` was renamed to `session_submit()` because
it returns a session.
* `html_node()` and `html_nodes()` have been superseded in favor of
`html_element()` and `html_elements()` since they (almost) always return
elements, not nodes (#298).
* `html_session()` is now `session()` and returns an object of class
`rvest_session` (instead of `session`). All functions that work with session
objects now have a common `session_` prefix.
* Long deprecated `html()`, `html_tag()`, `xml()` functions have been removed.
* `minimal_html()` (which doesn't appear to be used by any other package)
has had its arguments flipped to make it more intuitive.
* `guess_encoding()` has been renamed to `html_encoding_guess()` to avoid
a clash with `stringr::guess_encoding()` (#209). `repair_encoding()` has
been deprecated because it doesn't appear to work.
* `pluck()` is no longer exported to avoid a clash with `purrr::pluck()`;
if you need it use `purrr::map_chr()` and friends instead (#209).
* `xml_tag()`, `xml_node()`, and `xml_nodes()` have been formally deprecated
in favor of their `html_` equivalents.
## Minor improvements and bug fixes
* The "harvesting the web" vignette has been rewritten to focus more on basics
rvest, eliminating the screenshots to keep the installed package as svelte as
possible. It's also been renamed to `vignette("rvest")` since it's the
vignette that you should read first.
* The SelectorGadget vignette is now a web-only article,
,
so we can be more generous with screenshots since they're no longer bundled
with every install of the package. Together with the rewrite of the other
vignette, this means that rvest is now ~90 Kb instead of ~1.1 Mb.
* All uses of IMDB have been eliminated since the site explicitly prohibits
scraping (#195).
* `session_submit()` errors if `form` doesn't have a `url` (#288).
* New `session_forward()` function to complement `session_back()`.
It now allows you to pick the submission button by position (#156).
The `...` argument is deprecated; please use `config` instead.
* `html_form_set()` can now accept character vectors allowing you to select
multiple checkboxes in a set or select multiple values from a multi-``
(#127, with help from @juba). It also uses dynamic dots so that you can use
`!!!` if you have a list of values (#189).
# rvest 0.3.6
* Remove failing example
# rvest 0.3.5
* Use web archive to fix broken example.
# rvest 0.3.4
* Remove unneeded `read_xml.response()` method (#242).
# rvest 0.3.3
* Fix `R CMD check` failure
* `submit_request()` now checks for empty form-field-types to select the
correct submit fields (@rentrop, #159)
# rvest 0.3.2
* Fixes to `follow_link()` and `back()` to correctly manage session history.
* If you're using xml2 1.0.0, `html_node()` will now return a "missing node".
* Parse rowspans and colspans effectively by filling using repetition from
left to right (for colspan) and top to bottom (rowspan) (#111)
* Updated a few examples and demos where the website structure has
changed.
* Made compatible with both xml2 0.1.2 and 1.0.0.
# rvest 0.3.1
* Fix invalid link for SSA example.
* Parse `` that don't have value attribute (#85).
* Remove all remaining uses of `html()` in favor of `read_html()`
(@jimhester, #113).
# rvest 0.3.0
* rvest has been rewritten to take advantage of the new xml2 package. xml2
provides a fresh binding to libxml2, avoiding many of the work-arounds
previously needed for the XML package. Now rvest depends on the xml2
package, so all the xml functions are available, and rvest adds a thin
wrapper for html.
* A number of functions have change names. The old versions still work,
but are deprecated and will be removed in rvest 0.4.0.
* `html_tag()` -> `html_name()`
* `html()` -> `read_html()`
* `html_node()` now throws an error if there are no matches, and a warning
if there's more than one match. I think this should make it more likely to
fail clearly when the structure of the page changes.
* `xml_structure()` has been moved to xml2. New `html_structure()` (also in
xml2) highlights id and class attributes (#78).
* `submit_form()` now works with forms that use GET (#66).
* `submit_request()` (and hence `submit_form()`) is now case-insensitive,
and so will find ` ` as well as` `.
* `submit_request()` (and hence `submit_form()`) recognizes forms with
` ` as a valid form submission button.
# rvest 0.2.0
## New features
* `html()` and `xml()` pass `...` on to `httr::GET()` so you can more
finely control the request (#48).
* Add xml support: parse with `xml()`, then work with using `xml_node()`,
`xml_attr()`, `xml_attrs()`, `xml_text()` and `xml_tag()` (#24).
* `xml_structure()`: new function that displays the structure (i.e. tag
and attribute names) of a xml/html object (#10).
## Bug fixes
* `follow_link()` now accepts css and xpath selectors. (#38, #41, #42)
* `html()` does a better job of dealing with encodings (passing the
problem on to `XML::parseHTML()`) instead of trying to do it itself
(#25, #50).
* `html_attr()` returns default value when input is NULL (#49)
* Add missing `html_node()` method for session.
* `html_nodes()` now returns an empty list if no elements are found (#31).
* `submit_form()` converts relative paths to absolute URLs (#52).
It also deals better with 0-length inputs (#29).
rvest/DESCRIPTION 0000644 0001750 0001750 00000002174 14132660177 013270 0 ustar nilesh nilesh Package: rvest
Title: Easily Harvest (Scrape) Web Pages
Version: 1.0.2
Authors@R:
c(person(given = "Hadley",
family = "Wickham",
role = c("aut", "cre"),
email = "hadley@rstudio.com"),
person(given = "RStudio",
role = "cph"))
Description: Wrappers around the 'xml2' and 'httr' packages to
make it easy to download, then manipulate, HTML and XML.
License: MIT + file LICENSE
URL: https://rvest.tidyverse.org/, https://github.com/tidyverse/rvest
BugReports: https://github.com/tidyverse/rvest/issues
Depends: R (>= 3.2)
Imports: httr (>= 0.5), lifecycle (>= 1.0.0), magrittr, rlang (>=
0.4.10), selectr, tibble, xml2 (>= 1.3)
Suggests: covr, glue, knitr, readr, rmarkdown, repurrrsive, spelling,
stringi (>= 0.3.1), testthat (>= 3.0.2), webfakes
VignetteBuilder: knitr
Config/testthat/edition: 3
Encoding: UTF-8
Language: en-US
RoxygenNote: 7.1.2
NeedsCompilation: no
Packaged: 2021-10-15 18:24:40 UTC; hadley
Author: Hadley Wickham [aut, cre],
RStudio [cph]
Maintainer: Hadley Wickham
Repository: CRAN
Date/Publication: 2021-10-16 23:30:07 UTC
rvest/README.md 0000644 0001750 0001750 00000011035 14132342252 013024 0 ustar nilesh nilesh
# rvest
[](https://cran.r-project.org/package=rvest)
[](https://github.com/tidyverse/rvest/actions)
[](https://app.codecov.io/gh/tidyverse/rvest?branch=master)
## Overview
rvest helps you scrape (or harvest) data from web pages. It is designed
to work with [magrittr](https://github.com/tidyverse/magrittr) to make
it easy to express common web scraping tasks, inspired by libraries like
[beautiful soup](https://www.crummy.com/software/BeautifulSoup/) and
[RoboBrowser](http://robobrowser.readthedocs.io/en/latest/readme.html).
If you’re scraping multiple pages, I highly recommend using rvest in
concert with [polite](https://dmi3kno.github.io/polite/). The polite
package ensures that you’re respecting the
[robots.txt](https://en.wikipedia.org/wiki/Robots_exclusion_standard)
and not hammering the site with too many requests.
## Installation
``` r
# The easiest way to get rvest is to install the whole tidyverse:
install.packages("tidyverse")
# Alternatively, install just rvest:
install.packages("rvest")
```
## Usage
``` r
library(rvest)
# Start by reading a HTML page with read_html():
starwars <- read_html("https://rvest.tidyverse.org/articles/starwars.html")
# Then find elements that match a css selector or XPath expression
# using html_elements(). In this example, each corresponds
# to a different film
films <- starwars %>% html_elements("section")
films
#> {xml_nodeset (7)}
#> [1] \nThe Phantom Menace\n \n\nReleased: 1999 ...
#> [2] \nAttack of the Clones\n \n\nReleased: 20 ...
#> [3] \nRevenge of the Sith\n \n\nReleased: 200 ...
#> [4] \nA New Hope\n \n\nReleased: 1977-05-25\n ...
#> [5] \nThe Empire Strikes Back\n \n\nReleased: ...
#> [6] \nReturn of the Jedi\n \n\nReleased: 1983 ...
#> [7] \nThe Force Awakens\n \n\nReleased: 2015- ...
# Then use html_element() to extract one element per film. Here
# we the title is given by the text inside
title <- films %>%
html_element("h2") %>%
html_text2()
title
#> [1] "The Phantom Menace" "Attack of the Clones"
#> [3] "Revenge of the Sith" "A New Hope"
#> [5] "The Empire Strikes Back" "Return of the Jedi"
#> [7] "The Force Awakens"
# Or use html_attr() to get data out of attributes. html_attr() always
# returns a string so we convert it to an integer using a readr function
episode <- films %>%
html_element("h2") %>%
html_attr("data-id") %>%
readr::parse_integer()
episode
#> [1] 1 2 3 4 5 6 7
```
If the page contains tabular data you can convert it directly to a data
frame with `html_table()`:
``` r
html <- read_html("https://en.wikipedia.org/w/index.php?title=The_Lego_Movie&oldid=998422565")
html %>%
html_element(".tracklist") %>%
html_table()
#> # A tibble: 29 × 4
#> No. Title `Performer(s)` Length
#>
#> 1 1. "\"Everything Is Awesome\"" "Tegan and Sara featuring The Lonel… 2:43
#> 2 2. "\"Prologue\"" "" 2:28
#> 3 3. "\"Emmett's Morning\"" "" 2:00
#> 4 4. "\"Emmett Falls in Love\"" "" 1:11
#> 5 5. "\"Escape\"" "" 3:26
#> 6 6. "\"Into the Old West\"" "" 1:00
#> 7 7. "\"Wyldstyle Explains\"" "" 1:21
#> 8 8. "\"Emmett's Mind\"" "" 2:17
#> 9 9. "\"The Transformation\"" "" 1:46
#> 10 10. "\"Saloons and Wagons\"" "" 3:38
#> # … with 19 more rows
```
## Code of Conduct
Please note that the rvest project is released with a [Contributor Code
of Conduct](https://rvest.tidyverse.org/CODE_OF_CONDUCT.html). By
contributing to this project, you agree to abide by its terms.
rvest/man/ 0000755 0001750 0001750 00000000000 14101012310 012301 5 ustar nilesh nilesh rvest/man/reexports.Rd 0000644 0001750 0001750 00000001027 13770130737 014654 0 ustar nilesh nilesh % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/rvest-package.R
\docType{import}
\name{reexports}
\alias{reexports}
\alias{read_html}
\alias{url_absolute}
\alias{\%>\%}
\title{Objects exported from other packages}
\keyword{internal}
\description{
These objects are imported from other packages. Follow the links
below to see their documentation.
\describe{
\item{magrittr}{\code{\link[magrittr:pipe]{\%>\%}}}
\item{xml2}{\code{\link[xml2:read_xml]{read_html}}, \code{\link[xml2]{url_absolute}}}
}}
rvest/man/html_encoding_guess.Rd 0000644 0001750 0001750 00000002051 13775424150 016637 0 ustar nilesh nilesh % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/encoding.R
\name{html_encoding_guess}
\alias{html_encoding_guess}
\alias{guess_encoding}
\title{Guess faulty character encoding}
\usage{
html_encoding_guess(x)
}
\arguments{
\item{x}{A character vector.}
}
\description{
\code{html_encoding_guess()} helps you handle web pages that declare an incorrect
encoding. Use \code{html_encoding_guess()} to generate a list of possible
encodings, then try each out by using \code{encoding} argument of \code{read_html()}.
\code{html_encoding_guess()} replaces the deprecated \code{guess_encoding()}.
}
\examples{
# A file with bad encoding included in the package
path <- system.file("html-ex", "bad-encoding.html", package = "rvest")
x <- read_html(path)
x \%>\% html_elements("p") \%>\% html_text()
html_encoding_guess(x)
# Two valid encodings, only one of which is correct
read_html(path, encoding = "ISO-8859-1") \%>\% html_elements("p") \%>\% html_text()
read_html(path, encoding = "ISO-8859-2") \%>\% html_elements("p") \%>\% html_text()
}
rvest/man/html_form.Rd 0000644 0001750 0001750 00000004260 13776122700 014607 0 ustar nilesh nilesh % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/form.R
\name{html_form}
\alias{html_form}
\alias{html_form_set}
\alias{html_form_submit}
\title{Parse forms and set values}
\usage{
html_form(x, base_url = NULL)
html_form_set(form, ...)
html_form_submit(form, submit = NULL)
}
\arguments{
\item{x}{A document (from \code{\link[=read_html]{read_html()}}), node set (from \code{\link[=html_elements]{html_elements()}}),
node (from \code{\link[=html_element]{html_element()}}), or session (from \code{\link[=session]{session()}}).}
\item{base_url}{Base url of underlying HTML document. The default, \code{NULL},
uses the url of the HTML document underlying \code{x}.}
\item{form}{A form}
\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name-value pairs giving
fields to modify.
Provide a character vector to set multiple checkboxes in a set or
select multiple values from a multi-select.}
\item{submit}{Which button should be used to submit the form?
\itemize{
\item \code{NULL}, the default, uses the first button.
\item A string selects a button by its name.
\item A number selects a button using its relative position.
}}
}
\value{
\itemize{
\item \code{html_form()} returns as S3 object with class \code{rvest_form} when applied
to a single element. It returns a list of \code{rvest_form} objects when
applied to multiple elements or a document.
\item \code{html_form_set()} returns an \code{rvest_form} object.
\item \code{html_form_submit()} submits the form, returning an httr response which
can be parsed with \code{\link[=read_html]{read_html()}}.
}
}
\description{
Use \code{html_form()} to extract a form, set values with \code{html_form_set()},
and submit it with \code{html_form_submit()}.
}
\examples{
html <- read_html("http://www.google.com")
search <- html_form(html)[[1]]
search <- search \%>\% html_form_set(q = "My little pony", hl = "fr")
# Or if you have a list of values, use !!!
vals <- list(q = "web scraping", hl = "en")
search <- search \%>\% html_form_set(!!!vals)
# To submit and get result:
\dontrun{
resp <- html_form_submit(search)
read_html(resp)
}
}
\seealso{
HTML 4.01 form specification:
\url{http://www.w3.org/TR/html401/interact/forms.html}
}
rvest/man/html_table.Rd 0000644 0001750 0001750 00000004665 14007274024 014737 0 ustar nilesh nilesh % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/table.R
\name{html_table}
\alias{html_table}
\title{Parse an html table into a data frame}
\usage{
html_table(
x,
header = NA,
trim = TRUE,
fill = deprecated(),
dec = ".",
na.strings = "NA",
convert = TRUE
)
}
\arguments{
\item{x}{A document (from \code{\link[=read_html]{read_html()}}), node set (from \code{\link[=html_elements]{html_elements()}}),
node (from \code{\link[=html_element]{html_element()}}), or session (from \code{\link[=session]{session()}}).}
\item{header}{Use first row as header? If \code{NA}, will use first row
if it consists of \verb{} tags.
If \code{TRUE}, column names are left exactly as they are in the source
document, which may require post-processing to generate a valid data
frame.}
\item{trim}{Remove leading and trailing whitespace within each cell?}
\item{fill}{Deprecated - missing cells in tables are now always
automatically filled with \code{NA}.}
\item{dec}{The character used as decimal place marker.}
\item{na.strings}{Character vector of values that will be converted to \code{NA}
if \code{convert} is \code{TRUE}.}
\item{convert}{If \code{TRUE}, will run \code{\link[=type.convert]{type.convert()}} to interpret texts as
integer, double, or \code{NA}.}
}
\value{
When applied to a single element, \code{html_table()} returns a single tibble.
When applied to multiple elements or a document, \code{html_table()} returns
a list of tibbles.
}
\description{
The algorithm mimics what a browser does, but repeats the values of merged
cells in every cell that cover.
}
\examples{
sample1 <- minimal_html("")
sample1 \%>\%
html_element("table") \%>\%
html_table()
# Values in merged cells will be duplicated
sample2 <- minimal_html("")
sample2 \%>\%
html_element("table") \%>\%
html_table()
# If a row is missing cells, they'll be filled with NAs
sample3 <- minimal_html("")
sample3 \%>\%
html_element("table") \%>\%
html_table()
}
rvest/man/repair_encoding.Rd 0000644 0001750 0001750 00000001231 14014035320 015726 0 ustar nilesh nilesh % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/encoding.R
\name{repair_encoding}
\alias{repair_encoding}
\title{Repair faulty encoding}
\usage{
repair_encoding(x, from = NULL)
}
\arguments{
\item{from}{The encoding that the string is actually in. If \code{NULL},
\code{guess_encoding} will be used.}
}
\description{
\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}
This function has been deprecated because it doesn't work. Instead
re-read the HTML file with correct \code{encoding} argument.
}
\keyword{internal}
rvest/man/google_form.Rd 0000644 0001750 0001750 00000000470 14101012310 015070 0 ustar nilesh nilesh % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utils.R
\name{google_form}
\alias{google_form}
\title{Make link to google form given id}
\usage{
google_form(x)
}
\arguments{
\item{x}{Unique identifier for form}
}
\description{
Make link to google form given id
}
\keyword{internal}
rvest/man/html_attr.Rd 0000644 0001750 0001750 00000002423 14132341320 014601 0 ustar nilesh nilesh % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/html.R
\name{html_attr}
\alias{html_attr}
\alias{html_attrs}
\title{Get element attributes}
\usage{
html_attr(x, name, default = NA_character_)
html_attrs(x)
}
\arguments{
\item{x}{A document (from \code{\link[=read_html]{read_html()}}), node set (from \code{\link[=html_elements]{html_elements()}}),
node (from \code{\link[=html_element]{html_element()}}), or session (from \code{\link[=session]{session()}}).}
\item{name}{Name of attribute to retrieve.}
\item{default}{A string used as a default value when the attribute does
not exist in every element.}
}
\value{
A character vector (for \code{html_attr()}) or list (\code{html_attrs()})
the same length as \code{x}.
}
\description{
\code{html_attr()} gets a single attribute; \code{html_attrs()} gets all attributes.
}
\examples{
html <- minimal_html('')
html \%>\% html_elements("a") \%>\% html_attrs()
html \%>\% html_elements("a") \%>\% html_attr("href")
html \%>\% html_elements("li") \%>\% html_attr("class")
html \%>\% html_elements("li") \%>\% html_attr("class", default = "inactive")
}
rvest/man/rvest-package.Rd 0000644 0001750 0001750 00000001416 14132341252 015344 0 ustar nilesh nilesh % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/rvest-package.R
\docType{package}
\name{rvest-package}
\alias{rvest}
\alias{rvest-package}
\title{rvest: Easily Harvest (Scrape) Web Pages}
\description{
\if{html}{\figure{logo.png}{options: align='right' alt='logo' width='120'}}
Wrappers around the 'xml2' and 'httr' packages to make it easy to download, then manipulate, HTML and XML.
}
\seealso{
Useful links:
\itemize{
\item \url{https://rvest.tidyverse.org/}
\item \url{https://github.com/tidyverse/rvest}
\item Report bugs at \url{https://github.com/tidyverse/rvest/issues}
}
}
\author{
\strong{Maintainer}: Hadley Wickham \email{hadley@rstudio.com}
Other contributors:
\itemize{
\item RStudio [copyright holder]
}
}
\keyword{internal}
rvest/man/minimal_html.Rd 0000644 0001750 0001750 00000000663 13775437157 015313 0 ustar nilesh nilesh % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utils.R
\name{minimal_html}
\alias{minimal_html}
\title{Create an HTML document from inline HTML}
\usage{
minimal_html(html, title = "")
}
\arguments{
\item{html}{HTML contents of page.}
\item{title}{Page title (required by HTML spec).}
}
\description{
Create an HTML document from inline HTML
}
\examples{
minimal_html("test
")
}
\keyword{internal}
rvest/man/html_element.Rd 0000644 0001750 0001750 00000006071 13776110054 015276 0 ustar nilesh nilesh % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/selectors.R
\name{html_element}
\alias{html_element}
\alias{html_elements}
\title{Select elements from an HTML document}
\usage{
html_element(x, css, xpath)
html_elements(x, css, xpath)
}
\arguments{
\item{x}{Either a document, a node set or a single node.}
\item{css, xpath}{Elements to select. Supply one of \code{css} or \code{xpath}
depending on whether you want to use a CSS selector or XPath 1.0
expression.}
}
\value{
\code{html_element()} returns a nodeset the same length as the input.
\code{html_elements()} flattens the output so there's no direct way to map
the output to the input.
}
\description{
\code{html_element()} and \code{html_elements()} find HTML element using CSS selectors
or XPath expressions. CSS selectors are particularly useful in conjunction
with \url{https://selectorgadget.com/}, which makes it very easy to discover the
selector you need.
}
\section{CSS selector support}{
CSS selectors are translated to XPath selectors by the \pkg{selectr}
package, which is a port of the python \pkg{cssselect} library,
\url{https://pythonhosted.org/cssselect/}.
It implements the majority of CSS3 selectors, as described in
\url{http://www.w3.org/TR/2011/REC-css3-selectors-20110929/}. The
exceptions are listed below:
\itemize{
\item Pseudo selectors that require interactivity are ignored:
\verb{:hover}, \verb{:active}, \verb{:focus}, \verb{:target}, \verb{:visited}.
\item The following pseudo classes don't work with the wild card element, *:
\verb{*:first-of-type}, \verb{*:last-of-type}, \verb{*:nth-of-type},
\verb{*:nth-last-of-type}, \verb{*:only-of-type}
\item It supports \verb{:contains(text)}
\item You can use !=, \verb{[foo!=bar]} is the same as \verb{:not([foo=bar])}
\item \verb{:not()} accepts a sequence of simple selectors, not just a single
simple selector.
}
}
\examples{
html <- minimal_html("
This is a heading
This is a paragraph
This is an important paragraph
")
html \%>\% html_element("h1")
html \%>\% html_elements("p")
html \%>\% html_elements(".important")
html \%>\% html_elements("#first")
# html_element() vs html_elements() --------------------------------------
html <- minimal_html("
C-3PO is a droid that weighs 167 kg
R2-D2 is a droid that weighs 96 kg
Yoda weighs 66 kg
R4-P17 is a droid
")
li <- html \%>\% html_elements("li")
# When applied to a node set, html_elements() returns all matching elements
# beneath any of the inputs, flattening results into a new node set.
li \%>\% html_elements("i")
# When applied to a node set, html_element() always returns a vector the
# same length as the input, using a "missing" element where needed.
li \%>\% html_element("i")
# and html_text() and html_attr() will return NA
li \%>\% html_element("i") \%>\% html_text2()
li \%>\% html_element("span") \%>\% html_attr("class")
}
rvest/man/html_text.Rd 0000644 0001750 0001750 00000005046 13775651250 014641 0 ustar nilesh nilesh % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/text.R
\name{html_text}
\alias{html_text}
\alias{html_text2}
\title{Get element text}
\usage{
html_text(x, trim = FALSE)
html_text2(x, preserve_nbsp = FALSE)
}
\arguments{
\item{x}{A document, node, or node set.}
\item{trim}{If \code{TRUE} will trim leading and trailing spaces.}
\item{preserve_nbsp}{Should non-breaking spaces be preserved? By default,
\code{html_text2()} converts to ordinary spaces to ease further computation.
When \code{preserve_nbsp} is \code{TRUE}, \verb{ } will appear in strings as
\code{"\\ua0"}. This often causes confusion because it prints the same way as
\code{" "}.}
}
\value{
A character vector the same length as \code{x}
}
\description{
There are two ways to retrieve text from a element: \code{html_text()} and
\code{html_text2()}. \code{html_text()} is a thin wrapper around \code{\link[xml2:xml_text]{xml2::xml_text()}}
which returns just the raw underlying text. \code{html_text2()} simulates how
text looks in a browser, using an approach inspired by JavaScript's
\href{https://developer.mozilla.org/en-US/docs/Web/API/HTMLElement/innerText}{innerText()}.
Roughly speaking, it converts \verb{ } to \code{"\\n"}, adds blank lines
around \verb{} tags, and lightly formats tabular data.
\code{html_text2()} is usually what you want, but it is much slower than
\code{html_text()} so for simple applications where performance is important
you may want to use \code{html_text()} instead.
}
\examples{
# To understand the difference between html_text() and html_text2()
# take the following html:
html <- minimal_html(
"
This is a paragraph.
This another sentence. This should start on a new line"
)
# html_text() returns the raw underlying text, which includes whitespace
# that would be ignored by a browser, and ignores the
html \%>\% html_element("p") \%>\% html_text() \%>\% writeLines()
# html_text2() simulates what a browser would display. Non-significant
# whitespace is collapsed, and is turned into a line break
html \%>\% html_element("p") \%>\% html_text2() \%>\% writeLines()
# By default, html_text2() also converts non-breaking spaces to regular
# spaces:
html <- minimal_html("
x y
")
x1 <- html \%>\% html_element("p") \%>\% html_text()
x2 <- html \%>\% html_element("p") \%>\% html_text2()
# When printed, non-breaking spaces look exactly like regular spaces
x1
x2
# But aren't actually the same:
x1 == x2
# Which you can confirm by looking at their underlying binary
# representaion:
charToRaw(x1)
charToRaw(x2)
}
rvest/man/figures/ 0000755 0001750 0001750 00000000000 13767413737 014010 5 ustar nilesh nilesh rvest/man/figures/lifecycle-defunct.svg 0000644 0001750 0001750 00000001704 13767413737 020120 0 ustar nilesh nilesh lifecycle lifecycle defunct defunct