urlchecker/0000755000176200001440000000000014151424662012406 5ustar liggesusersurlchecker/NAMESPACE0000644000176200001440000000016114151267543013626 0ustar liggesusers# Generated by roxygen2: do not edit by hand S3method(print,urlchecker_db) export(url_check) export(url_update) urlchecker/README.md0000644000176200001440000000176714151267367013707 0ustar liggesusers# urlchecker [![R build status](https://github.com/r-lib/urlchecker/workflows/R-CMD-check/badge.svg)](https://github.com/r-lib/urlchecker/actions) [![Codecov test coverage](https://codecov.io/gh/jimhester/urlchecker/branch/main/graph/badge.svg)](https://app.codecov.io/gh/jimhester/urlchecker?branch=main) [![R-CMD-check](https://github.com/r-lib/urlchecker/workflows/R-CMD-check/badge.svg)](https://github.com/r-lib/urlchecker/actions) The goal of urlchecker is to run the URL checks from R 4.1 in older versions of R and automatically update URLs as needed. It also uses concurrent requests, so is generally much faster than the URL checks from the tools package. ``` r library(urlchecker) # `url_check()` will check all URLs in a package, as is done by CRAN when # submitting a package. url_check("path/to/pkg") # `url_update()` will check all URLs in a package, then update any 301 # redirects automatically to their new location. url_update("path/to/pkg") ``` urlchecker/man/0000755000176200001440000000000013735627440013167 5ustar liggesusersurlchecker/man/url_update.Rd0000644000176200001440000000103414017460033015603 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/url_update.R \name{url_update} \alias{url_update} \title{Update URLs in a package} \usage{ url_update(path = ".", results = url_check(path)) } \arguments{ \item{path}{Path to the package} \item{results}{results from \link{url_check}.} } \value{ The results from \code{url_check(path)}, invisibly. } \description{ First uses \link{url_check} to check and then updates any URLs which are permanent (301) redirects. } \examples{ \dontrun{ url_update("my_pkg") } } urlchecker/man/url_check.Rd0000644000176200001440000000165414017460033015406 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/url_check.R \name{url_check} \alias{url_check} \title{Check urls in a package} \usage{ url_check( path = ".", db = NULL, parallel = TRUE, pool = curl::new_pool(), progress = TRUE ) } \arguments{ \item{path}{Path to the package} \item{db}{A url database} \item{parallel}{If \code{TRUE}, check the URLs in parallel} \item{pool}{A multi handle created by \code{\link[curl:multi]{curl::new_pool()}}. If \code{NULL} use a global pool.} \item{progress}{Whether to show the progress bar for parallel checks} } \value{ A \code{url_checker_db} object (invisibly). This is a \code{check_url_db} object with an added class with a custom print method. } \description{ Runs the \code{url_db_from_package_source} function in the tools package along with a function to check URLs in un-rendered Rmarkdown vignettes. } \examples{ \dontrun{ url_check("my_pkg") } } urlchecker/DESCRIPTION0000644000176200001440000000231314151424662014113 0ustar liggesusersPackage: urlchecker Title: Run CRAN URL Checks from Older R Versions Version: 1.0.1 Authors@R: c( person("R Core team", role = "aut", comment = "The code in urltools.R adapted from the tools package"), person("Jim", "Hester", role = "aut", comment = c(ORCID = "0000-0002-2739-7082")), person("Gábor", "Csárdi", , "csardi.gabor@gmail.com", role = c("aut", "cre")), person("RStudio", role = c("cph", "fnd")) ) Description: Provide the URL checking tools available in R 4.1+ as a package for earlier versions of R. Also uses concurrent requests so can be much faster than the serial versions. License: GPL-3 URL: https://github.com/r-lib/urlchecker BugReports: https://github.com/r-lib/urlchecker/issues Depends: R (>= 3.3) Imports: cli, curl, tools, xml2 Suggests: covr Encoding: UTF-8 RoxygenNote: 7.1.2 NeedsCompilation: no Packaged: 2021-11-30 00:26:56 UTC; jhester Author: R Core team [aut] (The code in urltools.R adapted from the tools package), Jim Hester [aut] (), Gábor Csárdi [aut, cre], RStudio [cph, fnd] Maintainer: Gábor Csárdi Repository: CRAN Date/Publication: 2021-11-30 13:40:02 UTC urlchecker/R/0000755000176200001440000000000014144744657012622 5ustar liggesusersurlchecker/R/utils.R0000644000176200001440000000115114017457444014075 0ustar liggesusersvlapply <- function(x, f, ...) vapply(x, f, logical(1)) # makes sure that pandoc is available # puts RStudio's pandoc on the PATH if it is the only one available with_pandoc_available <- function(code) { pandoc_location <- Sys.which("pandoc") if (!nzchar(pandoc_location)) { pandoc_path <- Sys.getenv("RSTUDIO_PANDOC") if (!nzchar(pandoc_path)) { stop("pandoc is not installed and on the PATH") } else { sys_path <- Sys.getenv("PATH") on.exit(Sys.setenv("PATH" = sys_path)) Sys.setenv("PATH" = paste(pandoc_path, sys_path, sep = .Platform$path.sep)) } } force(code) } urlchecker/R/zzz.R0000644000176200001440000000047614143561457013603 0ustar liggesuserstools <- new.env(parent = asNamespace("tools")) .onLoad <- function(...) { source(file = system.file(file.path("tools", "urltools.R"), package = "urlchecker"), local = tools) if (getRversion() < "4.0.0") { source(file = system.file(file.path("tools", "utils.R"), package = "urlchecker"), local = tools) } } urlchecker/R/url_update.R0000644000176200001440000000224514017460027015075 0ustar liggesusers#' Update URLs in a package #' #' First uses [url_check] to check and then updates any URLs which are permanent (301) #' redirects. #' #' @param path Path to the package #' @param results results from [url_check]. #' @return The results from `url_check(path)`, invisibly. #' @examples #' \dontrun{ #' url_update("my_pkg") #' } #' @export url_update <- function(path = ".", results = url_check(path)) { can_update <- vlapply(results[["New"]], nzchar) to_update <- results[can_update, ] for (row in seq_len(NROW(to_update))) { old <- to_update[["URL"]][[row]] new <- to_update[["New"]][[row]] root <- to_update[["root"]][[row]] if (nzchar(new)) { from <- to_update[["From"]][[row]] if (("README.md" %in% from) && file.exists("README.Rmd")) { from <- c(from, "README.Rmd") } for (file in from) { file_path <- file.path(root, file) data <- readLines(file_path) data <- gsub(old, new, data, fixed = TRUE) writeLines(data, file_path) cli::cli_alert_success("{.strong Updated:} {.url {old}} to {.url {new}} in {.file {file}}") } } } print(results[!can_update, ]) invisible(results) } urlchecker/R/parallel.R0000644000176200001440000000366513736643303014543 0ustar liggesuserscurl_fetch_headers <- function(urls, pool = curl::new_pool(), progress = FALSE) { hs <- vector("list", length(urls)) bar <- progress_bar(if (progress) length(urls), msg = "fetching ") for (i in seq_along(hs)) { u <- urls[[i]] h <- curl::new_handle(url = u) curl::handle_setopt(h, nobody = TRUE, cookiesession = 1L, followlocation = 1L, http_version = 2L, ssl_enable_alpn = 0L) if (grepl("^https?://github[.]com", u) && nzchar(a <- Sys.getenv("GITHUB_PAT", ""))) { curl::handle_setheaders(h, "Authorization" = paste("token", a)) } handle_result <- local({ i <- i function(x) { hs[[i]] <<- x bar$update() } }) handle_error <- local({ i <- i function(x) { hs[[i]] <<- structure(list(message = x), class = c("curl_error", "error", "condition")) bar$update() } }) curl::multi_add(h, done = handle_result, fail = handle_error, pool = pool) } curl::multi_run(pool = pool) out <- vector("list", length(hs)) for (i in seq_along(out)) { if (inherits(hs[[i]], "error")) { out[[i]] <- hs[[i]] } else { out[[i]] <- strsplit(rawToChar(hs[[i]]$headers), "(?<=\r\n)", perl = TRUE)[[1]] attr(out[[i]], "status") <- hs[[i]]$status_code } } out } utils::globalVariables(c("done", "fmt")) progress_bar <- function(length, msg = "") { bar <- new.env(parent = baseenv()) if (is.null(length)) { length <- 0L } bar$length <- length bar$done <- -1L digits <- trunc(log10(length)) + 1L bar$fmt <- paste0("\r", msg, "[ %", digits, "i / %", digits, "i ]") bar$update <- function() { assign("done", inherits = TRUE, done + 1L) if (length <= 0L) { return() } if (done >= length) { cat("\r", strrep(" ", nchar(fmt)), "\r", sep = "") } else { cat(sprintf(fmt, done, length), sep = "") } } environment(bar$update) <- bar bar$update() bar } urlchecker/R/url_check.R0000644000176200001440000000523714144744657014713 0ustar liggesusers#' Check urls in a package #' #' Runs the `url_db_from_package_source` function in the tools package along #' with a function to check URLs in un-rendered Rmarkdown vignettes. #' #' @param path Path to the package #' @param db A url database #' @param parallel If `TRUE`, check the URLs in parallel #' @param pool A multi handle created by [curl::new_pool()]. If `NULL` use a global pool. #' @param progress Whether to show the progress bar for parallel checks #' @return A `url_checker_db` object (invisibly). This is a `check_url_db` object #' with an added class with a custom print method. #' @examples #' \dontrun{ #' url_check("my_pkg") #' } #' @export url_check <- function(path = ".", db = NULL, parallel = TRUE, pool = curl::new_pool(), progress = TRUE) { if (is.null(db)) { db <- with_pandoc_available( rbind( tools$url_db_from_package_sources(normalizePath(path)), url_db_from_package_rmd_vignettes(normalizePath(path)) ) ) } res <- tools$check_url_db(db, parallel = parallel, pool = pool, verbose = progress) if (NROW(res) > 0) { res$root <- normalizePath(path) } class(res) <- c("urlchecker_db", class(res)) res } #' @export print.urlchecker_db <- function(x, ...) { if (NROW(x) == 0) { cli::cli_alert_success("All URLs are correct!") return(invisible(x)) } for (row in seq_len(NROW(x))) { cran <- x[["CRAN"]][[row]] if (nzchar(cran)) { status <- "Error" message <- "CRAN URL not in canonical form" url <- cran new <- "" } else { status <- x[["Status"]][[row]] message <- x[["Message"]][[row]] url <- x[["URL"]][[row]] new <- x[["New"]][[row]] } root <- x[["root"]][[row]] from <- x[["From"]][[row]] for (file in from) { file_path <- file.path(root, file) data <- readLines(file_path) match <- regexpr(url, data, fixed = TRUE) lines <- which(match != -1) starts <- match[match != -1] ends <- starts + attr(match, "match.length")[match != -1] for (i in seq_along(lines)) { pointer <- paste0(strrep(" ", starts[[i]] - 1), "^", strrep("~", ends[[i]] - starts[[i]] - 1)) if (nzchar(new)) { fix_it <- paste0(strrep(" ", starts[[i]] - 1), new) cli::cli_alert_warning(" {.strong Warning:} {file}:{lines[[i]]}:{starts[[i]]} {.emph Moved} {data[lines[[i]]]} {pointer} {fix_it} ") } else { cli::cli_alert_danger(" {.strong Error:} {file}:{lines[[i]]}:{starts[[i]]} {.emph {status}: {message}} {data[lines[[i]]]} {pointer} ") } } } } invisible(x) } urlchecker/R/rmd.R0000644000176200001440000000215514144744657013532 0ustar liggesusersurl_db_from_package_rmd_vignettes <- function(dir) { urls <- path <- character() rfiles <- Filter(file.exists, tools::pkgVignettes(dir = dir)$docs) for (rfile in rfiles) { if(!is.na(rfile) && nzchar(Sys.which("pandoc"))) { rpath <- asNamespace("tools")$.file_path_relative_to_dir(rfile, dir) tfile <- tempfile(fileext = ".html") on.exit(unlink(tfile), add = TRUE) out <- .pandoc_md_for_CRAN2(rfile, tfile) if(!out$status) { rurls <- tools$.get_urls_from_HTML_file(tfile) urls <- c(urls, rurls) path <- c(path, rep.int(rpath, length(rurls))) } } } tools$url_db(urls, path) } # adapted from https://github.com/wch/r-source/blob/58d223cf3eaa50ff8cfc2caf591d67350e549e4a/src/library/tools/R/utils.R#L1847-L1857 # Adding the autolink_bare_uris extension .pandoc_md_for_CRAN2 <- function(ifile, ofile) { asNamespace("tools")$.system_with_capture("pandoc", paste(shQuote(normalizePath(ifile)), "-s", "--mathjax", "--email-obfuscation=references", "-f", "markdown+autolink_bare_uris", "-o", shQuote(ofile))) } urlchecker/NEWS.md0000644000176200001440000000021314151267641013502 0ustar liggesusers# urlchecker 1.0.1 * Gábor Csárdi is now the maintainer. # urlchecker 1.0.0 * Added a `NEWS.md` file to track changes to the package. urlchecker/MD50000644000176200001440000000131314151424662012714 0ustar liggesusers22455a6ca57b0f762aa9850b8afe9cc6 *DESCRIPTION 674e1807da1556689d8669edb8ed048d *NAMESPACE 2694843c19add9356557c52b34be0667 *NEWS.md 830541df56b727bf1a328fec15ec9d92 *R/parallel.R 900037e98d62c5a26794cd650a4606f3 *R/rmd.R 7a656e035744999b27b70b5ff782469b *R/url_check.R d3b7e968317614e4e2404ac421de7143 *R/url_update.R dcdc3611dfdc6323581978ff9335a45f *R/utils.R 14da4508e0f551c90a5edca9d614980e *R/zzz.R 8882114f20694c6397c825d05a3f8954 *README.md a7b006485c1522f8fa67aaafa016bd19 *inst/tools/README.md 3d06a9fef1989b83d68a8758104a2ce1 *inst/tools/urltools.R 9279739de685384038b1472ff8918253 *inst/tools/utils.R 9277da0788c10cd29cca247aa17d6284 *man/url_check.Rd d228b5ea0cd423ebcd7841209f6d993f *man/url_update.Rd urlchecker/inst/0000755000176200001440000000000014017450734013363 5ustar liggesusersurlchecker/inst/tools/0000755000176200001440000000000014143561362014523 5ustar liggesusersurlchecker/inst/tools/utils.R0000644000176200001440000000023514143561362016006 0ustar liggesusers### ** lines2str lines2str <- function(txt, sep = "") trimws(gsub("\n", sep, paste(txt, collapse = sep), fixed = TRUE, useBytes = TRUE)) urlchecker/inst/tools/urltools.R0000644000176200001440000007127214144745360016545 0ustar liggesusers# File src/library/tools/R/urltools.R # Part of the R package, https://www.R-project.org # # Copyright (C) 2015-2021 The R Core Team # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # A copy of the GNU General Public License is available at # https://www.R-project.org/Licenses/ get_IANA_URI_scheme_db <- function() { ## See ## . baseurl <- "https://www.iana.org/assignments/uri-schemes/" db <- utils::read.csv(url(paste0(baseurl, "uri-schemes-1.csv")), stringsAsFactors = FALSE, encoding = "UTF-8") names(db) <- chartr(".", "_", names(db)) db } parse_URI_reference <- function(x) { ## See RFC_3986 . re <- "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?" if(length(x)) { y <- do.call(rbind, regmatches(x, regexec(re, x))) y <- y[, c(3, 5, 6, 8, 10), drop = FALSE] } else { y <- matrix(character(), 0L, 5L) } colnames(y) <- c("scheme", "authority", "path", "query", "fragment") y } .get_urls_from_Rd <- function(x, href = TRUE, ifdef = FALSE) { urls <- character() recurse <- function(e) { tag <- attr(e, "Rd_tag") ## Rd2HTML and Rd2latex remove whitespace and \n from URLs. if(identical(tag, "\\url")) { urls <<- c(urls, lines2str(.Rd_deparse(e, tag = FALSE))) } else if(href && identical(tag, "\\href")) { ## One could also record the \href text argument in the ## names, but then one would need to process named and ## unnamed extracted URLs separately. urls <<- c(urls, lines2str(.Rd_deparse(e[[1L]], tag = FALSE))) } else if(ifdef && length(tag) && (tag %in% c("\\if", "\\ifelse"))) { ## cf. testRdConditional() condition <- e[[1L]] if(all(RdTags(condition) == "TEXT")) { if(any(c("TRUE", "html") %in% trimws(strsplit(paste(condition, collapse = ""), ",")[[1L]]))) recurse(e[[2L]]) else if(tag == "\\ifelse") recurse(e[[3L]]) } } else if(is.list(e)) lapply(e, recurse) } lapply(x, recurse) unique(trimws(urls)) } .get_urls_from_HTML_file <- function(f) { doc <- xml2::read_html(f) if(!inherits(doc, "xml_node")) return(character()) nodes <- xml2::xml_find_all(doc, "//a") hrefs <- xml2::xml_attr(nodes, "href") unique(hrefs[!is.na(hrefs) & !startsWith(hrefs, "#")]) } .get_urls_from_PDF_file <- function(f) { ## Seems there is no straightforward way to extract hyperrefs from a ## PDF, hence first convert to HTML. ## Note that pdftohtml always outputs in cwd ... owd <- getwd() dir.create(d <- tempfile()) on.exit({ unlink(d, recursive = TRUE); setwd(owd) }) file.copy(normalizePath(f), d) setwd(d) g <- tempfile(tmpdir = d, fileext = ".xml") system2("pdftohtml", c("-s -q -i -c -xml", shQuote(basename(f)), shQuote(basename(g)))) ## Oh dear: seems that pdftohtml can fail without a non-zero exit ## status. if(file.exists(g)) .get_urls_from_HTML_file(g) else character() } url_db <- function(urls, parents) { ## Some people get leading LFs in URLs, so trim before checking. db <- data.frame(URL = trimws(as.character(urls)), Parent = as.character(parents), stringsAsFactors = FALSE) class(db) <- c("url_db", "data.frame") db } url_db_from_HTML_files <- function(dir, recursive = FALSE, files = NULL, verbose = FALSE) { urls <- parents <- character() if(is.null(files)) files <- list.files(dir, pattern = "[.]html$", full.names = TRUE, recursive = recursive) urls <- lapply(files, function(f) { if(verbose) message(sprintf("processing %s", .file_path_relative_to_dir(f, dir))) .get_urls_from_HTML_file(f) }) names(urls) <- files urls <- Filter(length, urls) if(length(urls)) { parents <- rep.int(.file_path_relative_to_dir(names(urls), dir), lengths(urls)) urls <- unlist(urls, use.names = FALSE) } url_db(urls, parents) } url_db_from_PDF_files <- function(dir, recursive = FALSE, files = NULL, verbose = FALSE) { urls <- parents <- character() if(is.null(files)) files <- list.files(dir, pattern = "[.]pdf$", full.names = TRUE, recursive = recursive) ## FIXME: this is simpler to do with full.names = FALSE and without ## tools:::.file_path_relative_to_dir(). urls <- lapply(files, function(f) { if(verbose) message(sprintf("processing %s", .file_path_relative_to_dir(f, dir))) .get_urls_from_PDF_file(f) }) names(urls) <- files urls <- Filter(length, urls) if(length(urls)) { parents <- rep.int(.file_path_relative_to_dir(names(urls), dir), lengths(urls)) urls <- unlist(urls, use.names = FALSE) } url_db(urls, parents) } url_db_from_package_Rd_db <- function(db) { urls <- Filter(length, lapply(db, .get_urls_from_Rd)) url_db(unlist(urls, use.names = FALSE), rep.int(file.path("man", names(urls)), lengths(urls))) } url_db_from_package_metadata <- function(meta) { urls <- character() fields <- c("URL", "BugReports") for(v in meta[fields]) { if(is.na(v)) next pattern <- "<(URL: *)?((https?|ftp)://[^[:space:],]*)[[:space:]]*>" m <- gregexpr(pattern, v) urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L)) regmatches(v, m) <- "" pattern <- "(^|[^>\"])((https?|ftp)://[^[:space:],]*)" m <- gregexpr(pattern, v) urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L)) } if(!is.na(v <- meta["Description"])) { pattern <- "<(URL: *)?((https?|ftp)://[^[:space:]]+)[[:space:]]*>" m <- gregexpr(pattern, v) urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L)) regmatches(v, m) <- "" pattern <- "([^>\"])((https?|ftp)://[[:alnum:]/.:@+\\_~%#?=&;,-]+[[:alnum:]/])" m <- gregexpr(pattern, v) urls <- c(urls, .gregexec_at_pos(pattern, v, m, 3L)) } url_db(urls, rep.int("DESCRIPTION", length(urls))) } url_db_from_package_citation <- function(dir, meta, installed = FALSE) { urls <- character() path <- if(installed) "CITATION" else file.path("inst", "CITATION") cfile <- file.path(dir, path) if(file.exists(cfile)) { cinfo <- .read_citation_quietly(cfile, meta) if(!inherits(cinfo, "error")) urls <- trimws(unique(unlist(cinfo$url, use.names = FALSE))) } url_db(urls, rep.int(path, length(urls))) } url_db_from_package_news <- function(dir, installed = FALSE) { path <- if(installed) "NEWS.Rd" else file.path("inst", "NEWS.Rd") nfile <- file.path(dir, path) urls <- if(file.exists(nfile)) { macros <- initialRdMacros() .get_urls_from_Rd(prepare_Rd(parse_Rd(nfile, macros = macros), stages = "install")) } else character() url_db(urls, rep.int(path, length(urls))) } url_db_from_package_HTML_files <- function(dir, installed = FALSE) { path <- if(installed) "doc" else file.path("inst", "doc") files <- Sys.glob(file.path(dir, path, "*.html")) if(installed && file.exists(rfile <- file.path(dir, "README.html"))) files <- c(files, rfile) url_db_from_HTML_files(dir, files = files) } url_db_from_package_README_md <- function(dir, installed = FALSE) { urls <- path <- character() rfile <- Filter(file.exists, c(if(!installed) file.path(dir, "inst", "README.md"), file.path(dir, "README.md")))[1L] if(!is.na(rfile) && nzchar(Sys.which("pandoc"))) { path <- .file_path_relative_to_dir(rfile, dir) tfile <- tempfile("README", fileext = ".html") on.exit(unlink(tfile)) out <- .pandoc_md_for_CRAN(rfile, tfile) if(!out$status) { urls <- .get_urls_from_HTML_file(tfile) } } url_db(urls, rep.int(path, length(urls))) } url_db_from_package_NEWS_md <- function(dir, installed = FALSE) { urls <- path <- character() nfile <- Filter(file.exists, c(if(!installed) file.path(dir, "inst", "NEWS.md"), file.path(dir, "NEWS.md")))[1L] if(!is.na(nfile) && nzchar(Sys.which("pandoc"))) { path <- .file_path_relative_to_dir(nfile, dir) tfile <- tempfile("NEWS", fileext = ".html") on.exit(unlink(tfile)) out <- .pandoc_md_for_CRAN(nfile, tfile) if(!out$status) { urls <- .get_urls_from_HTML_file(tfile) } } url_db(urls, rep.int(path, length(urls))) } url_db_from_package_sources <- function(dir, add = FALSE) { meta <- .read_description(file.path(dir, "DESCRIPTION")) db <- rbind(url_db_from_package_metadata(meta), url_db_from_package_Rd_db(Rd_db(dir = dir)), url_db_from_package_citation(dir, meta), url_db_from_package_news(dir)) if(requireNamespace("xml2", quietly = TRUE)) { db <- rbind(db, url_db_from_package_HTML_files(dir), url_db_from_package_README_md(dir), url_db_from_package_NEWS_md(dir) ) } if(add) db$Parent <- file.path(basename(dir), db$Parent) db } url_db_from_installed_packages <- function(packages, lib.loc = NULL, verbose = FALSE) { if(!length(packages)) return() one <- function(p) { if(verbose) message(sprintf("processing %s", p)) dir <- system.file(package = p, lib.loc = lib.loc) if(dir == "") return() meta <- .read_description(file.path(dir, "DESCRIPTION")) rddb <- Rd_db(p, lib.loc = dirname(dir)) db <- rbind(url_db_from_package_metadata(meta), url_db_from_package_Rd_db(rddb), url_db_from_package_citation(dir, meta, installed = TRUE), url_db_from_package_news(dir, installed = TRUE)) if(requireNamespace("xml2", quietly = TRUE)) { db <- rbind(db, url_db_from_package_HTML_files(dir, installed = TRUE), url_db_from_package_README_md(dir, installed = TRUE), url_db_from_package_NEWS_md(dir, installed = TRUE) ) } db$Parent <- file.path(p, db$Parent) db } do.call(rbind, c(lapply(packages, one), list(make.row.names = FALSE))) } get_IANA_HTTP_status_code_db <- function() { ## See ## baseurl <- "https://www.iana.org/assignments/http-status-codes/" db <- utils::read.csv(url(paste0(baseurl, "http-status-codes-1.csv")), stringsAsFactors = FALSE) ## Drop "Unassigned". db[db$Description != "Unassigned", ] } ## See ## and , ## Section 4.2.2 "Numeric Order List of Reply Codes", ## and , ## Section 5 "New FTP Replies". ## Only need those >= 400. table_of_FTP_server_return_codes <- c("421" = "Service not available, closing control connection.", "425" = "Can't open data connection.", "426" = "Connection closed; transfer aborted.", "430" = "Invalid username or password", "431" = "Need some unavailable resource to process security.", "434" = "Requested host unavailable.", "450" = "Requested file action not taken.", "451" = "Requested action aborted: local error in processing.", "452" = "Requested action not taken. Insufficient storage space in system.", "500" = "Syntax error, command unrecognized.", "501" = "Syntax error in parameters or arguments.", "502" = "Command not implemented.", "503" = "Bad sequence of commands.", "504" = "Command not implemented for that parameter.", "530" = "Not logged in.", "532" = "Need account for storing files.", "533" = "Command protection level denied for policy reasons.", "534" = "Request denied for policy reasons.", "535" = "Failed security check (hash, sequence, etc).", "536" = "Requested PROT level not supported by mechanism.", "537" = "Command protection level not supported by security mechanism.", "550" = "Requested action not taken. File unavailable", "551" = "Requested action aborted: page type unknown.", "552" = "Requested file action aborted. Exceeded storage allocation (for current directory or dataset).", "553" = "Requested action not taken. File name not allowed.", "631" = "Integrity protected reply.", "632" = "Confidentiality and integrity protected reply.", "633" = "Confidentiality protected reply." ) check_url_db <- function(db, remote = TRUE, verbose = FALSE, parallel = FALSE, pool = NULL) { use_curl <- !parallel && config_val_to_logical(Sys.getenv("_R_CHECK_URLS_USE_CURL_", "TRUE")) && requireNamespace("curl", quietly = TRUE) if(parallel && is.null(pool)) pool <- curl::new_pool() .gather <- function(u = character(), p = list(), s = rep.int("", length(u)), m = rep.int("", length(u)), new = rep.int("", length(u)), cran = rep.int("", length(u)), spaces = rep.int("", length(u)), R = rep.int("", length(u))) { y <- data.frame(URL = u, From = I(p), Status = s, Message = m, New = new, CRAN = cran, Spaces = spaces, R = R, row.names = NULL, stringsAsFactors = FALSE) y$From <- p class(y) <- c("check_url_db", "data.frame") y } .fetch_headers <- if(parallel) function(urls) .fetch_headers_via_curl(urls, verbose, pool) else function(urls) .fetch_headers_via_base(urls, verbose) .check_ftp <- function(u, h) { if(inherits(h, "error")) { s <- "-1" msg <- sub("[[:space:]]*$", "", conditionMessage(h)) } else { s <- as.character(attr(h, "status")) msg <- table_of_FTP_server_return_codes[s] } c(s, msg, "", "") } .check_http <- if(remote) function(u, h) c(.check_http_A(u, h), .check_http_B(u)) else function(u, h) c(rep.int("", 3L), .check_http_B(u)) .check_http_A <- function(u, h) { newLoc <- "" if(inherits(h, "error")) { s <- "-1" msg <- sub("[[:space:]]*$", "", conditionMessage(h)) if(grepl(paste(c("server certificate verification failed", "failed to get server cert", "libcurl error code (51|60)"), collapse = "|"), msg)) { h2 <- tryCatch(curlGetHeaders(u, verify = FALSE), error = identity) s2 <- as.character(attr(h2, "status")) msg <- paste0(msg, "\n\t(Status without verification: ", table_of_HTTP_status_codes[s2], ")") } } else { s <- as.character(attr(h, "status")) msg <- table_of_HTTP_status_codes[s] } ## Look for redirected URLs ## According to ## the first ## line of a response is the status-line, with "a possibly empty ## textual phrase describing the status code", so only look for ## a 301 status code in the first line. if(grepl(" 301 ", h[1L], useBytes = TRUE) && !startsWith(u, "https://doi.org/") && !startsWith(u, "http://dx.doi.org/")) { ## Get the new location from the last consecutive 301 ## obtained. h <- split(h, c(0L, cumsum(h == "\r\n")[-length(h)])) i <- vapply(h, function(e) grepl(" 301 ", e[1L], useBytes = TRUE), NA) h <- h[[which(!i)[1L] - 1L]] pos <- grep("^[Ll]ocation: ", h, useBytes = TRUE) if(length(pos)) { loc <- sub("^[Ll]ocation: ([^\r]*)\r\n", "\\1", h[pos[1L]]) ## Ouch. According to RFC 7231, the location is a URI ## reference, and may be relative in which case it needs ## resolving against the effect request URI. ## . ## Not quite straightforward, hence do not report such ## 301s. ## (Alternatively, could try reporting the 301 but no ## new location.) if(nzchar(parse_URI_reference(loc)[1L, "scheme"])) newLoc <- loc ## (Note also that fragments would need extra care.) } } ## if((s != "200") && use_curl) { g <- .curl_GET_status(u) if(g == "200") { s <- g msg <- "OK" } } ## A mis-configured site if (s == "503" && any(grepl("www.sciencedirect.com", c(u, newLoc)))) s <- "405" c(s, msg, newLoc) } .check_http_B <- function(u) { ul <- tolower(u) cran <- ((grepl("^https?://cran.r-project.org/web/packages", ul) && !grepl("^https?://cran.r-project.org/web/packages/[.[:alnum:]_]+(html|pdf|rds)$", ul)) || (grepl("^https?://cran.r-project.org/web/views/[[:alnum:]]+[.]html$", ul)) || startsWith(ul, "http://cran.r-project.org") || any(startsWith(ul, mirrors))) R <- grepl("^http://(www|bugs|journal).r-project.org", ul) spaces <- grepl(" ", u) c(if(cran) u else "", if(spaces) u else "", if(R) u else "") } bad <- .gather() if(!NROW(db)) return(bad) ## Could also use utils::getCRANmirrors(local.only = TRUE). mirrors <- c(utils::read.csv(file.path(R.home("doc"), "CRAN_mirrors.csv"), as.is = TRUE, encoding = "UTF-8")$URL, "http://cran.rstudio.com/", "https://cran.rstudio.com/") mirrors <- tolower(sub("/$", "", mirrors)) if(inherits(db, "check_url_db")) { ## Allow re-checking check results. parents <- db$From urls <- db$URL } else { parents <- split(db$Parent, db$URL) urls <- names(parents) } parts <- parse_URI_reference(urls) ## Empty URLs. ind <- apply(parts == "", 1L, all) if(any(ind)) { len <- sum(ind) bad <- rbind(bad, .gather(urls[ind], parents[ind], m = rep.int("Empty URL", len))) } ## Invalid URI schemes. schemes <- parts[, 1L] ind <- is.na(match(schemes, c("", IANA_URI_scheme_db$URI_Scheme, ## Also allow 'javascript' scheme, see ## ## (but apparently never registered with IANA). "javascript"))) if(any(ind)) { len <- sum(ind) msg <- rep.int("Invalid URI scheme", len) doi <- schemes[ind] == "doi" if(any(doi)) msg[doi] <- paste(msg[doi], "(use \\doi for DOIs in Rd markup)") bad <- rbind(bad, .gather(urls[ind], parents[ind], m = msg)) } ## ftp. pos <- which(schemes == "ftp") if(length(pos) && remote) { urlspos <- urls[pos] headers <- .fetch_headers(urlspos) results <- do.call(rbind, Map(.check_ftp, urlspos, headers)) status <- as.numeric(results[, 1L]) ind <- (status < 0L) | (status >= 400L) if(any(ind)) { pos <- pos[ind] s <- as.character(status[ind]) s[s == "-1"] <- "Error" m <- results[ind, 2L] m[is.na(m)] <- "" bad <- rbind(bad, .gather(urls[pos], parents[pos], s, m)) } } ## http/https. pos <- which(schemes == "http" | schemes == "https") if(length(pos)) { urlspos <- urls[pos] headers <- .fetch_headers(urlspos) results <- do.call(rbind, Map(.check_http, urlspos, headers)) status <- as.numeric(results[, 1L]) ## 405 is HTTP not allowing HEAD requests ## maybe also skip 500, 503, 504 as likely to be temporary issues ind <- is.na(match(status, c(200L, 405L, NA))) | nzchar(results[, 3L]) | nzchar(results[, 4L]) | nzchar(results[, 5L]) | nzchar(results[, 6L]) if(any(ind)) { pos <- pos[ind] s <- as.character(status[ind]) s[is.na(s)] <- "" s[s == "-1"] <- "Error" m <- results[ind, 2L] m[is.na(m)] <- "" bad <- rbind(bad, .gather(urls[pos], parents[pos], s, m, results[ind, 3L], results[ind, 4L], results[ind, 5L], results[ind, 6L])) } } bad } format.check_url_db <- function(x, ...) { if(!NROW(x)) return(character()) u <- x$URL new <- x$New ind <- nzchar(new) if(any(ind)) { u[ind] <- sprintf("%s (moved to %s)", u[ind], new[ind]) if(config_val_to_logical(Sys.getenv("_R_CHECK_URLS_SHOW_301_STATUS_", "FALSE"))) { x$Message[ind] <- "Moved Permanently" x$Status[ind] <- "301" } } paste0(sprintf("URL: %s", u), sprintf("\nFrom: %s", vapply(x$From, paste, "", collapse = "\n ")), ifelse((s <- x$Status) == "", "", sprintf("\nStatus: %s", s)), ifelse((m <- x$Message) == "", "", sprintf("\nMessage: %s", gsub("\n", "\n ", m, fixed=TRUE))), ifelse((m <- x$Spaces) == "", "", "\nURL contains spaces"), ifelse((m <- x$CRAN) == "", "", "\nCRAN URL not in canonical form"), ifelse((m <- x$R) == "", "", "\nR-project URL not in canonical form") ) } print.check_url_db <- function(x, ...) { if(NROW(x)) writeLines(paste(format(x), collapse = "\n\n")) invisible(x) } as.matrix.check_url_db <- function(x, ...) { n <- lengths(x[["From"]]) y <- do.call(cbind, c(list(URL = rep.int(x[["URL"]], n), Parent = unlist(x[["From"]])), lapply(x[-c(1L, 2L)], rep.int, n))) rownames(y) <- NULL y } .fetch_headers_via_base <- function(urls, verbose = FALSE, ids = urls) Map(function(u, verbose, i) { if(verbose) message(sprintf("processing %s", i)) tryCatch(curlGetHeaders(u), error = identity) }, urls, verbose, ids) .fetch_headers_via_curl <- function(urls, verbose = FALSE, pool = NULL) { .progress_bar <- function(length, msg = "") { bar <- new.env(parent = baseenv()) if(is.null(length)) { length <- 0L } ## ## make codetools happy done <- fmt <- NULL ## bar$length <- length bar$done <- -1L digits <- trunc(log10(length)) + 1L bar$fmt <- paste0("\r", msg, "[ %", digits, "i / %", digits, "i ]") bar$update <- function() { assign("done", inherits = TRUE, done + 1L) if (length <= 0L) { return() } if (done >= length) { cat("\r", strrep(" ", nchar(fmt)), "\r", sep = "") } else { cat(sprintf(fmt, done, length), sep = "") } } environment(bar$update) <- bar bar$update() bar } if(is.null(pool)) pool <- curl::new_pool() hs <- vector("list", length(urls)) bar <- .progress_bar(if (verbose) length(urls), msg = "fetching ") for(i in seq_along(hs)) { u <- urls[[i]] h <- curl::new_handle(url = u) curl::handle_setopt(h, nobody = TRUE, cookiesession = 1L, followlocation = 1L, http_version = 2L, ssl_enable_alpn = 0L) timeout <- as.integer(getOption("timeout")) if(!is.na(timeout) && (timeout > 0L)) curl::handle_setopt(h, connecttimeout = timeout, timeout = timeout) if(grepl("^https?://github[.]com", u) && nzchar(a <- Sys.getenv("GITHUB_PAT", ""))) { curl::handle_setheaders(h, "Authorization" = paste("token", a)) } handle_result <- local({ i <- i function(x) { hs[[i]] <<- x bar$update() } }) handle_error <- local({ i <- i function(x) { hs[[i]] <<- structure(list(message = x), class = c("curl_error", "error", "condition")) bar$update() } }) curl::multi_add(h, done = handle_result, fail = handle_error, pool = pool) } curl::multi_run(pool = pool) out <- vector("list", length(hs)) for(i in seq_along(out)) { if(inherits(hs[[i]], "error")) { out[[i]] <- hs[[i]] } else { out[[i]] <- strsplit(rawToChar(hs[[i]]$headers), "(?<=\r\n)", perl = TRUE)[[1L]] attr(out[[i]], "status") <- hs[[i]]$status_code } } out } .curl_GET_status <- function(u, verbose = FALSE) { if(verbose) message(sprintf("processing %s", u)) ## Configure curl handle for better luck with JSTOR URLs/DOIs. ## Alternatively, special-case requests to ## https?://doi.org/10.2307 ## https?://www.jstor.org h <- curl::new_handle() curl::handle_setopt(h, cookiesession = 1, followlocation = 1, http_version = 2L, ssl_enable_alpn = 0) timeout <- as.integer(getOption("timeout")) if(!is.na(timeout) && (timeout > 0L)) curl::handle_setopt(h, connecttimeout = timeout, timeout = timeout) if(startsWith(u, "https://github.com") && nzchar(a <- Sys.getenv("GITHUB_PAT", ""))) curl::handle_setheaders(h, "Authorization" = paste("token", a)) g <- tryCatch(curl::curl_fetch_memory(u, handle = h), error = identity) if(inherits(g, "error")) -1L else g$status_code } urlchecker/inst/tools/README.md0000644000176200001440000000042014017450734015776 0ustar liggesusersThe file `urltools.R` in this directory is copied from the SVN with the following ``` svn cat -r 80050 https://svn.r-project.org/R/trunk/src/library/tools/R/urltools.R > urltools.R ``` When you want to update it run the above command with the latest SVN revision number