rematch/0000755000176200001440000000000014473671472011714 5ustar liggesusersrematch/NAMESPACE0000644000176200001440000000012414473607241013121 0ustar liggesusers# Generated by roxygen2: do not edit by hand export(re_match) export(re_match_all) rematch/LICENSE0000644000176200001440000000010214473607656012715 0ustar liggesusersYEAR: 2023 COPYRIGHT HOLDER: Mango Solutions; Posit Software, PBC rematch/README.md0000644000176200001440000000710614473617257013200 0ustar liggesusers # rematch > Match Regular Expressions with a Nicer ‘API’ [![R-CMD-check](https://github.com/gaborcsardi/rematch/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/gaborcsardi/rematch/actions/workflows/R-CMD-check.yaml) [![](https://www.r-pkg.org/badges/version/rematch)](https://www.r-pkg.org/pkg/rematch) [![CRAN RStudio mirror downloads](https://cranlogs.r-pkg.org/badges/rematch)](https://www.r-pkg.org/pkg/rematch) [![Coverage Status](https://img.shields.io/codecov/c/github/gaborcsardi/rematch/main.svg)](https://app.codecov.io/github/gaborcsardi/rematch?branch=main) A small wrapper on ‘regexpr’ to extract the matches and captured groups from the match of a regular expression to a character vector. ## Installation ``` r source("https://install-github.me/gaborcsardi/rematch") ``` ## Usage ``` r library(rematch) ``` ``` r dates <- c("2016-04-20", "1977-08-08", "not a date", "2016", "76-03-02", "2012-06-30", "2015-01-21 19:58") isodate <- "([0-9]{4})-([0-1][0-9])-([0-3][0-9])" re_match(text = dates, pattern = isodate) ``` #> .match #> [1,] "2016-04-20" "2016" "04" "20" #> [2,] "1977-08-08" "1977" "08" "08" #> [3,] NA NA NA NA #> [4,] NA NA NA NA #> [5,] NA NA NA NA #> [6,] "2012-06-30" "2012" "06" "30" #> [7,] "2015-01-21" "2015" "01" "21" ``` r isodaten <- "(?[0-9]{4})-(?[0-1][0-9])-(?[0-3][0-9])" re_match(text = dates, pattern = isodaten) ``` #> .match year month day #> [1,] "2016-04-20" "2016" "04" "20" #> [2,] "1977-08-08" "1977" "08" "08" #> [3,] NA NA NA NA #> [4,] NA NA NA NA #> [5,] NA NA NA NA #> [6,] "2012-06-30" "2012" "06" "30" #> [7,] "2015-01-21" "2015" "01" "21" ``` r github_repos <- c("metacran/crandb", "jeroenooms/curl@v0.9.3", "jimhester/covr#47", "hadley/dplyr@*release", "r-lib/remotes@550a3c7d3f9e1493a2ba", "/$&@R64&3") owner_rx <- "(?:(?[^/]+)/)?" repo_rx <- "(?[^/@#]+)" subdir_rx <- "(?:/(?[^@#]*[^@#/]))?" ref_rx <- "(?:@(?[^*].*))" pull_rx <- "(?:#(?[0-9]+))" release_rx <- "(?:@(?[*]release))" ref_or_pull_or_release_rx <- sprintf("(?:%s|%s|%s)?", ref_rx, pull_rx, release_rx) github_rx <- sprintf("^(?:%s%s%s%s|(?.*))$", owner_rx, repo_rx, subdir_rx, ref_or_pull_or_release_rx) out <- re_match(text = github_repos, pattern = github_rx) out ``` #> .match owner repo subdir #> [1,] "metacran/crandb" "metacran" "crandb" "" #> [2,] "jeroenooms/curl@v0.9.3" "jeroenooms" "curl" "" #> [3,] "jimhester/covr#47" "jimhester" "covr" "" #> [4,] "hadley/dplyr@*release" "hadley" "dplyr" "" #> [5,] "r-lib/remotes@550a3c7d3f9e1493a2ba" "r-lib" "remotes" "" #> [6,] "/$&@R64&3" "" "" "" #> ref pull release catchall #> [1,] "" "" "" "" #> [2,] "v0.9.3" "" "" "" #> [3,] "" "47" "" "" #> [4,] "" "" "*release" "" #> [5,] "550a3c7d3f9e1493a2ba" "" "" "" #> [6,] "" "" "" "/$&@R64&3" ## License MIT © Mango Solutions; Posit Software, PBC rematch/man/0000755000176200001440000000000014473607241012460 5ustar liggesusersrematch/man/re_match.Rd0000644000176200001440000000241314473607241014531 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/package.R \name{re_match} \alias{re_match} \title{Match a regular expression to a character vector} \usage{ re_match(pattern, text, ...) } \arguments{ \item{pattern}{Regular expression, defaults to be a PCRE expression. See \code{\link[base]{regex}} for more about regular expressions.} \item{text}{Character vector.} \item{...}{Additional arguments to pass to \code{\link[base]{regexpr}}.} } \value{ A character matrix of the matched (sub)strings. The first column is always the full match. This column is named \code{.match}. The result of the columns are capture groups, with appropriate column names, if the groups are named. } \description{ This function is a small wrapper on the \code{\link[base]{regexpr}} base R function, to provide an API that is easier to use. } \details{ Currently only the first occurence of the pattern is used. } \examples{ dates <- c("2016-04-20", "1977-08-08", "not a date", "2016", "76-03-02", "2012-06-30", "2015-01-21 19:58") isodate <- "([0-9]{4})-([0-1][0-9])-([0-3][0-9])" re_match(text = dates, pattern = isodate) # The same with named groups isodaten <- "(?[0-9]{4})-(?[0-1][0-9])-(?[0-3][0-9])" re_match(text = dates, pattern = isodaten) } rematch/man/rematch.Rd0000644000176200001440000000061114473607241014370 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/package.R \docType{package} \name{rematch} \alias{rematch} \alias{rematch-package} \title{Match Regular Expressions with a Nicer 'API'} \description{ A small wrapper on 'regexpr' to extract the matches and captured groups from the match of a regular expression to a character vector. See \code{\link{re_match}}. } rematch/man/re_match_all.Rd0000644000176200001440000000203214473607241015356 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/package.R \name{re_match_all} \alias{re_match_all} \title{Extract all matches of a regular expression} \usage{ re_match_all(pattern, text, ...) } \arguments{ \item{pattern}{Regular expression, defaults to be a PCRE expression. See \code{\link[base]{regex}} for more about regular expressions.} \item{text}{Character vector.} \item{...}{Additional arguments to pass to \code{\link[base]{regexpr}}.} } \value{ A list of character matrices. Each list element contains the matches of one string in the input character vector. Each matrix has a \code{.match} column that contains the matching part of the string. Additional columns are added for capture groups. For named capture groups, the columns are named. } \description{ This function is a thin wrapper on the \code{\link[base]{gregexpr}} base R function, to provide an API that is easier to use. It is similar to \code{\link{re_match}}, but extracts all matches, including potentially named capture groups. } rematch/DESCRIPTION0000644000176200001440000000117014473671472013421 0ustar liggesusersPackage: rematch Title: Match Regular Expressions with a Nicer 'API' Version: 2.0.0 Author: Gabor Csardi Maintainer: Gabor Csardi Description: A small wrapper on 'regexpr' to extract the matches and captured groups from the match of a regular expression to a character vector. License: MIT + file LICENSE URL: https://github.com/gaborcsardi/rematch BugReports: https://github.com/gaborcsardi/rematch/issues RoxygenNote: 5.0.1.9000 Suggests: covr, testthat Encoding: UTF-8 NeedsCompilation: no Packaged: 2023-08-30 12:10:51 UTC; gaborcsardi Repository: CRAN Date/Publication: 2023-08-30 16:50:02 UTC rematch/tests/0000755000176200001440000000000014473607241013047 5ustar liggesusersrematch/tests/testthat/0000755000176200001440000000000014473671472014716 5ustar liggesusersrematch/tests/testthat/test-all.R0000644000176200001440000000247014473607241016562 0ustar liggesusers context("re_match_all") test_that("corner cases", { res <- re_match_all("", c("foo", "bar")) expect_equal( res, list( cbind(.match = c("", "", "")), cbind(.match = c("", "", "")) ) ) res <- re_match_all("", c("", "bar")) expect_equal( res, list( cbind(.match = ""), cbind(.match = c("", "", "")) ) ) res <- re_match_all("", character()) expect_equal(res, list()) res <- re_match_all("foo", character()) expect_equal(res, list()) res <- re_match_all("foo", "not") expect_equal(res, list(cbind(.match = character()))) }) test_that("capture groups", { pattern <- "([0-9]+)" res <- re_match_all(pattern, c("123xxxx456", "", "xxx", "1", "123")) expect_equal( res, list( cbind(.match = c("123", "456"), c("123", "456")), cbind(.match = character(), character()), cbind(.match = character(), character()), cbind(.match = "1", "1"), cbind(.match = "123", "123") ) ) }) test_that("scalar text with capure groups", { res <- re_match_all("\\b(\\w+)\\b", "foo bar") expect_equal(res, list(cbind(.match = c("foo", "bar"), c("foo", "bar")))) res <- re_match_all("\\b(?\\w+)\\b", "foo bar") expect_equal( res, list( cbind(.match = c("foo", "bar"), word = c("foo", "bar")) ) ) }) rematch/tests/testthat/test.R0000644000176200001440000000404014473607241016007 0ustar liggesusers context("rematch") test_that("corner cases", { res <- re_match("", c("foo", "bar")) expect_equal(res, cbind(.match = c("", ""))) res <- re_match("", c("foo", "", "bar")) expect_equal(res, cbind(.match = c("", "", ""))) res <- re_match("", character()) expect_equal(res, cbind(.match = character())) res <- re_match("foo", character()) expect_equal(res, cbind(.match = character())) res <- re_match("foo (g1) (g2)", character()) expect_equal(res, cbind(.match = character(), character(), character())) res <- re_match("foo (g1) (?g2)", character()) expect_equal( res, cbind(.match = character(), character(), name = character()) ) res <- re_match("foo", "not") expect_equal(res, cbind(.match = NA_character_)) }) test_that("not so corner cases", { dates <- c("2016-04-20", "1977-08-08", "not a date", "2016", "76-03-02", "2012-06-30", "2015-01-21 19:58") isodate <- "([0-9]{4})-([0-1][0-9])-([0-3][0-9])" expect_equal( re_match(text = dates, pattern = isodate), cbind( .match = c(dates[1:2], NA, NA, NA, "2012-06-30", "2015-01-21"), c("2016", "1977", NA, NA, NA, "2012", "2015"), c("04", "08", NA, NA, NA, "06", "01"), c("20", "08", NA, NA, NA, "30", "21") ) ) isodaten <- "(?[0-9]{4})-(?[0-1][0-9])-(?[0-3][0-9])" expect_equal( re_match(text = dates, pattern = isodaten), cbind( .match = c(dates[1:2], NA, NA, NA, "2012-06-30", "2015-01-21"), year = c("2016", "1977", NA, NA, NA, "2012", "2015"), month = c("04", "08", NA, NA, NA, "06", "01"), day = c("20", "08", NA, NA, NA, "30", "21") ) ) }) test_that("UTF8", { res <- re_match("Gábor", c("Gábor Csárdi")) expect_equal(res, cbind(.match = "Gábor")) }) test_that("text is scalar & capture groups", { res <- re_match("(\\w+) (\\w+)", "foo bar") expect_equal(res, cbind(.match = "foo bar", "foo", "bar")) res <- re_match("(?\\w+) (?\\w+)", "foo bar") expect_equal(res, cbind(.match = "foo bar", g1 = "foo", g2 = "bar")) }) rematch/tests/testthat.R0000644000176200001440000000010714473607241015030 0ustar liggesusers if (require(testthat)) { library(rematch) test_check("rematch") } rematch/R/0000755000176200001440000000000014473607241012106 5ustar liggesusersrematch/R/package.R0000644000176200001440000001020214473607241013617 0ustar liggesusers #' Match Regular Expressions with a Nicer 'API' #' #' A small wrapper on 'regexpr' to extract the matches and captured #' groups from the match of a regular expression to a character vector. #' See \code{\link{re_match}}. #' #' @docType package #' @name rematch NULL #' Match a regular expression to a character vector #' #' This function is a small wrapper on the \code{\link[base]{regexpr}} #' base R function, to provide an API that is easier to use. #' #' Currently only the first occurence of the pattern is used. #' #' @param pattern Regular expression, defaults to be a PCRE #' expression. See \code{\link[base]{regex}} for more about #' regular expressions. #' @param text Character vector. #' @param ... Additional arguments to pass to #' \code{\link[base]{regexpr}}. #' @return A character matrix of the matched (sub)strings. #' The first column is always the full match. This column is #' named \code{.match}. The result of the columns are capture groups, #' with appropriate column names, if the groups are named. #' #' @export #' @examples #' dates <- c("2016-04-20", "1977-08-08", "not a date", "2016", #' "76-03-02", "2012-06-30", "2015-01-21 19:58") #' isodate <- "([0-9]{4})-([0-1][0-9])-([0-3][0-9])" #' re_match(text = dates, pattern = isodate) #' #' # The same with named groups #' isodaten <- "(?[0-9]{4})-(?[0-1][0-9])-(?[0-3][0-9])" #' re_match(text = dates, pattern = isodaten) re_match <- function(pattern, text, ...) { stopifnot(is.character(pattern), length(pattern) == 1, !is.na(pattern)) text <- as.character(text) match <- regexpr(pattern, text, perl = TRUE, ...) ## Full matches res <- cbind(as.character( ifelse( match == -1, NA_character_, substr(text, match, match + attr(match, "match.length") - 1) ) )) if (!is.null(attr(match, "capture.start"))) { res <- cbind( res, rbind(vapply( seq_len(NCOL(attr(match, "capture.start"))), function(i) { start <- attr(match, "capture.start")[,i] len <- attr(match, "capture.length")[,i] end <- start + len - 1 res <- substr(text, start, end) res[ start == -1 ] <- NA_character_ res }, character(length(match)) )) ) } colnames(res) <- c(".match", attr(match, "capture.names")) res } #' Extract all matches of a regular expression #' #' This function is a thin wrapper on the \code{\link[base]{gregexpr}} #' base R function, to provide an API that is easier to use. It is #' similar to \code{\link{re_match}}, but extracts all matches, including #' potentially named capture groups. #' #' @param ... Additional arguments to pass to #' \code{\link[base]{regexpr}}. #' @inheritParams re_match #' @return A list of character matrices. Each list element contains the #' matches of one string in the input character vector. Each matrix #' has a \code{.match} column that contains the matching part of the #' string. Additional columns are added for capture groups. For named #' capture groups, the columns are named. #' #' @export re_match_all <- function(pattern, text, ...) { stopifnot(is.character(pattern), length(pattern) == 1, !is.na(pattern)) text <- as.character(text) match <- gregexpr(pattern, text, perl = TRUE, ...) mapply(re_match_all1, match, text, SIMPLIFY = FALSE) } re_match_all1 <- function(match, text) { match_len <- attr(match, "match.length") capt_start <- attr(match, "capture.start") capt_len <- attr(match, "capture.length") capt_names <- attr(match, "capture.names") match <- as.vector(match) if (identical(match, -1L)) { return(matrix( character(), nrow = 0, ncol = length(capt_names) + 1, dimnames = list(character(), c(".match", capt_names)) )) } res <- cbind(as.character(substring(text, match, match + match_len - 1))) if (!is.null(capt_start)) { res <- cbind( res, rbind(vapply( seq_len(NCOL(capt_start)), function(i) { substring(text, capt_start[,i], capt_start[,i] + capt_len[,i] - 1) }, character(length(match)) )) ) } colnames(res) <- c(".match", capt_names) res } rematch/NEWS.md0000644000176200001440000000052314473630642013004 0ustar liggesusers # rematch 2.0.0 * New `re_match_all` function to extract all matches. * Removed the `perl` arguments, we always use PERL compatible regular expressions now. # rematch 1.0.1 * Make `R CMD check` work when `testthat` is not available. * Fixed a bug with group capture when `text` is a scalar. # rematch 1.0.0 First public release. rematch/MD50000644000176200001440000000111014473671472012215 0ustar liggesusers8894337e18744d1be5602e07afc266fd *DESCRIPTION 21e3a7621ad0adb1225f87cf96639fc4 *LICENSE f60282b37bd342ec25ba42e90600c9db *NAMESPACE 2a86a2c81435e5abbfa0586a19e5fbed *NEWS.md ce0acdd79ba7de00c9e6ae64a8cedc66 *R/package.R 340c38767097a08dc6ac56b5b373a440 *README.md 8c241f4abafd70a930f89539ddd9547e *man/re_match.Rd 78a096b9e09e027fd69c43c02f8b7563 *man/re_match_all.Rd a8d0ce22292bf9718bcce898db9c5480 *man/rematch.Rd c99ec8380983645be33bdffcc35830d5 *tests/testthat.R 9f66f5428f9e65ad15050aec8c50db76 *tests/testthat/test-all.R 0a9a58b76071b82c6132742ebf4afe6f *tests/testthat/test.R