stringr/ 0000755 0001751 0000144 00000000000 12520375150 011760 5 ustar hornik users stringr/inst/ 0000755 0001751 0000144 00000000000 12520151252 012730 5 ustar hornik users stringr/inst/doc/ 0000755 0001751 0000144 00000000000 12520151252 013475 5 ustar hornik users stringr/inst/doc/stringr.R 0000644 0001751 0000144 00000004236 12520151252 015315 0 ustar hornik users ## ---- echo=FALSE---------------------------------------------------------
library("stringr")
knitr::opts_chunk$set(comment = "#>", collapse = TRUE)
## ------------------------------------------------------------------------
strings <- c(
"apple",
"219 733 8965",
"329-293-8753",
"Work: 579-499-7527; Home: 543.355.3679"
)
phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})"
## ------------------------------------------------------------------------
# Which strings contain phone numbers?
str_detect(strings, phone)
str_subset(strings, phone)
## ------------------------------------------------------------------------
# Where in the string is the phone number located?
(loc <- str_locate(strings, phone))
str_locate_all(strings, phone)
## ------------------------------------------------------------------------
# What are the phone numbers?
str_extract(strings, phone)
str_extract_all(strings, phone)
str_extract_all(strings, phone, simplify = TRUE)
## ------------------------------------------------------------------------
# Pull out the three components of the match
str_match(strings, phone)
str_match_all(strings, phone)
## ------------------------------------------------------------------------
str_replace(strings, phone, "XXX-XXX-XXXX")
str_replace_all(strings, phone, "XXX-XXX-XXXX")
## ------------------------------------------------------------------------
col2hex <- function(col) {
rgb <- col2rgb(col)
rgb(rgb["red", ], rgb["green", ], rgb["blue", ], max = 255)
}
# Goal replace colour names in a string with their hex equivalent
strings <- c("Roses are red, violets are blue", "My favourite colour is green")
colours <- str_c("\\b", colors(), "\\b", collapse="|")
# This gets us the colours, but we have no way of replacing them
str_extract_all(strings, colours)
# Instead, let's work with locations
locs <- str_locate_all(strings, colours)
Map(function(string, loc) {
hex <- col2hex(str_sub(string, loc))
str_sub(string, loc) <- hex
string
}, strings, locs)
## ------------------------------------------------------------------------
matches <- col2hex(colors())
names(matches) <- str_c("\\b", colors(), "\\b")
str_replace_all(strings, matches)
stringr/inst/doc/stringr.html 0000644 0001751 0000144 00000063026 12520151252 016062 0 ustar hornik users
Strings are not glamorous, high-profile components of R, but they do play a big role in many data cleaning and preparations tasks. R provides a solid set of string operations, but because they have grown organically over time, they can be inconsistent and a little hard to learn. Additionally, they lag behind the string operations in other programming languages, so that some things that are easy to do in languages like Ruby or Python are rather hard to do in R. The stringr package aims to remedy these problems by providing a clean, modern interface to common string operations.
These are described in more detail in the following sections.
Basic string operations
There are three string functions that are closely related to their base R equivalents, but with a few enhancements:
str_c()
is equivalent to paste()
, but it uses the empty string (“”) as the default separator and silently removes NULL
inputs.
str_length()
is equivalent to nchar()
, but it preserves NA’s (rather than giving them length 2) and converts factors to characters (not integers).
str_sub()
is equivalent to substr()
but it returns a zero length vector if any of its inputs are zero length, and otherwise expands each argument to match the longest. It also accepts negative positions, which are calculated from the left of the last character. The end position defaults to -1
, which corresponds to the last character.
str_str<-
is equivalent to substr<-
, but like str_sub
it understands negative indices, and replacement strings not do need to be the same length as the string they are replacing.
Three functions add new functionality:
str_dup()
to duplicate the characters within a string.
str_trim()
to remove leading and trailing whitespace.
str_pad()
to pad a string with extra whitespace on the left, right, or both sides.
Pattern matching
stringr provides pattern matching functions to detect, locate, extract, match, replace, and split strings. I’ll illustrate how they work with some strings and a regular expression designed to match (US) phone numbers:
strings <- c(
"apple",
"219 733 8965",
"329-293-8753",
"Work: 579-499-7527; Home: 543.355.3679"
)
phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})"
str_detect()
detects the presence or absence of a pattern and returns a logical vector (similar to grepl()
). str_subset()
returns the elements of a character vector that match a regular expression (similar to grep()
with value = TRUE
)`.
# Which strings contain phone numbers?
str_detect(strings, phone)
#> [1] FALSE TRUE TRUE TRUE
str_subset(strings, phone)
#> [1] "219 733 8965"
#> [2] "329-293-8753"
#> [3] "Work: 579-499-7527; Home: 543.355.3679"
str_locate()
locates the first position of a pattern and returns a numeric matrix with columns start and end. str_locate_all()
locates all matches, returning a list of numeric matrices. Similar to regexpr()
and gregexpr()
.
# Where in the string is the phone number located?
(loc <- str_locate(strings, phone))
#> start end
#> [1,] NA NA
#> [2,] 1 12
#> [3,] 1 12
#> [4,] 7 18
str_locate_all(strings, phone)
#> [[1]]
#> start end
#>
#> [[2]]
#> start end
#> [1,] 1 12
#>
#> [[3]]
#> start end
#> [1,] 1 12
#>
#> [[4]]
#> start end
#> [1,] 7 18
#> [2,] 27 38
str_extract()
extracts text corresponding to the first match, returning a character vector. str_extract_all()
extracts all matches and returns a list of character vectors.
# What are the phone numbers?
str_extract(strings, phone)
#> [1] NA "219 733 8965" "329-293-8753" "579-499-7527"
str_extract_all(strings, phone)
#> [[1]]
#> character(0)
#>
#> [[2]]
#> [1] "219 733 8965"
#>
#> [[3]]
#> [1] "329-293-8753"
#>
#> [[4]]
#> [1] "579-499-7527" "543.355.3679"
str_extract_all(strings, phone, simplify = TRUE)
#> [,1] [,2]
#> [1,] "" ""
#> [2,] "219 733 8965" ""
#> [3,] "329-293-8753" ""
#> [4,] "579-499-7527" "543.355.3679"
str_match()
extracts capture groups formed by ()
from the first match. It returns a character matrix with one column for the complete match and one column for each group. str_match_all()
extracts capture groups from all matches and returns a list of character matrices. Similar to regmatches()
.
# Pull out the three components of the match
str_match(strings, phone)
#> [,1] [,2] [,3] [,4]
#> [1,] NA NA NA NA
#> [2,] "219 733 8965" "219" "733" "8965"
#> [3,] "329-293-8753" "329" "293" "8753"
#> [4,] "579-499-7527" "579" "499" "7527"
str_match_all(strings, phone)
#> [[1]]
#> [,1] [,2] [,3] [,4]
#>
#> [[2]]
#> [,1] [,2] [,3] [,4]
#> [1,] "219 733 8965" "219" "733" "8965"
#>
#> [[3]]
#> [,1] [,2] [,3] [,4]
#> [1,] "329-293-8753" "329" "293" "8753"
#>
#> [[4]]
#> [,1] [,2] [,3] [,4]
#> [1,] "579-499-7527" "579" "499" "7527"
#> [2,] "543.355.3679" "543" "355" "3679"
str_replace()
replaces the first matched pattern and returns a character vector. str_replace_all()
replaces all matches. Similar to sub()
and gsub()
.
str_replace(strings, phone, "XXX-XXX-XXXX")
#> [1] "apple"
#> [2] "XXX-XXX-XXXX"
#> [3] "XXX-XXX-XXXX"
#> [4] "Work: XXX-XXX-XXXX; Home: 543.355.3679"
str_replace_all(strings, phone, "XXX-XXX-XXXX")
#> [1] "apple"
#> [2] "XXX-XXX-XXXX"
#> [3] "XXX-XXX-XXXX"
#> [4] "Work: XXX-XXX-XXXX; Home: XXX-XXX-XXXX"
str_split_fixed()
splits the string into a fixed number of pieces based on a pattern and returns a character matrix. str_split()
splits a string into a variable number of pieces and returns a list of character vectors.
Arguments
Each pattern matching function has the same first two arguments, a character vector of string
s to process and a single pattern
(regular expression) to match. The replace functions have an additional argument specifying the replacement string, and the split functions have an argument to specify the number of pieces.
Unlike base string functions, stringr offers control over matching not through arguments, but through modifier functions, regexp()
, coll()
and fixed()
. This is a deliberate choice made to simplify these functions. For example, while grepl
has six arguments, str_detect()
only has two.
Regular expressions
To be able to use these functions effectively, you’ll need a good knowledge of regular expressions, which this vignette is not going to teach you. Some useful tools to get you started:
When writing regular expressions, I strongly recommend generating a list of positive (pattern should match) and negative (pattern shouldn’t match) test cases to ensure that you are matching the correct components.
Functions that return lists
Many of the functions return a list of vectors or matrices. To work with each element of the list there are two strategies: iterate through a common set of indices, or use Map()
to iterate through the vectors simultaneously. The second strategy is illustrated below:
col2hex <- function(col) {
rgb <- col2rgb(col)
rgb(rgb["red", ], rgb["green", ], rgb["blue", ], max = 255)
}
# Goal replace colour names in a string with their hex equivalent
strings <- c("Roses are red, violets are blue", "My favourite colour is green")
colours <- str_c("\\b", colors(), "\\b", collapse="|")
# This gets us the colours, but we have no way of replacing them
str_extract_all(strings, colours)
#> [[1]]
#> [1] "red" "blue"
#>
#> [[2]]
#> [1] "green"
# Instead, let's work with locations
locs <- str_locate_all(strings, colours)
Map(function(string, loc) {
hex <- col2hex(str_sub(string, loc))
str_sub(string, loc) <- hex
string
}, strings, locs)
#> $`Roses are red, violets are blue`
#> [1] "Roses are #FF0000, violets are blue"
#> [2] "Roses are red, violets are #0000FF"
#>
#> $`My favourite colour is green`
#> [1] "My favourite colour is #00FF00"
Another approach is to use the second form of str_replace_all()
: if you give it a named vector, it applies each pattern = replacement
in turn:
matches <- col2hex(colors())
names(matches) <- str_c("\\b", colors(), "\\b")
str_replace_all(strings, matches)
#> [1] "Roses are #FF0000, violets are #0000FF"
#> [2] "My favourite colour is #00FF00"
Conclusion
stringr provides an opinionated interface to strings in R. It makes string processing simpler by removing uncommon options, and by vigorously enforcing consistency across functions. I have also added new functions that I have found useful from Ruby, and over time, I hope users will suggest useful functions from other programming languages. I will continue to build on the included test suite to ensure that the package behaves as expected and remains bug free.
stringr/inst/doc/stringr.Rmd 0000644 0001751 0000144 00000022041 12520151252 015630 0 ustar hornik users ---
title: "Introduction to stringr"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Introduction to stringr}
%\VignetteEngine{knitr::rmarkdown}
\usepackage[utf8]{inputenc}
---
```{r, echo=FALSE}
library("stringr")
knitr::opts_chunk$set(comment = "#>", collapse = TRUE)
```
Strings are not glamorous, high-profile components of R, but they do play a big role in many data cleaning and preparations tasks. R provides a solid set of string operations, but because they have grown organically over time, they can be inconsistent and a little hard to learn. Additionally, they lag behind the string operations in other programming languages, so that some things that are easy to do in languages like Ruby or Python are rather hard to do in R. The __stringr__ package aims to remedy these problems by providing a clean, modern interface to common string operations.
More concretely, stringr:
- Simplifies string operations by eliminating options that you don't need
95% of the time (the other 5% of the time you can functions from base R or
[stringi](https://github.com/Rexamine/stringi/)).
- Uses consistent function names and arguments.
- Produces outputs than can easily be used as inputs. This includes ensuring
that missing inputs result in missing outputs, and zero length inputs result
in zero length outputs. It also processes factors and character vectors in
the same way.
- Completes R's string handling functions with useful functions from other
programming languages.
To meet these goals, stringr provides two basic families of functions:
- basic string operations, and
- pattern matching functions which use regular expressions to detect, locate,
match, replace, extract, and split strings.
As of version 1.0, stringr is a thin wrapper around [stringi](https://github.com/Rexamine/stringi/), which implements all the functions in stringr with efficient C code based on the [ICU library](http://site.icu-project.org). Compared to stringi, stringr is considerably simpler: it provides fewer options and fewer functions. This is great when you're getting started learning string functions, and if you do need more of stringi's power, you should find the interface similar.
These are described in more detail in the following sections.
## Basic string operations
There are three string functions that are closely related to their base R equivalents, but with a few enhancements:
- `str_c()` is equivalent to `paste()`, but it uses the empty string ("") as
the default separator and silently removes `NULL` inputs.
- `str_length()` is equivalent to `nchar()`, but it preserves NA's (rather than
giving them length 2) and converts factors to characters (not integers).
- `str_sub()` is equivalent to `substr()` but it returns a zero length vector
if any of its inputs are zero length, and otherwise expands each argument to
match the longest. It also accepts negative positions, which are calculated
from the left of the last character. The end position defaults to `-1`,
which corresponds to the last character.
- `str_str<-` is equivalent to `substr<-`, but like `str_sub` it understands
negative indices, and replacement strings not do need to be the same length
as the string they are replacing.
Three functions add new functionality:
- `str_dup()` to duplicate the characters within a string.
- `str_trim()` to remove leading and trailing whitespace.
- `str_pad()` to pad a string with extra whitespace on the left, right, or both sides.
## Pattern matching
stringr provides pattern matching functions to **detect**, **locate**, **extract**, **match**, **replace**, and **split** strings. I'll illustrate how they work with some strings and a regular expression designed to match (US) phone numbers:
```{r}
strings <- c(
"apple",
"219 733 8965",
"329-293-8753",
"Work: 579-499-7527; Home: 543.355.3679"
)
phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})"
```
- `str_detect()` detects the presence or absence of a pattern and returns a
logical vector (similar to `grepl()`). `str_subset()` returns the elements
of a character vector that match a regular expression (similar to `grep()`
with `value = TRUE`)`.
```{r}
# Which strings contain phone numbers?
str_detect(strings, phone)
str_subset(strings, phone)
```
- `str_locate()` locates the first position of a pattern and returns a numeric
matrix with columns start and end. `str_locate_all()` locates all matches,
returning a list of numeric matrices. Similar to `regexpr()` and `gregexpr()`.
```{r}
# Where in the string is the phone number located?
(loc <- str_locate(strings, phone))
str_locate_all(strings, phone)
```
- `str_extract()` extracts text corresponding to the first match, returning a
character vector. `str_extract_all()` extracts all matches and returns a
list of character vectors.
```{r}
# What are the phone numbers?
str_extract(strings, phone)
str_extract_all(strings, phone)
str_extract_all(strings, phone, simplify = TRUE)
```
- `str_match()` extracts capture groups formed by `()` from the first match.
It returns a character matrix with one column for the complete match and
one column for each group. `str_match_all()` extracts capture groups from
all matches and returns a list of character matrices. Similar to
`regmatches()`.
```{r}
# Pull out the three components of the match
str_match(strings, phone)
str_match_all(strings, phone)
```
- `str_replace()` replaces the first matched pattern and returns a character
vector. `str_replace_all()` replaces all matches. Similar to `sub()` and
`gsub()`.
```{r}
str_replace(strings, phone, "XXX-XXX-XXXX")
str_replace_all(strings, phone, "XXX-XXX-XXXX")
```
- `str_split_fixed()` splits the string into a fixed number of pieces based
on a pattern and returns a character matrix. `str_split()` splits a string
into a variable number of pieces and returns a list of character vectors.
### Arguments
Each pattern matching function has the same first two arguments, a character vector of `string`s to process and a single `pattern` (regular expression) to match. The replace functions have an additional argument specifying the replacement string, and the split functions have an argument to specify the number of pieces.
Unlike base string functions, stringr offers control over matching not through arguments, but through modifier functions, `regexp()`, `coll()` and `fixed()`. This is a deliberate choice made to simplify these functions. For example, while `grepl` has six arguments, `str_detect()` only has two.
### Regular expressions
To be able to use these functions effectively, you'll need a good knowledge of regular expressions, which this vignette is not going to teach you. Some useful tools to get you started:
- A good [reference sheet](http://www.regular-expressions.info/reference.html).
- A tool that allows you to [interactively test](http://gskinner.com/RegExr/)
what a regular expression will match.
- A tool to [build a regular expression](http://www.txt2re.com) from an
input string.
When writing regular expressions, I strongly recommend generating a list of positive (pattern should match) and negative (pattern shouldn't match) test cases to ensure that you are matching the correct components.
### Functions that return lists
Many of the functions return a list of vectors or matrices. To work with each element of the list there are two strategies: iterate through a common set of indices, or use `Map()` to iterate through the vectors simultaneously. The second strategy is illustrated below:
```{r}
col2hex <- function(col) {
rgb <- col2rgb(col)
rgb(rgb["red", ], rgb["green", ], rgb["blue", ], max = 255)
}
# Goal replace colour names in a string with their hex equivalent
strings <- c("Roses are red, violets are blue", "My favourite colour is green")
colours <- str_c("\\b", colors(), "\\b", collapse="|")
# This gets us the colours, but we have no way of replacing them
str_extract_all(strings, colours)
# Instead, let's work with locations
locs <- str_locate_all(strings, colours)
Map(function(string, loc) {
hex <- col2hex(str_sub(string, loc))
str_sub(string, loc) <- hex
string
}, strings, locs)
```
Another approach is to use the second form of `str_replace_all()`: if you give it a named vector, it applies each `pattern = replacement` in turn:
```{r}
matches <- col2hex(colors())
names(matches) <- str_c("\\b", colors(), "\\b")
str_replace_all(strings, matches)
```
## Conclusion
stringr provides an opinionated interface to strings in R. It makes string processing simpler by removing uncommon options, and by vigorously enforcing consistency across functions. I have also added new functions that I have found useful from Ruby, and over time, I hope users will suggest useful functions from other programming languages. I will continue to build on the included test suite to ensure that the package behaves as expected and remains bug free.
stringr/tests/ 0000755 0001751 0000144 00000000000 12435640121 013120 5 ustar hornik users stringr/tests/testthat.R 0000644 0001751 0000144 00000000072 12435640121 015102 0 ustar hornik users library(testthat)
library(stringr)
test_check("stringr")
stringr/tests/testthat/ 0000755 0001751 0000144 00000000000 12520375150 014762 5 ustar hornik users stringr/tests/testthat/test-trim.r 0000644 0001751 0000144 00000001147 12435640121 017076 0 ustar hornik users context("Trimming strings")
test_that("trimming removes spaces", {
is_trimmed <- equals("abc")
expect_that(str_trim("abc "), is_trimmed)
expect_that(str_trim(" abc"), is_trimmed)
expect_that(str_trim(" abc "), is_trimmed)
})
test_that("trimming removes tabs", {
is_trimmed <- equals("abc")
expect_that(str_trim("abc\t"), is_trimmed)
expect_that(str_trim("\tabc"), is_trimmed)
expect_that(str_trim("\tabc\t"), is_trimmed)
})
test_that("side argument restricts trimming", {
expect_that(str_trim(" abc ", "left"), equals("abc "))
expect_that(str_trim(" abc ", "right"), equals(" abc"))
})
stringr/tests/testthat/test-sub.r 0000644 0001751 0000144 00000003400 12435640121 016706 0 ustar hornik users context("Extracting substrings")
alphabet <- str_c(letters, collapse = "")
test_that("correct substring extracted", {
expect_that(str_sub(alphabet, 1, 3), equals("abc"))
expect_that(str_sub(alphabet, 24, 26), equals("xyz"))
})
test_that("arguments expanded to longest", {
alphabet <- str_c(letters, collapse = "")
expect_that(
str_sub(alphabet, c(1, 24), c(3, 26)),
equals(c("abc", "xyz")))
expect_that(
str_sub(c("abc", "xyz"), 2, 2),
equals(c("b", "y")))
})
test_that("specifying only end subsets from start", {
expect_that(str_sub(alphabet, end = 3), equals(c("abc")))
})
test_that("specifying only start subsets to end", {
expect_that(str_sub(alphabet, 24), equals(c("xyz")))
})
test_that("specifying -1 as end selects entire string", {
expect_that(
str_sub("ABCDEF", c(4, 5), c(5, -1)),
equals(c("DE", "EF"))
)
expect_that(
str_sub("ABCDEF", c(4, 5), c(-1, -1)),
equals(c("DEF", "EF"))
)
})
test_that("negative values select from end", {
expect_that(str_sub("ABCDEF", 1, -4), equals("ABC"))
expect_that(str_sub("ABCDEF", -3), equals("DEF"))
})
test_that("missing arguments give missing results", {
expect_that(str_sub(NA), equals(NA_character_))
expect_that(str_sub(NA, 1, 3), equals(NA_character_))
expect_that(str_sub(c(NA, "NA"), 1, 3), equals(c(NA, "NA")))
expect_that(str_sub("test", NA, NA), equals(NA_character_))
expect_that(str_sub(c(NA, "test"), NA, NA), equals(rep(NA_character_, 2)))
})
test_that("replacement works", {
x <- "BBCDEF"
str_sub(x, 1, 1) <- "A"
expect_that(x, equals("ABCDEF"))
str_sub(x, -1, -1) <- "K"
expect_that(x, equals("ABCDEK"))
str_sub(x, -2, -1) <- "EFGH"
expect_that(x, equals("ABCDEFGH"))
str_sub(x, 2, -2) <- ""
expect_that(x, equals("AH"))
})
stringr/tests/testthat/test-length.r 0000644 0001751 0000144 00000001163 12435640121 017402 0 ustar hornik users context("String length")
test_that("str_length is number of characters", {
expect_that(str_length("a"), equals(1))
expect_that(str_length("ab"), equals(2))
expect_that(str_length("abc"), equals(3))
})
test_that("str_length of missing string is missing", {
expect_that(str_length(NA), equals(NA_integer_))
expect_that(str_length(c(NA, 1)), equals(c(NA, 1)))
expect_that(str_length("NA"), equals(2))
})
test_that("str_length of factor is length of level", {
expect_that(str_length(factor("a")), equals(1))
expect_that(str_length(factor("ab")), equals(2))
expect_that(str_length(factor("abc")), equals(3))
})
stringr/tests/testthat/test-extract.r 0000644 0001751 0000144 00000000711 12435640121 017571 0 ustar hornik users context("Extract patterns")
test_that("single pattern extracted correctly", {
test <- c("one two three", "a b c")
expect_that(
str_extract_all(test, "[a-z]+"),
equals(list(c("one", "two", "three"), c("a", "b", "c"))))
expect_that(
str_extract_all(test, "[a-z]{3,}"),
equals(list(c("one", "two", "three"), character())))
})
test_that("no match yields empty vector", {
expect_equal(str_extract_all("a", "b")[[1]], character())
})
stringr/tests/testthat/test-detect.r 0000644 0001751 0000144 00000001401 12435640121 017364 0 ustar hornik users context("Detecting patterns")
test_that("special cases are correct", {
expect_that(str_detect(NA, "x"), equals(NA))
expect_that(str_detect(character(), "x"), equals(logical()))
})
test_that("vectorised patterns work", {
expect_that(str_detect("ab", c("a", "b", "c")), equals(c(T, T, F)))
expect_that(str_detect(c("ca", "ab"), c("a", "c")), equals(c(T, F)))
})
test_that("modifiers work", {
expect_that(str_detect("ab", "AB"), equals(FALSE))
expect_that(str_detect("ab", regex("AB", TRUE)), equals(TRUE))
expect_that(str_detect("abc", "ab[c]"), equals(TRUE))
expect_that(str_detect("abc", fixed("ab[c]")), equals(FALSE))
expect_that(str_detect("ab[c]", fixed("ab[c]")), equals(TRUE))
expect_that(str_detect("abc", "(?x)a b c"), equals(TRUE))
})
stringr/tests/testthat/test-split.r 0000644 0001751 0000144 00000004322 12440403712 017253 0 ustar hornik users context("Splitting strings")
test_that("special cases are correct", {
expect_that(str_split(NA, "")[[1]], equals(NA_character_))
expect_that(str_split(character(), ""), equals(list()))
})
test_that("str_split functions as expected", {
test <- c("bab", "cac", "dadad")
result <- str_split(test, "a")
expect_that(result, is_a("list"))
expect_that(length(result), equals(3))
lengths <- vapply(result, length, integer(1))
expect_that(lengths, equals(c(2, 2, 3)))
expect_that(result, equals(
list(c("b", "b"), c("c", "c"), c("d", "d", "d"))))
})
test_that("vectors give correct results dealt with correctly", {
test <- c("bab", "cac", "dadad", "eae")
result <- str_split_fixed(test, "a", 3)
expect_that(result, is_a("matrix"))
expect_that(nrow(result), equals(4))
expect_that(ncol(result), equals(3))
expect_that(result[1, ], equals(c("b", "b", "")))
expect_that(result[3, ], equals(c("d", "d", "d")))
expect_that(result[, 1], equals(c("b", "c", "d", "e")))
})
test_that("n sets maximum number of splits in str_split", {
test <- "Subject: Roger: his drinking problems"
expect_that(length(str_split(test, ": ")[[1]]), equals(3))
expect_that(length(str_split(test, ": ", 4)[[1]]), equals(3))
expect_that(length(str_split(test, ": ", 3)[[1]]), equals(3))
expect_that(length(str_split(test, ": ", 2)[[1]]), equals(2))
expect_that(length(str_split(test, ": ", 1)[[1]]), equals(1))
expect_that(
str_split(test, ": ", 3)[[1]],
equals(c("Subject", "Roger", "his drinking problems")))
expect_that(
str_split(test, ": ", 2)[[1]],
equals(c("Subject", "Roger: his drinking problems")))
})
test_that("n sets exact number of splits in str_split_fixed", {
test <- "Subject: Roger: his drinking problems"
expect_that(ncol(str_split_fixed(test, ": ", 4)), equals(4))
expect_that(ncol(str_split_fixed(test, ": ", 3)), equals(3))
expect_that(ncol(str_split_fixed(test, ": ", 2)), equals(2))
expect_that(ncol(str_split_fixed(test, ": ", 1)), equals(1))
expect_that(
str_split_fixed(test, ": ", 3)[1, ],
equals(c("Subject", "Roger", "his drinking problems")))
expect_that(
str_split_fixed(test, ": ", 2)[1, ],
equals(c("Subject", "Roger: his drinking problems")))
})
stringr/tests/testthat/test-match.r 0000644 0001751 0000144 00000003535 12513530736 017231 0 ustar hornik users context("Matching groups")
set.seed(1410)
num <- matrix(sample(9, 10 * 10, rep = T), ncol = 10)
num_flat <- apply(num, 1, str_c, collapse = "")
phones <- str_c(
"(", num[, 1], num[ ,2], num[, 3], ") ",
num[, 4], num[, 5], num[, 6], " ",
num[, 7], num[, 8], num[, 9], num[, 10])
test_that("special case are correct", {
expect_equal(str_match(NA, "(a)"), matrix(NA_character_))
expect_equal(str_match(character(), "(a)"), matrix(character(), 0, 1))
})
test_that("no matching cases returns 1 column matrix", {
res <- str_match(c("a", "b"), ".")
expect_that(nrow(res), equals(2))
expect_that(ncol(res), equals(1))
expect_that(res[, 1], equals(c("a", "b")))
})
test_that("single match works when all match", {
matches <- str_match(phones, "\\(([0-9]{3})\\) ([0-9]{3}) ([0-9]{4})")
expect_that(nrow(matches), equals(length(phones)))
expect_that(ncol(matches), equals(4))
expect_that(matches[, 1], equals(phones))
matches_flat <- apply(matches[, -1], 1, str_c, collapse = "")
expect_that(matches_flat, equals(num_flat))
})
test_that("match returns NA when some inputs don't match", {
matches <- str_match(c(phones, "blah", NA),
"\\(([0-9]{3})\\) ([0-9]{3}) ([0-9]{4})")
expect_that(nrow(matches), equals(length(phones) + 2))
expect_that(ncol(matches), equals(4))
expect_that(matches[11, ], equals(rep(NA_character_, 4)))
expect_that(matches[12, ], equals(rep(NA_character_, 4)))
})
test_that("match returns NA when optional group doesn't match", {
expect_equal(str_match(c("ab", "a"), "(a)(b)?")[,3], c("b", NA))
})
test_that("multiple match works", {
phones_one <- str_c(phones, collapse = " ")
multi_match <- str_match_all(phones_one,
"\\(([0-9]{3})\\) ([0-9]{3}) ([0-9]{4})")
single_matches <- str_match(phones,
"\\(([0-9]{3})\\) ([0-9]{3}) ([0-9]{4})")
expect_that(multi_match[[1]], equals(single_matches))
})
stringr/tests/testthat/test-dup.r 0000644 0001751 0000144 00000000710 12435640121 016706 0 ustar hornik users context("Duplicating strings")
test_that("basic duplication works", {
expect_that(str_dup("a", 3), equals("aaa"))
expect_that(str_dup("abc", 2), equals("abcabc"))
expect_that(str_dup(c("a", "b"), 2), equals(c("aa", "bb")))
expect_that(str_dup(c("a", "b"), c(2, 3)), equals(c("aa", "bbb")))
})
test_that("0 duplicates equals empty string", {
expect_that(str_dup("a", 0), equals(""))
expect_that(str_dup(c("a", "b"), 0), equals(rep("", 2)))
})
stringr/tests/testthat/test-pad.r 0000644 0001751 0000144 00000001125 12435640121 016663 0 ustar hornik users context("Test padding")
test_that("long strings are unchanged", {
lengths <- sample(40:100, 10)
strings <- vapply(lengths, function(x)
str_c(letters[sample(26, x, rep = T)], collapse = ""),
character(1))
padded <- str_pad(strings, width = 30)
expect_that(str_length(padded), equals(str_length(padded)))
})
test_that("directions work for simple case", {
pad <- function(direction) str_pad("had", direction, width = 10)
expect_that(pad("right"), equals("had "))
expect_that(pad("left"), equals(" had"))
expect_that(pad("both"), equals(" had "))
})
stringr/tests/testthat/test-locate.r 0000644 0001751 0000144 00000002225 12435640121 017370 0 ustar hornik users context("Locations")
test_that("basic location matching works", {
expect_that(str_locate("abc", "a")[1, ], equals(c(1, 1),
check.attributes = F))
expect_that(str_locate("abc", "b")[1, ], equals(c(2, 2),
check.attributes = F))
expect_that(str_locate("abc", "c")[1, ], equals(c(3, 3),
check.attributes = F))
expect_that(str_locate("abc", ".+")[1, ], equals(c(1, 3),
check.attributes = F))
})
test_that("locations are integers", {
strings <- c("a b c", "d e f")
expect_that(is.integer(str_locate(strings, "[a-z]")), is_true())
res <- str_locate_all(strings, "[a-z]")[[1]]
expect_that(is.integer(res), is_true())
expect_that(is.integer(invert_match(res)), is_true())
})
test_that("both string and patterns are vectorised", {
strings <- c("abc", "def")
locs <- str_locate(strings, "a")
expect_that(locs[, "start"], equals(c(1, NA)))
locs <- str_locate(strings, c("a", "d"))
expect_that(locs[, "start"], equals(c(1, 1)))
expect_that(locs[, "end"], equals(c(1, 1)))
locs <- str_locate_all(c("abab"), c("a", "b"))
expect_that(locs[[1]][, "start"], equals(c(1, 3)))
expect_that(locs[[2]][, "start"], equals(c(2, 4)))
})
stringr/tests/testthat/test-count.r 0000644 0001751 0000144 00000000543 12435640121 017252 0 ustar hornik users context("Counting matches")
test_that("counts are as expected", {
fruit <- c("apple", "banana", "pear", "pineapple")
expect_equal(str_count(fruit, "a"), c(1, 3, 1, 1))
expect_equal(str_count(fruit, "p"), c(2, 0, 1, 3))
expect_equal(str_count(fruit, "e"), c(1, 0, 1, 2))
expect_equal(str_count(fruit, c("a", "b", "p", "n")), c(1, 1, 1, 1))
})
stringr/tests/testthat/test-join.r 0000644 0001751 0000144 00000000640 12435640121 017057 0 ustar hornik users context("Joining strings")
test_that("basic case works", {
test <- c("a", "b", "c")
expect_that(str_c(test), equals(test))
expect_that(str_c(test, sep = " "), equals(test))
expect_that(str_c(test, collapse = ""), equals("abc"))
})
test_that("NULLs are dropped", {
test <- letters[1:3]
expect_equal(str_c(test, NULL), test)
expect_equal(str_c(test, NULL, "a", sep = " "), c("a a", "b a", "c a"))
})
stringr/NAMESPACE 0000644 0001751 0000144 00000001432 12442343223 013176 0 ustar hornik users # Generated by roxygen2 (4.1.0): do not edit by hand
export("%>%")
export("str_sub<-")
export(boundary)
export(coll)
export(fixed)
export(ignore.case)
export(invert_match)
export(perl)
export(regex)
export(str_c)
export(str_conv)
export(str_count)
export(str_detect)
export(str_dup)
export(str_extract)
export(str_extract_all)
export(str_join)
export(str_length)
export(str_locate)
export(str_locate_all)
export(str_match)
export(str_match_all)
export(str_order)
export(str_pad)
export(str_replace)
export(str_replace_all)
export(str_replace_na)
export(str_sort)
export(str_split)
export(str_split_fixed)
export(str_sub)
export(str_subset)
export(str_to_lower)
export(str_to_title)
export(str_to_upper)
export(str_trim)
export(str_wrap)
export(word)
import(stringi)
importFrom(magrittr,"%>%")
stringr/R/ 0000755 0001751 0000144 00000000000 12513233473 012164 5 ustar hornik users stringr/R/utils.R 0000644 0001751 0000144 00000000213 12435640121 013436 0 ustar hornik users #' Pipe operator
#'
#' @name %>%
#' @rdname pipe
#' @keywords internal
#' @export
#' @importFrom magrittr %>%
#' @usage lhs \%>\% rhs
NULL
stringr/R/wrap.r 0000644 0001751 0000144 00000002426 12452577753 013342 0 ustar hornik users #' Wrap strings into nicely formatted paragraphs.
#'
#' This is a wrapper around \code{\link[stringi]{stri_wrap}} which implements
#' the Knuth-Plass paragraph wrapping algorithm.
#'
#' @param string character vector of strings to reformat.
#' @param width positive integer giving target line width in characters. A
#' width less than or equal to 1 will put each word on its own line.
#' @param indent non-negative integer giving indentation of first line in
#' each paragraph
#' @param exdent non-negative integer giving indentation of following lines in
#' each paragraph
#' @return A character vector of re-wrapped strings.
#' @export
#' @examples
#' thanks_path <- file.path(R.home("doc"), "THANKS")
#' thanks <- str_c(readLines(thanks_path), collapse = "\n")
#' thanks <- word(thanks, 1, 3, fixed("\n\n"))
#' cat(str_wrap(thanks), "\n")
#' cat(str_wrap(thanks, width = 40), "\n")
#' cat(str_wrap(thanks, width = 60, indent = 2), "\n")
#' cat(str_wrap(thanks, width = 60, exdent = 2), "\n")
#' cat(str_wrap(thanks, width = 0, exdent = 2), "\n")
str_wrap <- function(string, width = 80, indent = 0, exdent = 0) {
if (width <= 0) width <- 1
out <- stri_wrap(string, width = width, indent = indent, exdent = exdent,
simplify = FALSE)
vapply(out, str_c, collapse = "\n", character(1))
}
stringr/R/subset.R 0000644 0001751 0000144 00000002067 12440371067 013622 0 ustar hornik users #' Keep strings matching a pattern.
#'
#' This is a convenient wrapper around \code{x[str_detect(x, pattern)]}.
#' Vectorised over \code{string} and \code{pattern}
#'
#' @inheritParams str_detect
#' @return A character vector.
#' @seealso \code{\link{grep}} with argument \code{value = TRUE},
#' \code{\link[stringi]{stri_subset}} for the underlying implementation.
#' @export
#' @examples
#' fruit <- c("apple", "banana", "pear", "pinapple")
#' str_subset(fruit, "a")
#' str_subset(fruit, "^a")
#' str_subset(fruit, "a$")
#' str_subset(fruit, "b")
#' str_subset(fruit, "[aeiou]")
#'
#' # Missings are silently dropped
#' str_subset(c("a", NA, "b"), ".")
str_subset <- function(string, pattern) {
switch(type(pattern),
empty = ,
bound = stop("Not implemented", call. = FALSE),
fixed = stri_subset_fixed(string, pattern, omit_na = TRUE),
coll = stri_subset_coll(string, pattern, omit_na = TRUE,
opts_collator = attr(pattern, "options")),
regex = stri_subset_regex(string, pattern, omit_na = TRUE,
opts_regex = attr(pattern, "options"))
)
}
stringr/R/case.R 0000644 0001751 0000144 00000001350 12436074132 013220 0 ustar hornik users #' Convert case of a string.
#'
#' @param string String to modify
#' @param locale Locale to use for translations.
#' @examples
#' dog <- "The quick brown dog"
#' str_to_upper(dog)
#' str_to_lower(dog)
#' str_to_title(dog)
#'
#' # Locale matters!
#' str_to_upper("i", "en") # English
#' str_to_upper("i", "tr") # Turkish
#' @name case
NULL
#' @export
#' @rdname case
str_to_upper <- function(string, locale = "") {
stri_trans_toupper(string, locale = locale)
}
#' @export
#' @rdname case
str_to_lower <- function(string, locale = "") {
stri_trans_tolower(string, locale = locale)
}
#' @export
#' @rdname case
str_to_title <- function(string, locale = "") {
stri_trans_totitle(string, opts_brkiter = stri_opts_brkiter(locale = locale))
}
stringr/R/split.r 0000644 0001751 0000144 00000004741 12442343250 013504 0 ustar hornik users #' Split up a string into pieces.
#'
#' Vectorised over \code{string} and \code{pattern}.
#'
#' @inheritParams str_detect
#' @param n number of pieces to return. Default (Inf) uses all
#' possible split positions.
#'
#' For \code{str_split_fixed}, if n is greater than the number of pieces,
#' the result will be padded with empty strings.
#' @return For \code{str_split_fixed}, a character matrix with \code{n} columns.
#' For \code{str_split}, a list of character vectors.
#' @seealso \code{\link{stri_split}} for the underlying implementation.
#' @export
#' @examples
#' fruits <- c(
#' "apples and oranges and pears and bananas",
#' "pineapples and mangos and guavas"
#' )
#'
#' str_split(fruits, " and ")
#'
#' # Specify n to restrict the number of possible matches
#' str_split(fruits, " and ", n = 3)
#' str_split(fruits, " and ", n = 2)
#' # If n greater than number of pieces, no padding occurs
#' str_split(fruits, " and ", n = 5)
#'
#' # Use fixed to return a character matrix
#' str_split_fixed(fruits, " and ", 3)
#' str_split_fixed(fruits, " and ", 4)
str_split <- function(string, pattern, n = Inf) {
if (identical(n, Inf)) n <- -1L
switch(type(pattern),
empty = stri_split_boundaries(string, n = n, simplify = FALSE,
opts_brkiter = stri_opts_brkiter(type = "character")),
bound = stri_split_boundaries(string, n = n, simplify = FALSE,
opts_brkiter = attr(pattern, "options")),
fixed = stri_split_fixed(string, pattern, n = n, simplify = FALSE,
opts_fixed = attr(pattern, "options")),
regex = stri_split_regex(string, pattern, n = n, simplify = FALSE,
opts_regex = attr(pattern, "options")),
coll = stri_split_coll(string, pattern, n = n, simplify = FALSE,
opts_collator = attr(pattern, "options"))
)
}
#' @export
#' @rdname str_split
str_split_fixed <- function(string, pattern, n) {
out <- switch(type(pattern),
empty = stri_split_boundaries(string, n = n, simplify = TRUE,
opts_brkiter = stri_opts_brkiter(type = "character")),
bound = stri_split_boundaries(string, n = n, simplify = TRUE,
opts_brkiter = attr(pattern, "options")),
fixed = stri_split_fixed(string, pattern, n = n, simplify = TRUE,
opts_fixed = attr(pattern, "options")),
regex = stri_split_regex(string, pattern, n = n, simplify = TRUE,
opts_regex = attr(pattern, "options")),
coll = stri_split_coll(string, pattern, n = n, simplify = TRUE,
opts_collator = attr(pattern, "options"))
)
out[is.na(out)] <- ""
out
}
stringr/R/stringr.R 0000644 0001751 0000144 00000000126 12435640121 013771 0 ustar hornik users #' Fast and friendly string manipulation.
#'
#' @name stringr
#' @import stringi
NULL
stringr/R/conv.R 0000644 0001751 0000144 00000001035 12435654364 013264 0 ustar hornik users #' Specify the encoding of a string.
#'
#' This is a convenient way to override the current encoding of a string.
#'
#' @param string String to re-encode.
#' @param encoding Name of encoding. See \code{\link[stringi]{stri_enc_list}}
#' for a complete list.
#' @export
#' @examples
#' # Example from encoding?stringi::stringi
#' x <- rawToChar(as.raw(177))
#' x
#' str_conv(x, "ISO-8859-2") # Polish "a with ogonek"
#' str_conv(x, "ISO-8859-1") # Plus-minus
str_conv <- function(string, encoding) {
stri_conv(string, encoding, "UTF-8")
}
stringr/R/length.r 0000644 0001751 0000144 00000002041 12436074637 013636 0 ustar hornik users #' The length of a string.
#'
#' Technically this returns the number of "code points", in a string. One
#' code point usually corresponds to one character, but not always. For example,
#' an u with a umlaut might be represented as a single character or as the
#' combination a u and an umlaut.
#'
#' @inheritParams str_detect
#' @return A numeric vector giving number of characters (code points) in each
#' element of the character vector. Missing string have missing length.
#' @seealso \code{\link[stringi]{stri_length}} which this function wraps.
#' @export
#' @examples
#' str_length(letters)
#' str_length(NA)
#' str_length(factor("abc"))
#' str_length(c("i", "like", "programming", NA))
#'
#' # Two ways of representing a u with an umlaut
#' u1 <- "\u00fc"
#' u2 <- stringi::stri_trans_nfd(u1)
#' # The print the same:
#' u1
#' u2
#' # But have a different length
#' str_length(u1)
#' str_length(u2)
#' # Even though they have the same number of characters
#' str_count(u1)
#' str_count(u2)
str_length <- function(string) {
stri_length(string)
}
stringr/R/modifiers.r 0000644 0001751 0000144 00000010611 12442343363 014330 0 ustar hornik users #' Control matching behaviour with modifier functions.
#'
#' \describe{
#' \item{fixed}{Compare literal bytes in the string. This is very fast, but
#' not usually what you want for non-ASCII character sets.}
#' \item{coll}{Compare strings respecting standard collation rules.}
#' \item{regexp}{The default. Uses ICU regular expressions.}
#' \item{boundary}{Match boundaries between things.}
#' }
#'
#' @param pattern Pattern to modify behaviour.
#' @param ignore_case Should case differences be ignored in the match?
#' @name modifiers
#' @examples
#' pattern <- "a.b"
#' strings <- c("abb", "a.b")
#' str_detect(strings, pattern)
#' str_detect(strings, fixed(pattern))
#' str_detect(strings, coll(pattern))
#'
#' # coll() is useful for locale-aware case-insensitive matching
#' i <- c("I", "\u0130", "i")
#' i
#' str_detect(i, fixed("i", TRUE))
#' str_detect(i, coll("i", TRUE))
#' str_detect(i, coll("i", TRUE, locale = "tr"))
#'
#' # Word boundaries
#' words <- c("These are some words.")
#' str_count(words, boundary("word"))
#' str_split(words, " ")[[1]]
#' str_split(words, boundary("word"))[[1]]
#'
#' # Regular expression variations
#' str_extract_all("The Cat in the Hat", "[a-z]+")
#' str_extract_all("The Cat in the Hat", regex("[a-z]+", TRUE))
#'
#' str_extract_all("a\nb\nc", "^.")
#' str_extract_all("a\nb\nc", regex("^.", multiline = TRUE))
#'
#' str_extract_all("a\nb\nc", "a.")
#' str_extract_all("a\nb\nc", regex("a.", dotall = TRUE))
NULL
#' @export
#' @rdname modifiers
fixed <- function(pattern, ignore_case = FALSE) {
options <- stri_opts_fixed(case_insensitive = ignore_case)
structure(
pattern,
options = options,
class = c("fixed", "pattern", "character")
)
}
#' @export
#' @rdname modifiers
#' @param locale Locale to use for comparisons. See
#' \code{\link[stringi]{stri_locale_list}()} for all possible options.
#' @param ... Other less frequently used arguments passed on to
#' \code{\link[stringi]{stri_opts_collator}},
#' \code{\link[stringi]{stri_opts_regex}}, or
#' \code{\link[stringi]{stri_opts_brkiter}}
coll <- function(pattern, ignore_case = FALSE, locale = NULL, ...) {
options <- stri_opts_collator(
strength = if (ignore_case) 2L else 3L,
locale = locale,
...
)
structure(
pattern,
options = options,
class = c("coll", "pattern", "character")
)
}
#' @export
#' @rdname modifiers
#' @param multiline If \code{TRUE}, \code{$} and \code{^} match
#' the beginning and end of each line. If \code{FALSE}, the
#' default, only match the start and end of the input.
#' @param comments If \code{TRUE}, white space and comments beginning with
#' \code{#} are ignored. Escape literal spaces with \code{\\ }.
#' @param dotall If \code{TRUE}, \code{.} will also match line terminators.
regex <- function(pattern, ignore_case = FALSE, multiline = FALSE,
comments = FALSE, dotall = FALSE, ...) {
options <- stri_opts_regex(
case_insensitive = ignore_case,
multiline = multiline,
comments = comments,
dotall = dotall,
...
)
structure(
pattern,
options = options,
class = c("regex", "pattern", "character")
)
}
#' @param type Boundary type to detect.
#' @param skip_word_none Ignore "words" that don't contain any characters
#' or numbers - i.e. punctuation.
#' @export
#' @rdname modifiers
boundary <- function(type = c("character", "line_break", "sentence", "word"),
skip_word_none = TRUE, ...) {
type <- match.arg(type)
options <- stri_opts_brkiter(
type = type,
skip_word_none = skip_word_none,
...
)
structure(
character(),
options = options,
class = c("boundary", "pattern", "character")
)
}
type <- function(x) UseMethod("type")
type.boundary <- function(x) "bound"
type.regexp <- function(x) "regex"
type.coll <- function(x) "coll"
type.fixed <- function(x) "fixed"
type.character <- function(x) if (identical(x, "")) "empty" else "regex"
#' Deprecated modifier functions.
#'
#' Please use \code{\link{regexp}} and \code{\link{coll}} instead.
#'
#' @name modifier-deprecated
#' @keywords internal
NULL
#' @export
#' @rdname modifier-deprecated
ignore.case <- function(string) {
message("Please use (fixed|coll|regexp)(x, ignore_case = TRUE) instead of ignore.case(x)")
fixed(string, ignore_case = TRUE)
}
#' @export
#' @rdname modifier-deprecated
perl <- function(pattern) {
message("perl is deprecated. Please use regexp instead")
regex(pattern)
}
stringr/R/sort.R 0000644 0001751 0000144 00000002331 12435651034 013275 0 ustar hornik users #' Order or sort a character vector.
#'
#' @param x A character vector to sort.
#' @param decreasing A boolean. If \code{FALSE}, the default, sorts from
#' lowest to highest; if \code{TRUE} sorts from highest to lowest.
#' @param na_last Where should \code{NA} go? \code{TRUE} at the end,
#' \code{FALSE} at the beginning, \code{NA} dropped.
#' @param locale In which locale should the sorting occur? Defaults to
#' the current locale.
#' @param ... Other options used to control sorting order. Passed on to
#' \code{\link[stringi]{stri_opts_collator}}.
#' @seealso \code{\link[stringi]{stri_order}} for the underlying implementation.
#' @export
#' @examples
#' str_order(letters, locale = "en")
#' str_sort(letters, locale = "en")
#'
#' str_order(letters, locale = "haw")
#' str_sort(letters, locale = "haw")
str_order <- function(x, decreasing = FALSE, na_last = TRUE, locale = "", ...) {
stri_order(x, decreasing = decreasing, na_last = na_last,
opts_collator = stri_opts_collator(locale, ...))
}
#' @export
#' @rdname str_order
str_sort <- function(x, decreasing = FALSE, na_last = TRUE, locale = "", ...) {
stri_sort(x, decreasing = decreasing, na_last = na_last,
opts_collator = stri_opts_collator(locale, ...))
}
stringr/R/word.r 0000644 0001751 0000144 00000003424 12513233473 013325 0 ustar hornik users #' Extract words from a sentence.
#'
#' @param string input character vector.
#' @param start integer vector giving position of first word to extract.
#' Defaults to first word. If negative, counts backwards from last
#' character.
#' @param end integer vector giving position of last word to extract.
#' Defaults to first word. If negative, counts backwards from last
#' character.
#' @param sep separator between words. Defaults to single space.
#' @return character vector of words from \code{start} to \code{end}
#' (inclusive). Will be length of longest input argument.
#' @export
#' @examples
#' sentences <- c("Jane saw a cat", "Jane sat down")
#' word(sentences, 1)
#' word(sentences, 2)
#' word(sentences, -1)
#' word(sentences, 2, -1)
#'
#' # Also vectorised over start and end
#' word(sentences[1], 1:3, -1)
#' word(sentences[1], 1, 1:4)
#'
#' # Can define words by other separators
#' str <- 'abc.def..123.4568.999'
#' word(str, 1, sep = fixed('..'))
#' word(str, 2, sep = fixed('..'))
word <- function(string, start = 1L, end = start, sep = fixed(" ")) {
n <- max(length(string), length(start), length(end))
string <- rep(string, length.out = n)
start <- rep(start, length.out = n)
end <- rep(end, length.out = n)
breaks <- str_locate_all(string, sep)
words <- lapply(breaks, invert_match)
# Convert negative values into actual positions
len <- vapply(words, nrow, integer(1))
neg_start <- !is.na(start) & start < 0L
start[neg_start] <- start[neg_start] + len[neg_start] + 1L
neg_end <- !is.na(end) & end < 0L
end[neg_end] <- end[neg_end] + len[neg_end] + 1L
# Extract locations
starts <- mapply(function(word, loc) word[loc, "start"], words, start)
ends <- mapply(function(word, loc) word[loc, "end"], words, end)
str_sub(string, starts, ends)
}
stringr/R/locate.r 0000644 0001751 0000144 00000005627 12442343250 013624 0 ustar hornik users #' Locate the position of patterns in a string.
#'
#' Vectorised over \code{string} and \code{pattern}. If the match is of length
#' 0, (e.g. from a special match like \code{$}) end will be one character less
#' than start.
#'
#' @inheritParams str_detect
#' @return For \code{str_locate}, an integer matrix. First column gives start
#' postion of match, and second column gives end position. For
#' \code{str_locate_all} a list of integer matrices.
#' @seealso
#' \code{\link{str_extract}} for a convenient way of extracting matches,
#' \code{\link[stringi]{stri_locate}} for the underlying implementation.
#' @export
#' @examples
#' fruit <- c("apple", "banana", "pear", "pineapple")
#' str_locate(fruit, "$")
#' str_locate(fruit, "a")
#' str_locate(fruit, "e")
#' str_locate(fruit, c("a", "b", "p", "p"))
#'
#' str_locate_all(fruit, "a")
#' str_locate_all(fruit, "e")
#' str_locate_all(fruit, c("a", "b", "p", "p"))
#'
#' # Find location of every character
#' str_locate_all(fruit, "")
str_locate <- function(string, pattern) {
switch(type(pattern),
empty = stri_locate_first_boundaries(string,
opts_brkiter = stri_opts_brkiter("character")),
bound = stri_locate_first_boundaries(string,
opts_brkiter = attr(pattern, "options")),
fixed = stri_locate_first_fixed(string, pattern,
opts_fixed = attr(pattern, "options")),
coll = stri_locate_first_coll(string, pattern,
opts_collator = attr(pattern, "options")),
regex = stri_locate_first_regex(string, pattern,
opts_regex = attr(pattern, "options"))
)
}
#' @rdname str_locate
#' @export
str_locate_all <- function(string, pattern) {
switch(type(pattern),
empty = stri_locate_all_boundaries(string, omit_no_match = TRUE,
opts_brkiter = stri_opts_brkiter("character")),
bound = stri_locate_all_boundaries(string, omit_no_match = TRUE,
opts_brkiter = attr(pattern, "options")),
fixed = stri_locate_all_fixed(string, pattern, omit_no_match = TRUE,
opts_fixed = attr(pattern, "options")),
regex = stri_locate_all_regex(string, pattern,
omit_no_match = TRUE, opts_regex = attr(pattern, "options")),
coll = stri_locate_all_coll(string, pattern,
omit_no_match = TRUE, opts_collator = attr(pattern, "options"))
)
}
#' Switch location of matches to location of non-matches.
#'
#' Invert a matrix of match locations to match the opposite of what was
#' previously matched.
#'
#' @param loc matrix of match locations, as from \code{\link{str_locate_all}}
#' @return numeric match giving locations of non-matches
#' @export
#' @examples
#' numbers <- "1 and 2 and 4 and 456"
#' num_loc <- str_locate_all(numbers, "[0-9]+")[[1]]
#' str_sub(numbers, num_loc[, "start"], num_loc[, "end"])
#'
#' text_loc <- invert_match(num_loc)
#' str_sub(numbers, text_loc[, "start"], text_loc[, "end"])
invert_match <- function(loc) {
cbind(
start = c(0L, loc[, "end"] + 1L),
end = c(loc[, "start"] - 1L, -1L)
)
}
stringr/R/sub.r 0000644 0001751 0000144 00000004261 12435722031 013137 0 ustar hornik users #' Extract and replace substrings from a character vector.
#'
#' \code{str_sub} will recycle all arguments to be the same length as the
#' longest argument. If any arguments are of length 0, the output will be
#' a zero length character vector.
#'
#' Substrings are inclusive - they include the characters at both start and
#' end positions. \code{str_sub(string, 1, -1)} will return the complete
#' substring, from the first character to the last.
#'
#' @param string input character vector.
#' @param start,end Two integer vectors. \code{start} gives the position
#' of the first character (defaults to first), \code{end} gives the position
#' of the last (defaults to last character). Alternatively, pass a two-column
#' matrix to \code{start}.
#'
#' Negative values count backwards from the last character.
#' @param value replacement string
#' @return A character vector of substring from \code{start} to \code{end}
#' (inclusive). Will be length of longest input argument.
#' @seealso The underlying implementation in \code{\link[stringi]{stri_sub}}
#' @export
#' @examples
#' hw <- "Hadley Wickham"
#'
#' str_sub(hw, 1, 6)
#' str_sub(hw, end = 6)
#' str_sub(hw, 8, 14)
#' str_sub(hw, 8)
#' str_sub(hw, c(1, 8), c(6, 14))
#'
#' # Negative indices
#' str_sub(hw, -1)
#' str_sub(hw, -7)
#' str_sub(hw, end = -7)
#'
#' # Alternatively, you can pass in a two colum matrix, as in the
#' # output from str_locate_all
#' pos <- str_locate_all(hw, "[aeio]")[[1]]
#' str_sub(hw, pos)
#' str_sub(hw, pos[, 1], pos[, 2])
#'
#' # Vectorisation
#' str_sub(hw, seq_len(str_length(hw)))
#' str_sub(hw, end = seq_len(str_length(hw)))
#'
#' # Replacement form
#' x <- "BBCDEF"
#' str_sub(x, 1, 1) <- "A"; x
#' str_sub(x, -1, -1) <- "K"; x
#' str_sub(x, -2, -2) <- "GHIJ"; x
#' str_sub(x, 2, -2) <- ""; x
str_sub <- function(string, start = 1L, end = -1L) {
if (is.matrix(start)) {
stri_sub(string, from = start)
} else {
stri_sub(string, from = start, to = end)
}
}
#' @export
#' @rdname str_sub
"str_sub<-" <- function(string, start = 1L, end = -1L, value) {
if (is.matrix(start)) {
stri_sub(string, from = start) <- value
} else {
stri_sub(string, from = start, to = end) <- value
}
string
}
stringr/R/dup.r 0000644 0001751 0000144 00000000714 12435643521 013143 0 ustar hornik users #' Duplicate and concatenate strings within a character vector.
#'
#' Vectorised over \code{string} and \code{times}.
#'
#' @param string Input character vector.
#' @param times Number of times to duplicate each string.
#' @return A character vector.
#' @export
#' @examples
#' fruit <- c("apple", "pear", "banana")
#' str_dup(fruit, 2)
#' str_dup(fruit, 1:3)
#' str_c("ba", str_dup("na", 0:5))
str_dup <- function(string, times) {
stri_dup(string, times)
}
stringr/R/pad-trim.r 0000644 0001751 0000144 00000003233 12440404606 014062 0 ustar hornik users #' Pad a string.
#'
#' Vectorised over \code{string}, \code{width} and \code{pad}.
#'
#' @param string A character vector.
#' @param width Minimum width of padded strings.
#' @param side Side on which padding character is added (left, right or both).
#' @param pad Single padding character (default is a space).
#' @return A character vector.
#' @seealso \code{\link{str_trim}} to remove whitespace
#' @export
#' @examples
#' rbind(
#' str_pad("hadley", 30, "left"),
#' str_pad("hadley", 30, "right"),
#' str_pad("hadley", 30, "both")
#' )
#'
#' # All arguments are vectorised except side
#' str_pad(c("a", "abc", "abcdef"), 10)
#' str_pad("a", c(5, 10, 20))
#' str_pad("a", 10, pad = c("-", "_", " "))
#'
#' # Longer strings are returned unchanged
#' str_pad("hadley", 3)
str_pad <- function(string, width, side = c("left", "right", "both"), pad = " ") {
side <- match.arg(side)
switch(side,
left = stri_pad_left(string, width, pad = pad),
right = stri_pad_right(string, width, pad = pad),
both = stri_pad_both(string, width, pad = pad)
)
}
#' Trim whitespace from start and end of string.
#'
#' @param string A character vector.
#' @param side Side on which to remove whitespace (left, right or both).
#' @return A character vector.
#' @export
#' @seealso \code{\link{str_pad}} to add whitespace
#' @examples
#' str_trim(" String with trailing and leading white space\t")
#' str_trim("\n\nString with trailing and leading white space\n\n")
str_trim <- function(string, side = c("both", "left", "right")) {
side <- match.arg(side)
switch(side,
left = stri_trim_left(string),
right = stri_trim_right(string),
both = stri_trim_both(string)
)
}
stringr/R/count.r 0000644 0001751 0000144 00000002225 12442343250 013474 0 ustar hornik users #' Count the number of matches in a string.
#'
#' Vectorised over \code{string} and \code{pattern}.
#'
#' @inheritParams str_detect
#' @return An integer vector.
#' @seealso
#' \code{\link[stringi]{stri_count}} which this function wraps.
#'
#' \code{\link{str_locate}}/\code{\link{str_locate_all}} to locate position
#' of matches
#'
#' @export
#' @examples
#' fruit <- c("apple", "banana", "pear", "pineapple")
#' str_count(fruit, "a")
#' str_count(fruit, "p")
#' str_count(fruit, "e")
#' str_count(fruit, c("a", "b", "p", "p"))
#'
#' str_count(c("a.", "...", ".a.a"), ".")
#' str_count(c("a.", "...", ".a.a"), fixed("."))
str_count <- function(string, pattern = "") {
switch(type(pattern),
empty = stri_count_boundaries(string,
opts_brkiter = stri_opts_brkiter(type = "character")),
bound = stri_count_boundaries(string,
opts_brkiter = attr(pattern, "options")),
fixed = stri_count_fixed(string, pattern,
opts_fixed = attr(pattern, "options")),
coll = stri_count_coll(string, pattern,
opts_collator = attr(pattern, "options")),
regex = stri_count_regex(string, pattern,
opts_regex = attr(pattern, "options"))
)
}
stringr/R/detect.r 0000644 0001751 0000144 00000003203 12442343250 013611 0 ustar hornik users #' Detect the presence or absence of a pattern in a string.
#'
#' Vectorised over \code{string} and \code{pattern}.
#'
#' @param string Input vector. Either a character vector, or something
#' coercible to one.
#' @param pattern Pattern to look for.
#'
#' The default interpretation is a regular expression, as described
#' in \link[stringi]{stringi-search-regex}. Control options with
#' \code{\link{regex}()}.
#'
#' Match a fixed string (i.e. by comparing only bytes), using
#' \code{\link{fixed}(x)}. This is fast, but approximate. Generally,
#' for matching human text, you'll want \code{\link{coll}(x)} which
#' respects character matching rules for the specified locale.
#'
#' Match character, word, line and sentence boundaries with
#' \code{\link{boundary}()}. An empty pattern, "", is equivalent to
#' \code{boundary("character")}.
#' @return A logical vector.
#' @seealso \code{\link[stringi]{stri_detect}} which this function wraps
#' @export
#' @examples
#' fruit <- c("apple", "banana", "pear", "pinapple")
#' str_detect(fruit, "a")
#' str_detect(fruit, "^a")
#' str_detect(fruit, "a$")
#' str_detect(fruit, "b")
#' str_detect(fruit, "[aeiou]")
#'
#' # Also vectorised over pattern
#' str_detect("aecfg", letters)
str_detect <- function(string, pattern) {
switch(type(pattern),
empty = ,
bound = stop("Not implemented", call. = FALSE),
fixed = stri_detect_fixed(string, pattern,
opts_fixed = attr(pattern, "options")),
coll = stri_detect_coll(string, pattern,
opts_collator = attr(pattern, "options")),
regex = stri_detect_regex(string, pattern,
opts_regex = attr(pattern, "options"))
)
}
stringr/R/match.r 0000644 0001751 0000144 00000003443 12513530665 013452 0 ustar hornik users #' Extract matched groups from a string.
#'
#' Vectorised over \code{string} and \code{pattern}.
#'
#' @inheritParams str_detect
#' @param pattern Pattern to look for, as defined by an ICU regular
#' expression. See \link[stringi]{stringi-search-regex} for more details.
#' @return For \code{str_match}, a character matrix. First column is the
#' complete match, followed by one column for each capture group.
#' For \code{str_match_all}, a list of character matrices.
#'
#' @seealso \code{\link{str_extract}} to extract the complete match,
#' \code{\link[stringi]{stri_match}} for the underlying
#' implementation.
#' @export
#' @examples
#' strings <- c(" 219 733 8965", "329-293-8753 ", "banana", "595 794 7569",
#' "387 287 6718", "apple", "233.398.9187 ", "482 952 3315",
#' "239 923 8115 and 842 566 4692", "Work: 579-499-7527", "$1000",
#' "Home: 543.355.3679")
#' phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})"
#'
#' str_extract(strings, phone)
#' str_match(strings, phone)
#'
#' # Extract/match all
#' str_extract_all(strings, phone)
#' str_match_all(strings, phone)
#'
#' x <- c("