stringr/0000755000176200001440000000000013427716423011753 5ustar liggesusersstringr/inst/0000755000176200001440000000000013427574670012736 5ustar liggesusersstringr/inst/htmlwidgets/0000755000176200001440000000000013427574674015275 5ustar liggesusersstringr/inst/htmlwidgets/str_view.js0000644000176200001440000000036712613714660017466 0ustar liggesusersHTMLWidgets.widget({ name: 'str_view', type: 'output', initialize: function(el, width, height) { }, renderValue: function(el, x, instance) { el.innerHTML = x.html; }, resize: function(el, width, height, instance) { } }); stringr/inst/htmlwidgets/lib/0000755000176200001440000000000012613713452016024 5ustar liggesusersstringr/inst/htmlwidgets/lib/str_view.css0000644000176200001440000000027012613715274020403 0ustar liggesusers.str_view ul, .str_view li { list-style: none; padding: 0; margin: 0.5em 0; font-family: monospace; } .str_view .match { border: 1px solid #ccc; background-color: #eee; } stringr/inst/htmlwidgets/str_view.yaml0000644000176200001440000000014712613714471020010 0ustar liggesusersdependencies: - name: str_view version: 0.1.0 src: htmlwidgets/lib/ stylesheet: str_view.css stringr/inst/doc/0000755000176200001440000000000013427574670013503 5ustar liggesusersstringr/inst/doc/stringr.R0000644000176200001440000001007413427574670015320 0ustar liggesusers## ---- include = FALSE---------------------------------------------------- library(stringr) knitr::opts_chunk$set( comment = "#>", collapse = TRUE ) ## ------------------------------------------------------------------------ str_length("abc") ## ------------------------------------------------------------------------ x <- c("abcdef", "ghifjk") # The 3rd letter str_sub(x, 3, 3) # The 2nd to 2nd-to-last character str_sub(x, 2, -2) ## ------------------------------------------------------------------------ str_sub(x, 3, 3) <- "X" x ## ------------------------------------------------------------------------ str_dup(x, c(2, 3)) ## ------------------------------------------------------------------------ x <- c("abc", "defghi") str_pad(x, 10) # default pads on left str_pad(x, 10, "both") ## ------------------------------------------------------------------------ str_pad(x, 4) ## ------------------------------------------------------------------------ x <- c("Short", "This is a long string") x %>% str_trunc(10) %>% str_pad(10, "right") ## ------------------------------------------------------------------------ x <- c(" a ", "b ", " c") str_trim(x) str_trim(x, "left") ## ------------------------------------------------------------------------ jabberwocky <- str_c( "`Twas brillig, and the slithy toves ", "did gyre and gimble in the wabe: ", "All mimsy were the borogoves, ", "and the mome raths outgrabe. " ) cat(str_wrap(jabberwocky, width = 40)) ## ------------------------------------------------------------------------ x <- "I like horses." str_to_upper(x) str_to_title(x) str_to_lower(x) # Turkish has two sorts of i: with and without the dot str_to_lower(x, "tr") ## ------------------------------------------------------------------------ x <- c("y", "i", "k") str_order(x) str_sort(x) # In Lithuanian, y comes between i and k str_sort(x, locale = "lt") ## ------------------------------------------------------------------------ strings <- c( "apple", "219 733 8965", "329-293-8753", "Work: 579-499-7527; Home: 543.355.3679" ) phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})" ## ------------------------------------------------------------------------ # Which strings contain phone numbers? str_detect(strings, phone) str_subset(strings, phone) ## ------------------------------------------------------------------------ # How many phone numbers in each string? str_count(strings, phone) ## ------------------------------------------------------------------------ # Where in the string is the phone number located? (loc <- str_locate(strings, phone)) str_locate_all(strings, phone) ## ------------------------------------------------------------------------ # What are the phone numbers? str_extract(strings, phone) str_extract_all(strings, phone) str_extract_all(strings, phone, simplify = TRUE) ## ------------------------------------------------------------------------ # Pull out the three components of the match str_match(strings, phone) str_match_all(strings, phone) ## ------------------------------------------------------------------------ str_replace(strings, phone, "XXX-XXX-XXXX") str_replace_all(strings, phone, "XXX-XXX-XXXX") ## ------------------------------------------------------------------------ str_split("a-b-c", "-") str_split_fixed("a-b-c", "-", n = 2) ## ------------------------------------------------------------------------ a1 <- "\u00e1" a2 <- "a\u0301" c(a1, a2) a1 == a2 ## ------------------------------------------------------------------------ str_detect(a1, fixed(a2)) str_detect(a1, coll(a2)) ## ------------------------------------------------------------------------ i <- c("I", "İ", "i", "ı") i str_subset(i, coll("i", ignore_case = TRUE)) str_subset(i, coll("i", ignore_case = TRUE, locale = "tr")) ## ------------------------------------------------------------------------ x <- "This is a sentence." str_split(x, boundary("word")) str_count(x, boundary("word")) str_extract_all(x, boundary("word")) ## ------------------------------------------------------------------------ str_split(x, "") str_count(x, "") stringr/inst/doc/stringr.html0000644000176200001440000012577713427574670016104 0ustar liggesusers Introduction to stringr

Introduction to stringr

There are four main families of functions in stringr:

  1. Character manipulation: these functions allow you to manipulate individual characters within the strings in character vectors.

  2. Whitespace tools to add, remove, and manipulate whitespace.

  3. Locale sensitive operations whose operations will vary from locale to locale.

  4. Pattern matching functions. These recognise four engines of pattern description. The most common is regular expressions, but there are three other tools.

Getting and setting individual characters

You can get the length of the string with str_length():

str_length("abc")
#> [1] 3

This is now equivalent to the base R function nchar(). Previously it was needed to work around issues with nchar() such as the fact that it returned 2 for nchar(NA). This has been fixed as of R 3.3.0, so it is no longer so important.

You can access individual character using str_sub(). It takes three arguments: a character vector, a start position and an end position. Either position can either be a positive integer, which counts from the left, or a negative integer which counts from the right. The positions are inclusive, and if longer than the string, will be silently truncated.

x <- c("abcdef", "ghifjk")

# The 3rd letter
str_sub(x, 3, 3)
#> [1] "c" "i"

# The 2nd to 2nd-to-last character
str_sub(x, 2, -2)
#> [1] "bcde" "hifj"

You can also use str_sub() to modify strings:

str_sub(x, 3, 3) <- "X"
x
#> [1] "abXdef" "ghXfjk"

To duplicate individual strings, you can use str_dup():

str_dup(x, c(2, 3))
#> [1] "abXdefabXdef"       "ghXfjkghXfjkghXfjk"

Whitespace

Three functions add, remove, or modify whitespace:

  1. str_pad() pads a string to a fixed length by adding extra whitespace on the left, right, or both sides.

    (You can pad with other characters by using the pad argument.)

    str_pad() will never make a string shorter:

    So if you want to ensure that all strings are the same length (often useful for print methods), combine str_pad() and str_trunc():

  2. The opposite of str_pad() is str_trim(), which removes leading and trailing whitespace:

  3. You can use str_wrap() to modify existing whitespace in order to wrap a paragraph of text, such that the length of each line is as similar as possible.

Locale sensitive

A handful of stringr functions are locale-sensitive: they will perform differently in different regions of the world. These functions are case transformation functions:

x <- "I like horses."
str_to_upper(x)
#> [1] "I LIKE HORSES."
str_to_title(x)
#> [1] "I Like Horses."

str_to_lower(x)
#> [1] "i like horses."
# Turkish has two sorts of i: with and without the dot
str_to_lower(x, "tr")
#> [1] "ı like horses."

String ordering and sorting:

x <- c("y", "i", "k")
str_order(x)
#> [1] 2 3 1

str_sort(x)
#> [1] "i" "k" "y"
# In Lithuanian, y comes between i and k
str_sort(x, locale = "lt")
#> [1] "i" "y" "k"

The locale always defaults to English to ensure that the default behaviour is identical across systems. Locales always include a two letter ISO-639-1 language code (like “en” for English or “zh” for Chinese), and optionally a ISO-3166 country code (like “en_UK” vs “en_US”). You can see a complete list of available locales by running stringi::stri_locale_list().

Pattern matching

The vast majority of stringr functions work with patterns. These are parameterised by the task they perform and the types of patterns they match.

Tasks

Each pattern matching function has the same first two arguments, a character vector of strings to process and a single pattern to match. stringr provides pattern matching functions to detect, locate, extract, match, replace, and split strings. I’ll illustrate how they work with some strings and a regular expression designed to match (US) phone numbers:

Engines

There are four main engines that stringr can use to describe patterns:

Fixed matches

fixed(x) only matches the exact sequence of bytes specified by x. This is a very limited “pattern”, but the restriction can make matching much faster. Beware using fixed() with non-English data. It is problematic because there are often multiple ways of representing the same character. For example, there are two ways to define “á”: either as a single character or as an “a” plus an accent:

They render identically, but because they’re defined differently, fixed() doesn’t find a match. Instead, you can use coll(), explained below, to respect human character comparison rules:

Boundary

boundary() matches boundaries between characters, lines, sentences or words. It’s most useful with str_split(), but can be used with all pattern matching functions:

By convention, "" is treated as boundary("character"):

stringr/inst/doc/regular-expressions.R0000644000176200001440000001167213427574667017664 0ustar liggesusers## ----setup, include = FALSE---------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) library(stringr) ## ---- eval = FALSE------------------------------------------------------- # # The regular call: # str_extract(fruit, "nana") # # Is shorthand for # str_extract(fruit, regex("nana")) ## ------------------------------------------------------------------------ x <- c("apple", "banana", "pear") str_extract(x, "an") ## ------------------------------------------------------------------------ bananas <- c("banana", "Banana", "BANANA") str_detect(bananas, "banana") str_detect(bananas, regex("banana", ignore_case = TRUE)) ## ------------------------------------------------------------------------ str_extract(x, ".a.") ## ------------------------------------------------------------------------ str_detect("\nX\n", ".X.") str_detect("\nX\n", regex(".X.", dotall = TRUE)) ## ------------------------------------------------------------------------ # To create the regular expression, we need \\ dot <- "\\." # But the expression itself only contains one: writeLines(dot) # And this tells R to look for an explicit . str_extract(c("abc", "a.c", "bef"), "a\\.c") ## ------------------------------------------------------------------------ x <- "a\\b" writeLines(x) str_extract(x, "\\\\") ## ------------------------------------------------------------------------ x <- c("a.b.c.d", "aeb") starts_with <- "a.b" str_detect(x, paste0("^", starts_with)) str_detect(x, paste0("^\\Q", starts_with, "\\E")) ## ------------------------------------------------------------------------ x <- "a\u0301" str_extract(x, ".") str_extract(x, "\\X") ## ------------------------------------------------------------------------ str_extract_all("1 + 2 = 3", "\\d+")[[1]] ## ------------------------------------------------------------------------ # Some Laotian numbers str_detect("១២៣", "\\d") ## ------------------------------------------------------------------------ (text <- "Some \t badly\n\t\tspaced \f text") str_replace_all(text, "\\s+", " ") ## ------------------------------------------------------------------------ (text <- c('"Double quotes"', "«Guillemet»", "“Fancy quotes”")) str_replace_all(text, "\\p{quotation mark}", "'") ## ------------------------------------------------------------------------ str_extract_all("Don't eat that!", "\\w+")[[1]] str_split("Don't eat that!", "\\W")[[1]] ## ------------------------------------------------------------------------ str_replace_all("The quick brown fox", "\\b", "_") str_replace_all("The quick brown fox", "\\B", "_") ## ------------------------------------------------------------------------ str_detect(c("abc", "def", "ghi"), "abc|def") ## ------------------------------------------------------------------------ str_extract(c("grey", "gray"), "gre|ay") str_extract(c("grey", "gray"), "gr(e|a)y") ## ------------------------------------------------------------------------ pattern <- "(..)\\1" fruit %>% str_subset(pattern) fruit %>% str_subset(pattern) %>% str_match(pattern) ## ------------------------------------------------------------------------ str_match(c("grey", "gray"), "gr(e|a)y") str_match(c("grey", "gray"), "gr(?:e|a)y") ## ------------------------------------------------------------------------ x <- c("apple", "banana", "pear") str_extract(x, "^a") str_extract(x, "a$") ## ------------------------------------------------------------------------ x <- "Line 1\nLine 2\nLine 3\n" str_extract_all(x, "^Line..")[[1]] str_extract_all(x, regex("^Line..", multiline = TRUE))[[1]] str_extract_all(x, regex("\\ALine..", multiline = TRUE))[[1]] ## ------------------------------------------------------------------------ x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII" str_extract(x, "CC?") str_extract(x, "CC+") str_extract(x, 'C[LX]+') ## ------------------------------------------------------------------------ str_extract(x, "C{2}") str_extract(x, "C{2,}") str_extract(x, "C{2,3}") ## ------------------------------------------------------------------------ str_extract(x, c("C{2,3}", "C{2,3}?")) str_extract(x, c("C[LX]+", "C[LX]+?")) ## ------------------------------------------------------------------------ str_detect("ABC", "(?>A|.B)C") str_detect("ABC", "(?:A|.B)C") ## ------------------------------------------------------------------------ x <- c("1 piece", "2 pieces", "3") str_extract(x, "\\d+(?= pieces?)") y <- c("100", "$400") str_extract(y, "(?<=\\$)\\d+") ## ------------------------------------------------------------------------ str_detect("xyz", "x(?#this is a comment)") ## ------------------------------------------------------------------------ phone <- regex(" \\(? # optional opening parens (\\d{3}) # area code [)- ]? # optional closing parens, dash, or space (\\d{3}) # another three numbers [ -]? # optional space or dash (\\d{3}) # three more numbers ", comments = TRUE) str_match("514-791-8141", phone) stringr/inst/doc/stringr.Rmd0000644000176200001440000002247513341254174015636 0ustar liggesusers--- title: "Introduction to stringr" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Introduction to stringr} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} library(stringr) knitr::opts_chunk$set( comment = "#>", collapse = TRUE ) ``` There are four main families of functions in stringr: 1. Character manipulation: these functions allow you to manipulate individual characters within the strings in character vectors. 1. Whitespace tools to add, remove, and manipulate whitespace. 1. Locale sensitive operations whose operations will vary from locale to locale. 1. Pattern matching functions. These recognise four engines of pattern description. The most common is regular expressions, but there are three other tools. ## Getting and setting individual characters You can get the length of the string with `str_length()`: ```{r} str_length("abc") ``` This is now equivalent to the base R function `nchar()`. Previously it was needed to work around issues with `nchar()` such as the fact that it returned 2 for `nchar(NA)`. This has been fixed as of R 3.3.0, so it is no longer so important. You can access individual character using `str_sub()`. It takes three arguments: a character vector, a `start` position and an `end` position. Either position can either be a positive integer, which counts from the left, or a negative integer which counts from the right. The positions are inclusive, and if longer than the string, will be silently truncated. ```{r} x <- c("abcdef", "ghifjk") # The 3rd letter str_sub(x, 3, 3) # The 2nd to 2nd-to-last character str_sub(x, 2, -2) ``` You can also use `str_sub()` to modify strings: ```{r} str_sub(x, 3, 3) <- "X" x ``` To duplicate individual strings, you can use `str_dup()`: ```{r} str_dup(x, c(2, 3)) ``` ## Whitespace Three functions add, remove, or modify whitespace: 1. `str_pad()` pads a string to a fixed length by adding extra whitespace on the left, right, or both sides. ```{r} x <- c("abc", "defghi") str_pad(x, 10) # default pads on left str_pad(x, 10, "both") ``` (You can pad with other characters by using the `pad` argument.) `str_pad()` will never make a string shorter: ```{r} str_pad(x, 4) ``` So if you want to ensure that all strings are the same length (often useful for print methods), combine `str_pad()` and `str_trunc()`: ```{r} x <- c("Short", "This is a long string") x %>% str_trunc(10) %>% str_pad(10, "right") ``` 1. The opposite of `str_pad()` is `str_trim()`, which removes leading and trailing whitespace: ```{r} x <- c(" a ", "b ", " c") str_trim(x) str_trim(x, "left") ``` 1. You can use `str_wrap()` to modify existing whitespace in order to wrap a paragraph of text, such that the length of each line is as similar as possible. ```{r} jabberwocky <- str_c( "`Twas brillig, and the slithy toves ", "did gyre and gimble in the wabe: ", "All mimsy were the borogoves, ", "and the mome raths outgrabe. " ) cat(str_wrap(jabberwocky, width = 40)) ``` ## Locale sensitive A handful of stringr functions are locale-sensitive: they will perform differently in different regions of the world. These functions are case transformation functions: ```{r} x <- "I like horses." str_to_upper(x) str_to_title(x) str_to_lower(x) # Turkish has two sorts of i: with and without the dot str_to_lower(x, "tr") ``` String ordering and sorting: ```{r} x <- c("y", "i", "k") str_order(x) str_sort(x) # In Lithuanian, y comes between i and k str_sort(x, locale = "lt") ``` The locale always defaults to English to ensure that the default behaviour is identical across systems. Locales always include a two letter ISO-639-1 language code (like "en" for English or "zh" for Chinese), and optionally a ISO-3166 country code (like "en_UK" vs "en_US"). You can see a complete list of available locales by running `stringi::stri_locale_list()`. ## Pattern matching The vast majority of stringr functions work with patterns. These are parameterised by the task they perform and the types of patterns they match. ### Tasks Each pattern matching function has the same first two arguments, a character vector of `string`s to process and a single `pattern` to match. stringr provides pattern matching functions to **detect**, **locate**, **extract**, **match**, **replace**, and **split** strings. I'll illustrate how they work with some strings and a regular expression designed to match (US) phone numbers: ```{r} strings <- c( "apple", "219 733 8965", "329-293-8753", "Work: 579-499-7527; Home: 543.355.3679" ) phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})" ``` - `str_detect()` detects the presence or absence of a pattern and returns a logical vector (similar to `grepl()`). `str_subset()` returns the elements of a character vector that match a regular expression (similar to `grep()` with `value = TRUE`)`. ```{r} # Which strings contain phone numbers? str_detect(strings, phone) str_subset(strings, phone) ``` - `str_count()` counts the number of matches: ```{r} # How many phone numbers in each string? str_count(strings, phone) ``` - `str_locate()` locates the **first** position of a pattern and returns a numeric matrix with columns start and end. `str_locate_all()` locates all matches, returning a list of numeric matrices. Similar to `regexpr()` and `gregexpr()`. ```{r} # Where in the string is the phone number located? (loc <- str_locate(strings, phone)) str_locate_all(strings, phone) ``` - `str_extract()` extracts text corresponding to the **first** match, returning a character vector. `str_extract_all()` extracts all matches and returns a list of character vectors. ```{r} # What are the phone numbers? str_extract(strings, phone) str_extract_all(strings, phone) str_extract_all(strings, phone, simplify = TRUE) ``` - `str_match()` extracts capture groups formed by `()` from the **first** match. It returns a character matrix with one column for the complete match and one column for each group. `str_match_all()` extracts capture groups from all matches and returns a list of character matrices. Similar to `regmatches()`. ```{r} # Pull out the three components of the match str_match(strings, phone) str_match_all(strings, phone) ``` - `str_replace()` replaces the **first** matched pattern and returns a character vector. `str_replace_all()` replaces all matches. Similar to `sub()` and `gsub()`. ```{r} str_replace(strings, phone, "XXX-XXX-XXXX") str_replace_all(strings, phone, "XXX-XXX-XXXX") ``` - `str_split_fixed()` splits a string into a **fixed** number of pieces based on a pattern and returns a character matrix. `str_split()` splits a string into a **variable** number of pieces and returns a list of character vectors. ```{r} str_split("a-b-c", "-") str_split_fixed("a-b-c", "-", n = 2) ``` ### Engines There are four main engines that stringr can use to describe patterns: * Regular expressions, the default, as shown above, and described in `vignette("regular-expressions")`. * Fixed bytewise matching, with `fixed()`. * Locale-sensitive character matching, with `coll()` * Text boundary analysis with `boundary()`. #### Fixed matches `fixed(x)` only matches the exact sequence of bytes specified by `x`. This is a very limited "pattern", but the restriction can make matching much faster. Beware using `fixed()` with non-English data. It is problematic because there are often multiple ways of representing the same character. For example, there are two ways to define "á": either as a single character or as an "a" plus an accent: ```{r} a1 <- "\u00e1" a2 <- "a\u0301" c(a1, a2) a1 == a2 ``` They render identically, but because they're defined differently, `fixed()` doesn't find a match. Instead, you can use `coll()`, explained below, to respect human character comparison rules: ```{r} str_detect(a1, fixed(a2)) str_detect(a1, coll(a2)) ``` #### Collation search `coll(x)` looks for a match to `x` using human-language **coll**ation rules, and is particularly important if you want to do case insensitive matching. Collation rules differ around the world, so you'll also need to supply a `locale` parameter. ```{r} i <- c("I", "İ", "i", "ı") i str_subset(i, coll("i", ignore_case = TRUE)) str_subset(i, coll("i", ignore_case = TRUE, locale = "tr")) ``` The downside of `coll()` is speed. Because the rules for recognising which characters are the same are complicated, `coll()` is relatively slow compared to `regex()` and `fixed()`. Note that when both `fixed()` and `regex()` have `ignore_case` arguments, they perform a much simpler comparison than `coll()`. #### Boundary `boundary()` matches boundaries between characters, lines, sentences or words. It's most useful with `str_split()`, but can be used with all pattern matching functions: ```{r} x <- "This is a sentence." str_split(x, boundary("word")) str_count(x, boundary("word")) str_extract_all(x, boundary("word")) ``` By convention, `""` is treated as `boundary("character")`: ```{r} str_split(x, "") str_count(x, "") ``` stringr/inst/doc/regular-expressions.html0000644000176200001440000013743313427574667020433 0ustar liggesusers Regular expressions

Regular expressions

Regular expressions are a concise and flexible tool for describing patterns in strings. This vignette describes the key features of stringr’s regular expressions, as implemented by stringi. It is not a tutorial, so if you’re unfamiliar regular expressions, I’d recommend starting at http://r4ds.had.co.nz/strings.html. If you want to master the details, I’d recommend reading the classic Mastering Regular Expressions by Jeffrey E. F. Friedl.

Regular expressions are the default pattern engine in stringr. That means when you use a pattern matching function with a bare string, it’s equivalent to wrapping it in a call to regex():

# The regular call:
str_extract(fruit, "nana")
# Is shorthand for
str_extract(fruit, regex("nana"))

You will need to use regex() explicitly if you want to override the default options, as you’ll see in examples below.

Basic matches

The simplest patterns match exact strings:

x <- c("apple", "banana", "pear")
str_extract(x, "an")
#> [1] NA   "an" NA

You can perform a case-insensitive match using ignore_case = TRUE:

bananas <- c("banana", "Banana", "BANANA")
str_detect(bananas, "banana")
#> [1]  TRUE FALSE FALSE
str_detect(bananas, regex("banana", ignore_case = TRUE))
#> [1] TRUE TRUE TRUE

The next step up in complexity is ., which matches any character except a newline:

str_extract(x, ".a.")
#> [1] NA    "ban" "ear"

You can allow . to match everything, including \n, by setting dotall = TRUE:

str_detect("\nX\n", ".X.")
#> [1] FALSE
str_detect("\nX\n", regex(".X.", dotall = TRUE))
#> [1] TRUE

Escaping

If “.” matches any character, how do you match a literal “.”? You need to use an “escape” to tell the regular expression you want to match it exactly, not use its special behaviour. Like strings, regexps use the backslash, \, to escape special behaviour. So to match an ., you need the regexp \.. Unfortunately this creates a problem. We use strings to represent regular expressions, and \ is also used as an escape symbol in strings. So to create the regular expression \. we need the string "\\.".

# To create the regular expression, we need \\
dot <- "\\."

# But the expression itself only contains one:
writeLines(dot)
#> \.

# And this tells R to look for an explicit .
str_extract(c("abc", "a.c", "bef"), "a\\.c")
#> [1] NA    "a.c" NA

If \ is used as an escape character in regular expressions, how do you match a literal \? Well you need to escape it, creating the regular expression \\. To create that regular expression, you need to use a string, which also needs to escape \. That means to match a literal \ you need to write "\\\\" — you need four backslashes to match one!

x <- "a\\b"
writeLines(x)
#> a\b

str_extract(x, "\\\\")
#> [1] "\\"

In this vignette, I use \. to denote the regular expression, and "\\." to denote the string that represents the regular expression.

An alternative quoting mechanism is \Q...\E: all the characters in ... are treated as exact matches. This is useful if you want to exactly match user input as part of a regular expression.

x <- c("a.b.c.d", "aeb")
starts_with <- "a.b"

str_detect(x, paste0("^", starts_with))
#> [1] TRUE TRUE
str_detect(x, paste0("^\\Q", starts_with, "\\E"))
#> [1]  TRUE FALSE

Special characters

Escapes also allow you to specify individual characters that are otherwise hard to type. You can specify individual unicode characters in five ways, either as a variable number of hex digits (four is most common), or by name:

Similarly, you can specify many common control characters:

(Many of these are only of historical interest and are only included here for the sake of completeness.)

Matching multiple characters

There are a number of patterns that match more than one character. You’ve already seen ., which matches any character (except a newline). A closely related operator is \X, which matches a grapheme cluster, a set of individual elements that form a single symbol. For example, one way of representing “á” is as the letter “a” plus an accent: . will match the component “a”, while \X will match the complete symbol:

x <- "a\u0301"
str_extract(x, ".")
#> [1] "a"
str_extract(x, "\\X")
#> [1] "á"

There are five other escaped pairs that match narrower classes of characters:

You can also create your own character classes using []:

There are a number of pre-built classes that you can use inside []:

These all go inside the [] for character classes, i.e. [[:digit:]AX] matches all digits, A, and X.

You can also using Unicode properties, like [\p{Letter}], and various set operations, like [\p{Letter}--\p{script=latin}]. See ?"stringi-search-charclass" for details.

Alternation

| is the alternation operator, which will pick between one or more possible matches. For example, abc|def will match abc or def.

str_detect(c("abc", "def", "ghi"), "abc|def")
#> [1]  TRUE  TRUE FALSE

Note that the precedence for | is low, so that abc|def matches abc or def not abcyz or abxyz.

Grouping

You can use parentheses to override the default precedence rules:

str_extract(c("grey", "gray"), "gre|ay")
#> [1] "gre" "ay"
str_extract(c("grey", "gray"), "gr(e|a)y")
#> [1] "grey" "gray"

Parenthesis also define “groups” that you can refer to with backreferences, like \1, \2 etc, and can be extracted with str_match(). For example, the following regular expression finds all fruits that have a repeated pair of letters:

pattern <- "(..)\\1"
fruit %>% 
  str_subset(pattern)
#> [1] "banana"      "coconut"     "cucumber"    "jujube"      "papaya"     
#> [6] "salal berry"

fruit %>% 
  str_subset(pattern) %>% 
  str_match(pattern)
#>      [,1]   [,2]
#> [1,] "anan" "an"
#> [2,] "coco" "co"
#> [3,] "cucu" "cu"
#> [4,] "juju" "ju"
#> [5,] "papa" "pa"
#> [6,] "alal" "al"

You can use (?:...), the non-grouping parentheses, to control precedence but not capture the match in a group. This is slightly more efficient than capturing parentheses.

str_match(c("grey", "gray"), "gr(e|a)y")
#>      [,1]   [,2]
#> [1,] "grey" "e" 
#> [2,] "gray" "a"
str_match(c("grey", "gray"), "gr(?:e|a)y")
#>      [,1]  
#> [1,] "grey"
#> [2,] "gray"

This is most useful for more complex cases where you need to capture matches and control precedence independently.

Anchors

By default, regular expressions will match any part of a string. It’s often useful to anchor the regular expression so that it matches from the start or end of the string:

x <- c("apple", "banana", "pear")
str_extract(x, "^a")
#> [1] "a" NA  NA
str_extract(x, "a$")
#> [1] NA  "a" NA

To match a literal “$” or “^”, you need to escape them, \$, and \^.

For multiline strings, you can use regex(multiline = TRUE). This changes the behaviour of ^ and $, and introduces three new operators:

x <- "Line 1\nLine 2\nLine 3\n"
str_extract_all(x, "^Line..")[[1]]
#> [1] "Line 1"
str_extract_all(x, regex("^Line..", multiline = TRUE))[[1]]
#> [1] "Line 1" "Line 2" "Line 3"
str_extract_all(x, regex("\\ALine..", multiline = TRUE))[[1]]
#> [1] "Line 1"

Repetition

You can control how many times a pattern matches with the repetition operators:

x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
str_extract(x, "CC?")
#> [1] "CC"
str_extract(x, "CC+")
#> [1] "CCC"
str_extract(x, 'C[LX]+')
#> [1] "CLXXX"

Note that the precedence of these operators is high, so you can write: colou?r to match either American or British spellings. That means most uses will need parentheses, like bana(na)+.

You can also specify the number of matches precisely:

str_extract(x, "C{2}")
#> [1] "CC"
str_extract(x, "C{2,}")
#> [1] "CCC"
str_extract(x, "C{2,3}")
#> [1] "CCC"

By default these matches are “greedy”: they will match the longest string possible. You can make them “lazy”, matching the shortest string possible by putting a ? after them:

str_extract(x, c("C{2,3}", "C{2,3}?"))
#> [1] "CCC" "CC"
str_extract(x, c("C[LX]+", "C[LX]+?"))
#> [1] "CLXXX" "CL"

You can also make the matches possessive by putting a + after them, which means that if later parts of the match fail, the repetition will not be re-tried with a smaller number of characters. This is an advanced feature used to improve performance in worst-case scenarios (called “catastrophic backtracking”).

A related concept is the atomic-match parenthesis, (?>...). If a later match fails and the engine needs to back-track, an atomic match is kept as is: it succeeds or fails as a whole. Compare the following two regular expressions:

str_detect("ABC", "(?>A|.B)C")
#> [1] FALSE
str_detect("ABC", "(?:A|.B)C")
#> [1] TRUE

The atomic match fails because it matches A, and then the next character is a C so it fails. The regular match succeeds because it matches A, but then C doesn’t match, so it back-tracks and tries B instead.

Look arounds

These assertions look ahead or behind the current match without “consuming” any characters (i.e. changing the input position).

These are useful when you want to check that a pattern exists, but you don’t want to include it in the result:

x <- c("1 piece", "2 pieces", "3")
str_extract(x, "\\d+(?= pieces?)")
#> [1] "1" "2" NA

y <- c("100", "$400")
str_extract(y, "(?<=\\$)\\d+")
#> [1] NA    "400"

Comments

There are two ways to include comments in a regular expression. The first is with (?#...):

str_detect("xyz", "x(?#this is a comment)")
#> [1] TRUE

The second is to use regex(comments = TRUE). This form ignores spaces and newlines, and anything everything after #. To match a literal space, you’ll need to escape it: "\\ ". This is a useful way of describing complex regular expressions:

phone <- regex("
  \\(?     # optional opening parens
  (\\d{3}) # area code
  [)- ]?   # optional closing parens, dash, or space
  (\\d{3}) # another three numbers
  [ -]?    # optional space or dash
  (\\d{3}) # three more numbers
  ", comments = TRUE)

str_match("514-791-8141", phone)
#>      [,1]          [,2]  [,3]  [,4] 
#> [1,] "514-791-814" "514" "791" "814"
stringr/inst/doc/regular-expressions.Rmd0000644000176200001440000003433113341254174020161 0ustar liggesusers--- title: "Regular expressions" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Regular expressions} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) library(stringr) ``` Regular expressions are a concise and flexible tool for describing patterns in strings. This vignette describes the key features of stringr's regular expressions, as implemented by [stringi](https://github.com/gagolews/stringi). It is not a tutorial, so if you're unfamiliar regular expressions, I'd recommend starting at . If you want to master the details, I'd recommend reading the classic [_Mastering Regular Expressions_](https://amzn.com/0596528124) by Jeffrey E. F. Friedl. Regular expressions are the default pattern engine in stringr. That means when you use a pattern matching function with a bare string, it's equivalent to wrapping it in a call to `regex()`: ```{r, eval = FALSE} # The regular call: str_extract(fruit, "nana") # Is shorthand for str_extract(fruit, regex("nana")) ``` You will need to use `regex()` explicitly if you want to override the default options, as you'll see in examples below. ## Basic matches The simplest patterns match exact strings: ```{r} x <- c("apple", "banana", "pear") str_extract(x, "an") ``` You can perform a case-insensitive match using `ignore_case = TRUE`: ```{r} bananas <- c("banana", "Banana", "BANANA") str_detect(bananas, "banana") str_detect(bananas, regex("banana", ignore_case = TRUE)) ``` The next step up in complexity is `.`, which matches any character except a newline: ```{r} str_extract(x, ".a.") ``` You can allow `.` to match everything, including `\n`, by setting `dotall = TRUE`: ```{r} str_detect("\nX\n", ".X.") str_detect("\nX\n", regex(".X.", dotall = TRUE)) ``` ## Escaping If "`.`" matches any character, how do you match a literal "`.`"? You need to use an "escape" to tell the regular expression you want to match it exactly, not use its special behaviour. Like strings, regexps use the backslash, `\`, to escape special behaviour. So to match an `.`, you need the regexp `\.`. Unfortunately this creates a problem. We use strings to represent regular expressions, and `\` is also used as an escape symbol in strings. So to create the regular expression `\.` we need the string `"\\."`. ```{r} # To create the regular expression, we need \\ dot <- "\\." # But the expression itself only contains one: writeLines(dot) # And this tells R to look for an explicit . str_extract(c("abc", "a.c", "bef"), "a\\.c") ``` If `\` is used as an escape character in regular expressions, how do you match a literal `\`? Well you need to escape it, creating the regular expression `\\`. To create that regular expression, you need to use a string, which also needs to escape `\`. That means to match a literal `\` you need to write `"\\\\"` --- you need four backslashes to match one! ```{r} x <- "a\\b" writeLines(x) str_extract(x, "\\\\") ``` In this vignette, I use `\.` to denote the regular expression, and `"\\."` to denote the string that represents the regular expression. An alternative quoting mechanism is `\Q...\E`: all the characters in `...` are treated as exact matches. This is useful if you want to exactly match user input as part of a regular expression. ```{r} x <- c("a.b.c.d", "aeb") starts_with <- "a.b" str_detect(x, paste0("^", starts_with)) str_detect(x, paste0("^\\Q", starts_with, "\\E")) ``` ## Special characters Escapes also allow you to specify individual characters that are otherwise hard to type. You can specify individual unicode characters in five ways, either as a variable number of hex digits (four is most common), or by name: * `\xhh`: 2 hex digits. * `\x{hhhh}`: 1-6 hex digits. * `\uhhhh`: 4 hex digits. * `\Uhhhhhhhh`: 8 hex digits. * `\N{name}`, e.g. `\N{grinning face}` matches the basic smiling emoji. Similarly, you can specify many common control characters: * `\a`: bell. * `\cX`: match a control-X character. * `\e`: escape (`\u001B`). * `\f`: form feed (`\u000C`). * `\n`: line feed (`\u000A`). * `\r`: carriage return (`\u000D`). * `\t`: horizontal tabulation (`\u0009`). * `\0ooo` match an octal character. 'ooo' is from one to three octal digits, from 000 to 0377. The leading zero is required. (Many of these are only of historical interest and are only included here for the sake of completeness.) ## Matching multiple characters There are a number of patterns that match more than one character. You've already seen `.`, which matches any character (except a newline). A closely related operator is `\X`, which matches a __grapheme cluster__, a set of individual elements that form a single symbol. For example, one way of representing "á" is as the letter "a" plus an accent: `.` will match the component "a", while `\X` will match the complete symbol: ```{r} x <- "a\u0301" str_extract(x, ".") str_extract(x, "\\X") ``` There are five other escaped pairs that match narrower classes of characters: * `\d`: matches any digit. The complement, `\D`, matches any character that is not a decimal digit. ```{r} str_extract_all("1 + 2 = 3", "\\d+")[[1]] ``` Technically, `\d` includes any character in the Unicode Category of Nd ("Number, Decimal Digit"), which also includes numeric symbols from other languages: ```{r} # Some Laotian numbers str_detect("១២៣", "\\d") ``` * `\s`: matches any whitespace. This includes tabs, newlines, form feeds, and any character in the Unicode Z Category (which includes a variety of space characters and other separators.). The complement, `\S`, matches any non-whitespace character. ```{r} (text <- "Some \t badly\n\t\tspaced \f text") str_replace_all(text, "\\s+", " ") ``` * `\p{property name}` matches any character with specific unicode property, like `\p{Uppercase}` or `\p{Diacritic}`. The complement, `\P{property name}`, matches all characters without the property. A complete list of unicode properties can be found at . ```{r} (text <- c('"Double quotes"', "«Guillemet»", "“Fancy quotes”")) str_replace_all(text, "\\p{quotation mark}", "'") ``` * `\w` matches any "word" character, which includes alphabetic characters, marks and decimal numbers. The complement, `\W`, matches any non-word character. ```{r} str_extract_all("Don't eat that!", "\\w+")[[1]] str_split("Don't eat that!", "\\W")[[1]] ``` Technically, `\w` also matches connector punctuation, `\u200c` (zero width connector), and `\u200d` (zero width joiner), but these are rarely seen in the wild. * `\b` matches word boundaries, the transition between word and non-word characters. `\B` matches the opposite: boundaries that have either both word or non-word characters on either side. ```{r} str_replace_all("The quick brown fox", "\\b", "_") str_replace_all("The quick brown fox", "\\B", "_") ``` You can also create your own __character classes__ using `[]`: * `[abc]`: matches a, b, or c. * `[a-z]`: matches every character between a and z (in Unicode code point order). * `[^abc]`: matches anything except a, b, or c. * `[\^\-]`: matches `^` or `-`. There are a number of pre-built classes that you can use inside `[]`: * `[:punct:]`: punctuation. * `[:alpha:]`: letters. * `[:lower:]`: lowercase letters. * `[:upper:]`: upperclass letters. * `[:digit:]`: digits. * `[:xdigit:]`: hex digits. * `[:alnum:]`: letters and numbers. * `[:cntrl:]`: control characters. * `[:graph:]`: letters, numbers, and punctuation. * `[:print:]`: letters, numbers, punctuation, and whitespace. * `[:space:]`: space characters (basically equivalent to `\s`). * `[:blank:]`: space and tab. These all go inside the `[]` for character classes, i.e. `[[:digit:]AX]` matches all digits, A, and X. You can also using Unicode properties, like `[\p{Letter}]`, and various set operations, like `[\p{Letter}--\p{script=latin}]`. See `?"stringi-search-charclass"` for details. ## Alternation `|` is the __alternation__ operator, which will pick between one or more possible matches. For example, `abc|def` will match `abc` or `def`. ```{r} str_detect(c("abc", "def", "ghi"), "abc|def") ``` Note that the precedence for `|` is low, so that `abc|def` matches `abc` or `def` not `abcyz` or `abxyz`. ## Grouping You can use parentheses to override the default precedence rules: ```{r} str_extract(c("grey", "gray"), "gre|ay") str_extract(c("grey", "gray"), "gr(e|a)y") ``` Parenthesis also define "groups" that you can refer to with __backreferences__, like `\1`, `\2` etc, and can be extracted with `str_match()`. For example, the following regular expression finds all fruits that have a repeated pair of letters: ```{r} pattern <- "(..)\\1" fruit %>% str_subset(pattern) fruit %>% str_subset(pattern) %>% str_match(pattern) ``` You can use `(?:...)`, the non-grouping parentheses, to control precedence but not capture the match in a group. This is slightly more efficient than capturing parentheses. ```{r} str_match(c("grey", "gray"), "gr(e|a)y") str_match(c("grey", "gray"), "gr(?:e|a)y") ``` This is most useful for more complex cases where you need to capture matches and control precedence independently. ## Anchors By default, regular expressions will match any part of a string. It's often useful to __anchor__ the regular expression so that it matches from the start or end of the string: * `^` matches the start of string. * `$` matches the end of the string. ```{r} x <- c("apple", "banana", "pear") str_extract(x, "^a") str_extract(x, "a$") ``` To match a literal "$" or "^", you need to escape them, `\$`, and `\^`. For multiline strings, you can use `regex(multiline = TRUE)`. This changes the behaviour of `^` and `$`, and introduces three new operators: * `^` now matches the start of each line. * `$` now matches the end of each line. * `\A` matches the start of the input. * `\z` matches the end of the input. * `\Z` matches the end of the input, but before the final line terminator, if it exists. ```{r} x <- "Line 1\nLine 2\nLine 3\n" str_extract_all(x, "^Line..")[[1]] str_extract_all(x, regex("^Line..", multiline = TRUE))[[1]] str_extract_all(x, regex("\\ALine..", multiline = TRUE))[[1]] ``` ## Repetition You can control how many times a pattern matches with the repetition operators: * `?`: 0 or 1. * `+`: 1 or more. * `*`: 0 or more. ```{r} x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII" str_extract(x, "CC?") str_extract(x, "CC+") str_extract(x, 'C[LX]+') ``` Note that the precedence of these operators is high, so you can write: `colou?r` to match either American or British spellings. That means most uses will need parentheses, like `bana(na)+`. You can also specify the number of matches precisely: * `{n}`: exactly n * `{n,}`: n or more * `{n,m}`: between n and m ```{r} str_extract(x, "C{2}") str_extract(x, "C{2,}") str_extract(x, "C{2,3}") ``` By default these matches are "greedy": they will match the longest string possible. You can make them "lazy", matching the shortest string possible by putting a `?` after them: * `??`: 0 or 1, prefer 0. * `+?`: 1 or more, match as few times as possible. * `*?`: 0 or more, match as few times as possible. * `{n,}?`: n or more, match as few times as possible. * `{n,m}?`: between n and m, , match as few times as possible, but at least n. ```{r} str_extract(x, c("C{2,3}", "C{2,3}?")) str_extract(x, c("C[LX]+", "C[LX]+?")) ``` You can also make the matches possessive by putting a `+` after them, which means that if later parts of the match fail, the repetition will not be re-tried with a smaller number of characters. This is an advanced feature used to improve performance in worst-case scenarios (called "catastrophic backtracking"). * `?+`: 0 or 1, possessive. * `++`: 1 or more, possessive. * `*+`: 0 or more, possessive. * `{n}+`: exactly n, possessive. * `{n,}+`: n or more, possessive. * `{n,m}+`: between n and m, possessive. A related concept is the __atomic-match__ parenthesis, `(?>...)`. If a later match fails and the engine needs to back-track, an atomic match is kept as is: it succeeds or fails as a whole. Compare the following two regular expressions: ```{r} str_detect("ABC", "(?>A|.B)C") str_detect("ABC", "(?:A|.B)C") ``` The atomic match fails because it matches A, and then the next character is a C so it fails. The regular match succeeds because it matches A, but then C doesn't match, so it back-tracks and tries B instead. ## Look arounds These assertions look ahead or behind the current match without "consuming" any characters (i.e. changing the input position). * `(?=...)`: positive look-ahead assertion. Matches if `...` matches at the current input. * `(?!...)`: negative look-ahead assertion. Matches if `...` __does not__ match at the current input. * `(?<=...)`: positive look-behind assertion. Matches if `...` matches text preceding the current position, with the last character of the match being the character just before the current position. Length must be bounded (i.e. no `*` or `+`). * `(?% str_c(collapse = " ") %>% str_wrap(0) %>% str_count("\n") expect_equal(n_returns, length(letters) - 1) }) stringr/tests/testthat/test-subset.r0000644000176200001440000000120013413766176017420 0ustar liggesuserscontext("Subsetting character vectors") test_that("basic subsetting for fixed patterns works", { expect_equal(str_subset(c("i", "I"), fixed("i")), "i") expect_equal( str_subset(c("i", "I"), fixed("i", ignore_case = TRUE)), c("i", "I") ) # negation works expect_equal(str_subset(c("i", "I"), fixed("i"), negate = TRUE), "I") }) test_that("str_which is equivalent to grep", { expect_equal( str_which(head(letters), "[aeiou]"), grep("[aeiou]", head(letters)) ) # negation works expect_equal( str_which(head(letters), "[aeiou]", negate = TRUE), grep("[aeiou]", head(letters), invert = TRUE) ) }) stringr/tests/testthat/test-detect.r0000644000176200001440000000270313427104326017360 0ustar liggesuserscontext("Detecting patterns") test_that("special cases are correct", { expect_equal(str_detect(NA, "x"), NA) expect_equal(str_detect(character(), "x"), logical()) }) test_that("vectorised patterns work", { expect_equal(str_detect("ab", c("a", "b", "c")), c(T, T, F)) expect_equal(str_detect(c("ca", "ab"), c("a", "c")), c(T, F)) # negation works expect_equal(str_detect("ab", c("a", "b", "c"), negate = TRUE), c(F, F, T)) }) test_that("modifiers work", { expect_false(str_detect("ab", "AB")) expect_true(str_detect("ab", regex("AB", TRUE))) expect_true(str_detect("abc", "ab[c]")) expect_false(str_detect("abc", fixed("ab[c]"))) expect_true(str_detect("ab[c]", fixed("ab[c]"))) expect_true(str_detect("ab[c]", coll("ab[c]"))) expect_true(str_detect("abc", "(?x)a b c")) }) test_that("str_starts works", { expect_true(str_starts("ab", "a")) expect_false(str_starts("ab", "b")) # negation expect_false(str_starts("ab", "a", TRUE)) expect_true(str_starts("ab", "b", TRUE)) # Special typing of patterns. expect_true(str_starts("ab", fixed("A", ignore_case = TRUE))) expect_true(str_starts("ab", regex("A", ignore_case = TRUE))) }) test_that("str_ends works", { expect_true(str_ends("ab", "b")) expect_false(str_ends("ab", "a")) # negation expect_false(str_ends("ab", "b", TRUE)) expect_true(str_ends("ab", "a", TRUE)) # Special typing of patterns. expect_true(str_ends("ab", fixed("B", ignore_case = TRUE))) }) stringr/tests/testthat/test-split.r0000644000176200001440000000457412615701126017252 0ustar liggesuserscontext("Splitting strings") test_that("special cases are correct", { expect_equal(str_split(NA, "")[[1]], NA_character_) expect_equal(str_split(character(), ""), list()) }) test_that("str_split functions as expected", { test <- c("bab", "cac", "dadad") result <- str_split(test, "a") expect_is(result, "list") expect_equal(length(result), 3) lengths <- vapply(result, length, integer(1)) expect_equal(lengths, c(2, 2, 3)) expect_equal(result, list(c("b", "b"), c("c", "c"), c("d", "d", "d")) ) }) test_that("vectors give correct results dealt with correctly", { test <- c("bab", "cac", "dadad", "eae") result <- str_split_fixed(test, "a", 3) expect_is(result, "matrix") expect_equal(nrow(result), 4) expect_equal(ncol(result), 3) expect_equal(result[1, ], c("b", "b", "")) expect_equal(result[3, ], c("d", "d", "d")) expect_equal(result[, 1], c("b", "c", "d", "e")) }) test_that("n sets maximum number of splits in str_split", { test <- "Subject: Roger: his drinking problems" expect_equal(length(str_split(test, ": ")[[1]]), 3) expect_equal(length(str_split(test, ": ", 4)[[1]]), 3) expect_equal(length(str_split(test, ": ", 3)[[1]]), 3) expect_equal(length(str_split(test, ": ", 2)[[1]]), 2) expect_equal(length(str_split(test, ": ", 1)[[1]]), 1) expect_equal( str_split(test, ": ", 3)[[1]], c("Subject", "Roger", "his drinking problems") ) expect_equal( str_split(test, ": ", 2)[[1]], c("Subject", "Roger: his drinking problems") ) }) test_that("n sets exact number of splits in str_split_fixed", { test <- "Subject: Roger: his drinking problems" expect_equal(ncol(str_split_fixed(test, ": ", 4)), 4) expect_equal(ncol(str_split_fixed(test, ": ", 3)), 3) expect_equal(ncol(str_split_fixed(test, ": ", 2)), 2) expect_equal(ncol(str_split_fixed(test, ": ", 1)), 1) expect_equal( str_split_fixed(test, ": ", 3)[1, ], c("Subject", "Roger", "his drinking problems") ) expect_equal( str_split_fixed(test, ": ", 2)[1, ], c("Subject", "Roger: his drinking problems") ) }) test_that("str_split can split sentences correctly", { test <- "This is a sentence. Is this a sentence? Why, yes it is." expect_equal(length(str_split(test, boundary("sentence"))[[1]]), 3) expect_equal( str_split(test, boundary("sentence")), list(c("This is a sentence. ", "Is this a sentence? ", "Why, yes it is.")) ) }) stringr/tests/testthat/test-match.r0000644000176200001440000000422413274612511017204 0ustar liggesuserscontext("Matching groups") set.seed(1410) num <- matrix(sample(9, 10 * 10, replace = T), ncol = 10) num_flat <- apply(num, 1, str_c, collapse = "") phones <- str_c( "(", num[, 1], num[, 2], num[, 3], ") ", num[, 4], num[, 5], num[, 6], " ", num[, 7], num[, 8], num[, 9], num[, 10]) test_that("empty strings return correct matrix of correct size", { skip_if_not_installed("stringi", "1.2.2") expect_equal(str_match(NA, "(a)"), matrix(NA_character_, 1, 2)) expect_equal(str_match(character(), "(a)"), matrix(character(), 0, 2)) }) test_that("no matching cases returns 1 column matrix", { res <- str_match(c("a", "b"), ".") expect_equal(nrow(res), 2) expect_equal(ncol(res), 1) expect_equal(res[, 1], c("a", "b")) }) test_that("single match works when all match", { matches <- str_match(phones, "\\(([0-9]{3})\\) ([0-9]{3}) ([0-9]{4})") expect_equal(nrow(matches), length(phones)) expect_equal(ncol(matches), 4) expect_equal(matches[, 1], phones) matches_flat <- apply(matches[, -1], 1, str_c, collapse = "") expect_equal(matches_flat, num_flat) }) test_that("match returns NA when some inputs don't match", { matches <- str_match(c(phones, "blah", NA), "\\(([0-9]{3})\\) ([0-9]{3}) ([0-9]{4})") expect_equal(nrow(matches), length(phones) + 2) expect_equal(ncol(matches), 4) expect_equal(matches[11, ], rep(NA_character_, 4)) expect_equal(matches[12, ], rep(NA_character_, 4)) }) test_that("match returns NA when optional group doesn't match", { expect_equal(str_match(c("ab", "a"), "(a)(b)?")[, 3], c("b", NA)) }) test_that("match_all returns NA when option group doesn't match", { expect_equal(str_match_all("a", "(a)(b)?")[[1]][1, ], c("a", "a", NA)) }) test_that("multiple match works", { phones_one <- str_c(phones, collapse = " ") multi_match <- str_match_all(phones_one, "\\(([0-9]{3})\\) ([0-9]{3}) ([0-9]{4})") single_matches <- str_match(phones, "\\(([0-9]{3})\\) ([0-9]{3}) ([0-9]{4})") expect_equal(multi_match[[1]], single_matches) }) test_that("match and match_all fail when pattern is not a regex", { expect_error(str_match(phones, fixed("3"))) expect_error(str_match_all(phones, coll("9"))) }) stringr/tests/testthat/test-dup.r0000644000176200001440000000063612615661551016711 0ustar liggesuserscontext("Duplicating strings") test_that("basic duplication works", { expect_equal(str_dup("a", 3), "aaa") expect_equal(str_dup("abc", 2), "abcabc") expect_equal(str_dup(c("a", "b"), 2), c("aa", "bb")) expect_equal(str_dup(c("a", "b"), c(2, 3)), c("aa", "bbb")) }) test_that("0 duplicates equals empty string", { expect_equal(str_dup("a", 0), "") expect_equal(str_dup(c("a", "b"), 0), rep("", 2)) }) stringr/tests/testthat/test-pad.r0000644000176200001440000000107513202640102016641 0ustar liggesuserscontext("Test padding") test_that("long strings are unchanged", { lengths <- sample(40:100, 10) strings <- vapply(lengths, function(x) str_c(letters[sample(26, x, replace = T)], collapse = ""), character(1)) padded <- str_pad(strings, width = 30) expect_equal(str_length(padded), str_length(padded)) }) test_that("directions work for simple case", { pad <- function(direction) str_pad("had", direction, width = 10) expect_equal(pad("right"), "had ") expect_equal(pad("left"), " had") expect_equal(pad("both"), " had ") }) stringr/tests/testthat/test-locate.r0000644000176200001440000000174112615677441017373 0ustar liggesuserscontext("Locations") test_that("basic location matching works", { expect_equivalent(str_locate("abc", "a")[1, ], c(1, 1)) expect_equivalent(str_locate("abc", "b")[1, ], c(2, 2)) expect_equivalent(str_locate("abc", "c")[1, ], c(3, 3)) expect_equivalent(str_locate("abc", ".+")[1, ], c(1, 3)) }) test_that("locations are integers", { strings <- c("a b c", "d e f") expect_true(is.integer(str_locate(strings, "[a-z]"))) res <- str_locate_all(strings, "[a-z]")[[1]] expect_true(is.integer(res)) expect_true(is.integer(invert_match(res))) }) test_that("both string and patterns are vectorised", { strings <- c("abc", "def") locs <- str_locate(strings, "a") expect_equal(locs[, "start"], c(1, NA)) locs <- str_locate(strings, c("a", "d")) expect_equal(locs[, "start"], c(1, 1)) expect_equal(locs[, "end"], c(1, 1)) locs <- str_locate_all(c("abab"), c("a", "b")) expect_equal(locs[[1]][, "start"], c(1, 3)) expect_equal(locs[[2]][, "start"], c(2, 4)) }) stringr/tests/testthat/test-count.r0000644000176200001440000000054312435640121017234 0ustar liggesuserscontext("Counting matches") test_that("counts are as expected", { fruit <- c("apple", "banana", "pear", "pineapple") expect_equal(str_count(fruit, "a"), c(1, 3, 1, 1)) expect_equal(str_count(fruit, "p"), c(2, 0, 1, 3)) expect_equal(str_count(fruit, "e"), c(1, 0, 1, 2)) expect_equal(str_count(fruit, c("a", "b", "p", "n")), c(1, 1, 1, 1)) }) stringr/tests/testthat/test-join.r0000644000176200001440000000061312615662420017047 0ustar liggesuserscontext("Joining strings") test_that("basic case works", { test <- c("a", "b", "c") expect_equal(str_c(test), test) expect_equal(str_c(test, sep = " "), test) expect_equal(str_c(test, collapse = ""), "abc") }) test_that("NULLs are dropped", { test <- letters[1:3] expect_equal(str_c(test, NULL), test) expect_equal(str_c(test, NULL, "a", sep = " "), c("a a", "b a", "c a")) }) stringr/tests/testthat/test-view.R0000644000176200001440000000131013202637725017020 0ustar liggesuserscontext("view") x <- c("abc", "def", "fgh") test_that("view works", { expect_error(str_view(x, "[aeiou]"), NA) expect_error(str_view_all(x, "d|e"), NA) }) test_that("match argument controls what is shown", { a <- str_view(x, "d|e") expect_equal(str_count(a$x$html, "\\"), 3) a <- str_view(x, "d|e", match = TRUE) expect_equal(str_count(a$x$html, "\\"), 1) a <- str_view(x, "d|e", match = FALSE) expect_equal(str_count(a$x$html, "\\"), 2) }) test_that("view_all shows all matches", { a <- str_view_all(x, "d|e", match = TRUE) expect_equal(str_count(a$x$html, "match"), 2) a <- str_view_all(x, "d|e", match = FALSE) expect_equal(str_count(a$x$html, "match"), 0) }) stringr/tests/testthat/test-flatten.R0000644000176200001440000000022413202640257017500 0ustar liggesuserscontext("test-flatten.R") test_that("equivalent to paste with collapse", { expect_equal(str_flatten(letters), paste0(letters, collapse = "")) }) stringr/tests/testthat/test-case.R0000644000176200001440000000073413427104326016765 0ustar liggesuserscontext("case") x <- "This is a sentence." test_that("to_upper and to_lower have equivalent base versions", { expect_identical(str_to_upper(x), toupper(x)) expect_identical(str_to_lower(x), tolower(x)) }) test_that("to_title creates one capital letter per word", { expect_equal(str_count(x, "\\W+"), str_count(str_to_title(x), "[[:upper:]]")) }) test_that("to_sentence capitalizes just the first letter", { expect_identical(str_to_sentence("a Test"), "A test") }) stringr/NAMESPACE0000644000176200001440000000176113427104443013170 0ustar liggesusers# Generated by roxygen2: do not edit by hand export("%>%") export("str_sub<-") export(boundary) export(coll) export(fixed) export(invert_match) export(regex) export(str_c) export(str_conv) export(str_count) export(str_detect) export(str_dup) export(str_ends) export(str_extract) export(str_extract_all) export(str_flatten) export(str_glue) export(str_glue_data) export(str_interp) export(str_length) export(str_locate) export(str_locate_all) export(str_match) export(str_match_all) export(str_order) export(str_pad) export(str_remove) export(str_remove_all) export(str_replace) export(str_replace_all) export(str_replace_na) export(str_sort) export(str_split) export(str_split_fixed) export(str_squish) export(str_starts) export(str_sub) export(str_subset) export(str_to_lower) export(str_to_sentence) export(str_to_title) export(str_to_upper) export(str_trim) export(str_trunc) export(str_view) export(str_view_all) export(str_which) export(str_wrap) export(word) import(stringi) importFrom(magrittr,"%>%") stringr/NEWS.md0000644000176200001440000002222513427574470013060 0ustar liggesusers# stringr 1.4.0 * `str_interp()` now renders lists consistently independent on the presence of additional placeholders (@amhrasmussen). * New `str_starts()` and `str_ends()` functions to detect patterns at the beginning or end of strings (@jonthegeek, #258). * `str_subset()`, `str_detect()`, and `str_which()` get `negate` argument, which is useful when you want the elements that do NOT match (#259, @yutannihilation). * New `str_to_sentence()` function to capitalize with sentence case (@jonthegeek, #202). # stringr 1.3.1 * `str_replace_all()` with a named vector now respects modifier functions (#207) * `str_trunc()` is once again vectorised correctly (#203, @austin3dickey). * `str_view()` handles `NA` values more gracefully (#217). I've also tweaked the sizing policy so hopefully it should work better in notebooks, while preserving the existing behaviour in knit documents (#232). # stringr 1.3.0 ## API changes * During package build, you may see `Error : object ‘ignore.case’ is not exported by 'namespace:stringr'`. This is because the long deprecated `str_join()`, `ignore.case()` and `perl()` have now been removed. ## New features * `str_glue()` and `str_glue_data()` provide convenient wrappers around `glue` and `glue_data()` from the [glue](http://glue.tidyverse.org/) package (#157). * `str_flatten()` is a wrapper around `stri_flatten()` and clearly conveys flattening a character vector into a single string (#186). * `str_remove()` and `str_remove_all()` functions. These wrap `str_replace()` and `str_replace_all()` to remove patterns from strings. (@Shians, #178) * `str_squish()` removes spaces from both the left and right side of strings, and also converts multiple space (or space-like characters) to a single space within strings (@stephlocke, #197). * `str_sub()` gains `omit_na` argument for ignoring `NA`. Accordingly, `str_replace()` now ignores `NA`s and keeps the original strings. (@yutannihilation, #164) ## Bug fixes and minor improvements * `str_trunc()` now preserves NAs (@ClaytonJY, #162) * `str_trunc()` now throws an error when `width` is shorter than `ellipsis` (@ClaytonJY, #163). * Long deprecated `str_join()`, `ignore.case()` and `perl()` have now been removed. # stringr 1.2.0 ## API changes * `str_match_all()` now returns NA if an optional group doesn't match (previously it returned ""). This is more consistent with `str_match()` and other match failures (#134). ## New features * In `str_replace()`, `replacement` can now be a function that is called once for each match and whose return value is used to replace the match. * New `str_which()` mimics `grep()` (#129). * A new vignette (`vignette("regular-expressions")`) describes the details of the regular expressions supported by stringr. The main vignette (`vignette("stringr")`) has been updated to give a high-level overview of the package. ## Minor improvements and bug fixes * `str_order()` and `str_sort()` gain explicit `numeric` argument for sorting mixed numbers and strings. * `str_replace_all()` now throws an error if `replacement` is not a character vector. If `replacement` is `NA_character_` it replaces the complete string with replaces with `NA` (#124). * All functions that take a locale (e.g. `str_to_lower()` and `str_sort()`) default to "en" (English) to ensure that the default is consistent across platforms. # stringr 1.1.0 * Add sample datasets: `fruit`, `words` and `sentences`. * `fixed()`, `regex()`, and `coll()` now throw an error if you use them with anything other than a plain string (#60). I've clarified that the replacement for `perl()` is `regex()` not `regexp()` (#61). `boundary()` has improved defaults when splitting on non-word boundaries (#58, @lmullen). * `str_detect()` now can detect boundaries (by checking for a `str_count()` > 0) (#120). `str_subset()` works similarly. * `str_extract()` and `str_extract_all()` now work with `boundary()`. This is particularly useful if you want to extract logical constructs like words or sentences. `str_extract_all()` respects the `simplify` argument when used with `fixed()` matches. * `str_subset()` now respects custom options for `fixed()` patterns (#79, @gagolews). * `str_replace()` and `str_replace_all()` now behave correctly when a replacement string contains `$`s, `\\\\1`, etc. (#83, #99). * `str_split()` gains a `simplify` argument to match `str_extract_all()` etc. * `str_view()` and `str_view_all()` create HTML widgets that display regular expression matches (#96). * `word()` returns `NA` for indexes greater than number of words (#112). # stringr 1.0.0 * stringr is now powered by [stringi](https://github.com/Rexamine/stringi) instead of base R regular expressions. This improves unicode and support, and makes most operations considerably faster. If you find stringr inadequate for your string processing needs, I highly recommend looking at stringi in more detail. * stringr gains a vignette, currently a straight forward update of the article that appeared in the R Journal. * `str_c()` now returns a zero length vector if any of its inputs are zero length vectors. This is consistent with all other functions, and standard R recycling rules. Similarly, using `str_c("x", NA)` now yields `NA`. If you want `"xNA"`, use `str_replace_na()` on the inputs. * `str_replace_all()` gains a convenient syntax for applying multiple pairs of pattern and replacement to the same vector: ```R input <- c("abc", "def") str_replace_all(input, c("[ad]" = "!", "[cf]" = "?")) ``` * `str_match()` now returns NA if an optional group doesn't match (previously it returned ""). This is more consistent with `str_extract()` and other match failures. * New `str_subset()` keeps values that match a pattern. It's a convenient wrapper for `x[str_detect(x)]` (#21, @jiho). * New `str_order()` and `str_sort()` allow you to sort and order strings in a specified locale. * New `str_conv()` to convert strings from specified encoding to UTF-8. * New modifier `boundary()` allows you to count, locate and split by character, word, line and sentence boundaries. * The documentation got a lot of love, and very similar functions (e.g. first and all variants) are now documented together. This should hopefully make it easier to locate the function you need. * `ignore.case(x)` has been deprecated in favour of `fixed|regex|coll(x, ignore.case = TRUE)`, `perl(x)` has been deprecated in favour of `regex(x)`. * `str_join()` is deprecated, please use `str_c()` instead. # stringr 0.6.2 * fixed path in `str_wrap` example so works for more R installations. * remove dependency on plyr # stringr 0.6.1 * Zero input to `str_split_fixed` returns 0 row matrix with `n` columns * Export `str_join` # stringr 0.6 * new modifier `perl` that switches to Perl regular expressions * `str_match` now uses new base function `regmatches` to extract matches - this should hopefully be faster than my previous pure R algorithm # stringr 0.5 * new `str_wrap` function which gives `strwrap` output in a more convenient format * new `word` function extract words from a string given user defined separator (thanks to suggestion by David Cooper) * `str_locate` now returns consistent type when matching empty string (thanks to Stavros Macrakis) * new `str_count` counts number of matches in a string. * `str_pad` and `str_trim` receive performance tweaks - for large vectors this should give at least a two order of magnitude speed up * str_length returns NA for invalid multibyte strings * fix small bug in internal `recyclable` function # stringr 0.4 * all functions now vectorised with respect to string, pattern (and where appropriate) replacement parameters * fixed() function now tells stringr functions to use fixed matching, rather than escaping the regular expression. Should improve performance for large vectors. * new ignore.case() modifier tells stringr functions to ignore case of pattern. * str_replace renamed to str_replace_all and new str_replace function added. This makes str_replace consistent with all functions. * new str_sub<- function (analogous to substring<-) for substring replacement * str_sub now understands negative positions as a position from the end of the string. -1 replaces Inf as indicator for string end. * str_pad side argument can be left, right, or both (instead of center) * str_trim gains side argument to better match str_pad * stringr now has a namespace and imports plyr (rather than requiring it) # stringr 0.3 * fixed() now also escapes | * str_join() renamed to str_c() * all functions more carefully check input and return informative error messages if not as expected. * add invert_match() function to convert a matrix of location of matches to locations of non-matches * add fixed() function to allow matching of fixed strings. # stringr 0.2 * str_length now returns correct results when used with factors * str_sub now correctly replaces Inf in end argument with length of string * new function str_split_fixed returns fixed number of splits in a character matrix * str_split no longer uses strsplit to preserve trailing breaks stringr/data/0000755000176200001440000000000012743441637012666 5ustar liggesusersstringr/data/fruit.rda0000644000176200001440000000103412743441637014505 0ustar liggesusersBZh91AY&SYko`[`@P@?@SABFiCCFL#*mj)2*Th`M10Jy i#M4hhtbAbv+{)DZgnB0"$(Cepll &zP"! UB郐^Vk]~gg]OCSYg8D b 7&ay^7N;vӸw/lLVYٞ qfۮIJKjHU_۹&DCx3\Z]L*(gYvke(+5@8}@̸HB@⏚F@5Tx*ݡ#( ̔ XPM!{sg[s,F0rö6$)\D ^v^4hOyZ6M+:͚rsMmq/o\,hi3g3V60=pScq{uD! $ PL4A5<MMGh Ʉ4hd!S 4  @ 4B4 ~2md=Ѡ (S(  I6cԞIF44hBGr 1HGb@B^u B ЅP|_; B![$.PE#4yyH!iދC*3xEؖxdA0 PID"47˫ӗ  ieBUƙ+/8zc Nru}ZF%@`47UL=TQ..)3/WA_^U\48 hAC I0>n:Ӊ'oH5C c&K8#`őhŻ"`l%20pk zqg0t ?ͩ ʤUwnk`7r0$iJΑQ2[;I̵ЦD(mpAk.6-WV'%=H.%H߮Ͳ11HV+ZAs3MK热`M: *KvD\IoS~O1FMK lfl5$|!;X<@Ә7\--<)TQأ%0\khzCh>;s'ЎFy{4!]"eSP SE_4C1xN94ɦ(WtJae//K̝^j,[/^EΩyFK<^|2{r9Аwó/D{SBN<uejcQd6ٝ1لtġaNX8F&jUL gىivM+ \) @όcW~?Ө|yA Le̢DPx,;mu"aV86'p<ftg M7TDD6a˞fÌzϧrcq]//JNۦzBUko<&'<5-e]?sowOɟ ;#*F%ko R喗-EG-/4m߶;Ӛ?c&ӊgn}. 9*Hxđ%t6I "YB}m #YϧՔ(ߓk^8ab>`=HfVTLHy =7=)Tw1=u{HzƌقIlAr,^;G'#3N^ȩ xє jܘ}|X JEdf1E0hɤjatɀ&w.NYQgO>"=wrPOٱ;듨SZ`zADIW8;i2 !tkS7̦6 ɛc/HPnJ2~ t͍tj|l"b5nHn,ΚLhoX3@?JYKlA誟PƮ^6X~n7>ea]rɭک1@:[X"s|X q$-gaacõ{o P8kyi\"N r>G)K1ZJ/nCpğ(P> JibyɐA2=4Lktg!Md8(k,fw:u ҦO+q/:Ϥ_M4! q4Mn%υgc f: YυmKݠr9oɈX5,tQn6ilt*z^ID(tmԖsmZHQ<]fy:.kvTe;HEY1C񺜳&O0S̕:waQɟ` N:;fag͇S]vO8-rYjh=~+";vzĢ+!$u7=;3wo1"Yw|r{[juv~0as7qM2v=Î/{tsLp6 m!N{g8]AF;M[0khLsy"A EWf5ד|g%4asS:Y!Gy[Gi3o1لyTOnQ5oT~1" ),qfH2IrˠfS2!DLVR/mUM~ڵjg]Eq?~8BզyUFvɷ?g|&7ԲkcuwVżnq,oN5H*oMOGYEuvxXƷetA8H{׻k^-zc7.{5Yup[,Z״pEn(Wvw}xf~KD:Y/[7]5}#Y%_ĭjNMI)MǺ1n7Vq^Fk%CR.q]gCK&$D)Ep eߙࢠKW?Uw޴(BPRNzREK(%α;"r@kRv_#c~X6zZ tlVM eqz{s&?&P EEDS/%] kkQ~w@׿sR:'D{=;#!o\Ы'fG}L MYN>VPdX5d _$tK=lK]_,0EG2%P0_.;+7L'0_/pÍs  **q+ `a;Hr̈tFU>F^EV?$7Y+8(^|V!8LCӨ7x 9Nf.vhH ɉW>S߿k؇hs/m,U @EBl4X?i$q_ Qs]v0~]S[6+L݌Ȣ0|~L2MA"/8L%T0<_|@H5@ZNvnt09ޛ>̿F쇄Aٛgqq~ :eەx?e|`x>RS 3\|kXEˮ' دR#&2 cOiUkJ?RVLb3T3'~J`p,qmˈ.O5xUϰ*]E! -2oeP rMΡz PEQ5B5Fzdbdp66Ζyι=IP#͆'q$>G#Јr^^ݷklLRLb7G7sf5]SԑA~1mp1z;iz]yЇMG l[<ݯEEK2>3<9Ju7QjYmFSe& ZauI#L ʭd3D]-N|g5ip" h9=߱ Gհ0$,?=mwI9ʰYIrYf@@Z@@Z͘hf sVG椔6 aJHi``w3^]'*v^ت&\,X$DPPQCGZ$k.1HkfSUb{;&A7+˖Zdqkme[h3˃m\[L̷bn`8 0K"3A4Pfa#>Om*tfQAATXXXQUDEXRQ"DPPU"T QQT(*5բ(EQUHQj ""*AH=TV* EDbF c2*VE,k,報E1XLUUQT EeFҠ1. bV A`EOaߵ10+q 2b.Ge.fքKHR۳j1`jwa5p;&hw#2 pVn$鞺6M$)l|ӘVl WmPtaC$ b"q~2~E~^Йc}?=1+ȊLZiAf!_ɬrԦp?@Ӟ& @~#-RsBv5KH{ g_KхYyijpfmE6YR&='^ڿ=sݫL E+ݱ!AQÜ|~#w7|*7,kVj0Lٍ_6cB^  \y>Vnv+Ѫ@G^6o8ŭ+>*l)_U3c<=teRK&En.ߖ,HGk3Q!GR6=RHCb!z{De))̲ü:Lѹ,A@o%0i̫WIwfl9ֵɏ.7MBcFKiLQlj:;YzWN7@B.RbRRjk3.5fS+.I^irߵLvP#m7*}kn LկN]1wl ":C^sᧉ˩6ɘ|Mzg˾ئp[Gn}6'وWACZaѪMm)R՘m3Y1␍"|ƵX$[5f|#7YrÑˡ/isfvL7x i:k.VdhxTǨU*0̘8p bgѡŦ+kKJkjSQ)tjQr0"52{3@k/VYG*CJ?u~w㞢JcZa0[6G8G영sq,}'(iF/8Mħb/$"mr8-xeh/zyW\f{gl6Ld,7}lX4YT*&x%쒆.V2yhn%*Et0 upD0Okr!d鐄V~)S֋OD!%;JB$H0'd)On3bKm~ZA-&nmOf1ᥥP"R2_6>oVF1 l,qx8ho۷PT*iʙ&STmLlSL-̕j8D:}߯Ms!&m샩VXTUsPaeg\cdR>=R8S 22=xWL̮[5li4U%\9kDaAǴ*gYc X!$JŴޡE?˜b,eEW2]>452dB0J6C/NuSz:iglMCKߓAڸg3YM:c s#r`tL[5V'ot4nj'sӬ_Kb=ĒO+8w(Ne)hS%W;F DqX`% N6TGl,b/)aM4 C!꧛ZintѷlϪVx5 d@hXաIEE~@/=BO]`ˋ@6lա B׳2J0#0rh5]aNL4@0. sq 5~w9<HQi!HF]vS#bD׿>1|T>J vJ,2" ؋ɚA }r,۱i? 3|G(]YR0Ӛtk cymκi11X]zhk SWfT,DZb* E!(@2봧QĻN7?[ LI()Pq$&QTNr l hryemiZd0`^5"%H$o !kJ  YQ$$O4GS˝{ˉ6n$I߮ݲV J1Klw" [YQ; `=C,DfUedW!΂`OX}FQaM,2& 4Ŏpy;:]=y?n2(MvB;zմ\X'à&jBSBAMOfckFPpO7iJ!J3p&^4,n}1RLO:V/u]hWsQFip*{E%@!yR]AnW-/EF(I*9pYI |<7#,--,qbPTDh>fa)q71!ΡS #LZ a("x).jf䴼 (A|!zQVg5.yUVZ!H8yZk/4Í ЫkjA0`řWcbOu]=;4pFmhZzB.,yNp ݸJwlx _'F3á;g9CL":(+7:,tvɾ/ aĴ#$6T6S .ZE~{I+%㉳Qc\jqcGd$v |&1gDfYrnUDe4U(8,MXF/ifc|Uŋ6IޜUIy^Os>˝?R>0nn\n.T.;  "ŷ5IFռ1J p_ {͢10y֊5/7 {0^nACqs4J@ >Ϥv,%uX@IY!S$Gy|x$>vHc.PQDC%S(9BC2ȥʑqeC3t !D".\nsxl o=4]8G}ܹfrSVM!< FRd&dM!;Xr ݗW>1svJ6]ޢ NwPd%@fIn\1e}ge|UX*CLpAȃgFnD \`]W0'9 }nb>tq$ 8KO31p|,;=7Ú=R&[":iMÑ%y M|.-|V\5Gչ(/ q#' cǜ|8s<ԡЈͬL P2GOZ'0vA}_~gCKndWS{G6]J׉CC;nr|"dnzW*P*>jR Rq4 e(=C"uV9'=1܁oN"tabmlߦ`2`Hl~+9Pk;k7z/C܃ w* &'D0(bxT0f]}:ay<"_/7T^p#aQVH&Ѐhz;ب?)lgl& # T#ՒA␺"-.)QpC+&!mh)pX֍,ЎcÒnpڣs`k>y_!rDv3O I73ѹz4zv^fuɠ_,;E/CMQ2Hɤ y7eA3\6"HQ3c ;}lW$<9,4S)!:덯5ϜWx}vL ')/ A"Y}FIoo 1)>r^`)KˆŐ958$)C;#uxJa"x^9㱓6=C$0C< $ \}~OKڮ2i4$dd0j8jZz*YGύV47;lޠ`FK0DhYT|8Xx2¿lzH0]dP dC37`vz=z]'$2` DB;ZP =,2q8#$_˜@7/ݝ_>|0ܔС~xjkE$C Dcf8ͱI䱷Iu؁ -興/EIӇr X#:ƻ f/q#ˌІžS8ak] 'ۯj&w~3g.7(ɧ".,eF"@Sl-#%U'+,d,NarHi.<ζA#XAPk{=_oLɁc=BzmoORSt$ k2xqU-w<\ymsƵvzԌAU~X[9p `JBcd.$Z~**SB!BY' òd-q~yFۘ>?cD#͕, }皏W4|fO$!vV|jzs \tQaJeQ6ߺيxA.Π9e1)]1.^`YįVz˜TZĻzb!ͼ ߏ1]3ݼT޸y;ʩ!b; XPH)A q9;ܑN$Xzstringr/R/0000755000176200001440000000000013427104326012145 5ustar liggesusersstringr/R/trunc.R0000644000176200001440000000217013274373310013424 0ustar liggesusers#' Truncate a character string. #' #' @param string A character vector. #' @param width Maximum width of string. #' @param side,ellipsis Location and content of ellipsis that indicates #' content has been removed. #' @seealso [str_pad()] to increase the minimum width of a string. #' @export #' @examples #' x <- "This string is moderately long" #' rbind( #' str_trunc(x, 20, "right"), #' str_trunc(x, 20, "left"), #' str_trunc(x, 20, "center") #' ) #' str_trunc <- function(string, width, side = c("right", "left", "center"), ellipsis = "...") { side <- match.arg(side) too_long <- !is.na(string) & str_length(string) > width width... <- width - str_length(ellipsis) if (width... < 0) stop("`width` is shorter than `ellipsis`", .call = FALSE) string[too_long] <- switch(side, right = str_c(str_sub(string[too_long], 1, width...), ellipsis), left = str_c(ellipsis, str_sub(string[too_long], -width..., -1)), center = str_c( str_sub(string[too_long], 1, ceiling(width... / 2)), ellipsis, str_sub(string[too_long], -floor(width... / 2), -1) ) ) string } stringr/R/utils.R0000644000176200001440000000021312752717364013440 0ustar liggesusers#' Pipe operator #' #' @name %>% #' @rdname pipe #' @keywords internal #' @export #' @importFrom magrittr %>% #' @usage lhs \%>\% rhs NULL stringr/R/wrap.r0000644000176200001440000000241413202620054013272 0ustar liggesusers#' Wrap strings into nicely formatted paragraphs. #' #' This is a wrapper around [stringi::stri_wrap()] which implements #' the Knuth-Plass paragraph wrapping algorithm. #' #' @param string character vector of strings to reformat. #' @param width positive integer giving target line width in characters. A #' width less than or equal to 1 will put each word on its own line. #' @param indent non-negative integer giving indentation of first line in #' each paragraph #' @param exdent non-negative integer giving indentation of following lines in #' each paragraph #' @return A character vector of re-wrapped strings. #' @export #' @examples #' thanks_path <- file.path(R.home("doc"), "THANKS") #' thanks <- str_c(readLines(thanks_path), collapse = "\n") #' thanks <- word(thanks, 1, 3, fixed("\n\n")) #' cat(str_wrap(thanks), "\n") #' cat(str_wrap(thanks, width = 40), "\n") #' cat(str_wrap(thanks, width = 60, indent = 2), "\n") #' cat(str_wrap(thanks, width = 60, exdent = 2), "\n") #' cat(str_wrap(thanks, width = 0, exdent = 2), "\n") str_wrap <- function(string, width = 80, indent = 0, exdent = 0) { if (width <= 0) width <- 1 out <- stri_wrap(string, width = width, indent = indent, exdent = exdent, simplify = FALSE) vapply(out, str_c, collapse = "\n", character(1)) } stringr/R/subset.R0000644000176200001440000000316213413766176013613 0ustar liggesusers#' Keep strings matching a pattern, or find positions. #' #' `str_subset()` is a wrapper around `x[str_detect(x, pattern)]`, #' and is equivalent to `grep(pattern, x, value = TRUE)`. #' `str_which()` is a wrapper around `which(str_detect(x, pattern))`, #' and is equivalent to `grep(pattern, x)`. #' See [str_detect()] for an equivalent to `grepl(pattern, x)`. #' #' Vectorised over `string` and `pattern` #' #' @inheritParams str_detect #' @return A character vector. #' @seealso [grep()] with argument `value = TRUE`, #' [stringi::stri_subset()] for the underlying implementation. #' @export #' @examples #' fruit <- c("apple", "banana", "pear", "pinapple") #' str_subset(fruit, "a") #' str_which(fruit, "a") #' #' str_subset(fruit, "^a") #' str_subset(fruit, "a$") #' str_subset(fruit, "b") #' str_subset(fruit, "[aeiou]") #' #' # Returns elements that do NOT match #' str_subset(fruit, "^p", negate = TRUE) #' #' # Missings never match #' str_subset(c("a", NA, "b"), ".") #' str_which(c("a", NA, "b"), ".") str_subset <- function(string, pattern, negate = FALSE) { switch(type(pattern), empty = , bound = string[str_detect(string, pattern) & !negate], fixed = stri_subset_fixed(string, pattern, omit_na = TRUE, negate = negate, opts_fixed = opts(pattern)), coll = stri_subset_coll(string, pattern, omit_na = TRUE, negate = negate, opts_collator = opts(pattern)), regex = stri_subset_regex(string, pattern, omit_na = TRUE, negate = negate, opts_regex = opts(pattern)) ) } #' @export #' @rdname str_subset str_which <- function(string, pattern, negate = FALSE) { which(str_detect(string, pattern, negate = negate)) } stringr/R/case.R0000644000176200001440000000205313427104326013203 0ustar liggesusers#' Convert case of a string. #' #' @param string String to modify #' @param locale Locale to use for translations. Defaults to "en" (English) #' to ensure consistent default ordering across platforms. #' @examples #' dog <- "The quick brown dog" #' str_to_upper(dog) #' str_to_lower(dog) #' str_to_title(dog) #' str_to_sentence("the quick brown dog") #' #' # Locale matters! #' str_to_upper("i") # English #' str_to_upper("i", "tr") # Turkish #' @name case NULL #' @export #' @rdname case str_to_upper <- function(string, locale = "en") { stri_trans_toupper(string, locale = locale) } #' @export #' @rdname case str_to_lower <- function(string, locale = "en") { stri_trans_tolower(string, locale = locale) } #' @export #' @rdname case str_to_title <- function(string, locale = "en") { stri_trans_totitle(string, opts_brkiter = stri_opts_brkiter(locale = locale)) } #' @export #' @rdname case str_to_sentence <- function(string, locale = "en") { stri_trans_totitle( string, opts_brkiter = stri_opts_brkiter(type = "sentence", locale = locale) ) } stringr/R/split.r0000644000176200001440000000362013202620054013454 0ustar liggesusers#' Split up a string into pieces. #' #' Vectorised over `string` and `pattern`. #' #' @inheritParams str_detect #' @inheritParams str_extract #' @param n number of pieces to return. Default (Inf) uses all #' possible split positions. #' #' For `str_split_fixed`, if n is greater than the number of pieces, #' the result will be padded with empty strings. #' @return For `str_split_fixed`, a character matrix with `n` columns. #' For `str_split`, a list of character vectors. #' @seealso [stri_split()] for the underlying implementation. #' @export #' @examples #' fruits <- c( #' "apples and oranges and pears and bananas", #' "pineapples and mangos and guavas" #' ) #' #' str_split(fruits, " and ") #' str_split(fruits, " and ", simplify = TRUE) #' #' # Specify n to restrict the number of possible matches #' str_split(fruits, " and ", n = 3) #' str_split(fruits, " and ", n = 2) #' # If n greater than number of pieces, no padding occurs #' str_split(fruits, " and ", n = 5) #' #' # Use fixed to return a character matrix #' str_split_fixed(fruits, " and ", 3) #' str_split_fixed(fruits, " and ", 4) str_split <- function(string, pattern, n = Inf, simplify = FALSE) { if (identical(n, Inf)) n <- -1L switch(type(pattern), empty = stri_split_boundaries(string, n = n, simplify = simplify, opts_brkiter = opts(pattern)), bound = stri_split_boundaries(string, n = n, simplify = simplify, opts_brkiter = opts(pattern)), fixed = stri_split_fixed(string, pattern, n = n, simplify = simplify, opts_fixed = opts(pattern)), regex = stri_split_regex(string, pattern, n = n, simplify = simplify, opts_regex = opts(pattern)), coll = stri_split_coll(string, pattern, n = n, simplify = simplify, opts_collator = opts(pattern)) ) } #' @export #' @rdname str_split str_split_fixed <- function(string, pattern, n) { out <- str_split(string, pattern, n = n, simplify = TRUE) out[is.na(out)] <- "" out } stringr/R/stringr.R0000644000176200001440000000004113001736403013746 0ustar liggesusers#' @keywords internal "_PACKAGE" stringr/R/remove.r0000644000176200001440000000107213202707126013623 0ustar liggesusers#' Remove matched patterns in a string. #' #' Alias for `str_replace(string, pattern, "")`. #' #' @inheritParams str_detect #' @return A character vector. #' @seealso [str_replace()] for the underlying implementation. #' @export #' @examples #' fruits <- c("one apple", "two pears", "three bananas") #' str_remove(fruits, "[aeiou]") #' str_remove_all(fruits, "[aeiou]") str_remove <- function(string, pattern) { str_replace(string, pattern, "") } #' @export #' @rdname str_remove str_remove_all <- function(string, pattern) { str_replace_all(string, pattern, "") } stringr/R/conv.R0000644000176200001440000000102313202620054013221 0ustar liggesusers#' Specify the encoding of a string. #' #' This is a convenient way to override the current encoding of a string. #' #' @param string String to re-encode. #' @param encoding Name of encoding. See [stringi::stri_enc_list()] #' for a complete list. #' @export #' @examples #' # Example from encoding?stringi::stringi #' x <- rawToChar(as.raw(177)) #' x #' str_conv(x, "ISO-8859-2") # Polish "a with ogonek" #' str_conv(x, "ISO-8859-1") # Plus-minus str_conv <- function(string, encoding) { stri_conv(string, encoding, "UTF-8") } stringr/R/length.r0000644000176200001440000000202713202620054013602 0ustar liggesusers#' The length of a string. #' #' Technically this returns the number of "code points", in a string. One #' code point usually corresponds to one character, but not always. For example, #' an u with a umlaut might be represented as a single character or as the #' combination a u and an umlaut. #' #' @inheritParams str_detect #' @return A numeric vector giving number of characters (code points) in each #' element of the character vector. Missing string have missing length. #' @seealso [stringi::stri_length()] which this function wraps. #' @export #' @examples #' str_length(letters) #' str_length(NA) #' str_length(factor("abc")) #' str_length(c("i", "like", "programming", NA)) #' #' # Two ways of representing a u with an umlaut #' u1 <- "\u00fc" #' u2 <- stringi::stri_trans_nfd(u1) #' # The print the same: #' u1 #' u2 #' # But have a different length #' str_length(u1) #' str_length(u2) #' # Even though they have the same number of characters #' str_count(u1) #' str_count(u2) str_length <- function(string) { stri_length(string) } stringr/R/modifiers.r0000644000176200001440000001217713274371075014330 0ustar liggesusers#' Control matching behaviour with modifier functions. #' #' \describe{ #' \item{fixed}{Compare literal bytes in the string. This is very fast, but #' not usually what you want for non-ASCII character sets.} #' \item{coll}{Compare strings respecting standard collation rules.} #' \item{regex}{The default. Uses ICU regular expressions.} #' \item{boundary}{Match boundaries between things.} #' } #' #' @param pattern Pattern to modify behaviour. #' @param ignore_case Should case differences be ignored in the match? #' @name modifiers #' @examples #' pattern <- "a.b" #' strings <- c("abb", "a.b") #' str_detect(strings, pattern) #' str_detect(strings, fixed(pattern)) #' str_detect(strings, coll(pattern)) #' #' # coll() is useful for locale-aware case-insensitive matching #' i <- c("I", "\u0130", "i") #' i #' str_detect(i, fixed("i", TRUE)) #' str_detect(i, coll("i", TRUE)) #' str_detect(i, coll("i", TRUE, locale = "tr")) #' #' # Word boundaries #' words <- c("These are some words.") #' str_count(words, boundary("word")) #' str_split(words, " ")[[1]] #' str_split(words, boundary("word"))[[1]] #' #' # Regular expression variations #' str_extract_all("The Cat in the Hat", "[a-z]+") #' str_extract_all("The Cat in the Hat", regex("[a-z]+", TRUE)) #' #' str_extract_all("a\nb\nc", "^.") #' str_extract_all("a\nb\nc", regex("^.", multiline = TRUE)) #' #' str_extract_all("a\nb\nc", "a.") #' str_extract_all("a\nb\nc", regex("a.", dotall = TRUE)) NULL #' @export #' @rdname modifiers fixed <- function(pattern, ignore_case = FALSE) { pattern <- as_bare_character(pattern) options <- stri_opts_fixed(case_insensitive = ignore_case) structure( pattern, options = options, class = c("fixed", "pattern", "character") ) } #' @export #' @rdname modifiers #' @param locale Locale to use for comparisons. See #' [stringi::stri_locale_list()] for all possible options. #' Defaults to "en" (English) to ensure that the default collation is #' consistent across platforms. #' @param ... Other less frequently used arguments passed on to #' [stringi::stri_opts_collator()], #' [stringi::stri_opts_regex()], or #' [stringi::stri_opts_brkiter()] coll <- function(pattern, ignore_case = FALSE, locale = "en", ...) { pattern <- as_bare_character(pattern) options <- stri_opts_collator( strength = if (ignore_case) 2L else 3L, locale = locale, ... ) structure( pattern, options = options, class = c("coll", "pattern", "character") ) } #' @export #' @rdname modifiers #' @param multiline If `TRUE`, `$` and `^` match #' the beginning and end of each line. If `FALSE`, the #' default, only match the start and end of the input. #' @param comments If `TRUE`, white space and comments beginning with #' `#` are ignored. Escape literal spaces with `\\ `. #' @param dotall If `TRUE`, `.` will also match line terminators. regex <- function(pattern, ignore_case = FALSE, multiline = FALSE, comments = FALSE, dotall = FALSE, ...) { pattern <- as_bare_character(pattern) options <- stri_opts_regex( case_insensitive = ignore_case, multiline = multiline, comments = comments, dotall = dotall, ... ) structure( pattern, options = options, class = c("regex", "pattern", "character") ) } #' @param type Boundary type to detect. #' \describe{ #' \item{`character`}{Every character is a boundary.} #' \item{`line_break`}{Boundaries are places where it is acceptable to have #' a line break in the current locale.} #' \item{`sentence`}{The beginnings and ends of sentences are boundaries, #' using intelligent rules to avoid counting abbreviations #' ([details](https://www.unicode.org/reports/tr29/#Sentence_Boundaries)).} #' \item{`word`}{The beginnings and ends of words are boundaries.} #' } #' @param skip_word_none Ignore "words" that don't contain any characters #' or numbers - i.e. punctuation. Default `NA` will skip such "words" #' only when splitting on `word` boundaries. #' #' @seealso [str_wrap()] for breaking text to form paragraphs #' @seealso [`stringi::stringi-search-boundaries`] for more detail on the #' various boundaries #' @export #' @rdname modifiers boundary <- function(type = c("character", "line_break", "sentence", "word"), skip_word_none = NA, ...) { type <- match.arg(type) if (identical(skip_word_none, NA)) { skip_word_none <- type == "word" } options <- stri_opts_brkiter( type = type, skip_word_none = skip_word_none, ... ) structure( character(), options = options, class = c("boundary", "pattern", "character") ) } opts <- function(x) { if (identical(x, "")) { stri_opts_brkiter(type = "character") } else { attr(x, "options") } } type <- function(x) UseMethod("type") type.boundary <- function(x) "bound" type.regex <- function(x) "regex" type.coll <- function(x) "coll" type.fixed <- function(x) "fixed" type.character <- function(x) if (identical(x, "")) "empty" else "regex" as_bare_character <- function(x) { if (is.character(x) && !is.object(x)) { # All OK! return(x) } warning("Coercing `pattern` to a plain character vector.", call. = FALSE) as.character(x) } stringr/R/flatten.R0000644000176200001440000000051213202623120013710 0ustar liggesusers#' Flatten a string #' #' @param string Character to flatten #' @param collapse String to insert between each piece #' @return A character vector of length 1 #' @export #' @examples #' str_flatten(letters) #' str_flatten(letters, "-") str_flatten <- function(string, collapse = "") { stri_flatten(string, collapse = collapse) } stringr/R/sort.R0000644000176200001440000000275613202620054013261 0ustar liggesusers#' Order or sort a character vector. #' #' @param x A character vector to sort. #' @param decreasing A boolean. If `FALSE`, the default, sorts from #' lowest to highest; if `TRUE` sorts from highest to lowest. #' @param na_last Where should `NA` go? `TRUE` at the end, #' `FALSE` at the beginning, `NA` dropped. #' @param locale In which locale should the sorting occur? Defaults to #' the English. This ensures that code behaves the same way across #' platforms. #' @param numeric If `TRUE`, will sort digits numerically, instead #' of as strings. #' @param ... Other options used to control sorting order. Passed on to #' [stringi::stri_opts_collator()]. #' @seealso [stringi::stri_order()] for the underlying implementation. #' @export #' @examples #' str_order(letters) #' str_sort(letters) #' #' str_order(letters, locale = "haw") #' str_sort(letters, locale = "haw") #' #' x <- c("100a10", "100a5", "2b", "2a") #' str_sort(x) #' str_sort(x, numeric = TRUE) str_order <- function(x, decreasing = FALSE, na_last = TRUE, locale = "en", numeric = FALSE, ...) { stri_order(x, decreasing = decreasing, na_last = na_last, opts_collator = stri_opts_collator(locale, numeric = numeric, ...)) } #' @export #' @rdname str_order str_sort <- function(x, decreasing = FALSE, na_last = TRUE, locale = "en", numeric = FALSE, ...) { stri_sort(x, decreasing = decreasing, na_last = na_last, opts_collator = stri_opts_collator(locale, numeric = numeric, ...)) } stringr/R/data.R0000644000176200001440000000117713202620054013177 0ustar liggesusers#' Sample character vectors for practicing string manipulations. #' #' `fruit` and `word` come from the `rcorpora` package #' written by Gabor Csardi; the data was collected by Darius Kazemi #' and made available at \url{https://github.com/dariusk/corpora}. #' `sentences` is a collection of "Harvard sentences" used for #' standardised testing of voice. #' #' @format A character vector. #' @name stringr-data #' @examples #' length(sentences) #' sentences[1:5] #' #' length(fruit) #' fruit[1:5] #' #' length(words) #' words[1:5] NULL #' @rdname stringr-data "sentences" #' @rdname stringr-data "fruit" #' @rdname stringr-data "words" stringr/R/word.r0000644000176200001440000000354113202637725013313 0ustar liggesusers#' Extract words from a sentence. #' #' @param string input character vector. #' @param start integer vector giving position of first word to extract. #' Defaults to first word. If negative, counts backwards from last #' character. #' @param end integer vector giving position of last word to extract. #' Defaults to first word. If negative, counts backwards from last #' character. #' @param sep separator between words. Defaults to single space. #' @return character vector of words from `start` to `end` #' (inclusive). Will be length of longest input argument. #' @export #' @examples #' sentences <- c("Jane saw a cat", "Jane sat down") #' word(sentences, 1) #' word(sentences, 2) #' word(sentences, -1) #' word(sentences, 2, -1) #' #' # Also vectorised over start and end #' word(sentences[1], 1:3, -1) #' word(sentences[1], 1, 1:4) #' #' # Can define words by other separators #' str <- 'abc.def..123.4568.999' #' word(str, 1, sep = fixed('..')) #' word(str, 2, sep = fixed('..')) word <- function(string, start = 1L, end = start, sep = fixed(" ")) { n <- max(length(string), length(start), length(end)) string <- rep(string, length.out = n) start <- rep(start, length.out = n) end <- rep(end, length.out = n) breaks <- str_locate_all(string, sep) words <- lapply(breaks, invert_match) # Convert negative values into actual positions len <- vapply(words, nrow, integer(1)) neg_start <- !is.na(start) & start < 0L start[neg_start] <- start[neg_start] + len[neg_start] + 1L neg_end <- !is.na(end) & end < 0L end[neg_end] <- end[neg_end] + len[neg_end] + 1L # Replace indexes past end with NA start[start > len] <- NA end[end > len] <- NA # Extract locations starts <- mapply(function(word, loc) word[loc, "start"], words, start) ends <- mapply(function(word, loc) word[loc, "end"], words, end) str_sub(string, starts, ends) } stringr/R/locate.r0000644000176200001440000000522613202620054013574 0ustar liggesusers#' Locate the position of patterns in a string. #' #' Vectorised over `string` and `pattern`. If the match is of length #' 0, (e.g. from a special match like `$`) end will be one character less #' than start. #' #' @inheritParams str_detect #' @return For `str_locate`, an integer matrix. First column gives start #' postion of match, and second column gives end position. For #' `str_locate_all` a list of integer matrices. #' @seealso #' [str_extract()] for a convenient way of extracting matches, #' [stringi::stri_locate()] for the underlying implementation. #' @export #' @examples #' fruit <- c("apple", "banana", "pear", "pineapple") #' str_locate(fruit, "$") #' str_locate(fruit, "a") #' str_locate(fruit, "e") #' str_locate(fruit, c("a", "b", "p", "p")) #' #' str_locate_all(fruit, "a") #' str_locate_all(fruit, "e") #' str_locate_all(fruit, c("a", "b", "p", "p")) #' #' # Find location of every character #' str_locate_all(fruit, "") str_locate <- function(string, pattern) { switch(type(pattern), empty = stri_locate_first_boundaries(string, opts_brkiter = opts(pattern)), bound = stri_locate_first_boundaries(string, opts_brkiter = opts(pattern)), fixed = stri_locate_first_fixed(string, pattern, opts_fixed = opts(pattern)), coll = stri_locate_first_coll(string, pattern, opts_collator = opts(pattern)), regex = stri_locate_first_regex(string, pattern, opts_regex = opts(pattern)) ) } #' @rdname str_locate #' @export str_locate_all <- function(string, pattern) { opts <- opts(pattern) switch(type(pattern), empty = stri_locate_all_boundaries(string, omit_no_match = TRUE, opts_brkiter = opts), bound = stri_locate_all_boundaries(string, omit_no_match = TRUE, opts_brkiter = opts), fixed = stri_locate_all_fixed(string, pattern, omit_no_match = TRUE, opts_fixed = opts), regex = stri_locate_all_regex(string, pattern, omit_no_match = TRUE, opts_regex = opts), coll = stri_locate_all_coll(string, pattern, omit_no_match = TRUE, opts_collator = opts) ) } #' Switch location of matches to location of non-matches. #' #' Invert a matrix of match locations to match the opposite of what was #' previously matched. #' #' @param loc matrix of match locations, as from [str_locate_all()] #' @return numeric match giving locations of non-matches #' @export #' @examples #' numbers <- "1 and 2 and 4 and 456" #' num_loc <- str_locate_all(numbers, "[0-9]+")[[1]] #' str_sub(numbers, num_loc[, "start"], num_loc[, "end"]) #' #' text_loc <- invert_match(num_loc) #' str_sub(numbers, text_loc[, "start"], text_loc[, "end"]) invert_match <- function(loc) { cbind( start = c(0L, loc[, "end"] + 1L), end = c(loc[, "start"] - 1L, -1L) ) } stringr/R/sub.r0000644000176200001440000000514313221261010013106 0ustar liggesusers#' Extract and replace substrings from a character vector. #' #' `str_sub` will recycle all arguments to be the same length as the #' longest argument. If any arguments are of length 0, the output will be #' a zero length character vector. #' #' Substrings are inclusive - they include the characters at both start and #' end positions. `str_sub(string, 1, -1)` will return the complete #' substring, from the first character to the last. #' #' @param string input character vector. #' @param start,end Two integer vectors. `start` gives the position #' of the first character (defaults to first), `end` gives the position #' of the last (defaults to last character). Alternatively, pass a two-column #' matrix to `start`. #' #' Negative values count backwards from the last character. #' @param omit_na Single logical value. If `TRUE`, missing values in any of the #' arguments provided will result in an unchanged input. #' @param value replacement string #' @return A character vector of substring from `start` to `end` #' (inclusive). Will be length of longest input argument. #' @seealso The underlying implementation in [stringi::stri_sub()] #' @export #' @examples #' hw <- "Hadley Wickham" #' #' str_sub(hw, 1, 6) #' str_sub(hw, end = 6) #' str_sub(hw, 8, 14) #' str_sub(hw, 8) #' str_sub(hw, c(1, 8), c(6, 14)) #' #' # Negative indices #' str_sub(hw, -1) #' str_sub(hw, -7) #' str_sub(hw, end = -7) #' #' # Alternatively, you can pass in a two colum matrix, as in the #' # output from str_locate_all #' pos <- str_locate_all(hw, "[aeio]")[[1]] #' str_sub(hw, pos) #' str_sub(hw, pos[, 1], pos[, 2]) #' #' # Vectorisation #' str_sub(hw, seq_len(str_length(hw))) #' str_sub(hw, end = seq_len(str_length(hw))) #' #' # Replacement form #' x <- "BBCDEF" #' str_sub(x, 1, 1) <- "A"; x #' str_sub(x, -1, -1) <- "K"; x #' str_sub(x, -2, -2) <- "GHIJ"; x #' str_sub(x, 2, -2) <- ""; x #' #' # If you want to keep the original if some argument is NA, #' # use omit_na = TRUE #' x1 <- x2 <- x3 <- x4 <- "AAA" #' str_sub(x1, 1, NA) <- "B" #' str_sub(x2, 1, 2) <- NA #' str_sub(x3, 1, NA, omit_na = TRUE) <- "B" #' str_sub(x4, 1, 2, omit_na = TRUE) <- NA #' x1; x2; x3; x4 str_sub <- function(string, start = 1L, end = -1L) { if (is.matrix(start)) { stri_sub(string, from = start) } else { stri_sub(string, from = start, to = end) } } #' @export #' @rdname str_sub "str_sub<-" <- function(string, start = 1L, end = -1L, omit_na = FALSE, value) { if (is.matrix(start)) { stri_sub(string, from = start, omit_na = omit_na) <- value } else { stri_sub(string, from = start, to = end, omit_na = omit_na) <- value } string } stringr/R/glue.R0000644000176200001440000000246613274371075013243 0ustar liggesusers#' Format and interpolate a string with glue #' #' These functions are wrappers around [glue::glue()] and [glue::glue_data()], #' which provide a powerful and elegant syntax for interpolating strings. #' These wrappers provide a small set of the full options. Use the functions #' directly from glue for more control. #' #' @export #' @examples #' name <- "Fred" #' age <- 50 #' anniversary <- as.Date("1991-10-12") #' str_glue( #' "My name is {name}, ", #' "my age next year is {age + 1}, ", #' "and my anniversary is {format(anniversary, '%A, %B %d, %Y')}." #' ) #' #' # single braces can be inserted by doubling them #' str_glue("My name is {name}, not {{name}}.") #' #' # You can also used named arguments #' str_glue( #' "My name is {name}, ", #' "and my age next year is {age + 1}.", #' name = "Joe", #' age = 40 #' ) #' #' # `str_glue_data()` is useful in data pipelines #' mtcars %>% str_glue_data("{rownames(.)} has {hp} hp") str_glue <- function(..., .sep = "", .envir = parent.frame()) { glue::glue(..., .sep = .sep, .envir = .envir) } #' @export #' @rdname str_glue #' @inheritParams glue::glue_data str_glue_data <- function(.x, ..., .sep = "", .envir = parent.frame(), .na = "NA") { glue::glue_data( .x, ..., .sep = .sep, .envir = .envir, .na = .na ) } stringr/R/dup.r0000644000176200001440000000070213202620054013107 0ustar liggesusers#' Duplicate and concatenate strings within a character vector. #' #' Vectorised over `string` and `times`. #' #' @param string Input character vector. #' @param times Number of times to duplicate each string. #' @return A character vector. #' @export #' @examples #' fruit <- c("apple", "pear", "banana") #' str_dup(fruit, 2) #' str_dup(fruit, 1:3) #' str_c("ba", str_dup("na", 0:5)) str_dup <- function(string, times) { stri_dup(string, times) } stringr/R/count.r0000644000176200001440000000201313202620054013444 0ustar liggesusers#' Count the number of matches in a string. #' #' Vectorised over `string` and `pattern`. #' #' @inheritParams str_detect #' @return An integer vector. #' @seealso #' [stringi::stri_count()] which this function wraps. #' #' [str_locate()]/[str_locate_all()] to locate position #' of matches #' #' @export #' @examples #' fruit <- c("apple", "banana", "pear", "pineapple") #' str_count(fruit, "a") #' str_count(fruit, "p") #' str_count(fruit, "e") #' str_count(fruit, c("a", "b", "p", "p")) #' #' str_count(c("a.", "...", ".a.a"), ".") #' str_count(c("a.", "...", ".a.a"), fixed(".")) str_count <- function(string, pattern = "") { switch(type(pattern), empty = stri_count_boundaries(string, opts_brkiter = opts(pattern)), bound = stri_count_boundaries(string, opts_brkiter = opts(pattern)), fixed = stri_count_fixed(string, pattern, opts_fixed = opts(pattern)), coll = stri_count_coll(string, pattern, opts_collator = opts(pattern)), regex = stri_count_regex(string, pattern, opts_regex = opts(pattern)) ) } stringr/R/trim.R0000644000176200001440000000176513204625103013246 0ustar liggesusers#' Trim whitespace from a string #' #' `str_trim()` removes whitespace from start and end of string; `str_squish()` #' also reduces repeated whitespace inside a string. #' #' @param string A character vector. #' @param side Side on which to remove whitespace (left, right or both). #' @return A character vector. #' @export #' @seealso [str_pad()] to add whitespace #' @examples #' str_trim(" String with trailing and leading white space\t") #' str_trim("\n\nString with trailing and leading white space\n\n") #' #' str_squish(" String with trailing, middle, and leading white space\t") #' str_squish("\n\nString with excess, trailing and leading white space\n\n") str_trim <- function(string, side = c("both", "left", "right")) { side <- match.arg(side) switch(side, left = stri_trim_left(string), right = stri_trim_right(string), both = stri_trim_both(string) ) } #' @export #' @rdname str_trim str_squish <- function(string) { stri_trim_both(str_replace_all(string,"\\s+"," ")) } stringr/R/interp.R0000644000176200001440000001705313413766176013613 0ustar liggesusers#' String interpolation. #' #' String interpolation is a useful way of specifying a character string which #' depends on values in a certain environment. It allows for string creation #' which is easier to read and write when compared to using e.g. #' [paste()] or [sprintf()]. The (template) string can #' include expression placeholders of the form `${expression}` or #' `$[format]{expression}`, where expressions are valid R expressions that #' can be evaluated in the given environment, and `format` is a format #' specification valid for use with [sprintf()]. #' #' @param string A template character string. This function is not vectorised: #' a character vector will be collapsed into a single string. #' @param env The environment in which to evaluate the expressions. #' @seealso [str_glue()] and [str_glue_data()] for alternative approaches to #' the same problem. #' @keywords internal #' @return An interpolated character string. #' @author Stefan Milton Bache #' @export #' @examples #' #' # Using values from the environment, and some formats #' user_name <- "smbache" #' amount <- 6.656 #' account <- 1337 #' str_interp("User ${user_name} (account $[08d]{account}) has $$[.2f]{amount}.") #' #' # Nested brace pairs work inside expressions too, and any braces can be #' # placed outside the expressions. #' str_interp("Works with } nested { braces too: $[.2f]{{{2 + 2}*{amount}}}") #' #' # Values can also come from a list #' str_interp( #' "One value, ${value1}, and then another, ${value2*2}.", #' list(value1 = 10, value2 = 20) #' ) #' #' # Or a data frame #' str_interp( #' "Values are $[.2f]{max(Sepal.Width)} and $[.2f]{min(Sepal.Width)}.", #' iris #' ) #' #' # Use a vector when the string is long: #' max_char <- 80 #' str_interp(c( #' "This particular line is so long that it is hard to write ", #' "without breaking the ${max_char}-char barrier!" #' )) str_interp <- function(string, env = parent.frame()) { if (!is.character(string)) { stop("string argument is not character.", call. = FALSE) } string <- str_c(string, collapse = "") # Find expression placeholders matches <- interp_placeholders(string) # Determine if any placeholders were found. if (matches$indices[1] <= 0) { string } else { # Evaluate them to get the replacement strings. replacements <- eval_interp_matches(matches$matches, env) # Replace the expressions by their values and return. `regmatches<-`(string, list(matches$indices), FALSE, list(replacements)) } } #' Match String Interpolation Placeholders #' #' Given a character string a set of expression placeholders are matched. They #' are of the form \code{${...}} or optionally \code{$[f]{...}} where `f` #' is a valid format for [sprintf()]. #' #' @param string character: The string to be interpolated. #' #' @return list containing `indices` (regex match data) and `matches`, #' the string representations of matched expressions. #' #' @noRd #' @author Stefan Milton Bache interp_placeholders <- function(string) { # Find starting position of ${} or $[]{} placeholders. starts <- gregexpr("\\$(\\[.*?\\])?\\{", string)[[1]] # Return immediately if no matches are found. if (starts[1] <= 0) return(list(indices = starts)) # Break up the string in parts parts <- substr(rep(string, length(starts)), start = starts, stop = c(starts[-1L] - 1L, nchar(string))) # If there are nested placeholders, each part will not contain a full # placeholder in which case we report invalid string interpolation template. if (any(!grepl("\\$(\\[.*?\\])?\\{.+\\}", parts))) stop("Invalid template string for interpolation.", call. = FALSE) # For each part, find the opening and closing braces. opens <- lapply(strsplit(parts, ""), function(v) which(v == "{")) closes <- lapply(strsplit(parts, ""), function(v) which(v == "}")) # Identify the positions within the parts of the matching closing braces. # These are the lengths of the placeholder matches. lengths <- mapply(match_brace, opens, closes) # Update the `starts` match data with the attr(starts, "match.length") <- lengths # Return both the indices (regex match data) and the actual placeholder # matches (as strings.) list(indices = starts, matches = mapply(substr, starts, starts + lengths - 1, x = string)) } #' Evaluate String Interpolation Matches #' #' The expression part of string interpolation matches are evaluated in a #' specified environment and formatted for replacement in the original string. #' Used internally by [str_interp()]. #' #' @param matches Match data #' #' @param env The environment in which to evaluate the expressions. #' #' @return A character vector of replacement strings. #' #' @noRd #' @author Stefan Milton Bache eval_interp_matches <- function(matches, env) { # Extract expressions from the matches expressions <- extract_expressions(matches) # Evaluate them in the given environment values <- lapply(expressions, eval, envir = env, enclos = if (is.environment(env)) env else environment(env)) # Find the formats to be used formats <- extract_formats(matches) # Format the values and return. mapply(sprintf, formats, values, SIMPLIFY = FALSE) } #' Extract Expression Objects from String Interpolation Matches #' #' An interpolation match object will contain both its wrapping \code{${ }} part #' and possibly a format. This extracts the expression parts and parses them to #' prepare them for evaluation. #' #' @param matches Match data #' #' @return list of R expressions #' #' @noRd #' @author Stefan Milton Bache extract_expressions <- function(matches) { # Parse function for text argument as first argument. parse_text <- function(text) { tryCatch( parse(text = text), error = function(e) stop(conditionMessage(e), call. = FALSE) ) } # string representation of the expressions (without the possible formats). strings <- gsub("\\$(\\[.+?\\])?\\{", "", matches) # Remove the trailing closing brace and parse. lapply(substr(strings, 1L, nchar(strings) - 1), parse_text) } #' Extract String Interpolation Formats from Matched Placeholders #' #' An expression placeholder for string interpolation may optionally contain a #' format valid for [sprintf()]. This function will extract such or #' default to "s" the format for strings. #' #' @param matches Match data #' #' @return A character vector of format specifiers. #' #' @noRd #' @author Stefan Milton Bache extract_formats <- function(matches) { # Extract the optional format parts. formats <- gsub("\\$(\\[(.+?)\\])?.*", "\\2", matches) # Use string options "s" as default when not specified. paste0("%", ifelse(formats == "", "s", formats)) } #' Utility Function for Matching a Closing Brace #' #' Given positions of opening and closing braces `match_brace` identifies #' the closing brace matching the first opening brace. #' #' @param opening integer: Vector with positions of opening braces. #' #' @param closing integer: Vector with positions of closing braces. #' #' @return Integer with the posision of the matching brace. #' #' @noRd #' @author Stefan Milton Bache match_brace <- function(opening, closing) { # maximum index for the matching closing brace max_close <- max(closing) # "path" for mapping opening and closing breaces path <- numeric(max_close) # Set openings to 1, and closings to -1 path[opening[opening < max_close]] <- 1 path[closing] <- -1 # Cumulate the path ... cumpath <- cumsum(path) # ... and the first 0 after the first opening identifies the match. min(which(1:max_close > min(which(cumpath == 1)) & cumpath == 0)) } stringr/R/pad.r0000644000176200001440000000212113274402311013063 0ustar liggesusers#' Pad a string. #' #' Vectorised over `string`, `width` and `pad`. #' #' @param string A character vector. #' @param width Minimum width of padded strings. #' @param side Side on which padding character is added (left, right or both). #' @param pad Single padding character (default is a space). #' @return A character vector. #' @seealso [str_trim()] to remove whitespace; #' [str_trunc()] to decrease the maximum width of a string. #' @export #' @examples #' rbind( #' str_pad("hadley", 30, "left"), #' str_pad("hadley", 30, "right"), #' str_pad("hadley", 30, "both") #' ) #' #' # All arguments are vectorised except side #' str_pad(c("a", "abc", "abcdef"), 10) #' str_pad("a", c(5, 10, 20)) #' str_pad("a", 10, pad = c("-", "_", " ")) #' #' # Longer strings are returned unchanged #' str_pad("hadley", 3) str_pad <- function(string, width, side = c("left", "right", "both"), pad = " ") { side <- match.arg(side) switch(side, left = stri_pad_left(string, width, pad = pad), right = stri_pad_right(string, width, pad = pad), both = stri_pad_both(string, width, pad = pad) ) } stringr/R/detect.r0000644000176200001440000000761013427104326013604 0ustar liggesusers#' Detect the presence or absence of a pattern in a string. #' #' Vectorised over `string` and `pattern`. #' Equivalent to `grepl(pattern, x)`. #' See [str_which()] for an equivalent to `grep(pattern, x)`. #' #' @param string Input vector. Either a character vector, or something #' coercible to one. #' @param pattern Pattern to look for. #' #' The default interpretation is a regular expression, as described #' in [stringi::stringi-search-regex]. Control options with #' [regex()]. #' #' Match a fixed string (i.e. by comparing only bytes), using #' [fixed()]. This is fast, but approximate. Generally, #' for matching human text, you'll want [coll()] which #' respects character matching rules for the specified locale. #' #' Match character, word, line and sentence boundaries with #' [boundary()]. An empty pattern, "", is equivalent to #' `boundary("character")`. #' #' @param negate If `TRUE`, return non-matching elements. #' @return A logical vector. #' @seealso [stringi::stri_detect()] which this function wraps, #' [str_subset()] for a convenient wrapper around #' `x[str_detect(x, pattern)]` #' @export #' @examples #' fruit <- c("apple", "banana", "pear", "pinapple") #' str_detect(fruit, "a") #' str_detect(fruit, "^a") #' str_detect(fruit, "a$") #' str_detect(fruit, "b") #' str_detect(fruit, "[aeiou]") #' #' # Also vectorised over pattern #' str_detect("aecfg", letters) #' #' # Returns TRUE if the pattern do NOT match #' str_detect(fruit, "^p", negate = TRUE) str_detect <- function(string, pattern, negate = FALSE) { switch(type(pattern), empty = , bound = str_count(string, pattern) > 0 & !negate, fixed = stri_detect_fixed(string, pattern, negate = negate, opts_fixed = opts(pattern)), coll = stri_detect_coll(string, pattern, negate = negate, opts_collator = opts(pattern)), regex = stri_detect_regex(string, pattern, negate = negate, opts_regex = opts(pattern)) ) } #' Detect the presence or absence of a pattern at the beginning or end of a #' string. #' #' Vectorised over `string` and `pattern`. #' #' @inheritParams str_detect #' @param pattern Pattern with which the string starts or ends. #' #' The default interpretation is a regular expression, as described in #' [stringi::stringi-search-regex]. Control options with [regex()]. #' #' Match a fixed string (i.e. by comparing only bytes), using [fixed()]. This #' is fast, but approximate. Generally, for matching human text, you'll want #' [coll()] which respects character matching rules for the specified locale. #' #' @return A logical vector. #' @seealso [str_detect()] which this function wraps when pattern is regex. #' @export #' @examples #' fruit <- c("apple", "banana", "pear", "pinapple") #' str_starts(fruit, "p") #' str_starts(fruit, "p", negate = TRUE) #' str_ends(fruit, "e") #' str_ends(fruit, "e", negate = TRUE) str_starts <- function(string, pattern, negate = FALSE) { switch( type(pattern), empty = , bound = stop("boundary() patterns are not supported."), fixed = stri_startswith_fixed(string, pattern, negate = negate, opts_fixed = opts(pattern)), coll = stri_startswith_coll(string, pattern, negate = negate, opts_collator = opts(pattern)), regex = { pattern2 <- paste0("^", pattern) attributes(pattern2) <- attributes(pattern) str_detect(string, pattern2, negate) } ) } #' @rdname str_starts #' @export str_ends <- function(string, pattern, negate = FALSE) { switch(type(pattern), empty = , bound = stop("boundary() patterns are not supported."), fixed = stri_endswith_fixed(string, pattern, negate = negate, opts_fixed = opts(pattern)), coll = stri_endswith_coll(string, pattern, negate = negate, opts_collator = opts(pattern)), regex = { pattern2 <- paste0(pattern, "$") attributes(pattern2) <- attributes(pattern) str_detect(string, pattern2, negate) } ) } stringr/R/match.r0000644000176200001440000000332113202620365013420 0ustar liggesusers#' Extract matched groups from a string. #' #' Vectorised over `string` and `pattern`. #' #' @inheritParams str_detect #' @param pattern Pattern to look for, as defined by an ICU regular #' expression. See [stringi::stringi-search-regex] for more details. #' @return For `str_match`, a character matrix. First column is the #' complete match, followed by one column for each capture group. #' For `str_match_all`, a list of character matrices. #' #' @seealso [str_extract()] to extract the complete match, #' [stringi::stri_match()] for the underlying #' implementation. #' @export #' @examples #' strings <- c(" 219 733 8965", "329-293-8753 ", "banana", "595 794 7569", #' "387 287 6718", "apple", "233.398.9187 ", "482 952 3315", #' "239 923 8115 and 842 566 4692", "Work: 579-499-7527", "$1000", #' "Home: 543.355.3679") #' phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})" #' #' str_extract(strings, phone) #' str_match(strings, phone) #' #' # Extract/match all #' str_extract_all(strings, phone) #' str_match_all(strings, phone) #' #' x <- c(" ", " <>", "", "", NA) #' str_match(x, "<(.*?)> <(.*?)>") #' str_match_all(x, "<(.*?)>") #' #' str_extract(x, "<.*?>") #' str_extract_all(x, "<.*?>") str_match <- function(string, pattern) { if (type(pattern) != "regex") { stop("Can only match regular expressions", call. = FALSE) } stri_match_first_regex(string, pattern, opts_regex = opts(pattern) ) } #' @rdname str_match #' @export str_match_all <- function(string, pattern) { if (type(pattern) != "regex") { stop("Can only match regular expressions", call. = FALSE) } stri_match_all_regex(string, pattern, omit_no_match = TRUE, opts_regex = opts(pattern) ) } stringr/R/view.R0000644000176200001440000000554213427104326013250 0ustar liggesusers#' View HTML rendering of regular expression match. #' #' `str_view` shows the first match; `str_view_all` shows all #' the matches. To build regular expressions interactively, check out the #' [RegExplain RStudio addin](https://www.garrickadenbuie.com/project/regexplain/). #' #' @inheritParams str_detect #' @param match If `TRUE`, shows only strings that match the pattern. #' If `FALSE`, shows only the strings that don't match the pattern. #' Otherwise (the default, `NA`) displays both matches and non-matches. #' @export #' @examples #' str_view(c("abc", "def", "fgh"), "[aeiou]") #' str_view(c("abc", "def", "fgh"), "^") #' str_view(c("abc", "def", "fgh"), "..") #' #' # Show all matches with str_view_all #' str_view_all(c("abc", "def", "fgh"), "d|e") #' #' # Use match to control what is shown #' str_view(c("abc", "def", "fgh"), "d|e") #' str_view(c("abc", "def", "fgh"), "d|e", match = TRUE) #' str_view(c("abc", "def", "fgh"), "d|e", match = FALSE) str_view <- function(string, pattern, match = NA) { if (identical(match, TRUE)) { string <- string[str_detect(string, pattern)] } else if (identical(match, FALSE)) { string <- string[!str_detect(string, pattern)] } loc <- str_locate(string, pattern) # How to do escaping? Need to update x and loc has_match <- !is.na(loc[, "start"]) str_sub(string[has_match], loc[has_match, , drop = FALSE]) <- paste0("", str_sub(string[has_match], loc[has_match, , drop = FALSE]), "") str_view_widget(string) } #' @rdname str_view #' @export str_view_all <- function(string, pattern, match = NA) { if (identical(match, TRUE)) { string <- string[str_detect(string, pattern)] } else if (identical(match, FALSE)) { string <- string[!str_detect(string, pattern)] } loc <- str_locate_all(string, pattern) string_list <- Map(loc = loc, string = string, function(loc, string) { if (nrow(loc) == 0) return(string) for (i in rev(seq_len(nrow(loc)))) { str_sub(string, loc[i, , drop = FALSE]) <- paste0("", str_sub(string, loc[i, , drop = FALSE]), "") } string }) string <- unlist(string_list) str_view_widget(string) } str_view_widget <- function(lines) { lines <- str_replace_na(lines) bullets <- str_c( "
    \n", str_c("
  • ", lines, "
  • ", collapse = "\n"), "\n
" ) html <- htmltools::HTML(bullets) if (!requireNamespace("htmlwidgets", quietly = TRUE)) { stop("htmlwidgets package required for str_view(). \nPlease install.packages(\"htmlwidgets\") to use this functionality.", call. = FALSE) } size <- htmlwidgets::sizingPolicy( knitr.figure = FALSE, defaultHeight = pmin(10 * length(lines), 300), knitr.defaultHeight = "100%" ) htmlwidgets::createWidget( "str_view", list(html = html), sizingPolicy = size, package = "stringr" ) } stringr/R/extract.r0000644000176200001440000000454213202620054013777 0ustar liggesusers#' Extract matching patterns from a string. #' #' Vectorised over `string` and `pattern`. #' #' @inheritParams str_detect #' @return A character vector. #' @seealso [str_match()] to extract matched groups; #' [stringi::stri_extract()] for the underlying implementation. #' @param simplify If `FALSE`, the default, returns a list of character #' vectors. If `TRUE` returns a character matrix. #' @export #' @examples #' shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2") #' str_extract(shopping_list, "\\d") #' str_extract(shopping_list, "[a-z]+") #' str_extract(shopping_list, "[a-z]{1,4}") #' str_extract(shopping_list, "\\b[a-z]{1,4}\\b") #' #' # Extract all matches #' str_extract_all(shopping_list, "[a-z]+") #' str_extract_all(shopping_list, "\\b[a-z]+\\b") #' str_extract_all(shopping_list, "\\d") #' #' # Simplify results into character matrix #' str_extract_all(shopping_list, "\\b[a-z]+\\b", simplify = TRUE) #' str_extract_all(shopping_list, "\\d", simplify = TRUE) #' #' # Extract all words #' str_extract_all("This is, suprisingly, a sentence.", boundary("word")) str_extract <- function(string, pattern) { switch(type(pattern), empty = stri_extract_first_boundaries(string, pattern, opts_brkiter = opts(pattern)), bound = stri_extract_first_boundaries(string, pattern, opts_brkiter = opts(pattern)), fixed = stri_extract_first_fixed(string, pattern, opts_fixed = opts(pattern)), coll = stri_extract_first_coll(string, pattern, opts_collator = opts(pattern)), regex = stri_extract_first_regex(string, pattern, opts_regex = opts(pattern)) ) } #' @rdname str_extract #' @export str_extract_all <- function(string, pattern, simplify = FALSE) { switch(type(pattern), empty = stri_extract_all_boundaries(string, pattern, simplify = simplify, omit_no_match = TRUE, opts_brkiter = opts(pattern)), bound = stri_extract_all_boundaries(string, pattern, simplify = simplify, omit_no_match = TRUE, opts_brkiter = opts(pattern)), fixed = stri_extract_all_fixed(string, pattern, simplify = simplify, omit_no_match = TRUE, opts_fixed = opts(pattern)), coll = stri_extract_all_coll(string, pattern, simplify = simplify, omit_no_match = TRUE, opts_collator = opts(pattern)), regex = stri_extract_all_regex(string, pattern, simplify = simplify, omit_no_match = TRUE, opts_regex = opts(pattern)) ) } stringr/R/replace.r0000644000176200001440000001344613274374157013766 0ustar liggesusers#' Replace matched patterns in a string. #' #' Vectorised over `string`, `pattern` and `replacement`. #' #' @inheritParams str_detect #' @param pattern Pattern to look for. #' #' The default interpretation is a regular expression, as described #' in [stringi::stringi-search-regex]. Control options with #' [regex()]. #' #' Match a fixed string (i.e. by comparing only bytes), using #' [fixed()]. This is fast, but approximate. Generally, #' for matching human text, you'll want [coll()] which #' respects character matching rules for the specified locale. #' @param replacement A character vector of replacements. Should be either #' length one, or the same length as `string` or `pattern`. #' References of the form `\1`, `\2`, etc will be replaced with #' the contents of the respective matched group (created by `()`). #' #' To perform multiple replacements in each element of `string`, #' pass a named vector (`c(pattern1 = replacement1)`) to #' `str_replace_all`. Alternatively, pass a function to #' `replacement`: it will be called once for each match and its #' return value will be used to replace the match. #' #' To replace the complete string with `NA`, use #' `replacement = NA_character_`. #' @return A character vector. #' @seealso [str_replace_na()] to turn missing values into "NA"; #' [stri_replace()] for the underlying implementation. #' @export #' @examples #' fruits <- c("one apple", "two pears", "three bananas") #' str_replace(fruits, "[aeiou]", "-") #' str_replace_all(fruits, "[aeiou]", "-") #' str_replace_all(fruits, "[aeiou]", toupper) #' str_replace_all(fruits, "b", NA_character_) #' #' str_replace(fruits, "([aeiou])", "") #' str_replace(fruits, "([aeiou])", "\\1\\1") #' str_replace(fruits, "[aeiou]", c("1", "2", "3")) #' str_replace(fruits, c("a", "e", "i"), "-") #' #' # If you want to apply multiple patterns and replacements to the same #' # string, pass a named vector to pattern. #' fruits %>% #' str_c(collapse = "---") %>% #' str_replace_all(c("one" = "1", "two" = "2", "three" = "3")) #' #' # Use a function for more sophisticated replacement. This example #' # replaces colour names with their hex values. #' colours <- str_c("\\b", colors(), "\\b", collapse="|") #' col2hex <- function(col) { #' rgb <- col2rgb(col) #' rgb(rgb["red", ], rgb["green", ], rgb["blue", ], max = 255) #' } #' #' x <- c( #' "Roses are red, violets are blue", #' "My favourite colour is green" #' ) #' str_replace_all(x, colours, col2hex) str_replace <- function(string, pattern, replacement) { if (!missing(replacement) && is.function(replacement)) { return(str_transform(string, pattern, replacement)) } switch(type(pattern), empty = stop("Empty `pattern` not supported", call. = FALSE), bound = stop("Boundary `pattern` not supported", call. = FALSE), fixed = stri_replace_first_fixed(string, pattern, replacement, opts_fixed = opts(pattern)), coll = stri_replace_first_coll(string, pattern, replacement, opts_collator = opts(pattern)), regex = stri_replace_first_regex(string, pattern, fix_replacement(replacement), opts_regex = opts(pattern)) ) } #' @export #' @rdname str_replace str_replace_all <- function(string, pattern, replacement) { if (!missing(replacement) && is.function(replacement)) { return(str_transform_all(string, pattern, replacement)) } if (!is.null(names(pattern))) { vec <- FALSE replacement <- unname(pattern) pattern[] <- names(pattern) } else { vec <- TRUE } switch(type(pattern), empty = stop("Empty `pattern`` not supported", call. = FALSE), bound = stop("Boundary `pattern` not supported", call. = FALSE), fixed = stri_replace_all_fixed(string, pattern, replacement, vectorize_all = vec, opts_fixed = opts(pattern)), coll = stri_replace_all_coll(string, pattern, replacement, vectorize_all = vec, opts_collator = opts(pattern)), regex = stri_replace_all_regex(string, pattern, fix_replacement(replacement), vectorize_all = vec, opts_regex = opts(pattern)) ) } fix_replacement <- function(x) { if (!is.character(x)) { stop("`replacement` must be a character vector", call. = FALSE) } vapply(x, fix_replacement_one, character(1), USE.NAMES = FALSE) } fix_replacement_one <- function(x) { if (is.na(x)) { return(x) } chars <- str_split(x, "")[[1]] out <- character(length(chars)) escaped <- logical(length(chars)) in_escape <- FALSE for (i in seq_along(chars)) { escaped[[i]] <- in_escape char <- chars[[i]] if (in_escape) { # Escape character not printed previously so must include here if (char == "$") { out[[i]] <- "\\\\$" } else if (char >= "0" && char <= "9") { out[[i]] <- paste0("$", char) } else { out[[i]] <- paste0("\\", char) } in_escape <- FALSE } else { if (char == "$") { out[[i]] <- "\\$" } else if (char == "\\") { in_escape <- TRUE } else { out[[i]] <- char } } } # tibble::tibble(chars, out, escaped) paste0(out, collapse = "") } #' Turn NA into "NA" #' #' @inheritParams str_replace #' @param replacement A single string. #' @export #' @examples #' str_replace_na(c(NA, "abc", "def")) str_replace_na <- function(string, replacement = "NA") { stri_replace_na(string, replacement) } str_transform <- function(string, pattern, replacement) { loc <- str_locate(string, pattern) str_sub(string, loc, omit_na = TRUE) <- replacement(str_sub(string, loc)) string } str_transform_all <- function(string, pattern, replacement) { locs <- str_locate_all(string, pattern) for (i in seq_along(string)) { for (j in rev(seq_len(nrow(locs[[i]])))) { loc <- locs[[i]] str_sub(string[[i]], loc[j, 1], loc[j, 2]) <- replacement(str_sub(string[[i]], loc[j, 1], loc[j, 2])) } } string } stringr/R/c.r0000644000176200001440000000417513427104326012561 0ustar liggesusers#' Join multiple strings into a single string. #' #' Joins two or more vectors element-wise into a single character vector, #' optionally inserting `sep` between input vectors. If `collapse` is not `NULL`, #' it will be inserted between elements of the result, returning a character #' vector of length 1. #' #' To understand how `str_c` works, you need to imagine that you are building up #' a matrix of strings. Each input argument forms a column, and is expanded to #' the length of the longest argument, using the usual recyling rules. The #' `sep` string is inserted between each column. If collapse is `NULL` each row #' is collapsed into a single string. If non-`NULL` that string is inserted at #' the end of each row, and the entire matrix collapsed to a single string. #' #' @param ... One or more character vectors. Zero length arguments #' are removed. Short arguments are recycled to the length of the #' longest. #' #' Like most other R functions, missing values are "infectious": whenever #' a missing value is combined with another string the result will always #' be missing. Use [str_replace_na()] to convert `NA` to #' `"NA"` #' @param sep String to insert between input vectors. #' @param collapse Optional string used to combine input vectors into single #' string. #' @return If `collapse = NULL` (the default) a character vector with #' length equal to the longest input string. If `collapse` is #' non-NULL, a character vector of length 1. #' @seealso [paste()] for equivalent base R functionality, and #' [stringi::stri_join()] which this function wraps #' @export str_c #' @examples #' str_c("Letter: ", letters) #' str_c("Letter", letters, sep = ": ") #' str_c(letters, " is for", "...") #' str_c(letters[-26], " comes before ", letters[-1]) #' #' str_c(letters, collapse = "") #' str_c(letters, collapse = ", ") #' #' # Missing inputs give missing outputs #' str_c(c("a", NA, "b"), "-d") #' # Use str_replace_NA to display literal NAs: #' str_c(str_replace_na(c("a", NA, "b")), "-d") #' @import stringi str_c <- function(..., sep = "", collapse = NULL) { stri_c(..., sep = sep, collapse = collapse, ignore_null = TRUE) } stringr/vignettes/0000755000176200001440000000000013427574707013772 5ustar liggesusersstringr/vignettes/releases/0000755000176200001440000000000013274371075015566 5ustar liggesusersstringr/vignettes/releases/stringr-1.1.0.Rmd0000644000176200001440000000210113074137667020354 0ustar liggesusers--- title: "stringr 1.1.0" date: "2016-08-24" --- ```{r, echo = FALSE} knitr::opts_chunk$set(comment = "#>", collapse = T) library(stringr) ``` This release is mostly bug fixes, but there are a couple of new features you might care out. * There are three new datasets, `fruit`, `words` and `sentences`, to help you practice your regular expression skills: ```{r} str_subset(fruit, "(..)\\1") head(words) sentences[1] ``` * More functions work with `boundary()`: `str_detect()` and `str_subset()` can detect boundaries, and `str_extract()` and `str_extract_all()` pull out the components between boundaries. This is particularly useful if you want to extract logical constructs like words or sentences. ```{r} x <- "This is harder than you might expect, e.g. punctuation!" x %>% str_extract_all(boundary("word")) %>% .[[1]] x %>% str_extract(boundary("sentence")) ``` * `str_view()` and `str_view_all()` create HTML widgets that display regular expression matches. This is particularly useful for teaching. stringr/vignettes/releases/stringr-1.2.0.Rmd0000644000176200001440000000353713274371075020366 0ustar liggesusers--- title: "stringr 1.2.0" date: "2017-02-18" --- ```{r setup, include = TRUE} knitr::opts_chunk$set(comment = "#>", collapse = T) library(stringr) ``` ## API changes This release includes a change to the API: `str_match_all()` now returns NA if an optional group doesn't match (previously it returned ""). This is more consistent with `str_match()` and other match failures. ```{r} x <- c("a=1,b=2", "c=3", "d=") x %>% str_match("(.)=(\\d)?") x %>% str_match_all("(.)=(\\d)?,?") ``` ## New features There are three new features: * In `str_replace()`, `replacement` can now be a function that is called once for each match and who's return value is used to replace the match. ```{r} redact <- function(x) str_dup("-", str_length(x)) x <- c("It cost $500", "We spent $1,200 on stickers") x %>% str_replace_all("\\$[0-9,]+", redact) ``` * New `str_which()` mimics `grep()`: ```{r} fruit <- c("apple", "banana", "pear", "pinapple") # Matching positions str_which(fruit, "p") # Matching values str_subset(fruit, "p") ``` * A new vignette (`vignette("regular-expressions")`) describes the details of the regular expressions supported by stringr. The main vignette (`vignette("stringr")`) has been updated to give a high-level overview of the package. ## Minor improvements and bug fixes There were three other minor improvements and bug fixes: * `str_order()` and `str_sort()` gain explicit `numeric` argument for sorting mixed numbers and strings. * `str_replace_all()` now throws an error if `replacement` is not a character vector. If `replacement` is `NA_character_` it replaces the complete string with `NA`. * All functions that take a locale (e.g. `str_to_lower()` and `str_sort()`) default to "en" (English) to ensure that the default is consistent across platforms. stringr/vignettes/releases/stringr-1.0.0.Rmd0000644000176200001440000000570013074137635020356 0ustar liggesusers--- title: "stringr 1.0.0" date: "2015-05-05" --- ```{r, echo = FALSE} knitr::opts_chunk$set(comment = "#>", collapse = T) library(stringr) ``` I'm very excited to announce the 1.0.0 release of the stringr package. If you haven't heard of stringr before, it makes string manipulation easier by: * Using consistent function and argument names: all functions start with `str_`, and the first argument is always the input string This makes stringr easier to learn and easy to use with [the pipe](http://github.com/smbache/magrittr/). * Eliminating options that you don't need 95% of the time. To get started with stringr, check out the [new vignette](http://cran.r-project.org/web/packages/stringr/vignettes/stringr.html). ## What's new? The biggest change in this release is that stringr is now powered by the [stringi](https://github.com/Rexamine/stringi) package instead of base R. This has two big benefits: stringr is now much faster, and has much better unicode support. If you've used stringi before, you might wonder why stringr is still necessary: stringi does everything that stringr does, and much much more. There are two reasons that I think stringr is still important: 1. Lots of people use it already, so this update will give many people a performance boost for free. 1. The smaller API of stringr makes it a little easier to learn. That said, once you've learned stringr, using stringi should be easy, so it's a great place to start if you need a tool that doesn't exist in stringr. ## New features and functions * `str_replace_all()` gains a convenient syntax for applying multiple pairs of pattern and replacement to the same vector: ```{r} x <- c("abc", "def") str_replace_all(x, c("[ad]" = "!", "[cf]" = "?")) ``` * `str_subset()` keeps values that match a pattern: ```{r} x <- c("abc", "def", "jhi", "klm", "nop") str_subset(x, "[aeiou]") ``` * `str_order()` and `str_sort()` sort and order strings in a specified locale. `str_conv()` to converts strings from specified encoding to UTF-8. ```{r} # The vowels come before the consonants in Hawaiian str_sort(letters[1:10], locale = "haw") ``` * New modifier `boundary()` allows you to count, locate and split by character, word, line and sentence boundaries. ```{r} words <- c("These are some words. Some more words.") str_count(words, boundary("word")) str_split(words, boundary("word")) ``` There were two minor changes to make stringr a little more consistent: * `str_c()` now returns a zero length vector if any of its inputs are zero length vectors. This is consistent with all other functions, and standard R recycling rules. Similarly, using `str_c("x", NA)` now yields `NA`. If you want `"xNA"`, use `str_replace_na()` on the inputs. * `str_match()` now returns NA if an optional group doesn't match (previously it returned ""). This is more consistent with `str_extract()` and other match failures. stringr/vignettes/stringr.Rmd0000644000176200001440000002247513341254174016124 0ustar liggesusers--- title: "Introduction to stringr" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Introduction to stringr} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} library(stringr) knitr::opts_chunk$set( comment = "#>", collapse = TRUE ) ``` There are four main families of functions in stringr: 1. Character manipulation: these functions allow you to manipulate individual characters within the strings in character vectors. 1. Whitespace tools to add, remove, and manipulate whitespace. 1. Locale sensitive operations whose operations will vary from locale to locale. 1. Pattern matching functions. These recognise four engines of pattern description. The most common is regular expressions, but there are three other tools. ## Getting and setting individual characters You can get the length of the string with `str_length()`: ```{r} str_length("abc") ``` This is now equivalent to the base R function `nchar()`. Previously it was needed to work around issues with `nchar()` such as the fact that it returned 2 for `nchar(NA)`. This has been fixed as of R 3.3.0, so it is no longer so important. You can access individual character using `str_sub()`. It takes three arguments: a character vector, a `start` position and an `end` position. Either position can either be a positive integer, which counts from the left, or a negative integer which counts from the right. The positions are inclusive, and if longer than the string, will be silently truncated. ```{r} x <- c("abcdef", "ghifjk") # The 3rd letter str_sub(x, 3, 3) # The 2nd to 2nd-to-last character str_sub(x, 2, -2) ``` You can also use `str_sub()` to modify strings: ```{r} str_sub(x, 3, 3) <- "X" x ``` To duplicate individual strings, you can use `str_dup()`: ```{r} str_dup(x, c(2, 3)) ``` ## Whitespace Three functions add, remove, or modify whitespace: 1. `str_pad()` pads a string to a fixed length by adding extra whitespace on the left, right, or both sides. ```{r} x <- c("abc", "defghi") str_pad(x, 10) # default pads on left str_pad(x, 10, "both") ``` (You can pad with other characters by using the `pad` argument.) `str_pad()` will never make a string shorter: ```{r} str_pad(x, 4) ``` So if you want to ensure that all strings are the same length (often useful for print methods), combine `str_pad()` and `str_trunc()`: ```{r} x <- c("Short", "This is a long string") x %>% str_trunc(10) %>% str_pad(10, "right") ``` 1. The opposite of `str_pad()` is `str_trim()`, which removes leading and trailing whitespace: ```{r} x <- c(" a ", "b ", " c") str_trim(x) str_trim(x, "left") ``` 1. You can use `str_wrap()` to modify existing whitespace in order to wrap a paragraph of text, such that the length of each line is as similar as possible. ```{r} jabberwocky <- str_c( "`Twas brillig, and the slithy toves ", "did gyre and gimble in the wabe: ", "All mimsy were the borogoves, ", "and the mome raths outgrabe. " ) cat(str_wrap(jabberwocky, width = 40)) ``` ## Locale sensitive A handful of stringr functions are locale-sensitive: they will perform differently in different regions of the world. These functions are case transformation functions: ```{r} x <- "I like horses." str_to_upper(x) str_to_title(x) str_to_lower(x) # Turkish has two sorts of i: with and without the dot str_to_lower(x, "tr") ``` String ordering and sorting: ```{r} x <- c("y", "i", "k") str_order(x) str_sort(x) # In Lithuanian, y comes between i and k str_sort(x, locale = "lt") ``` The locale always defaults to English to ensure that the default behaviour is identical across systems. Locales always include a two letter ISO-639-1 language code (like "en" for English or "zh" for Chinese), and optionally a ISO-3166 country code (like "en_UK" vs "en_US"). You can see a complete list of available locales by running `stringi::stri_locale_list()`. ## Pattern matching The vast majority of stringr functions work with patterns. These are parameterised by the task they perform and the types of patterns they match. ### Tasks Each pattern matching function has the same first two arguments, a character vector of `string`s to process and a single `pattern` to match. stringr provides pattern matching functions to **detect**, **locate**, **extract**, **match**, **replace**, and **split** strings. I'll illustrate how they work with some strings and a regular expression designed to match (US) phone numbers: ```{r} strings <- c( "apple", "219 733 8965", "329-293-8753", "Work: 579-499-7527; Home: 543.355.3679" ) phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})" ``` - `str_detect()` detects the presence or absence of a pattern and returns a logical vector (similar to `grepl()`). `str_subset()` returns the elements of a character vector that match a regular expression (similar to `grep()` with `value = TRUE`)`. ```{r} # Which strings contain phone numbers? str_detect(strings, phone) str_subset(strings, phone) ``` - `str_count()` counts the number of matches: ```{r} # How many phone numbers in each string? str_count(strings, phone) ``` - `str_locate()` locates the **first** position of a pattern and returns a numeric matrix with columns start and end. `str_locate_all()` locates all matches, returning a list of numeric matrices. Similar to `regexpr()` and `gregexpr()`. ```{r} # Where in the string is the phone number located? (loc <- str_locate(strings, phone)) str_locate_all(strings, phone) ``` - `str_extract()` extracts text corresponding to the **first** match, returning a character vector. `str_extract_all()` extracts all matches and returns a list of character vectors. ```{r} # What are the phone numbers? str_extract(strings, phone) str_extract_all(strings, phone) str_extract_all(strings, phone, simplify = TRUE) ``` - `str_match()` extracts capture groups formed by `()` from the **first** match. It returns a character matrix with one column for the complete match and one column for each group. `str_match_all()` extracts capture groups from all matches and returns a list of character matrices. Similar to `regmatches()`. ```{r} # Pull out the three components of the match str_match(strings, phone) str_match_all(strings, phone) ``` - `str_replace()` replaces the **first** matched pattern and returns a character vector. `str_replace_all()` replaces all matches. Similar to `sub()` and `gsub()`. ```{r} str_replace(strings, phone, "XXX-XXX-XXXX") str_replace_all(strings, phone, "XXX-XXX-XXXX") ``` - `str_split_fixed()` splits a string into a **fixed** number of pieces based on a pattern and returns a character matrix. `str_split()` splits a string into a **variable** number of pieces and returns a list of character vectors. ```{r} str_split("a-b-c", "-") str_split_fixed("a-b-c", "-", n = 2) ``` ### Engines There are four main engines that stringr can use to describe patterns: * Regular expressions, the default, as shown above, and described in `vignette("regular-expressions")`. * Fixed bytewise matching, with `fixed()`. * Locale-sensitive character matching, with `coll()` * Text boundary analysis with `boundary()`. #### Fixed matches `fixed(x)` only matches the exact sequence of bytes specified by `x`. This is a very limited "pattern", but the restriction can make matching much faster. Beware using `fixed()` with non-English data. It is problematic because there are often multiple ways of representing the same character. For example, there are two ways to define "á": either as a single character or as an "a" plus an accent: ```{r} a1 <- "\u00e1" a2 <- "a\u0301" c(a1, a2) a1 == a2 ``` They render identically, but because they're defined differently, `fixed()` doesn't find a match. Instead, you can use `coll()`, explained below, to respect human character comparison rules: ```{r} str_detect(a1, fixed(a2)) str_detect(a1, coll(a2)) ``` #### Collation search `coll(x)` looks for a match to `x` using human-language **coll**ation rules, and is particularly important if you want to do case insensitive matching. Collation rules differ around the world, so you'll also need to supply a `locale` parameter. ```{r} i <- c("I", "İ", "i", "ı") i str_subset(i, coll("i", ignore_case = TRUE)) str_subset(i, coll("i", ignore_case = TRUE, locale = "tr")) ``` The downside of `coll()` is speed. Because the rules for recognising which characters are the same are complicated, `coll()` is relatively slow compared to `regex()` and `fixed()`. Note that when both `fixed()` and `regex()` have `ignore_case` arguments, they perform a much simpler comparison than `coll()`. #### Boundary `boundary()` matches boundaries between characters, lines, sentences or words. It's most useful with `str_split()`, but can be used with all pattern matching functions: ```{r} x <- "This is a sentence." str_split(x, boundary("word")) str_count(x, boundary("word")) str_extract_all(x, boundary("word")) ``` By convention, `""` is treated as `boundary("character")`: ```{r} str_split(x, "") str_count(x, "") ``` stringr/vignettes/regular-expressions.Rmd0000644000176200001440000003433113341254174020447 0ustar liggesusers--- title: "Regular expressions" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Regular expressions} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) library(stringr) ``` Regular expressions are a concise and flexible tool for describing patterns in strings. This vignette describes the key features of stringr's regular expressions, as implemented by [stringi](https://github.com/gagolews/stringi). It is not a tutorial, so if you're unfamiliar regular expressions, I'd recommend starting at . If you want to master the details, I'd recommend reading the classic [_Mastering Regular Expressions_](https://amzn.com/0596528124) by Jeffrey E. F. Friedl. Regular expressions are the default pattern engine in stringr. That means when you use a pattern matching function with a bare string, it's equivalent to wrapping it in a call to `regex()`: ```{r, eval = FALSE} # The regular call: str_extract(fruit, "nana") # Is shorthand for str_extract(fruit, regex("nana")) ``` You will need to use `regex()` explicitly if you want to override the default options, as you'll see in examples below. ## Basic matches The simplest patterns match exact strings: ```{r} x <- c("apple", "banana", "pear") str_extract(x, "an") ``` You can perform a case-insensitive match using `ignore_case = TRUE`: ```{r} bananas <- c("banana", "Banana", "BANANA") str_detect(bananas, "banana") str_detect(bananas, regex("banana", ignore_case = TRUE)) ``` The next step up in complexity is `.`, which matches any character except a newline: ```{r} str_extract(x, ".a.") ``` You can allow `.` to match everything, including `\n`, by setting `dotall = TRUE`: ```{r} str_detect("\nX\n", ".X.") str_detect("\nX\n", regex(".X.", dotall = TRUE)) ``` ## Escaping If "`.`" matches any character, how do you match a literal "`.`"? You need to use an "escape" to tell the regular expression you want to match it exactly, not use its special behaviour. Like strings, regexps use the backslash, `\`, to escape special behaviour. So to match an `.`, you need the regexp `\.`. Unfortunately this creates a problem. We use strings to represent regular expressions, and `\` is also used as an escape symbol in strings. So to create the regular expression `\.` we need the string `"\\."`. ```{r} # To create the regular expression, we need \\ dot <- "\\." # But the expression itself only contains one: writeLines(dot) # And this tells R to look for an explicit . str_extract(c("abc", "a.c", "bef"), "a\\.c") ``` If `\` is used as an escape character in regular expressions, how do you match a literal `\`? Well you need to escape it, creating the regular expression `\\`. To create that regular expression, you need to use a string, which also needs to escape `\`. That means to match a literal `\` you need to write `"\\\\"` --- you need four backslashes to match one! ```{r} x <- "a\\b" writeLines(x) str_extract(x, "\\\\") ``` In this vignette, I use `\.` to denote the regular expression, and `"\\."` to denote the string that represents the regular expression. An alternative quoting mechanism is `\Q...\E`: all the characters in `...` are treated as exact matches. This is useful if you want to exactly match user input as part of a regular expression. ```{r} x <- c("a.b.c.d", "aeb") starts_with <- "a.b" str_detect(x, paste0("^", starts_with)) str_detect(x, paste0("^\\Q", starts_with, "\\E")) ``` ## Special characters Escapes also allow you to specify individual characters that are otherwise hard to type. You can specify individual unicode characters in five ways, either as a variable number of hex digits (four is most common), or by name: * `\xhh`: 2 hex digits. * `\x{hhhh}`: 1-6 hex digits. * `\uhhhh`: 4 hex digits. * `\Uhhhhhhhh`: 8 hex digits. * `\N{name}`, e.g. `\N{grinning face}` matches the basic smiling emoji. Similarly, you can specify many common control characters: * `\a`: bell. * `\cX`: match a control-X character. * `\e`: escape (`\u001B`). * `\f`: form feed (`\u000C`). * `\n`: line feed (`\u000A`). * `\r`: carriage return (`\u000D`). * `\t`: horizontal tabulation (`\u0009`). * `\0ooo` match an octal character. 'ooo' is from one to three octal digits, from 000 to 0377. The leading zero is required. (Many of these are only of historical interest and are only included here for the sake of completeness.) ## Matching multiple characters There are a number of patterns that match more than one character. You've already seen `.`, which matches any character (except a newline). A closely related operator is `\X`, which matches a __grapheme cluster__, a set of individual elements that form a single symbol. For example, one way of representing "á" is as the letter "a" plus an accent: `.` will match the component "a", while `\X` will match the complete symbol: ```{r} x <- "a\u0301" str_extract(x, ".") str_extract(x, "\\X") ``` There are five other escaped pairs that match narrower classes of characters: * `\d`: matches any digit. The complement, `\D`, matches any character that is not a decimal digit. ```{r} str_extract_all("1 + 2 = 3", "\\d+")[[1]] ``` Technically, `\d` includes any character in the Unicode Category of Nd ("Number, Decimal Digit"), which also includes numeric symbols from other languages: ```{r} # Some Laotian numbers str_detect("១២៣", "\\d") ``` * `\s`: matches any whitespace. This includes tabs, newlines, form feeds, and any character in the Unicode Z Category (which includes a variety of space characters and other separators.). The complement, `\S`, matches any non-whitespace character. ```{r} (text <- "Some \t badly\n\t\tspaced \f text") str_replace_all(text, "\\s+", " ") ``` * `\p{property name}` matches any character with specific unicode property, like `\p{Uppercase}` or `\p{Diacritic}`. The complement, `\P{property name}`, matches all characters without the property. A complete list of unicode properties can be found at . ```{r} (text <- c('"Double quotes"', "«Guillemet»", "“Fancy quotes”")) str_replace_all(text, "\\p{quotation mark}", "'") ``` * `\w` matches any "word" character, which includes alphabetic characters, marks and decimal numbers. The complement, `\W`, matches any non-word character. ```{r} str_extract_all("Don't eat that!", "\\w+")[[1]] str_split("Don't eat that!", "\\W")[[1]] ``` Technically, `\w` also matches connector punctuation, `\u200c` (zero width connector), and `\u200d` (zero width joiner), but these are rarely seen in the wild. * `\b` matches word boundaries, the transition between word and non-word characters. `\B` matches the opposite: boundaries that have either both word or non-word characters on either side. ```{r} str_replace_all("The quick brown fox", "\\b", "_") str_replace_all("The quick brown fox", "\\B", "_") ``` You can also create your own __character classes__ using `[]`: * `[abc]`: matches a, b, or c. * `[a-z]`: matches every character between a and z (in Unicode code point order). * `[^abc]`: matches anything except a, b, or c. * `[\^\-]`: matches `^` or `-`. There are a number of pre-built classes that you can use inside `[]`: * `[:punct:]`: punctuation. * `[:alpha:]`: letters. * `[:lower:]`: lowercase letters. * `[:upper:]`: upperclass letters. * `[:digit:]`: digits. * `[:xdigit:]`: hex digits. * `[:alnum:]`: letters and numbers. * `[:cntrl:]`: control characters. * `[:graph:]`: letters, numbers, and punctuation. * `[:print:]`: letters, numbers, punctuation, and whitespace. * `[:space:]`: space characters (basically equivalent to `\s`). * `[:blank:]`: space and tab. These all go inside the `[]` for character classes, i.e. `[[:digit:]AX]` matches all digits, A, and X. You can also using Unicode properties, like `[\p{Letter}]`, and various set operations, like `[\p{Letter}--\p{script=latin}]`. See `?"stringi-search-charclass"` for details. ## Alternation `|` is the __alternation__ operator, which will pick between one or more possible matches. For example, `abc|def` will match `abc` or `def`. ```{r} str_detect(c("abc", "def", "ghi"), "abc|def") ``` Note that the precedence for `|` is low, so that `abc|def` matches `abc` or `def` not `abcyz` or `abxyz`. ## Grouping You can use parentheses to override the default precedence rules: ```{r} str_extract(c("grey", "gray"), "gre|ay") str_extract(c("grey", "gray"), "gr(e|a)y") ``` Parenthesis also define "groups" that you can refer to with __backreferences__, like `\1`, `\2` etc, and can be extracted with `str_match()`. For example, the following regular expression finds all fruits that have a repeated pair of letters: ```{r} pattern <- "(..)\\1" fruit %>% str_subset(pattern) fruit %>% str_subset(pattern) %>% str_match(pattern) ``` You can use `(?:...)`, the non-grouping parentheses, to control precedence but not capture the match in a group. This is slightly more efficient than capturing parentheses. ```{r} str_match(c("grey", "gray"), "gr(e|a)y") str_match(c("grey", "gray"), "gr(?:e|a)y") ``` This is most useful for more complex cases where you need to capture matches and control precedence independently. ## Anchors By default, regular expressions will match any part of a string. It's often useful to __anchor__ the regular expression so that it matches from the start or end of the string: * `^` matches the start of string. * `$` matches the end of the string. ```{r} x <- c("apple", "banana", "pear") str_extract(x, "^a") str_extract(x, "a$") ``` To match a literal "$" or "^", you need to escape them, `\$`, and `\^`. For multiline strings, you can use `regex(multiline = TRUE)`. This changes the behaviour of `^` and `$`, and introduces three new operators: * `^` now matches the start of each line. * `$` now matches the end of each line. * `\A` matches the start of the input. * `\z` matches the end of the input. * `\Z` matches the end of the input, but before the final line terminator, if it exists. ```{r} x <- "Line 1\nLine 2\nLine 3\n" str_extract_all(x, "^Line..")[[1]] str_extract_all(x, regex("^Line..", multiline = TRUE))[[1]] str_extract_all(x, regex("\\ALine..", multiline = TRUE))[[1]] ``` ## Repetition You can control how many times a pattern matches with the repetition operators: * `?`: 0 or 1. * `+`: 1 or more. * `*`: 0 or more. ```{r} x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII" str_extract(x, "CC?") str_extract(x, "CC+") str_extract(x, 'C[LX]+') ``` Note that the precedence of these operators is high, so you can write: `colou?r` to match either American or British spellings. That means most uses will need parentheses, like `bana(na)+`. You can also specify the number of matches precisely: * `{n}`: exactly n * `{n,}`: n or more * `{n,m}`: between n and m ```{r} str_extract(x, "C{2}") str_extract(x, "C{2,}") str_extract(x, "C{2,3}") ``` By default these matches are "greedy": they will match the longest string possible. You can make them "lazy", matching the shortest string possible by putting a `?` after them: * `??`: 0 or 1, prefer 0. * `+?`: 1 or more, match as few times as possible. * `*?`: 0 or more, match as few times as possible. * `{n,}?`: n or more, match as few times as possible. * `{n,m}?`: between n and m, , match as few times as possible, but at least n. ```{r} str_extract(x, c("C{2,3}", "C{2,3}?")) str_extract(x, c("C[LX]+", "C[LX]+?")) ``` You can also make the matches possessive by putting a `+` after them, which means that if later parts of the match fail, the repetition will not be re-tried with a smaller number of characters. This is an advanced feature used to improve performance in worst-case scenarios (called "catastrophic backtracking"). * `?+`: 0 or 1, possessive. * `++`: 1 or more, possessive. * `*+`: 0 or more, possessive. * `{n}+`: exactly n, possessive. * `{n,}+`: n or more, possessive. * `{n,m}+`: between n and m, possessive. A related concept is the __atomic-match__ parenthesis, `(?>...)`. If a later match fails and the engine needs to back-track, an atomic match is kept as is: it succeeds or fails as a whole. Compare the following two regular expressions: ```{r} str_detect("ABC", "(?>A|.B)C") str_detect("ABC", "(?:A|.B)C") ``` The atomic match fails because it matches A, and then the next character is a C so it fails. The regular match succeeds because it matches A, but then C doesn't match, so it back-tracks and tries B instead. ## Look arounds These assertions look ahead or behind the current match without "consuming" any characters (i.e. changing the input position). * `(?=...)`: positive look-ahead assertion. Matches if `...` matches at the current input. * `(?!...)`: negative look-ahead assertion. Matches if `...` __does not__ match at the current input. * `(?<=...)`: positive look-behind assertion. Matches if `...` matches text preceding the current position, with the last character of the match being the character just before the current position. Length must be bounded (i.e. no `*` or `+`). * `(? # stringr
[![CRAN status](https://www.r-pkg.org/badges/version/stringr)](https://cran.r-project.org/package=stringr) [![Travis build status](https://travis-ci.org/tidyverse/stringr.svg?branch=master)](https://travis-ci.org/tidyverse/stringr) [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/tidyverse/stringr?branch=master&svg=true)](https://ci.appveyor.com/project/tidyverse/stringr) [![Codecov test coverage](https://codecov.io/gh/tidyverse/stringr/branch/master/graph/badge.svg)](https://codecov.io/gh/tidyverse/stringr?branch=master) [![Lifecycle: stable](https://img.shields.io/badge/lifecycle-stable-brightgreen.svg)](https://www.tidyverse.org/lifecycle/#stable) ## Overview Strings are not glamorous, high-profile components of R, but they do play a big role in many data cleaning and preparation tasks. The stringr package provide a cohesive set of functions designed to make working with strings as easy as possible. If you’re not familiar with strings, the best place to start is the [chapter on strings](http://r4ds.had.co.nz/strings.html) in R for Data Science. stringr is built on top of [stringi](https://github.com/gagolews/stringi), which uses the [ICU](http://site.icu-project.org) C library to provide fast, correct implementations of common string manipulations. stringr focusses on the most important and commonly used string manipulation functions whereas stringi provides a comprehensive set covering almost anything you can imagine. If you find that stringr is missing a function that you need, try looking in stringi. Both packages share similar conventions, so once you’ve mastered stringr, you should find stringi similarly easy to use. ## Installation ``` r # Install the released version from CRAN: install.packages("stringr") # Install the cutting edge development version from GitHub: # install.packages("devtools") devtools::install_github("tidyverse/stringr") ``` ## Cheatsheet ## Usage All functions in stringr start with `str_` and take a vector of strings as the first argument. ``` r x <- c("why", "video", "cross", "extra", "deal", "authority") str_length(x) #> [1] 3 5 5 5 4 9 str_c(x, collapse = ", ") #> [1] "why, video, cross, extra, deal, authority" str_sub(x, 1, 2) #> [1] "wh" "vi" "cr" "ex" "de" "au" ``` Most string functions work with regular expressions, a concise language for describing patterns of text. For example, the regular expression `"[aeiou]"` matches any single character that is a vowel: ``` r str_subset(x, "[aeiou]") #> [1] "video" "cross" "extra" "deal" "authority" str_count(x, "[aeiou]") #> [1] 0 3 1 2 2 4 ``` There are seven main verbs that work with patterns: - `str_detect(x, pattern)` tells you if there’s any match to the pattern. ``` r str_detect(x, "[aeiou]") #> [1] FALSE TRUE TRUE TRUE TRUE TRUE ``` - `str_count(x, pattern)` counts the number of patterns. ``` r str_count(x, "[aeiou]") #> [1] 0 3 1 2 2 4 ``` - `str_subset(x, pattern)` extracts the matching components. ``` r str_subset(x, "[aeiou]") #> [1] "video" "cross" "extra" "deal" "authority" ``` - `str_locate(x, pattern)` gives the position of the match. ``` r str_locate(x, "[aeiou]") #> start end #> [1,] NA NA #> [2,] 2 2 #> [3,] 3 3 #> [4,] 1 1 #> [5,] 2 2 #> [6,] 1 1 ``` - `str_extract(x, pattern)` extracts the text of the match. ``` r str_extract(x, "[aeiou]") #> [1] NA "i" "o" "e" "e" "a" ``` - `str_match(x, pattern)` extracts parts of the match defined by parentheses. ``` r # extract the characters on either side of the vowel str_match(x, "(.)[aeiou](.)") #> [,1] [,2] [,3] #> [1,] NA NA NA #> [2,] "vid" "v" "d" #> [3,] "ros" "r" "s" #> [4,] NA NA NA #> [5,] "dea" "d" "a" #> [6,] "aut" "a" "t" ``` - `str_replace(x, pattern, replacement)` replaces the matches with new text. ``` r str_replace(x, "[aeiou]", "?") #> [1] "why" "v?deo" "cr?ss" "?xtra" "d?al" "?uthority" ``` - `str_split(x, pattern)` splits up a string into multiple pieces. ``` r str_split(c("a,b", "c,d,e"), ",") #> [[1]] #> [1] "a" "b" #> #> [[2]] #> [1] "c" "d" "e" ``` As well as regular expressions (the default), there are three other pattern matching engines: - `fixed()`: match exact bytes - `coll()`: match human letters - `boundary()`: match boundaries ## RStudio Addin The [RegExplain RStudio addin](https://www.garrickadenbuie.com/project/regexplain/) provides a friendly interface for working with regular expressions and functions from stringr. This addin allows you to interactively build your regexp, check the output of common string matching functions, consult the interactive help pages, or use the included resources to learn regular expressions. This addin can easily be installed with devtools: ``` r # install.packages("devtools") devtools::install_github("gadenbuie/regexplain") ``` ## Compared to base R R provides a solid set of string operations, but because they have grown organically over time, they can be inconsistent and a little hard to learn. Additionally, they lag behind the string operations in other programming languages, so that some things that are easy to do in languages like Ruby or Python are rather hard to do in R. - Uses consistent function and argument names. The first argument is always the vector of strings to modify, which makes stringr work particularly well in conjunction with the pipe: ``` r letters %>% .[1:10] %>% str_pad(3, "right") %>% str_c(letters[2:11]) #> [1] "a b" "b c" "c d" "d e" "e f" "f g" "g h" "h i" "i j" "j k" ``` - Simplifies string operations by eliminating options that you don’t need 95% of the time. - Produces outputs than can easily be used as inputs. This includes ensuring that missing inputs result in missing outputs, and zero length inputs result in zero length outputs. stringr/MD50000644000176200001440000001347113427716423012271 0ustar liggesusersf1be7aee4819b4767671e5bc8606e4f6 *DESCRIPTION b234ee4d69f5fce4486a80fdaf4a4263 *LICENSE 30eb6e853e082199431ecfaabc386016 *NAMESPACE 5880903268e299adc2ddb4d7724c5869 *NEWS.md 51398132d23aa4ce1115846674ec8a38 *R/c.r bfef1f92e0a2a49a8ef018a02e06c4c1 *R/case.R 20e75c1672eaa40373bbdea873641733 *R/conv.R d423543d3d2f6845c721505c9a024320 *R/count.r 716026feefecac9797d262bc61f4060a *R/data.R 2644b410c94c2fbfd223a6ea4a96900c *R/detect.r ca5f48cd48c4ca831104376e6bbd57c5 *R/dup.r 80bc6b1c219cf9c66bda38e4014278f8 *R/extract.r c531442725e5e3353471b1f81e0b399f *R/flatten.R 7445f931d846d60f74d1f2727a05b6d8 *R/glue.R 482b0bc26f2c5ec1869304099a725cd6 *R/interp.R 4042c049954895a0fbb8fc3706714bc8 *R/length.r 6a4c219f8f72d48ed3f0de657376feea *R/locate.r f2b9e9de01cd37d1d2f387549362c84d *R/match.r 978391d75d3b8846848b797848135981 *R/modifiers.r e5001591cf4127cc9c98fd6b76cb6192 *R/pad.r fb6888e6dd1cd75250bbd831b9bd00bc *R/remove.r 3348670071a0d8eb3f153056490120a4 *R/replace.r fd5e2c7ab2d7b8ec65485edef69b982a *R/sort.R 335af69767447f9a7e9e0f91c1020dfd *R/split.r b4fe12876c37f8a22cde06e9701eecf0 *R/stringr.R 47df4295e32a5d017bc536d5326d54a5 *R/sub.r cbaf25379a8482efa1eef1b431655214 *R/subset.R 44e3f9ed7cb0e731582c23f3a3215837 *R/trim.R 2cef0bb5f8d765fc094728aa6801be1c *R/trunc.R f583f5b5856f7cb5f2c5fbb04f39f8a8 *R/utils.R a0481fe990835444787dc782bcd030a3 *R/view.R 5ed6815298ddf621046e1595e5111aa6 *R/word.r ede05ec68f460020acce7f054bfe6356 *R/wrap.r 5dfd782a4dfa3796ba55fd2e7cffff5a *README.md 5de96acf442face16796a0c28ef569de *build/vignette.rds 89f0d280160eb4419b23251639a728c2 *data/fruit.rda 7ad07be2e18f2b3459b55adc0c03c498 *data/sentences.rda c99f00d311e24c76bbeabfc8a58b4b50 *data/words.rda baeeda52353d93ad1530a71401b5b323 *inst/doc/regular-expressions.R 25843de7a8010e5a0cc92348841db6da *inst/doc/regular-expressions.Rmd fad49bb8db1547a4a3633c0387bc7c83 *inst/doc/regular-expressions.html aedecc290d9788158540a0911f057218 *inst/doc/stringr.R 2e6abe80c39713fdd5778e6276185408 *inst/doc/stringr.Rmd f2a58e820abeb8ba44a398eea41a2694 *inst/doc/stringr.html 0cce813b2f19d701b1f00d51d42902c1 *inst/htmlwidgets/lib/str_view.css e7c37a495d4ae965400eeb1000dee672 *inst/htmlwidgets/str_view.js 1763429826b7f9745d2e590e4ca4c119 *inst/htmlwidgets/str_view.yaml cfb7654d3c476934d01d29d74c3fcf21 *man/case.Rd 7090c91ac9cc554962bd49006e492c20 *man/figures/logo.png c835b166a235ab40556f275aa89835ac *man/invert_match.Rd ca429cf521e5b4f7827bc2fb5413c495 *man/modifiers.Rd a64a7ea44fcaa33c2d3ad0f7909cbc3e *man/pipe.Rd ee15dcc3fff5c1f0b0991f4618720014 *man/str_c.Rd 77aa731c9047fb922cfef821f9058b8c *man/str_conv.Rd d69c983b870889c27aba996bb1d741cf *man/str_count.Rd 684e054552eee4f4e9d5a3f1633224a6 *man/str_detect.Rd ff1b4f8ff391243b73b7c7d55fdc4570 *man/str_dup.Rd 893cb5a03c74bbda8848a8960f33b68d *man/str_extract.Rd 70687cf5ae8aec9e4961b84b35757729 *man/str_flatten.Rd df6342564d3c9414821c2eb18a582cf1 *man/str_glue.Rd 728bce509a51f27efc5f420eae301352 *man/str_interp.Rd 735dfe7018a5bdcb361c2a886fa4781c *man/str_length.Rd 3abf43a579f1ac816af29182eaa8aba3 *man/str_locate.Rd af448221204c0eea894b8feb13b48555 *man/str_match.Rd 838727f41707dfa2d08a84a5cc18e593 *man/str_order.Rd cd45b7527a1721aa3a7d4b684f2fe49e *man/str_pad.Rd 231338615cd693a517558a16abcb3be1 *man/str_remove.Rd 7ffb74f17d17ea1fefb140d9695a72e1 *man/str_replace.Rd 336d96a35dfaf71efed27c5d0e67e28d *man/str_replace_na.Rd b396e80f4c183b95d13424be26cc7058 *man/str_split.Rd f34c7eb93e3c572493386acd7ad8b6ba *man/str_starts.Rd ba09f96c144a3996ae7cfbfd875e778c *man/str_sub.Rd 7c7a3de84089d0f5c288494814d77213 *man/str_subset.Rd f074784c71f52fa385ce1350e0136ae3 *man/str_trim.Rd e5fd22cc488caade748e2d167bfe86dd *man/str_trunc.Rd eab18ee02aa20fd501013e16b782c47a *man/str_view.Rd d2c7dd48f20114652e17628b14a4266a *man/str_wrap.Rd 47b1ce113c0ee06888c55adead1a314e *man/stringr-data.Rd 4bd1d0bbe4dbc4ade056a1995a130549 *man/stringr-package.Rd 80c0945177ea911afede4a8d51b6d390 *man/word.Rd 4ee9d05bd4688270eca8d85299cedcd1 *tests/testthat.R 1e5a2a2ee2f814a6f17ca0ba09a6286f *tests/testthat/test-case.R a4b042c9c43f1e339699262ca6aeb5cd *tests/testthat/test-conv.R 61f9d77768cf9ff813d382f9337178fb *tests/testthat/test-count.r 21817848235fe61702b1a07edd9f6c6c *tests/testthat/test-detect.r 2b336162dd9c107511565144566cf233 *tests/testthat/test-dup.r 3c4271c37fc4b0bf497e7aa0deee19d6 *tests/testthat/test-extract.r 33338d5733d47b09c8664059a7e15e48 *tests/testthat/test-flatten.R b828cd93b173c342b1b29652c0a79886 *tests/testthat/test-glue.R 1c924b1585f9a84d509b051f55795968 *tests/testthat/test-interp.r 6fe2c7933ec6c863e808320c60567a5f *tests/testthat/test-join.r 6f525891e80befb684ff295d1b714f71 *tests/testthat/test-length.r b9c2324c1d46d0efdb1b5d448db09556 *tests/testthat/test-locate.r 06f360cd9e9e217e6862884751f42026 *tests/testthat/test-match.r cb91cdc6d45174b4165a252447c74dca *tests/testthat/test-modifiers.r 9d4f02d9e2458e9ad849a218a3bd9f5c *tests/testthat/test-pad.r f70c240de35db3bb194a670aeb8e067c *tests/testthat/test-remove.r ae9cb8ea84ce2739e45fe6a80ac8e95f *tests/testthat/test-replace.r f52112340250c09a60adc39334d12ef8 *tests/testthat/test-sort.R ed9fce46356a66a829054fd312dcec0d *tests/testthat/test-split.r 452e6c5e06861420a22ecb153766cdd3 *tests/testthat/test-sub.r a4ce3d45929e6ecf8b6ecbfd96a2bb59 *tests/testthat/test-subset.r 21e6cc8d6762a81bb4e306a96016cffe *tests/testthat/test-trim.r 498d337c6d7845771609d03a542878d8 *tests/testthat/test-trunc.r 64c1d6ac2d13c879372a3bd3e117ee46 *tests/testthat/test-view.R bf9e1f3e3b9adfb5157cf231126b8c47 *tests/testthat/test-word.r bd8b91e1198e96d16daa8d6378ff8f04 *tests/testthat/test-wrap.r 25843de7a8010e5a0cc92348841db6da *vignettes/regular-expressions.Rmd 83b44443e822faf60ab22b3f595fd21b *vignettes/releases/stringr-1.0.0.Rmd 291466d813823c3c461393d928a54fcc *vignettes/releases/stringr-1.1.0.Rmd 3e322b18ecb9e2c00530383777cd1054 *vignettes/releases/stringr-1.2.0.Rmd 2e6abe80c39713fdd5778e6276185408 *vignettes/stringr.Rmd stringr/build/0000755000176200001440000000000013427574670013060 5ustar liggesusersstringr/build/vignette.rds0000644000176200001440000000036113427574670015417 0ustar liggesusersuP 0M ċ^C&eRiM=d2&o!!&N&33kpxx/]šr`R& bK @FtV(~y_UĘ7X[/O:gzwkB=N3&Aw<8&38V2ذpF20f:KUUn($2(g@>Nw,stringr/DESCRIPTION0000644000176200001440000000236413427716423013466 0ustar liggesusersPackage: stringr Title: Simple, Consistent Wrappers for Common String Operations Version: 1.4.0 Authors@R: c(person(given = "Hadley", family = "Wickham", role = c("aut", "cre", "cph"), email = "hadley@rstudio.com"), person(given = "RStudio", role = c("cph", "fnd"))) Description: A consistent, simple and easy to use set of wrappers around the fantastic 'stringi' package. All function and argument names (and positions) are consistent, all functions deal with "NA"'s and zero length vectors in the same way, and the output from one function is easy to feed into the input of another. License: GPL-2 | file LICENSE URL: http://stringr.tidyverse.org, https://github.com/tidyverse/stringr BugReports: https://github.com/tidyverse/stringr/issues Depends: R (>= 3.1) Imports: glue (>= 1.2.0), magrittr, stringi (>= 1.1.7) Suggests: covr, htmltools, htmlwidgets, knitr, rmarkdown, testthat VignetteBuilder: knitr Encoding: UTF-8 LazyData: true RoxygenNote: 6.1.1 NeedsCompilation: no Packaged: 2019-02-09 16:03:19 UTC; hadley Author: Hadley Wickham [aut, cre, cph], RStudio [cph, fnd] Maintainer: Hadley Wickham Repository: CRAN Date/Publication: 2019-02-10 03:40:03 UTC stringr/man/0000755000176200001440000000000013427574674012540 5ustar liggesusersstringr/man/word.Rd0000644000176200001440000000221613202620061013750 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/word.r \name{word} \alias{word} \title{Extract words from a sentence.} \usage{ word(string, start = 1L, end = start, sep = fixed(" ")) } \arguments{ \item{string}{input character vector.} \item{start}{integer vector giving position of first word to extract. Defaults to first word. If negative, counts backwards from last character.} \item{end}{integer vector giving position of last word to extract. Defaults to first word. If negative, counts backwards from last character.} \item{sep}{separator between words. Defaults to single space.} } \value{ character vector of words from \code{start} to \code{end} (inclusive). Will be length of longest input argument. } \description{ Extract words from a sentence. } \examples{ sentences <- c("Jane saw a cat", "Jane sat down") word(sentences, 1) word(sentences, 2) word(sentences, -1) word(sentences, 2, -1) # Also vectorised over start and end word(sentences[1], 1:3, -1) word(sentences[1], 1, 1:4) # Can define words by other separators str <- 'abc.def..123.4568.999' word(str, 1, sep = fixed('..')) word(str, 2, sep = fixed('..')) } stringr/man/str_sub.Rd0000644000176200001440000000447313221261010014461 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/sub.r \name{str_sub} \alias{str_sub} \alias{str_sub<-} \title{Extract and replace substrings from a character vector.} \usage{ str_sub(string, start = 1L, end = -1L) str_sub(string, start = 1L, end = -1L, omit_na = FALSE) <- value } \arguments{ \item{string}{input character vector.} \item{start, end}{Two integer vectors. \code{start} gives the position of the first character (defaults to first), \code{end} gives the position of the last (defaults to last character). Alternatively, pass a two-column matrix to \code{start}. Negative values count backwards from the last character.} \item{omit_na}{Single logical value. If \code{TRUE}, missing values in any of the arguments provided will result in an unchanged input.} \item{value}{replacement string} } \value{ A character vector of substring from \code{start} to \code{end} (inclusive). Will be length of longest input argument. } \description{ \code{str_sub} will recycle all arguments to be the same length as the longest argument. If any arguments are of length 0, the output will be a zero length character vector. } \details{ Substrings are inclusive - they include the characters at both start and end positions. \code{str_sub(string, 1, -1)} will return the complete substring, from the first character to the last. } \examples{ hw <- "Hadley Wickham" str_sub(hw, 1, 6) str_sub(hw, end = 6) str_sub(hw, 8, 14) str_sub(hw, 8) str_sub(hw, c(1, 8), c(6, 14)) # Negative indices str_sub(hw, -1) str_sub(hw, -7) str_sub(hw, end = -7) # Alternatively, you can pass in a two colum matrix, as in the # output from str_locate_all pos <- str_locate_all(hw, "[aeio]")[[1]] str_sub(hw, pos) str_sub(hw, pos[, 1], pos[, 2]) # Vectorisation str_sub(hw, seq_len(str_length(hw))) str_sub(hw, end = seq_len(str_length(hw))) # Replacement form x <- "BBCDEF" str_sub(x, 1, 1) <- "A"; x str_sub(x, -1, -1) <- "K"; x str_sub(x, -2, -2) <- "GHIJ"; x str_sub(x, 2, -2) <- ""; x # If you want to keep the original if some argument is NA, # use omit_na = TRUE x1 <- x2 <- x3 <- x4 <- "AAA" str_sub(x1, 1, NA) <- "B" str_sub(x2, 1, 2) <- NA str_sub(x3, 1, NA, omit_na = TRUE) <- "B" str_sub(x4, 1, 2, omit_na = TRUE) <- NA x1; x2; x3; x4 } \seealso{ The underlying implementation in \code{\link[stringi:stri_sub]{stringi::stri_sub()}} } stringr/man/str_length.Rd0000644000176200001440000000224013202620061015143 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/length.r \name{str_length} \alias{str_length} \title{The length of a string.} \usage{ str_length(string) } \arguments{ \item{string}{Input vector. Either a character vector, or something coercible to one.} } \value{ A numeric vector giving number of characters (code points) in each element of the character vector. Missing string have missing length. } \description{ Technically this returns the number of "code points", in a string. One code point usually corresponds to one character, but not always. For example, an u with a umlaut might be represented as a single character or as the combination a u and an umlaut. } \examples{ str_length(letters) str_length(NA) str_length(factor("abc")) str_length(c("i", "like", "programming", NA)) # Two ways of representing a u with an umlaut u1 <- "\\u00fc" u2 <- stringi::stri_trans_nfd(u1) # The print the same: u1 u2 # But have a different length str_length(u1) str_length(u2) # Even though they have the same number of characters str_count(u1) str_count(u2) } \seealso{ \code{\link[stringi:stri_length]{stringi::stri_length()}} which this function wraps. } stringr/man/str_trunc.Rd0000644000176200001440000000131613202620061015020 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/trunc.R \name{str_trunc} \alias{str_trunc} \title{Truncate a character string.} \usage{ str_trunc(string, width, side = c("right", "left", "center"), ellipsis = "...") } \arguments{ \item{string}{A character vector.} \item{width}{Maximum width of string.} \item{side, ellipsis}{Location and content of ellipsis that indicates content has been removed.} } \description{ Truncate a character string. } \examples{ x <- "This string is moderately long" rbind( str_trunc(x, 20, "right"), str_trunc(x, 20, "left"), str_trunc(x, 20, "center") ) } \seealso{ \code{\link[=str_pad]{str_pad()}} to increase the minimum width of a string. } stringr/man/figures/0000755000176200001440000000000013427574674014204 5ustar liggesusersstringr/man/figures/logo.png0000644000176200001440000006046013414140270015631 0ustar liggesusersPNG  IHDRX?gAMA a cHRMz&u0`:pQ<bKGD pHYs!7!73XztIME 3`IDATxw`Tם?}$*jcC\1ni;Nn^6=ٗoMw'I\bǎ7L fQM?4FHB#OVA:#a0-Ѐ?G>EAD'`08X | ׀)0-* =^t!⑊!FJUT+<|Ba! ǣO }(̧)(HepߠcL˨af$)̿=ыAں?4|y1|!^+p=z0 M!&!/ZiHT̔?B/L!ˉ!@^w.]{m ,v; r9d#JSc pM!' !?^->˴mJ^ σbz R ;hjj3K: 맮CCÇ!ax=/s2iSb&'GnŌ@/n$H$4imm'HfV@:paxe󻖳6?@B( ^@ }`hw Uذ-~$ DdrEF#BSc+pt0. x}ڰ-1 l~j`2J}$I"Jl F<f 0`xy!0MsZ n/H"vTaa[b } \E ?>l#D#47 Ԍ a8p x]Ȳd!p陴+zxGD;P$I"H%8şOa/! Ctߗ$&'Ϗ6JIgg^i C= Ip8]v旝őtb$H8DOa[]?lv+9y>>dmW$IBMwsP2 Ee ~N&m f 4#ID"E[K;m!^Y_`7/Eua[/eb8-MmtGPU-S!9_p8dIui@M; ^Ya[d 6OVfڶf!'ׇƤ(m[lo렵IlV\̬"ON-D*fVY@Vǜ<v IFPv#4(ecζ8*+\d;mpyȲ4F *$&1GmqT "6CqFX,f|^~7& &J{[m-$Ƕx]ģֶ8jw3gw&\H,MqL5 vJe<4mY/^6+хn~n9$B_֧9H#:eҶJ_w CIPUPM4naNCeї`6+A Tio *'۶' a 9<46?PLz?ϋ6`|+C\+t7KN($mAԶVpp6?c>wd#Gp H`|gYk[Jfu(.gjJ=B[K;1j[ L߃/Ǎb2 ![;Hf^ )Y`[q!\n~dpysiEg,Np(nYa[1Ji8g9n[`)tq,2(e#ڶxٟޡYzB1y;?vl JY戴-^V݆^EdV\xsT.^-<\d!f!7߇?ߋe܋!G/Uy''PH4B `„ b1 w?xPgH87 XdEFKj8ѝN7wGDAI& ׆3Ⲝ !0 _+34iwPABJEEMZ@}}=iEHJ_a0\m)P,I0XzATEyǶN(eL G 8yms5tW, g44!`/$h~$gΜrQQQbj:oJ! &z}ϲ"cqZ0MMdvrNw_6Qt1L88r(&ݡt7,*ϡbAn7&L R>=%Fz`c'&IS1+XLf-B/-d&ߎ3ρnsJBlÂ5EÔ0c ()50:z:}Õ=>.T*!R$KKj )2jR$?T&~kGdž+߉W۟ =p8B`/r` ` (C -ٯ;?6;, jRH~%8̸N^iԄJ/b$pU;"rD"A ߏb21WW6^a1QBRIϏM6wک6YN BiB K2Rv?dO&8EEE!I@g̶77^b1=( %QEFR bQ:-(&5\o6xs,)>8]6&FUUrssɓ̘>V|blH3(zޛx{ I& B*ާ">Xaȧ~ͣLg[d{'9z(B())a},Z}wڈoJ>c U/$E"Kvʉ& ,#YpӶ} l6>Çsyw&""od.&4jjHӾD8GL "Gw&p1H&'N3ϐH$0LQ{n}#֕ep>gGGs:b jC 'BO wnQWWիeǃ,7}xp]!,@I-9GDM>l^R iBYY999s Q>όBl@F+$d95qY@vkTZ[hqD,,TUU0~xGEe%aIܛCp\4ȒգB3pǏtRYYɖ-[>@nɆ75)`L/ 75as \Rߗ(KhJ;߫'?bv;|~wwm310/W,VB5ńޞ+6:Evo#+2 TLQ$`8+b yH*+`/h>L0ߏnP(Dmm-g;QtiAF*AY7|) P#C钕3pؽ{7{!Ǚ={6T=;GLL/IHi)RZM%$E62@ Hi.=pOtٖ x( m;F0tRRR^(~ǎarERN+j(̞3x];7sb~ejdtuƼJ0Bje_W,Y#<ԩSx<=<'6a2E}>Kr}c{wvvbZ={6۷o|FQz!T"3hY{c !]u߅0wuɶ'_?*رcuuuL8Պ8<6mbҤI,_UV5>/K/$47 з(CZ@!V0r!g /#Ktj$ǵ|B!bnT*E~~>'NdժU!>ǣ>֭[t~r0T!߼G5 Aˎipt !zE^d-MDf x%%%|_&??f, RYYI g}k-`޽f =F:1INZl6q3B1<Q&%><J\hbV+P<ĉ)//8sncL88xÁjeǎTUUa6 N UUP첾% 5DL!]C#=Kr.]fd2ɱcǸkȞ$IȲ{Ñ޲%JaQUɡCd(=W_Xƒb3/b %N<.M’Vw=+1c:h6lʕ+3 {ӧlݺiӦ! HYYYwy'8x 8N$Ibjb-pa: HdM5%sU1UĠxG8bo5_I9t555|b-2@ÇS\\L4%77ɔvF3㏳yf}Q|>SLAQf͚hd_XVa7Ӵ֗jpsӟ[|08JIvn7x>28Μ9Ä FHfΝ<6lno&?ϑ$j^/K.ey L)L**f{?͗% 8v8ߌcx%>9i/Ȳ{GSS`EQson7xtt6[nP(Ě5k(--eլYf.\nn;WmWy -LlC<(c@zUoNJuit"I::9 Dv0Br뮻pyf, L>NbgΜo;"//˗?TkK+]rkh/%6okz?ʕ+&Oٰa\V$k6}bl$Yh3)z^/_=jCQTU%cܹL&yMFa/ پ};.+ u|SN̚5ٳgO^^ikk#7L+))۷ҥKq\={0~srXthܱA1%HDǹa"{=fΜINNx^hFnn.#`rrr.n2{oXnRWWWU&LwJJJҫZAbnnԩS$}cV^Mnn.識kr!?[oҥK]x0ƚwX=%G[PwrA R\\L^^%%%W*/ 6Dسg> ӧO JqqnV~0o<%c2Z455r%ۜ8q"w.fG}'|;G?~Csw ۢF}`2"t6w_?!zJOwr:7YYr%O. B| _tTnn.6 ߏ(DQBZZZ0LsJ:%I3 wqVbrxXj@ py wӹrQB_fn"1s'd  6g~6jf3^@/\f ˖-;d2cz!f3[oz EQ(--bo>@ii)۶mc9c_ K.uV={6G宻b̙qs;[xF(8K R?%8XXs8W#ymHr+X,8N .'O >]IZ<䓼|gٲef<SLwyGRRRfJOYY眶f3s;ka,[ 0,V %B֮Y(s\=lܸ1vMxW>}:OxLX00恳 I6/yn?}a֬Y /@cc#.+U@GG{ᓟ$K.St: sΥh4ʱc(//jmHa7 1G! \IqI @x㍴o||D"3g~0Ο?2c <O%۵HL@Xv6 ghM1 r=z믿ȷm`Z5kր4UE2r 0z,APlfډT em6&OL~~>~:G>O/^Laa!Yn݀CiɄ$I465&:i,ធOg$^OOuN{k't:x---X,֯_OEEUUUt( ?0wWRPP4UG./X7G1$arY\*r-x4/ϔcY(E~{9Ə`ժUxxX`ySKD?|_Eˡ`{걌!ౌ;7R)榛o"J*/"H{u)))G{]v1o9xa #σR$IB6)}e7+nbrrrÔ3DA^^'$_W8p>,NЧ~K/qm+y}<䬜d}: MYr͒,3eP[[ٳBHR<͛W555_T*Ŋ+4ii,3wCfd{)..c6B0~xy֮]笠EQ.ؖt'I$L"@AMM ˖-c…̘1 i&2wLG6+:m tav[9."-r-vrrr4p8ٳ|>\.h6innԩSYp!8N^/SNl6eoʅoz!B5W:CK 8dx TOl'}Yu#i@cc#hd2D""D"b&^/Y x^v{ڗTo_zRFO E-ğv0 `0biI5Ӡ理iTiӦQPPlgyg^yR=8 C"Iʙ֠,')-=[I2b bbZl!d׮ᨹ 2d0* CMhYYc=hоƗP&qͷL6 y&b-͜:u}捼sxmy%nHDBGVB}cH%XСFZ6b9dZd^fRXT@TmAϜdI՝Dk= [9&%ne~,^f R$`;e>D/G{XIВ !B56m"0[,$C1R@εaQ<ˆiAV_aL 2}a"4!(uI7P#gkw4 c9 $EQQ`QXd[ρ5ׁb7zG m/?R 9mԮY^|G:N3t! $,y.$I8vxnudb8NFUh 2% ЙcK$Z[ R,S٣9v DdNɭ>Z̲BRKV}4``x3;.JABKZK+ liC\K\p`01<@$a*4aڥFB"ʉdI2;F1܅`?Ð#^$$,l@97܌YkB (E1#K>"߳!dKP*EB鑟 !8[x!#kB(fL USIjst[8l"@Hj)jR? j\,0˺5QTj q=G׻X3R-4s~]0-G.VLJ fO# q;fلN?j(sd9XБPJ\{P=u3HHbT3-{͇yxs4 Uڼ]/k`۱mv2# EV`}hBC0WӘ+#A$B05lmCM~~g/enT&ɱy%h2Ɖp;p2$ 3&bUt{zG"ʞq*=wH}mwKJ2_dGhbL XlcEn %|IXXQ,ɡ ݿw|hrRBCB/iz.)fQsH2ZkƦNFфFud4nD׿f*?̍ g;H")$22E0I ~۞G_ <$om)T'tזos#!ӳ[\\͝Uױ68h[5Q)C>9<ڇmK|Gfwa>6^rm^$Ibws 筟J>Z+ YI4Oz-(tOV^Ҩ3f;7\tDZ׃)u@ bav$^wmE;#OEV؋\$^"煡q5;͸ vNms:gY%=Ľ$Z!,#!0Y%Ir\}rƧl+${-.rO!1M@p&7C0^Że!V̂™i,I4MzQutJ`ȒĝUcS,gFij1#` NXΊ%)GhHD[gczB6|BG..hjz5Rzogs! jqVij&rMIH(|[v>1uS#-~r 6-"q}Sҽ*4asn^>aMh3?L?tkdoQ:$I"e~tn\J;2w!t>5uI-uNZMJK!O<'ZFCTjC' ՓRԅmƄ,NKk''9mʃ-58sFqBdפE%';3{J`V&sS1c^bCB!_`dhhi! !ȵyKt~Cm'=1Z>L/Mt˯fcNy/bcoSyյݾi "!Es4tRȝ@G/$Qxe<+o"Ů8i/~'DFolуVr%3hۍ%6U':νY")d+D;o%oIsm]SX?|x.cA~1;W(`=|$ Gю9v䮼S?P"Ȟ#C2E]ƍBcv$I$~'/vP$%rt箊$s(x#T /mqm}k7О*^#V$i@,yɯuΟ%{g=|9 y$$:*ֲ$ogz`7&]l{sI9JDwK?csOvAGEpXBB4yw Ƅ5!HhI_0_d,ondIDG:x]wC=.Odڰ璣2{[s^E)uWRav5Huz7'8'xj'-Nw Ԇynۘn/ I-Em{=O*4OU˰*>/! M!5EHc'—|% Z:i]}I鱗@0;[;g%s&B]yMhd0& NlobU,|rJ9ST&cM)z}jLh’qtj'b]S$6TPBv$ r9j;~)Θ%):.)Ϊk Ј&;-N,k#ȃp»6jYsl} $Q >DG&>F>%fۘdId j/=<9U,&+7/ ØpOŌlK&4:QzfMYt\"/;TNl%޵@zԴ(t^TۣZy W֩˜ ynԮ.;1'o2%@zD:x=RJS,HzFqi~=% 諾@ШLkse+Ԯ:UhN>6&>5m%fŌ6Pзy, eI7S-cx 5&+WMoj `m;CCp$xogo>G] BlSM6^ p2#WS)(\Y8 O1o7ôvcMTxYX<91fNt&doj蓀*[QfKRAǬTzL)N:et%LW4E$p2ƪ#' PIHrh+N}ITyKi!mqT+OM k!^ ՝=}oð IwҙJp;HA8C{ecNWh]HFĂ1dzXۊgCmrӽ37ѵ٥Y}4$)OWX_P/Z%:V+*cS %Y>wqcݑVpv \I(,+; M\p mgr47-`I\ƹ+V]j3V67K9psR l'>&TbNqڝ]I2aXʒġۖaS+dJ(dG>bm4{_y5,+ObEC#q*;D}{>WBM I\NlPKXg$}χ_kqty!Xwrc7 .ổ%hXË*ЙMR4Eۨ +_=+Оֶp{.2sلjCqbhɮڏːK& φe7Ooֺv0۰+ִNIMEOBhX3+x,'8oF{n$.\D"wzg6(Ŏu{sLݭ $҂DG!"Mdv <65 HzU}Cs6Zղ]_TC5i:A#x>`f@S`'HφmE2eߴAft}Ee %9]@6ks@K q:g;_. r@Z~$ }i/ϸNT^2ׄdcob~>ZTkwgwsMvQ}>7DNOtx!vӧ;C2p5zX#@R$. 65;2 ER/qWPL3Ɋ"uRuTJD8p C4 KȳM>QI4xۢA26v)gS iwCf`9z~|]?d|6dT vG2sy>$fXBf(E?6lb64zLKg-hw$4E-AZH0"!j?O`aj$ ԸF5Ap,7nmqpsF;t fgszIFD#Ă-Ղ/?sKQsoEOo/!7з'?@M$" bVF&mIr+ōlev̑uPcC(m hKhK5rx ?ވn7YCуڋSe x>^ɾZTbcAV~@Ya.F|ܛq}赧U $4A2ڝ(k?Iq(ᱸ(^+DyX"g͇8qk<74=M`u,0VwooЫHF*IA1TqE4&xN>Pv%d"gUh~r.\ Kiٖqێ>e: { 1*aXa[^Z_RJ/twg2TM/GaU,˟ʜdD[w %hiC`Xl~Ũx2{*41A~7(z{wɬeK~F,f -Y6Si&%|!&i48|GNbacNn,ɡ|pryp2J3@v6$Jp1jEö86;N=فIfnTc};|x(nPq}_F~>|Yl~Ũp7-~XLE׊Ͱ-@cr[l;edK{,)c?R$ꨴ "~?z) zM=P M!XD4$űDw[pIfMdws 6/ -Ş#LW([jPUP|-|.AZs9SOl|-Xc˶=[`7ٸj~)J]|tҍCrb3GOTXZT70܋^-^Șmev0;0| {4*67wr:҄x5n~q16 _l[`5Yo;XW%(frm^;WH{[7dWw0EۢZ[)F#|Goql& ^mgM6$fw90>bsЙE1S)bjN%{ZP>j©(o#AW0l~ƶ(cu[.mQA$mav$^>6rLɩ$)X3w-o6mQ\bw[)bIq5rs*܄l]@#GSɎDh%6^N-~-mCk[[='|prgʧL;*=xI06zGo0!-~Ll9E"Ii)~GhltΚclןЄn~c7\$FmQ2S.m9BbW>VL4oB$\fdTO՘ F\%} \alias{\%>\%} \title{Pipe operator} \usage{ lhs \%>\% rhs } \description{ Pipe operator } \keyword{internal} stringr/man/str_trim.Rd0000644000176200001440000000163613204625127014657 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/trim.R \name{str_trim} \alias{str_trim} \alias{str_squish} \title{Trim whitespace from a string} \usage{ str_trim(string, side = c("both", "left", "right")) str_squish(string) } \arguments{ \item{string}{A character vector.} \item{side}{Side on which to remove whitespace (left, right or both).} } \value{ A character vector. } \description{ \code{str_trim()} removes whitespace from start and end of string; \code{str_squish()} also reduces repeated whitespace inside a string. } \examples{ str_trim(" String with trailing and leading white space\\t") str_trim("\\n\\nString with trailing and leading white space\\n\\n") str_squish(" String with trailing, middle, and leading white space\\t") str_squish("\\n\\nString with excess, trailing and leading white space\\n\\n") } \seealso{ \code{\link[=str_pad]{str_pad()}} to add whitespace } stringr/man/str_conv.Rd0000644000176200001440000000116713202620061014636 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/conv.R \name{str_conv} \alias{str_conv} \title{Specify the encoding of a string.} \usage{ str_conv(string, encoding) } \arguments{ \item{string}{String to re-encode.} \item{encoding}{Name of encoding. See \code{\link[stringi:stri_enc_list]{stringi::stri_enc_list()}} for a complete list.} } \description{ This is a convenient way to override the current encoding of a string. } \examples{ # Example from encoding?stringi::stringi x <- rawToChar(as.raw(177)) x str_conv(x, "ISO-8859-2") # Polish "a with ogonek" str_conv(x, "ISO-8859-1") # Plus-minus } stringr/man/str_replace_na.Rd0000644000176200001440000000066213031472566016000 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/replace.r \name{str_replace_na} \alias{str_replace_na} \title{Turn NA into "NA"} \usage{ str_replace_na(string, replacement = "NA") } \arguments{ \item{string}{Input vector. Either a character vector, or something coercible to one.} \item{replacement}{A single string.} } \description{ Turn NA into "NA" } \examples{ str_replace_na(c(NA, "abc", "def")) } stringr/man/str_subset.Rd0000644000176200001440000000410413413766176015216 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/subset.R \name{str_subset} \alias{str_subset} \alias{str_which} \title{Keep strings matching a pattern, or find positions.} \usage{ str_subset(string, pattern, negate = FALSE) str_which(string, pattern, negate = FALSE) } \arguments{ \item{string}{Input vector. Either a character vector, or something coercible to one.} \item{pattern}{Pattern to look for. The default interpretation is a regular expression, as described in \link[stringi:stringi-search-regex]{stringi::stringi-search-regex}. Control options with \code{\link[=regex]{regex()}}. Match a fixed string (i.e. by comparing only bytes), using \code{\link[=fixed]{fixed()}}. This is fast, but approximate. Generally, for matching human text, you'll want \code{\link[=coll]{coll()}} which respects character matching rules for the specified locale. Match character, word, line and sentence boundaries with \code{\link[=boundary]{boundary()}}. An empty pattern, "", is equivalent to \code{boundary("character")}.} \item{negate}{If \code{TRUE}, return non-matching elements.} } \value{ A character vector. } \description{ \code{str_subset()} is a wrapper around \code{x[str_detect(x, pattern)]}, and is equivalent to \code{grep(pattern, x, value = TRUE)}. \code{str_which()} is a wrapper around \code{which(str_detect(x, pattern))}, and is equivalent to \code{grep(pattern, x)}. See \code{\link[=str_detect]{str_detect()}} for an equivalent to \code{grepl(pattern, x)}. } \details{ Vectorised over \code{string} and \code{pattern} } \examples{ fruit <- c("apple", "banana", "pear", "pinapple") str_subset(fruit, "a") str_which(fruit, "a") str_subset(fruit, "^a") str_subset(fruit, "a$") str_subset(fruit, "b") str_subset(fruit, "[aeiou]") # Returns elements that do NOT match str_subset(fruit, "^p", negate = TRUE) # Missings never match str_subset(c("a", NA, "b"), ".") str_which(c("a", NA, "b"), ".") } \seealso{ \code{\link[=grep]{grep()}} with argument \code{value = TRUE}, \code{\link[stringi:stri_subset]{stringi::stri_subset()}} for the underlying implementation. } stringr/man/str_split.Rd0000644000176200001440000000417313202620337015032 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/split.r \name{str_split} \alias{str_split} \alias{str_split_fixed} \title{Split up a string into pieces.} \usage{ str_split(string, pattern, n = Inf, simplify = FALSE) str_split_fixed(string, pattern, n) } \arguments{ \item{string}{Input vector. Either a character vector, or something coercible to one.} \item{pattern}{Pattern to look for. The default interpretation is a regular expression, as described in \link[stringi:stringi-search-regex]{stringi::stringi-search-regex}. Control options with \code{\link[=regex]{regex()}}. Match a fixed string (i.e. by comparing only bytes), using \code{\link[=fixed]{fixed()}}. This is fast, but approximate. Generally, for matching human text, you'll want \code{\link[=coll]{coll()}} which respects character matching rules for the specified locale. Match character, word, line and sentence boundaries with \code{\link[=boundary]{boundary()}}. An empty pattern, "", is equivalent to \code{boundary("character")}.} \item{n}{number of pieces to return. Default (Inf) uses all possible split positions. For \code{str_split_fixed}, if n is greater than the number of pieces, the result will be padded with empty strings.} \item{simplify}{If \code{FALSE}, the default, returns a list of character vectors. If \code{TRUE} returns a character matrix.} } \value{ For \code{str_split_fixed}, a character matrix with \code{n} columns. For \code{str_split}, a list of character vectors. } \description{ Vectorised over \code{string} and \code{pattern}. } \examples{ fruits <- c( "apples and oranges and pears and bananas", "pineapples and mangos and guavas" ) str_split(fruits, " and ") str_split(fruits, " and ", simplify = TRUE) # Specify n to restrict the number of possible matches str_split(fruits, " and ", n = 3) str_split(fruits, " and ", n = 2) # If n greater than number of pieces, no padding occurs str_split(fruits, " and ", n = 5) # Use fixed to return a character matrix str_split_fixed(fruits, " and ", 3) str_split_fixed(fruits, " and ", 4) } \seealso{ \code{\link[=stri_split]{stri_split()}} for the underlying implementation. } stringr/man/stringr-data.Rd0000644000176200001440000000135113031472465015410 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data.R \docType{data} \name{stringr-data} \alias{stringr-data} \alias{sentences} \alias{fruit} \alias{words} \title{Sample character vectors for practicing string manipulations.} \format{A character vector.} \usage{ sentences fruit words } \description{ \code{fruit} and \code{word} come from the \code{rcorpora} package written by Gabor Csardi; the data was collected by Darius Kazemi and made available at \url{https://github.com/dariusk/corpora}. \code{sentences} is a collection of "Harvard sentences" used for standardised testing of voice. } \examples{ length(sentences) sentences[1:5] length(fruit) fruit[1:5] length(words) words[1:5] } \keyword{datasets} stringr/man/str_c.Rd0000644000176200001440000000426113427104326014123 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/c.r \name{str_c} \alias{str_c} \title{Join multiple strings into a single string.} \usage{ str_c(..., sep = "", collapse = NULL) } \arguments{ \item{...}{One or more character vectors. Zero length arguments are removed. Short arguments are recycled to the length of the longest. Like most other R functions, missing values are "infectious": whenever a missing value is combined with another string the result will always be missing. Use \code{\link[=str_replace_na]{str_replace_na()}} to convert \code{NA} to \code{"NA"}} \item{sep}{String to insert between input vectors.} \item{collapse}{Optional string used to combine input vectors into single string.} } \value{ If \code{collapse = NULL} (the default) a character vector with length equal to the longest input string. If \code{collapse} is non-NULL, a character vector of length 1. } \description{ Joins two or more vectors element-wise into a single character vector, optionally inserting \code{sep} between input vectors. If \code{collapse} is not \code{NULL}, it will be inserted between elements of the result, returning a character vector of length 1. } \details{ To understand how \code{str_c} works, you need to imagine that you are building up a matrix of strings. Each input argument forms a column, and is expanded to the length of the longest argument, using the usual recyling rules. The \code{sep} string is inserted between each column. If collapse is \code{NULL} each row is collapsed into a single string. If non-\code{NULL} that string is inserted at the end of each row, and the entire matrix collapsed to a single string. } \examples{ str_c("Letter: ", letters) str_c("Letter", letters, sep = ": ") str_c(letters, " is for", "...") str_c(letters[-26], " comes before ", letters[-1]) str_c(letters, collapse = "") str_c(letters, collapse = ", ") # Missing inputs give missing outputs str_c(c("a", NA, "b"), "-d") # Use str_replace_NA to display literal NAs: str_c(str_replace_na(c("a", NA, "b")), "-d") } \seealso{ \code{\link[=paste]{paste()}} for equivalent base R functionality, and \code{\link[stringi:stri_join]{stringi::stri_join()}} which this function wraps } stringr/man/stringr-package.Rd0000644000176200001440000000200513427104326016064 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/stringr.R \docType{package} \name{stringr-package} \alias{stringr} \alias{stringr-package} \title{stringr: Simple, Consistent Wrappers for Common String Operations} \description{ \if{html}{\figure{logo.png}{options: align='right'}} A consistent, simple and easy to use set of wrappers around the fantastic 'stringi' package. All function and argument names (and positions) are consistent, all functions deal with "NA"'s and zero length vectors in the same way, and the output from one function is easy to feed into the input of another. } \seealso{ Useful links: \itemize{ \item \url{http://stringr.tidyverse.org} \item \url{https://github.com/tidyverse/stringr} \item Report bugs at \url{https://github.com/tidyverse/stringr/issues} } } \author{ \strong{Maintainer}: Hadley Wickham \email{hadley@rstudio.com} [copyright holder] Other contributors: \itemize{ \item RStudio [copyright holder, funder] } } \keyword{internal} stringr/man/str_count.Rd0000644000176200001440000000277513202620337015035 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/count.r \name{str_count} \alias{str_count} \title{Count the number of matches in a string.} \usage{ str_count(string, pattern = "") } \arguments{ \item{string}{Input vector. Either a character vector, or something coercible to one.} \item{pattern}{Pattern to look for. The default interpretation is a regular expression, as described in \link[stringi:stringi-search-regex]{stringi::stringi-search-regex}. Control options with \code{\link[=regex]{regex()}}. Match a fixed string (i.e. by comparing only bytes), using \code{\link[=fixed]{fixed()}}. This is fast, but approximate. Generally, for matching human text, you'll want \code{\link[=coll]{coll()}} which respects character matching rules for the specified locale. Match character, word, line and sentence boundaries with \code{\link[=boundary]{boundary()}}. An empty pattern, "", is equivalent to \code{boundary("character")}.} } \value{ An integer vector. } \description{ Vectorised over \code{string} and \code{pattern}. } \examples{ fruit <- c("apple", "banana", "pear", "pineapple") str_count(fruit, "a") str_count(fruit, "p") str_count(fruit, "e") str_count(fruit, c("a", "b", "p", "p")) str_count(c("a.", "...", ".a.a"), ".") str_count(c("a.", "...", ".a.a"), fixed(".")) } \seealso{ \code{\link[stringi:stri_count]{stringi::stri_count()}} which this function wraps. \code{\link[=str_locate]{str_locate()}}/\code{\link[=str_locate_all]{str_locate_all()}} to locate position of matches } stringr/man/case.Rd0000644000176200001440000000147213427104326013725 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/case.R \name{case} \alias{case} \alias{str_to_upper} \alias{str_to_lower} \alias{str_to_title} \alias{str_to_sentence} \title{Convert case of a string.} \usage{ str_to_upper(string, locale = "en") str_to_lower(string, locale = "en") str_to_title(string, locale = "en") str_to_sentence(string, locale = "en") } \arguments{ \item{string}{String to modify} \item{locale}{Locale to use for translations. Defaults to "en" (English) to ensure consistent default ordering across platforms.} } \description{ Convert case of a string. } \examples{ dog <- "The quick brown dog" str_to_upper(dog) str_to_lower(dog) str_to_title(dog) str_to_sentence("the quick brown dog") # Locale matters! str_to_upper("i") # English str_to_upper("i", "tr") # Turkish } stringr/man/str_replace.Rd0000644000176200001440000000547613274374163015335 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/replace.r \name{str_replace} \alias{str_replace} \alias{str_replace_all} \title{Replace matched patterns in a string.} \usage{ str_replace(string, pattern, replacement) str_replace_all(string, pattern, replacement) } \arguments{ \item{string}{Input vector. Either a character vector, or something coercible to one.} \item{pattern}{Pattern to look for. The default interpretation is a regular expression, as described in \link[stringi:stringi-search-regex]{stringi::stringi-search-regex}. Control options with \code{\link[=regex]{regex()}}. Match a fixed string (i.e. by comparing only bytes), using \code{\link[=fixed]{fixed()}}. This is fast, but approximate. Generally, for matching human text, you'll want \code{\link[=coll]{coll()}} which respects character matching rules for the specified locale.} \item{replacement}{A character vector of replacements. Should be either length one, or the same length as \code{string} or \code{pattern}. References of the form \code{\1}, \code{\2}, etc will be replaced with the contents of the respective matched group (created by \code{()}). To perform multiple replacements in each element of \code{string}, pass a named vector (\code{c(pattern1 = replacement1)}) to \code{str_replace_all}. Alternatively, pass a function to \code{replacement}: it will be called once for each match and its return value will be used to replace the match. To replace the complete string with \code{NA}, use \code{replacement = NA_character_}.} } \value{ A character vector. } \description{ Vectorised over \code{string}, \code{pattern} and \code{replacement}. } \examples{ fruits <- c("one apple", "two pears", "three bananas") str_replace(fruits, "[aeiou]", "-") str_replace_all(fruits, "[aeiou]", "-") str_replace_all(fruits, "[aeiou]", toupper) str_replace_all(fruits, "b", NA_character_) str_replace(fruits, "([aeiou])", "") str_replace(fruits, "([aeiou])", "\\\\1\\\\1") str_replace(fruits, "[aeiou]", c("1", "2", "3")) str_replace(fruits, c("a", "e", "i"), "-") # If you want to apply multiple patterns and replacements to the same # string, pass a named vector to pattern. fruits \%>\% str_c(collapse = "---") \%>\% str_replace_all(c("one" = "1", "two" = "2", "three" = "3")) # Use a function for more sophisticated replacement. This example # replaces colour names with their hex values. colours <- str_c("\\\\b", colors(), "\\\\b", collapse="|") col2hex <- function(col) { rgb <- col2rgb(col) rgb(rgb["red", ], rgb["green", ], rgb["blue", ], max = 255) } x <- c( "Roses are red, violets are blue", "My favourite colour is green" ) str_replace_all(x, colours, col2hex) } \seealso{ \code{\link[=str_replace_na]{str_replace_na()}} to turn missing values into "NA"; \code{\link[=stri_replace]{stri_replace()}} for the underlying implementation. } stringr/man/str_wrap.Rd0000644000176200001440000000232013202620061014632 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/wrap.r \name{str_wrap} \alias{str_wrap} \title{Wrap strings into nicely formatted paragraphs.} \usage{ str_wrap(string, width = 80, indent = 0, exdent = 0) } \arguments{ \item{string}{character vector of strings to reformat.} \item{width}{positive integer giving target line width in characters. A width less than or equal to 1 will put each word on its own line.} \item{indent}{non-negative integer giving indentation of first line in each paragraph} \item{exdent}{non-negative integer giving indentation of following lines in each paragraph} } \value{ A character vector of re-wrapped strings. } \description{ This is a wrapper around \code{\link[stringi:stri_wrap]{stringi::stri_wrap()}} which implements the Knuth-Plass paragraph wrapping algorithm. } \examples{ thanks_path <- file.path(R.home("doc"), "THANKS") thanks <- str_c(readLines(thanks_path), collapse = "\\n") thanks <- word(thanks, 1, 3, fixed("\\n\\n")) cat(str_wrap(thanks), "\\n") cat(str_wrap(thanks, width = 40), "\\n") cat(str_wrap(thanks, width = 60, indent = 2), "\\n") cat(str_wrap(thanks, width = 60, exdent = 2), "\\n") cat(str_wrap(thanks, width = 0, exdent = 2), "\\n") } stringr/man/str_pad.Rd0000644000176200001440000000202013202620061014422 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/pad.r \name{str_pad} \alias{str_pad} \title{Pad a string.} \usage{ str_pad(string, width, side = c("left", "right", "both"), pad = " ") } \arguments{ \item{string}{A character vector.} \item{width}{Minimum width of padded strings.} \item{side}{Side on which padding character is added (left, right or both).} \item{pad}{Single padding character (default is a space).} } \value{ A character vector. } \description{ Vectorised over \code{string}, \code{width} and \code{pad}. } \examples{ rbind( str_pad("hadley", 30, "left"), str_pad("hadley", 30, "right"), str_pad("hadley", 30, "both") ) # All arguments are vectorised except side str_pad(c("a", "abc", "abcdef"), 10) str_pad("a", c(5, 10, 20)) str_pad("a", 10, pad = c("-", "_", " ")) # Longer strings are returned unchanged str_pad("hadley", 3) } \seealso{ \code{\link[=str_trim]{str_trim()}} to remove whitespace; \code{\link[=str_trunc]{str_trunc()}} to decrease the maximum width of a string. } stringr/man/str_flatten.Rd0000644000176200001440000000066313202623210015325 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/flatten.R \name{str_flatten} \alias{str_flatten} \title{Flatten a string} \usage{ str_flatten(string, collapse = "") } \arguments{ \item{string}{Character to flatten} \item{collapse}{String to insert between each piece} } \value{ A character vector of length 1 } \description{ Flatten a string } \examples{ str_flatten(letters) str_flatten(letters, "-") } stringr/man/invert_match.Rd0000644000176200001440000000133613202620061015462 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/locate.r \name{invert_match} \alias{invert_match} \title{Switch location of matches to location of non-matches.} \usage{ invert_match(loc) } \arguments{ \item{loc}{matrix of match locations, as from \code{\link[=str_locate_all]{str_locate_all()}}} } \value{ numeric match giving locations of non-matches } \description{ Invert a matrix of match locations to match the opposite of what was previously matched. } \examples{ numbers <- "1 and 2 and 4 and 456" num_loc <- str_locate_all(numbers, "[0-9]+")[[1]] str_sub(numbers, num_loc[, "start"], num_loc[, "end"]) text_loc <- invert_match(num_loc) str_sub(numbers, text_loc[, "start"], text_loc[, "end"]) } stringr/man/str_remove.Rd0000644000176200001440000000247113202707126015175 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/remove.r \name{str_remove} \alias{str_remove} \alias{str_remove_all} \title{Remove matched patterns in a string.} \usage{ str_remove(string, pattern) str_remove_all(string, pattern) } \arguments{ \item{string}{Input vector. Either a character vector, or something coercible to one.} \item{pattern}{Pattern to look for. The default interpretation is a regular expression, as described in \link[stringi:stringi-search-regex]{stringi::stringi-search-regex}. Control options with \code{\link[=regex]{regex()}}. Match a fixed string (i.e. by comparing only bytes), using \code{\link[=fixed]{fixed()}}. This is fast, but approximate. Generally, for matching human text, you'll want \code{\link[=coll]{coll()}} which respects character matching rules for the specified locale. Match character, word, line and sentence boundaries with \code{\link[=boundary]{boundary()}}. An empty pattern, "", is equivalent to \code{boundary("character")}.} } \value{ A character vector. } \description{ Alias for \code{str_replace(string, pattern, "")}. } \examples{ fruits <- c("one apple", "two pears", "three bananas") str_remove(fruits, "[aeiou]") str_remove_all(fruits, "[aeiou]") } \seealso{ \code{\link[=str_replace]{str_replace()}} for the underlying implementation. } stringr/man/str_dup.Rd0000644000176200001440000000103613031472465014471 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/dup.r \name{str_dup} \alias{str_dup} \title{Duplicate and concatenate strings within a character vector.} \usage{ str_dup(string, times) } \arguments{ \item{string}{Input character vector.} \item{times}{Number of times to duplicate each string.} } \value{ A character vector. } \description{ Vectorised over \code{string} and \code{times}. } \examples{ fruit <- c("apple", "pear", "banana") str_dup(fruit, 2) str_dup(fruit, 1:3) str_c("ba", str_dup("na", 0:5)) } stringr/man/str_locate.Rd0000644000176200001440000000360113202620337015141 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/locate.r \name{str_locate} \alias{str_locate} \alias{str_locate_all} \title{Locate the position of patterns in a string.} \usage{ str_locate(string, pattern) str_locate_all(string, pattern) } \arguments{ \item{string}{Input vector. Either a character vector, or something coercible to one.} \item{pattern}{Pattern to look for. The default interpretation is a regular expression, as described in \link[stringi:stringi-search-regex]{stringi::stringi-search-regex}. Control options with \code{\link[=regex]{regex()}}. Match a fixed string (i.e. by comparing only bytes), using \code{\link[=fixed]{fixed()}}. This is fast, but approximate. Generally, for matching human text, you'll want \code{\link[=coll]{coll()}} which respects character matching rules for the specified locale. Match character, word, line and sentence boundaries with \code{\link[=boundary]{boundary()}}. An empty pattern, "", is equivalent to \code{boundary("character")}.} } \value{ For \code{str_locate}, an integer matrix. First column gives start postion of match, and second column gives end position. For \code{str_locate_all} a list of integer matrices. } \description{ Vectorised over \code{string} and \code{pattern}. If the match is of length 0, (e.g. from a special match like \code{$}) end will be one character less than start. } \examples{ fruit <- c("apple", "banana", "pear", "pineapple") str_locate(fruit, "$") str_locate(fruit, "a") str_locate(fruit, "e") str_locate(fruit, c("a", "b", "p", "p")) str_locate_all(fruit, "a") str_locate_all(fruit, "e") str_locate_all(fruit, c("a", "b", "p", "p")) # Find location of every character str_locate_all(fruit, "") } \seealso{ \code{\link[=str_extract]{str_extract()}} for a convenient way of extracting matches, \code{\link[stringi:stri_locate]{stringi::stri_locate()}} for the underlying implementation. } stringr/man/str_detect.Rd0000644000176200001440000000342713413766176015170 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/detect.r \name{str_detect} \alias{str_detect} \title{Detect the presence or absence of a pattern in a string.} \usage{ str_detect(string, pattern, negate = FALSE) } \arguments{ \item{string}{Input vector. Either a character vector, or something coercible to one.} \item{pattern}{Pattern to look for. The default interpretation is a regular expression, as described in \link[stringi:stringi-search-regex]{stringi::stringi-search-regex}. Control options with \code{\link[=regex]{regex()}}. Match a fixed string (i.e. by comparing only bytes), using \code{\link[=fixed]{fixed()}}. This is fast, but approximate. Generally, for matching human text, you'll want \code{\link[=coll]{coll()}} which respects character matching rules for the specified locale. Match character, word, line and sentence boundaries with \code{\link[=boundary]{boundary()}}. An empty pattern, "", is equivalent to \code{boundary("character")}.} \item{negate}{If \code{TRUE}, return non-matching elements.} } \value{ A logical vector. } \description{ Vectorised over \code{string} and \code{pattern}. Equivalent to \code{grepl(pattern, x)}. See \code{\link[=str_which]{str_which()}} for an equivalent to \code{grep(pattern, x)}. } \examples{ fruit <- c("apple", "banana", "pear", "pinapple") str_detect(fruit, "a") str_detect(fruit, "^a") str_detect(fruit, "a$") str_detect(fruit, "b") str_detect(fruit, "[aeiou]") # Also vectorised over pattern str_detect("aecfg", letters) # Returns TRUE if the pattern do NOT match str_detect(fruit, "^p", negate = TRUE) } \seealso{ \code{\link[stringi:stri_detect]{stringi::stri_detect()}} which this function wraps, \code{\link[=str_subset]{str_subset()}} for a convenient wrapper around \code{x[str_detect(x, pattern)]} } stringr/man/str_order.Rd0000644000176200001440000000256613202620061015010 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/sort.R \name{str_order} \alias{str_order} \alias{str_sort} \title{Order or sort a character vector.} \usage{ str_order(x, decreasing = FALSE, na_last = TRUE, locale = "en", numeric = FALSE, ...) str_sort(x, decreasing = FALSE, na_last = TRUE, locale = "en", numeric = FALSE, ...) } \arguments{ \item{x}{A character vector to sort.} \item{decreasing}{A boolean. If \code{FALSE}, the default, sorts from lowest to highest; if \code{TRUE} sorts from highest to lowest.} \item{na_last}{Where should \code{NA} go? \code{TRUE} at the end, \code{FALSE} at the beginning, \code{NA} dropped.} \item{locale}{In which locale should the sorting occur? Defaults to the English. This ensures that code behaves the same way across platforms.} \item{numeric}{If \code{TRUE}, will sort digits numerically, instead of as strings.} \item{...}{Other options used to control sorting order. Passed on to \code{\link[stringi:stri_opts_collator]{stringi::stri_opts_collator()}}.} } \description{ Order or sort a character vector. } \examples{ str_order(letters) str_sort(letters) str_order(letters, locale = "haw") str_sort(letters, locale = "haw") x <- c("100a10", "100a5", "2b", "2a") str_sort(x) str_sort(x, numeric = TRUE) } \seealso{ \code{\link[stringi:stri_order]{stringi::stri_order()}} for the underlying implementation. } stringr/man/str_view.Rd0000644000176200001440000000356213427104326014656 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/view.R \name{str_view} \alias{str_view} \alias{str_view_all} \title{View HTML rendering of regular expression match.} \usage{ str_view(string, pattern, match = NA) str_view_all(string, pattern, match = NA) } \arguments{ \item{string}{Input vector. Either a character vector, or something coercible to one.} \item{pattern}{Pattern to look for. The default interpretation is a regular expression, as described in \link[stringi:stringi-search-regex]{stringi::stringi-search-regex}. Control options with \code{\link[=regex]{regex()}}. Match a fixed string (i.e. by comparing only bytes), using \code{\link[=fixed]{fixed()}}. This is fast, but approximate. Generally, for matching human text, you'll want \code{\link[=coll]{coll()}} which respects character matching rules for the specified locale. Match character, word, line and sentence boundaries with \code{\link[=boundary]{boundary()}}. An empty pattern, "", is equivalent to \code{boundary("character")}.} \item{match}{If \code{TRUE}, shows only strings that match the pattern. If \code{FALSE}, shows only the strings that don't match the pattern. Otherwise (the default, \code{NA}) displays both matches and non-matches.} } \description{ \code{str_view} shows the first match; \code{str_view_all} shows all the matches. To build regular expressions interactively, check out the \href{https://www.garrickadenbuie.com/project/regexplain/}{RegExplain RStudio addin}. } \examples{ str_view(c("abc", "def", "fgh"), "[aeiou]") str_view(c("abc", "def", "fgh"), "^") str_view(c("abc", "def", "fgh"), "..") # Show all matches with str_view_all str_view_all(c("abc", "def", "fgh"), "d|e") # Use match to control what is shown str_view(c("abc", "def", "fgh"), "d|e") str_view(c("abc", "def", "fgh"), "d|e", match = TRUE) str_view(c("abc", "def", "fgh"), "d|e", match = FALSE) } stringr/man/str_extract.Rd0000644000176200001440000000407613202620337015353 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/extract.r \name{str_extract} \alias{str_extract} \alias{str_extract_all} \title{Extract matching patterns from a string.} \usage{ str_extract(string, pattern) str_extract_all(string, pattern, simplify = FALSE) } \arguments{ \item{string}{Input vector. Either a character vector, or something coercible to one.} \item{pattern}{Pattern to look for. The default interpretation is a regular expression, as described in \link[stringi:stringi-search-regex]{stringi::stringi-search-regex}. Control options with \code{\link[=regex]{regex()}}. Match a fixed string (i.e. by comparing only bytes), using \code{\link[=fixed]{fixed()}}. This is fast, but approximate. Generally, for matching human text, you'll want \code{\link[=coll]{coll()}} which respects character matching rules for the specified locale. Match character, word, line and sentence boundaries with \code{\link[=boundary]{boundary()}}. An empty pattern, "", is equivalent to \code{boundary("character")}.} \item{simplify}{If \code{FALSE}, the default, returns a list of character vectors. If \code{TRUE} returns a character matrix.} } \value{ A character vector. } \description{ Vectorised over \code{string} and \code{pattern}. } \examples{ shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2") str_extract(shopping_list, "\\\\d") str_extract(shopping_list, "[a-z]+") str_extract(shopping_list, "[a-z]{1,4}") str_extract(shopping_list, "\\\\b[a-z]{1,4}\\\\b") # Extract all matches str_extract_all(shopping_list, "[a-z]+") str_extract_all(shopping_list, "\\\\b[a-z]+\\\\b") str_extract_all(shopping_list, "\\\\d") # Simplify results into character matrix str_extract_all(shopping_list, "\\\\b[a-z]+\\\\b", simplify = TRUE) str_extract_all(shopping_list, "\\\\d", simplify = TRUE) # Extract all words str_extract_all("This is, suprisingly, a sentence.", boundary("word")) } \seealso{ \code{\link[=str_match]{str_match()}} to extract matched groups; \code{\link[stringi:stri_extract]{stringi::stri_extract()}} for the underlying implementation. } stringr/man/str_match.Rd0000644000176200001440000000301513202620403014757 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/match.r \name{str_match} \alias{str_match} \alias{str_match_all} \title{Extract matched groups from a string.} \usage{ str_match(string, pattern) str_match_all(string, pattern) } \arguments{ \item{string}{Input vector. Either a character vector, or something coercible to one.} \item{pattern}{Pattern to look for, as defined by an ICU regular expression. See \link[stringi:stringi-search-regex]{stringi::stringi-search-regex} for more details.} } \value{ For \code{str_match}, a character matrix. First column is the complete match, followed by one column for each capture group. For \code{str_match_all}, a list of character matrices. } \description{ Vectorised over \code{string} and \code{pattern}. } \examples{ strings <- c(" 219 733 8965", "329-293-8753 ", "banana", "595 794 7569", "387 287 6718", "apple", "233.398.9187 ", "482 952 3315", "239 923 8115 and 842 566 4692", "Work: 579-499-7527", "$1000", "Home: 543.355.3679") phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})" str_extract(strings, phone) str_match(strings, phone) # Extract/match all str_extract_all(strings, phone) str_match_all(strings, phone) x <- c(" ", " <>", "", "", NA) str_match(x, "<(.*?)> <(.*?)>") str_match_all(x, "<(.*?)>") str_extract(x, "<.*?>") str_extract_all(x, "<.*?>") } \seealso{ \code{\link[=str_extract]{str_extract()}} to extract the complete match, \code{\link[stringi:stri_match]{stringi::stri_match()}} for the underlying implementation. } stringr/man/modifiers.Rd0000644000176200001440000000710413413770102014765 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/modifiers.r \name{modifiers} \alias{modifiers} \alias{fixed} \alias{coll} \alias{regex} \alias{boundary} \title{Control matching behaviour with modifier functions.} \usage{ fixed(pattern, ignore_case = FALSE) coll(pattern, ignore_case = FALSE, locale = "en", ...) regex(pattern, ignore_case = FALSE, multiline = FALSE, comments = FALSE, dotall = FALSE, ...) boundary(type = c("character", "line_break", "sentence", "word"), skip_word_none = NA, ...) } \arguments{ \item{pattern}{Pattern to modify behaviour.} \item{ignore_case}{Should case differences be ignored in the match?} \item{locale}{Locale to use for comparisons. See \code{\link[stringi:stri_locale_list]{stringi::stri_locale_list()}} for all possible options. Defaults to "en" (English) to ensure that the default collation is consistent across platforms.} \item{...}{Other less frequently used arguments passed on to \code{\link[stringi:stri_opts_collator]{stringi::stri_opts_collator()}}, \code{\link[stringi:stri_opts_regex]{stringi::stri_opts_regex()}}, or \code{\link[stringi:stri_opts_brkiter]{stringi::stri_opts_brkiter()}}} \item{multiline}{If \code{TRUE}, \code{$} and \code{^} match the beginning and end of each line. If \code{FALSE}, the default, only match the start and end of the input.} \item{comments}{If \code{TRUE}, white space and comments beginning with \code{#} are ignored. Escape literal spaces with \code{\\ }.} \item{dotall}{If \code{TRUE}, \code{.} will also match line terminators.} \item{type}{Boundary type to detect. \describe{ \item{\code{character}}{Every character is a boundary.} \item{\code{line_break}}{Boundaries are places where it is acceptable to have a line break in the current locale.} \item{\code{sentence}}{The beginnings and ends of sentences are boundaries, using intelligent rules to avoid counting abbreviations (\href{https://www.unicode.org/reports/tr29/#Sentence_Boundaries}{details}).} \item{\code{word}}{The beginnings and ends of words are boundaries.} }} \item{skip_word_none}{Ignore "words" that don't contain any characters or numbers - i.e. punctuation. Default \code{NA} will skip such "words" only when splitting on \code{word} boundaries.} } \description{ \describe{ \item{fixed}{Compare literal bytes in the string. This is very fast, but not usually what you want for non-ASCII character sets.} \item{coll}{Compare strings respecting standard collation rules.} \item{regex}{The default. Uses ICU regular expressions.} \item{boundary}{Match boundaries between things.} } } \examples{ pattern <- "a.b" strings <- c("abb", "a.b") str_detect(strings, pattern) str_detect(strings, fixed(pattern)) str_detect(strings, coll(pattern)) # coll() is useful for locale-aware case-insensitive matching i <- c("I", "\\u0130", "i") i str_detect(i, fixed("i", TRUE)) str_detect(i, coll("i", TRUE)) str_detect(i, coll("i", TRUE, locale = "tr")) # Word boundaries words <- c("These are some words.") str_count(words, boundary("word")) str_split(words, " ")[[1]] str_split(words, boundary("word"))[[1]] # Regular expression variations str_extract_all("The Cat in the Hat", "[a-z]+") str_extract_all("The Cat in the Hat", regex("[a-z]+", TRUE)) str_extract_all("a\\nb\\nc", "^.") str_extract_all("a\\nb\\nc", regex("^.", multiline = TRUE)) str_extract_all("a\\nb\\nc", "a.") str_extract_all("a\\nb\\nc", regex("a.", dotall = TRUE)) } \seealso{ \code{\link[=str_wrap]{str_wrap()}} for breaking text to form paragraphs \code{\link[stringi:stringi-search-boundaries]{stringi::stringi-search-boundaries}} for more detail on the various boundaries } stringr/man/str_interp.Rd0000644000176200001440000000405513202642747015210 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/interp.R \name{str_interp} \alias{str_interp} \title{String interpolation.} \usage{ str_interp(string, env = parent.frame()) } \arguments{ \item{string}{A template character string. This function is not vectorised: a character vector will be collapsed into a single string.} \item{env}{The environment in which to evaluate the expressions.} } \value{ An interpolated character string. } \description{ String interpolation is a useful way of specifying a character string which depends on values in a certain environment. It allows for string creation which is easier to read and write when compared to using e.g. \code{\link[=paste]{paste()}} or \code{\link[=sprintf]{sprintf()}}. The (template) string can include expression placeholders of the form \code{${expression}} or \code{$[format]{expression}}, where expressions are valid R expressions that can be evaluated in the given environment, and \code{format} is a format specification valid for use with \code{\link[=sprintf]{sprintf()}}. } \examples{ # Using values from the environment, and some formats user_name <- "smbache" amount <- 6.656 account <- 1337 str_interp("User ${user_name} (account $[08d]{account}) has $$[.2f]{amount}.") # Nested brace pairs work inside expressions too, and any braces can be # placed outside the expressions. str_interp("Works with } nested { braces too: $[.2f]{{{2 + 2}*{amount}}}") # Values can also come from a list str_interp( "One value, ${value1}, and then another, ${value2*2}.", list(value1 = 10, value2 = 20) ) # Or a data frame str_interp( "Values are $[.2f]{max(Sepal.Width)} and $[.2f]{min(Sepal.Width)}.", iris ) # Use a vector when the string is long: max_char <- 80 str_interp(c( "This particular line is so long that it is hard to write ", "without breaking the ${max_char}-char barrier!" )) } \seealso{ \code{\link[=str_glue]{str_glue()}} and \code{\link[=str_glue_data]{str_glue_data()}} for alternative approaches to the same problem. } \author{ Stefan Milton Bache } \keyword{internal} stringr/man/str_glue.Rd0000644000176200001440000000377413341254174014647 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/glue.R \name{str_glue} \alias{str_glue} \alias{str_glue_data} \title{Format and interpolate a string with glue} \usage{ str_glue(..., .sep = "", .envir = parent.frame()) str_glue_data(.x, ..., .sep = "", .envir = parent.frame(), .na = "NA") } \arguments{ \item{...}{[\code{expressions}]\cr Expressions string(s) to format, multiple inputs are concatenated together before formatting.} \item{.sep}{[\code{character(1)}: \sQuote{""}]\cr Separator used to separate elements.} \item{.envir}{[\code{environment}: \code{parent.frame()}]\cr Environment to evaluate each expression in. Expressions are evaluated from left to right. If \code{.x} is an environment, the expressions are evaluated in that environment and \code{.envir} is ignored.} \item{.x}{[\code{listish}]\cr An environment, list or data frame used to lookup values.} \item{.na}{[\code{character(1)}: \sQuote{NA}]\cr Value to replace NA values with. If \code{NULL} missing values are propagated, that is an \code{NA} result will cause \code{NA} output. Otherwise the value is replaced by the value of \code{.na}.} } \description{ These functions are wrappers around \code{\link[glue:glue]{glue::glue()}} and \code{\link[glue:glue_data]{glue::glue_data()}}, which provide a powerful and elegant syntax for interpolating strings. These wrappers provide a small set of the full options. Use the functions directly from glue for more control. } \examples{ name <- "Fred" age <- 50 anniversary <- as.Date("1991-10-12") str_glue( "My name is {name}, ", "my age next year is {age + 1}, ", "and my anniversary is {format(anniversary, '\%A, \%B \%d, \%Y')}." ) # single braces can be inserted by doubling them str_glue("My name is {name}, not {{name}}.") # You can also used named arguments str_glue( "My name is {name}, ", "and my age next year is {age + 1}.", name = "Joe", age = 40 ) # `str_glue_data()` is useful in data pipelines mtcars \%>\% str_glue_data("{rownames(.)} has {hp} hp") } stringr/man/str_starts.Rd0000644000176200001440000000254613427104326015225 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/detect.r \name{str_starts} \alias{str_starts} \alias{str_ends} \title{Detect the presence or absence of a pattern at the beginning or end of a string.} \usage{ str_starts(string, pattern, negate = FALSE) str_ends(string, pattern, negate = FALSE) } \arguments{ \item{string}{Input vector. Either a character vector, or something coercible to one.} \item{pattern}{Pattern with which the string starts or ends. The default interpretation is a regular expression, as described in \link[stringi:stringi-search-regex]{stringi::stringi-search-regex}. Control options with \code{\link[=regex]{regex()}}. Match a fixed string (i.e. by comparing only bytes), using \code{\link[=fixed]{fixed()}}. This is fast, but approximate. Generally, for matching human text, you'll want \code{\link[=coll]{coll()}} which respects character matching rules for the specified locale.} \item{negate}{If \code{TRUE}, return non-matching elements.} } \value{ A logical vector. } \description{ Vectorised over \code{string} and \code{pattern}. } \examples{ fruit <- c("apple", "banana", "pear", "pinapple") str_starts(fruit, "p") str_starts(fruit, "p", negate = TRUE) str_ends(fruit, "e") str_ends(fruit, "e", negate = TRUE) } \seealso{ \code{\link[=str_detect]{str_detect()}} which this function wraps when pattern is regex. } stringr/LICENSE0000644000176200001440000004325413032561037012757 0ustar liggesusers GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Lesser General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License.