stringi/ 0000755 0001762 0000144 00000000000 14771247052 011742 5 ustar ligges users stringi/MD5 0000644 0001762 0000144 00000227026 14771247052 012263 0 ustar ligges users c11040ccc16441809adabbcef6a0e1d8 *DESCRIPTION
5b9bded89d382625aa5194610ef5dc0a *INSTALL
0b274d86f0944f0964b5cbe89f3e1529 *LICENSE
fbccc8447bec59bda28786a6a59b3442 *NAMESPACE
a4bf945a499f378a0a469204291c105b *NEWS
15bf7790e61900b5d9e6b920a05a19ab *R/ICU_settings.R
d97ec069ec10c409e150012b944b3b1b *R/compare.R
938d4c3b5ba033957d059090acf1201e *R/encoding.R
adbd123a02c59f02eeee9eadb13296e1 *R/encoding_conversion.R
02dc3549a8a67a19f2d37bbd79c88a9a *R/encoding_detection.R
4e61d55d8d20c9f997850b3b86637be6 *R/encoding_management.R
1fba31d0a3fde174f089bca14deb1580 *R/escape.R
d88252597a52b12d42d64df95ef1554e *R/files.R
b7361e0556de786d0f9e5fed7ad4dbe8 *R/install.R
66f83400fc39b73e9c5a17c89dac7502 *R/internal_prepare_arg.R
1e2508ee764b3d12a68ea300a03b348f *R/internal_test.R
ddf7920a17df98717a5a8e73c45e228c *R/join.R
9e5a917d9ebfd2c85bc84f18286f306d *R/length.R
3f8b72ec3a6a7a943a0e9dd4d74e4aa1 *R/locale.R
96de209032f3e86d7eef0753ddd19afc *R/locale_management.R
77cd1d9086073a133118c9d610093f17 *R/opts.R
a3dae305f6a8da323a6a8721a61993c1 *R/pad.R
8a5afc91d4ed94569224cc453fedbca9 *R/random.R
41fd0b9bdcb6fd8e2a430a06bc43d47a *R/reverse.R
654242c9eb4adac6376c7dc30318e832 *R/search.R
9cf8f04e46b0747298d2b4fce0066ad8 *R/search_count_4.R
a1a7b9194656386c161e6153d99af703 *R/search_count_bound.R
75bca32414c213f7cf393bd71b8aff04 *R/search_detect_4.R
87294d6151a91130e7219afc551ca7a3 *R/search_extract_4.R
81940054ff415c8a7977f893c65deda3 *R/search_extract_bound.R
909363b0676ab92e279b9c6ef41d51fc *R/search_locate_4.R
81d2c846df4d8c4ab7cb81a12e677d96 *R/search_locate_bound.R
ae12bed93e94cb2d516659d6a298229c *R/search_match_4.R
36c3ec95d3c55f325c71e6aa4666cebf *R/search_replace_4.R
4db6b32065ca48cbc1c7e44898ce0d15 *R/search_split_4.R
84874e582b4bfc60bdf959c280e26702 *R/search_split_bound.R
6c4d8f07cd11a2a882cfc601ea09e86b *R/search_startsendswith_4.R
770a5ea50a71446f86986c609d8aae0e *R/search_subset_4.R
3f5a99a1033d30881ab2ccc9a72a458b *R/sort.R
6bff0145897f7f39aad25b807a860223 *R/sprintf.R
11992cbf4f72d914075190bdf855191b *R/stats.R
600b4ff16b35ec57f7214acbebda28f3 *R/stringi_package.R
cc7b399f1a5d9963e1d8f00d263e3b94 *R/sub.R
1496986a08a1b940534919d0e046ea5b *R/time_calendar.R
115d819cda273048a7785a2eab46464a *R/time_format.R
2aef5f86d5b388da2fff4b16db19daa8 *R/time_symbols.R
4b95cd483be9fc564a057f2b577c1891 *R/time_zone.R
f854804f31d27b1a8f4eaacab241da36 *R/trans_casemap.R
2ffdd4e62fbb0f5f287473cb61783431 *R/trans_normalization.R
4cdcf592ea1a768f84fa8d427ce690dd *R/trans_other.R
a6f889b5a831281623a8f983aabde151 *R/trans_transliterate.R
0d802c7e9daa6c729065d57d962b0128 *R/trim.R
cb18ecf9188d501a56fe13e04013fc5e *R/utils.R
bfe8512f21f7dcddf02b8dd277ffa46f *R/wrap.R
4454e632feb75ce4f508807d50b3b12f *build/partial.rdb
e786a8e3195a353a5aa9105b1b18fb29 *cleanup
994bf59ab98cbce51323d0179a396e8e *configure
1f85dbfcffef6fa6a62a05e3ff85458c *configure.ac
5164e451ddb1b73c1052e9aae743767d *configure.win
0a51dcf4e38b057ee084a85644c6e3a7 *inst/AUTHORS
11940a3690625fea0670cbee73ede58a *inst/CITATION
f281a9a4d5d2a3827941b01abff8dcd5 *man/about_arguments.Rd
facc1aea6b469d0fb66839e6e47dc608 *man/about_encoding.Rd
aef6a1a3225f905c91775bfd62f7bd38 *man/about_locale.Rd
66b0a115b3f4385ab2b20c73e2df001e *man/about_search.Rd
27e0cc3f55c11e8751f316db9524731c *man/about_search_boundaries.Rd
2a41f016ce9daf72523fd18b6f4bd507 *man/about_search_charclass.Rd
bda5beb1839985bba3813a66de887344 *man/about_search_coll.Rd
1bdcbbd57a04b019e98d6917dee1997c *man/about_search_fixed.Rd
eace196448388b85525817253daaeab3 *man/about_search_regex.Rd
681f2c39e73f953d03338d95303267f8 *man/operator_add.Rd
95bc60262e823fbe7eb72e321eab3806 *man/operator_compare.Rd
44b518b7ca674e27b558ed30e09bca50 *man/operator_dollar.Rd
fe12fe54f17be287d1592f5990969a6b *man/stri_compare.Rd
9bc0b4a887d6fd4f8b568a2c21718900 *man/stri_count.Rd
4eb69343856500b71ee3e4136e48c265 *man/stri_count_boundaries.Rd
61d4e9ee5515276335670f859df59f69 *man/stri_datetime_add.Rd
012a6b0ea16680030908916840be3ef4 *man/stri_datetime_create.Rd
63229754aac10e796e6c2042587f969f *man/stri_datetime_fields.Rd
845705290346bb17740337ce2ab2ba57 *man/stri_datetime_format.Rd
94b1dceafe07d86c1a07ccbc23f6c473 *man/stri_datetime_fstr.Rd
c6db33c91376938541c19f2f5c2415c5 *man/stri_datetime_now.Rd
9dcfb5c4422b2b4428ac83c0a647bb87 *man/stri_datetime_symbols.Rd
8fb721a5777d8f81ae42a0a859e56730 *man/stri_detect.Rd
ee907e61319b13c0eb30ed608efc5e64 *man/stri_dup.Rd
0ee4d43c8c0023ef5f3c6b4a6a9d9f7e *man/stri_duplicated.Rd
8139a923d436200a04c19054290be31f *man/stri_enc_detect.Rd
e597449d6687a03932c13071b8a47816 *man/stri_enc_detect2.Rd
dbf2f340a3bac54920f7fa486de4890f *man/stri_enc_fromutf32.Rd
ca659ca78d219cac17d4605d3935d6f3 *man/stri_enc_info.Rd
582d9c1729decaa4c63f8cafbb367d8a *man/stri_enc_isascii.Rd
96ea8addd014e76bd261b55d94a31149 *man/stri_enc_isutf16.Rd
d8f4dbbd72de4e1684d79c5863aa6c43 *man/stri_enc_isutf8.Rd
fd211c6321254aaec9e050cfc8e3fdf7 *man/stri_enc_list.Rd
196d03d91964c93da50c61b785c4a07f *man/stri_enc_mark.Rd
3d76974bfedd87525b3690e4f77eee58 *man/stri_enc_set.Rd
5e2ee6edbaa990278f09bdecc785ea5a *man/stri_enc_toascii.Rd
35126c3527369f3c485ac742c9a38b16 *man/stri_enc_tonative.Rd
6a26e279c6bd7b40038367933ebb933f *man/stri_enc_toutf32.Rd
ee14797f45b703936e5cdac325465e68 *man/stri_enc_toutf8.Rd
f2f5fe4b1742c649de2134dac54b8c81 *man/stri_encode.Rd
09e1574a4a464aa9dc871fc319101590 *man/stri_escape_unicode.Rd
ec85f99ac41fb93b15c98fbb81791306 *man/stri_extract.Rd
86d4f8c26a36e9e62437c3ebc4226d77 *man/stri_extract_boundaries.Rd
a42cb9f6b9e733ff77a30d5f80fab5ea *man/stri_flatten.Rd
8b8c786f646623ce465c3f68a16a543a *man/stri_info.Rd
bd47458781c7368b236c63551277291f *man/stri_isempty.Rd
36c34f25eba4fc6e36bca3519d72010f *man/stri_join.Rd
6879e164dbace74170a0eca74b7ef495 *man/stri_join_list.Rd
0c11e6c3d21c154c076b01a42fd2ffb9 *man/stri_length.Rd
1fc9ab81a1adc8b0a5513625f4765900 *man/stri_list2matrix.Rd
3ce00aa1511217857a1272822916aad7 *man/stri_locale_info.Rd
9554cf49bd74615b59d06dae8f0a4bd4 *man/stri_locale_list.Rd
66d17af3574b16281f3b0e6539c5f8cf *man/stri_locale_set.Rd
d2e839e43f0b35542c27f094ca0a565d *man/stri_locate.Rd
0a1911c2411b0df9f802cc3ee337c14f *man/stri_locate_boundaries.Rd
89e230567b24a55c42a4b5614913d257 *man/stri_match.Rd
4d41393c9df2f66ebe8dfdc8dbb7cd86 *man/stri_na2empty.Rd
38d4763eaccee4cca6b016d8f42f3a21 *man/stri_numbytes.Rd
bbc11a11ab42c552aa2021cee7eb9ec9 *man/stri_opts_brkiter.Rd
fca73c9f559a4ed799dc04f59095eb51 *man/stri_opts_collator.Rd
e34a2aeb898b34837ea72f5415c04338 *man/stri_opts_fixed.Rd
1d61606edd40e0e9a315b7c88a39f691 *man/stri_opts_regex.Rd
f775ccf0ee34e0040447a35e506fcd37 *man/stri_order.Rd
7081ba7863589767e062d5b198df2a19 *man/stri_pad.Rd
98874345b46429d6e7d7c7851183ffb4 *man/stri_rand_lipsum.Rd
86a5426e03c808b1f38275505251370c *man/stri_rand_shuffle.Rd
3ef378b335ff09d5d3c54f8785d16464 *man/stri_rand_strings.Rd
5b0939177b1a8fe25bf72c3eff54afa5 *man/stri_rank.Rd
2355b24ebb992222f5dd5639941575c0 *man/stri_read_lines.Rd
6cf8d9fbf46427d4190ef1e8926db066 *man/stri_read_raw.Rd
739abfe8d7a3e8fa59a661649dd1de57 *man/stri_remove_empty.Rd
8505905d5d0f6fd705211c90a5ed1bc3 *man/stri_replace.Rd
6fe4bf6442b3117a9828a6536989add2 *man/stri_replace_na.Rd
acbd49ed0f7c14780001f2deb734b7f5 *man/stri_replace_rstr.Rd
2779cd55fbd28cdf7a1057eb5e166176 *man/stri_reverse.Rd
4f5b80e5ed84c616229451ffb7c0c981 *man/stri_sort.Rd
31f7dcf193bf440a6597daced93f555d *man/stri_sort_key.Rd
3481f15d7fdd574c80892c0e7896a6b9 *man/stri_split.Rd
ce8aa31b3021a3adf8a8e74d8ffad042 *man/stri_split_boundaries.Rd
0ef215f89103cadb224636f972862695 *man/stri_split_lines.Rd
b519bc5d01579fef7396ca746ae7cd5e *man/stri_sprintf.Rd
4d2b7488aa8f69a3b97fbb520017a60a *man/stri_startsendswith.Rd
ff4e32a409b836191c65cfb83cbdedf8 *man/stri_stats_general.Rd
f43f3590a07eba0c4457551e8a38ecbf *man/stri_stats_latex.Rd
e29c03d4df8979754f7e4a12458fea57 *man/stri_sub.Rd
18957f87ce34a7ed312fac3cdb6c497c *man/stri_sub_all.Rd
4347c5de9c076eb8289ff6da1a366a66 *man/stri_subset.Rd
93276498808607079fa908268fedffc4 *man/stri_timezone_info.Rd
b2b6541f69bcc7813fa24f4dbe5d757a *man/stri_timezone_list.Rd
4b821559050165e62aec074304231247 *man/stri_timezone_set.Rd
abfe9b51d269dcfd3030f4731513c0b1 *man/stri_trans_casemap.Rd
ae2e432a4593322fc458e90000cf6290 *man/stri_trans_char.Rd
d6f92f2a49e46d2f9be7a25450de21cd *man/stri_trans_general.Rd
1580f614404500b90320f4091214d9c0 *man/stri_trans_list.Rd
b6bc64dab3bdaf4c09cdcd79accf05cd *man/stri_trans_nf.Rd
45d5b3bfdc138615c0169d902b2263c4 *man/stri_trim.Rd
0d8f35b569539fcc95bd12bfef50b643 *man/stri_unescape_unicode.Rd
1b175ce46b12eb8655e23bae91899cc0 *man/stri_unique.Rd
21d0d8d4a922e82733b599963e939e05 *man/stri_width.Rd
71ed88e3597f646c586ebf2e033f4003 *man/stri_wrap.Rd
49ebee3c32f7fccc0983a04a6980206a *man/stri_write_lines.Rd
e766e5a948beba086527537587e85058 *man/stringi-package.Rd
027586a16c6ea87f2fb09ecbaee5d132 *src/Makevars.in
285cf51dcc3b31e1866742ff8f552ac8 *src/Makevars.win
a47d0b87576c49d306558f276511d5d6 *src/icu74/LICENSE
5f7b2dd77c1660634264fc3210664da9 *src/icu74/common/appendable.cpp
cee4b1e5cd196870283f2d9dd856a3d7 *src/icu74/common/bmpset.cpp
e1bd25b7cb9b8bd18323925af7be0b29 *src/icu74/common/bmpset.h
07e65db7a78f0c126244a1f1596f1519 *src/icu74/common/brkeng.cpp
e4482ff43625cee25d1709462b6bb066 *src/icu74/common/brkeng.h
aa7938e9a0e00426fdaffc50a317384c *src/icu74/common/brkiter.cpp
7f218959d375deb386b8487723eb4ca2 *src/icu74/common/bytesinkutil.cpp
63ef8f0eb4ee82dc37286d6f5bec5510 *src/icu74/common/bytesinkutil.h
1ffa481b943f6d2d16df024b15321dd2 *src/icu74/common/bytestream.cpp
4fec3a2364e293bfed68d2059e96fa60 *src/icu74/common/bytestrie.cpp
9d34bb7529e5e85db3846149f72e74fb *src/icu74/common/bytestriebuilder.cpp
eec840cb56dd0175ebe0bad6e6624cf2 *src/icu74/common/bytestrieiterator.cpp
5b4f1ae8af9658c3cff0fb411a0bbe73 *src/icu74/common/caniter.cpp
26ce62cdc123a6aa954d0825b4656fcf *src/icu74/common/capi_helper.h
0491d663d61b316c636c20eb14efb339 *src/icu74/common/characterproperties.cpp
7a06610ee4e501d7a6a43a3e392f2eb9 *src/icu74/common/chariter.cpp
5958b5a0b86c9bb79097afbbdc07d658 *src/icu74/common/charstr.cpp
1c5820d511ec39d2ffa24f558754aa21 *src/icu74/common/charstr.h
60539b2a360dd616050daabf0d86fa8f *src/icu74/common/charstrmap.h
56ffd680c0ff86d15828075edf24ff07 *src/icu74/common/cmemory.cpp
abb67941c632bced36e6a4ab4e51ff8d *src/icu74/common/cmemory.h
f20cfd96a1071bb0d9e405cbb8a9e573 *src/icu74/common/cpputils.h
36a23acf28a001f6d3cbad39f38840a7 *src/icu74/common/cstr.cpp
1734c10cd6f567a344a713fd13b6acb3 *src/icu74/common/cstr.h
607e9b62ea855cf96e148756cae39220 *src/icu74/common/cstring.cpp
2f0d93ae0ec0c9ae5b59fe7d28eee3d9 *src/icu74/common/cstring.h
6796dd6d8e6e96d1a7b151b6b095847e *src/icu74/common/cwchar.cpp
d24c0320118915cc2594222412902ce4 *src/icu74/common/cwchar.h
c289eb6c66006c1b1d5832ff8ade8587 *src/icu74/common/dictbe.cpp
297753dacbf0974f3ee4b5fda7a03a9a *src/icu74/common/dictbe.h
e27712eb92c95bcfa9a77ee703fabe8f *src/icu74/common/dictionarydata.cpp
ca105277d120deccaada6f6a5cf17906 *src/icu74/common/dictionarydata.h
4ec2b76aa91e236134e14754aec023fb *src/icu74/common/dtintrv.cpp
c4ec0d188412acad9d36846e8b37be29 *src/icu74/common/edits.cpp
25cd354b078228b5f726d2b12058f4d2 *src/icu74/common/emojiprops.cpp
524ea116206b67d476d8f8f66f993a44 *src/icu74/common/emojiprops.h
d0599f623f055b96f0fc492bb12df8ec *src/icu74/common/errorcode.cpp
d1400cf563a683370d7a4030eac16f9c *src/icu74/common/filteredbrk.cpp
b159de30753bfee01022329494b9b3e0 *src/icu74/common/filterednormalizer2.cpp
3a0890ef8285209b449cd874b0537bfd *src/icu74/common/hash.h
6a57bccc095f3b343e71a3aff9e4ce57 *src/icu74/common/icudataver.cpp
ff31826df391950cc39bd268e86c78d4 *src/icu74/common/icuplug.cpp
916cbee9d0df4a2cd1ee5a4cbc55e61b *src/icu74/common/icuplugimp.h
3eca5df0d53dbfd755596932ea6e1cb0 *src/icu74/common/loadednormalizer2impl.cpp
68bd9585b80223c17b781415e82e51de *src/icu74/common/localebuilder.cpp
16d3ed5b2a8543cdbd5a8bc83ff64df0 *src/icu74/common/localefallback_data.h
91994495dd194e5842228248019bfc33 *src/icu74/common/localematcher.cpp
5b490499d7f5df20a9044282146ab305 *src/icu74/common/localeprioritylist.cpp
37d91ee57184d74b4b9aeae1599433f2 *src/icu74/common/localeprioritylist.h
881f1bc0f31c5b02318b4689b0cd4f61 *src/icu74/common/localsvc.h
693e21fec8d49e356e82f1eb998dbd4e *src/icu74/common/locavailable.cpp
238322c53dc81946c5aaced25b28bcf9 *src/icu74/common/locbased.cpp
255709ed4f38a17483c41f4cbddc07df *src/icu74/common/locbased.h
0c7bb71bcab5a65705d1ed27b78ade0c *src/icu74/common/locdispnames.cpp
69ce0b7b169fc846334d531e226989a3 *src/icu74/common/locdistance.cpp
137447c33703b378fa13a55ae8fbc74d *src/icu74/common/locdistance.h
4c5577d47fa0441556e8e2176e082e49 *src/icu74/common/locdspnm.cpp
5d8cf9683bf024ccbfe0debb65141b2c *src/icu74/common/locid.cpp
ee64134183f676c92e1642def0561096 *src/icu74/common/loclikely.cpp
4b65b120350b7c2e4ec5c32544ffd75c *src/icu74/common/loclikelysubtags.cpp
1401c5171c58d2477bcee374768d52e2 *src/icu74/common/loclikelysubtags.h
45801605a76cb154d247e483d2d63c65 *src/icu74/common/locmap.cpp
b68d350d2e61faa97a8e36e273bf923e *src/icu74/common/locmap.h
eac5ed59ffbae769864f17f49ac4e154 *src/icu74/common/locresdata.cpp
df168377e8e97e760afed0a8d35d655f *src/icu74/common/locutil.cpp
c8de4950b1a1c5f40bb6dea944257653 *src/icu74/common/locutil.h
005c1a13ca32bd917ca0a86bae070f21 *src/icu74/common/lsr.cpp
b354c92d6c6e67232cf4bfee6068c501 *src/icu74/common/lsr.h
d52cc6823bbeb3a4e87481c68deaec9e *src/icu74/common/lstmbe.cpp
2dc55c269a222366fc75bf8bb8d0d225 *src/icu74/common/lstmbe.h
0c91992ff7075338ce8c7d92f3fd4f52 *src/icu74/common/messageimpl.h
70100f80f605af1bfa322ff674b7a337 *src/icu74/common/messagepattern.cpp
62081a8f902ec09940b6f9ad39c61814 *src/icu74/common/mlbe.cpp
f831547533e98363bd2f7089fa3a4cf6 *src/icu74/common/mlbe.h
2ef217357b14d55d974e0e2e8a58356c *src/icu74/common/msvcres.h
7486043406d4e31cbdfa125880e49138 *src/icu74/common/mutex.h
554a68d0c0e05ab883b6d53fce816afe *src/icu74/common/norm2_nfc_data.h
a073cc1261ddf38d44cb44ba75ef6687 *src/icu74/common/norm2allmodes.h
a068c7eda830d2110268b0424dee1b69 *src/icu74/common/normalizer2.cpp
51666d9357a06976271a6eff329662c0 *src/icu74/common/normalizer2impl.cpp
97623fe2c8f24cdf5f088460dee4b788 *src/icu74/common/normalizer2impl.h
4e5eeb5dd1e090d79c0e953b9efddcf1 *src/icu74/common/normlzr.cpp
1fa7ed3bf4f2f971e7a8276d486c7685 *src/icu74/common/parsepos.cpp
c66c8695e7403a37152270a8af566cdb *src/icu74/common/patternprops.cpp
ff6cfc56af708075307c192c001f29da *src/icu74/common/patternprops.h
429c1d63ed0b24581e6c475a8e37f97e *src/icu74/common/pluralmap.cpp
88cb63317c695dc7e8dbacbc449a4d24 *src/icu74/common/pluralmap.h
995698d1a497a8f06163a4a1185492cd *src/icu74/common/propname.cpp
a21161247e5be8b68452a4be78d640c5 *src/icu74/common/propname.h
610551e9e8a70ea372b23559115924a9 *src/icu74/common/propname_data.h
8a8ac4a7ad03bef0489d311dcee53499 *src/icu74/common/propsvec.cpp
e0145d2f8b4915e53973f706ab11405a *src/icu74/common/propsvec.h
e9c3746a1f8b1ccdc45ed6d4309bb899 *src/icu74/common/punycode.cpp
ee3c921b794f955e1a685167dc1836b2 *src/icu74/common/punycode.h
474c4d78e8dc9a8be2d25d791606a11b *src/icu74/common/putil.cpp
6c7a1b6fac1d79bc0d6e703b27abb64c *src/icu74/common/putilimp.h
559cecd2a98a2e81212c90fa1764b68d *src/icu74/common/rbbi.cpp
9370cfebe1239b56188afa60a7c4b317 *src/icu74/common/rbbi_cache.cpp
18c56992a825baa680cb115b400afe7d *src/icu74/common/rbbi_cache.h
136c5ea57bc9d7e6de71cf2c23e4f368 *src/icu74/common/rbbicst.pl
b7c64f3e0edb7f87fb8084385e7b4605 *src/icu74/common/rbbidata.cpp
331f278428e1f8f20fcf7db93455316b *src/icu74/common/rbbidata.h
20697208ff695cdf3f41aa659cd3816a *src/icu74/common/rbbinode.cpp
2c410fe710ab670b07fc52b9100e9962 *src/icu74/common/rbbinode.h
3d51591297788b33e7a9533ca76c7210 *src/icu74/common/rbbirb.cpp
ed463a8c4ae7258a4f29ac3324179228 *src/icu74/common/rbbirb.h
142f77394905d409050aa1478735eedc *src/icu74/common/rbbirpt.h
82112a367a22ebe95522954cfdcbbd81 *src/icu74/common/rbbirpt.txt
cb7f93de24bb83e5665bb0fa3bde694e *src/icu74/common/rbbiscan.cpp
b3d7a14766881f1d391b2b06939ec41d *src/icu74/common/rbbiscan.h
44098e0c661ac118bcfe706cb49ac070 *src/icu74/common/rbbisetb.cpp
808a7c843f02ff3fa60da37e38dff8e9 *src/icu74/common/rbbisetb.h
6f0b0bba0a4dcfc42e87a35bab8efeab *src/icu74/common/rbbistbl.cpp
cb6d87d1ebb8e673ba2abe57b4c981fe *src/icu74/common/rbbitblb.cpp
902c7df825c8c4a6605896ab2e95a5d6 *src/icu74/common/rbbitblb.h
0f812b7faa4664c917377d575b7731e1 *src/icu74/common/resbund.cpp
d97aaf4cb79da04595321bf98df17d2a *src/icu74/common/resbund_cnv.cpp
f30a51b3fc83fa6cdc96f719d0a0063c *src/icu74/common/resource.cpp
9126edf455c55252d0e485523050b9b0 *src/icu74/common/resource.h
80b36a32f7813f145b2bfa38c596243d *src/icu74/common/restrace.cpp
0435c3e8d88f3db1494e923631efc090 *src/icu74/common/restrace.h
901180f194d5b750d0f50c584fccaaba *src/icu74/common/ruleiter.cpp
61b5f69b262730fd0624701400528a49 *src/icu74/common/ruleiter.h
82d7d609b1b1d1b6e4924e0361d91fe4 *src/icu74/common/schriter.cpp
8bffbea5c51a8612ef9eb2ce68e9c1ed *src/icu74/common/serv.cpp
4e8b05d89e357e4e2b97a5417b02c0e6 *src/icu74/common/serv.h
7450c98221dd7d4a7b8e0bc18dad50ef *src/icu74/common/servlk.cpp
13295e5a0082aa3bc89dd161d9469888 *src/icu74/common/servlkf.cpp
ccfa74aaa75e0147f9f40c581af2e363 *src/icu74/common/servloc.h
e8df4698c390c6d2b3cae3d0afe10298 *src/icu74/common/servls.cpp
91d4e00af31b1cab71ef0922bc5a3289 *src/icu74/common/servnotf.cpp
89202af45b72be975f4116ec007d48b6 *src/icu74/common/servnotf.h
871c3011fe2308ae9ebd9cc07cf46588 *src/icu74/common/servrbf.cpp
0b77601f454ca348191969d9981d3778 *src/icu74/common/servslkf.cpp
541134d3e932630c85e9a63397cfbc59 *src/icu74/common/sharedobject.cpp
a439482b2eedd9c133a31f47ba459ec1 *src/icu74/common/sharedobject.h
0cde3c11d51b3f4450488f0523f93a89 *src/icu74/common/simpleformatter.cpp
1c888a5db9bd5bcfd222ed9cc95068bb *src/icu74/common/sprpimpl.h
3719470a87c813e264a588708d94733e *src/icu74/common/static_unicode_sets.cpp
3bd4fcbbe261de5ca4477b845baedc5e *src/icu74/common/static_unicode_sets.h
cad826cf0f88dab6580ba7af11dfff4f *src/icu74/common/stringpiece.cpp
3dd287b973225884eb60f94e4c63ef97 *src/icu74/common/stringtriebuilder.cpp
fcf861c4c72243de74d43b32627af0e3 *src/icu74/common/uarrsort.cpp
99e8141442c17ed5835e3c67139ecfa5 *src/icu74/common/uarrsort.h
ff4398d3757794bfb89b58260f73bf95 *src/icu74/common/uassert.h
56208a271cce741f2767b93340b79a75 *src/icu74/common/ubidi.cpp
fe5f27573e38963032654253ff568955 *src/icu74/common/ubidi_props.cpp
006bb181ca0c004ed2285054938fdd3f *src/icu74/common/ubidi_props.h
c60d0881d1e36ca6ba06716990440972 *src/icu74/common/ubidi_props_data.h
87ddd2f659109bf35f4e59dff039119b *src/icu74/common/ubidiimp.h
d89d8b3468d2f7482d0963fd0718604c *src/icu74/common/ubidiln.cpp
1fcd7f39b31f837af9ee734a71d4139d *src/icu74/common/ubiditransform.cpp
350e8dccda3db31ad4e26cb3b0f1dfd3 *src/icu74/common/ubidiwrt.cpp
d493356e30c2852ab898096290909c1e *src/icu74/common/ubrk.cpp
ac84d19db39d5cba7e99e302dfd15ce0 *src/icu74/common/ubrkimpl.h
52270f84c6d49df8865894204688812a *src/icu74/common/ucase.cpp
5be4dbce2e0be55fcf2ce20468ff22d5 *src/icu74/common/ucase.h
62b2d6e6795298c6877d31ba2b9bdb92 *src/icu74/common/ucase_props_data.h
298fad2cef464d72ddbb603a53ca9624 *src/icu74/common/ucasemap.cpp
decf402e008f58401753fb992fc27b19 *src/icu74/common/ucasemap_imp.h
2ff8413a6860ba42d83d860a45460797 *src/icu74/common/ucasemap_titlecase_brkiter.cpp
12274b70dddfd552794d3abc19de4fda *src/icu74/common/ucat.cpp
e96cb0eb83377cb095c6c0e4f40fff46 *src/icu74/common/uchar.cpp
123a117756cd94fcd34b02f6ec550bbb *src/icu74/common/uchar_props_data.h
8bc853685c9bba2299bed2f8d52f1c7e *src/icu74/common/ucharstrie.cpp
e0db9125b7cb2142da73a2cd8da8c268 *src/icu74/common/ucharstriebuilder.cpp
64ad5b834cfc8d85bc804cadab2fc814 *src/icu74/common/ucharstrieiterator.cpp
3f9063c504d404cbc342b62b5293e94c *src/icu74/common/uchriter.cpp
0124c53dbcd37f4d6c803d3fafd675c9 *src/icu74/common/ucln.h
b372e93b9c9b1e408e9196dda1283da2 *src/icu74/common/ucln_cmn.cpp
d2571e9749dedffc01e77749c09f054a *src/icu74/common/ucln_cmn.h
66e78aa2e4c799a2f161c316cb3cd6ad *src/icu74/common/ucln_imp.h
60b5018b77cf8b59c7c2b6fa4648a2f8 *src/icu74/common/ucmndata.cpp
a0dbc662c402fa1c298087debf7b7991 *src/icu74/common/ucmndata.h
b4d24b159f8f580116117deee6a95c4c *src/icu74/common/ucnv.cpp
f2904e74ea2c17541817db6489160a40 *src/icu74/common/ucnv2022.cpp
6114879505c54e096348aa7b6c423ef1 *src/icu74/common/ucnv_bld.cpp
b816c3ac1b08085d6b364ebb3ed5b401 *src/icu74/common/ucnv_bld.h
f2b205e7d0fb6baaaff2895fda3db13c *src/icu74/common/ucnv_cb.cpp
23d5509e268ff0a5a0ff882379a95e7e *src/icu74/common/ucnv_cnv.cpp
1432a6fb6bc3d24aa3e772bc591f7e5d *src/icu74/common/ucnv_cnv.h
3e3cdadde05fb6d47af9bf9e329a148a *src/icu74/common/ucnv_ct.cpp
13786e2af332d44bd8ddce63dd89fa35 *src/icu74/common/ucnv_err.cpp
9ef771a5b60cdce37f7f266ff3645d6f *src/icu74/common/ucnv_ext.cpp
db686c2369b7d80e71ed7eb3c19db497 *src/icu74/common/ucnv_ext.h
1752788dd8c7cfe061c392dc9b954082 *src/icu74/common/ucnv_imp.h
98e41e8c09680f3f03f3d835273c6852 *src/icu74/common/ucnv_io.cpp
74f38751ccd295e8c3dedd97e40581d5 *src/icu74/common/ucnv_io.h
f537aba247d9dbd5d9ac9a1c71f7b390 *src/icu74/common/ucnv_lmb.cpp
289fd90c341ff15ba04a4727ea0aa4c9 *src/icu74/common/ucnv_set.cpp
b3f9864491df17d013605950e721aba8 *src/icu74/common/ucnv_u16.cpp
0b3cbf386e2509129edb1457e13a5e29 *src/icu74/common/ucnv_u32.cpp
86f8d4b8a3a8aba1173b24e6bce190de *src/icu74/common/ucnv_u7.cpp
0e2f2891f99c435600546ccf62675f75 *src/icu74/common/ucnv_u8.cpp
7e0f2e0074c2fee739eeb3e671d68690 *src/icu74/common/ucnvbocu.cpp
29a32eef3fa52485a783a0e3f2da2046 *src/icu74/common/ucnvdisp.cpp
5d4f87eca974ab59e8e03d7ad73cbd0d *src/icu74/common/ucnvhz.cpp
1433efa24ba41931829b2b4e38436155 *src/icu74/common/ucnvisci.cpp
989ed98c16f43c9ee1cc48f0937deacb *src/icu74/common/ucnvlat1.cpp
e475913e7df724be25e100db5ebc5327 *src/icu74/common/ucnvmbcs.cpp
18542ba892d1625c6bbb65d6ddb050de *src/icu74/common/ucnvmbcs.h
a9bcb8ef179f5beb5e6695cbd457f720 *src/icu74/common/ucnvscsu.cpp
7ba8dba81144e8b66f3bba06f4b1b568 *src/icu74/common/ucnvsel.cpp
a97aee35a90370bf8f1bd5965a65f0d3 *src/icu74/common/ucol_data.h
569dafd910cec82341fb40285ef8d2cd *src/icu74/common/ucol_swp.cpp
9459520e8741fac35b9b8b9e93f903c1 *src/icu74/common/ucol_swp.h
d4607ab10e38123c75fd3b38997bfcdf *src/icu74/common/ucptrie.cpp
a0a26718e2387cc430eb3f3d9eecd761 *src/icu74/common/ucptrie_impl.h
29062aba5e1ab128b99c26f9e3851492 *src/icu74/common/ucurr.cpp
5ac72fa08b0586c3b1c3c056263f82be *src/icu74/common/ucurrimp.h
e95ebeb1b58ca8b12afb9023b6c9ba51 *src/icu74/common/udata.cpp
0355eedf4c6ee082025c36aeef852567 *src/icu74/common/udatamem.cpp
0be15feab9f692d0fec7c6ec7b964c55 *src/icu74/common/udatamem.h
4e7db48495d2dc60291b709ce777bd29 *src/icu74/common/udataswp.cpp
2f05e3cafc57e99c4322c875c201292e *src/icu74/common/udataswp.h
714675c46b3b69d780368d0a9db8b223 *src/icu74/common/uelement.h
61b9cb3bbc082c7778fc0a93311fb0dc *src/icu74/common/uenum.cpp
74ace804f67cb80c3f039ac8cd0f3047 *src/icu74/common/uenumimp.h
06a652e6fb32c8ce6233a072ab4c93ff *src/icu74/common/uhash.cpp
931a33a137f97b9c36a153c3a37f98fd *src/icu74/common/uhash.h
077b3409c8f4ab87b8cf00d92e583b0d *src/icu74/common/uhash_us.cpp
279993de4c5ac5b9d5af75fc48933951 *src/icu74/common/uidna.cpp
c5d287e1eb3f2887f566352bd78c91e4 *src/icu74/common/uinit.cpp
324c8cc797a4c179e52f21c3a0f4124d *src/icu74/common/uinvchar.cpp
a1a13f5b5931115ebc6724fbee4a7e30 *src/icu74/common/uinvchar.h
a81273f16325087549e8ab384fade5f4 *src/icu74/common/uiter.cpp
0f3bddb3773a839160f2c9fa93392ccc *src/icu74/common/ulayout_props.h
3b3b384819214ea5682221067afc0ae0 *src/icu74/common/ulist.cpp
cea9a4d84c3e28d7fa01b1278a02c43c *src/icu74/common/ulist.h
0b9b4f66e78e18733f1d8afe41e11eed *src/icu74/common/uloc.cpp
5c2050809eb2409c849a477b2ecc6476 *src/icu74/common/uloc_keytype.cpp
d7349e6b49f93b6135272e45b552236f *src/icu74/common/uloc_tag.cpp
fb7d2a57b86ff84cbe02c0cab6b8cfed *src/icu74/common/ulocale.cpp
0ea6e309a46f132d1d51eadaca1c80f3 *src/icu74/common/ulocbuilder.cpp
71a3208f9320cbf0f7a470e5c4f7419b *src/icu74/common/ulocimp.h
adb9d86dfd35396f80f0ca1377eb94f6 *src/icu74/common/umapfile.cpp
9f8498d136635f8d3856d29c5ce20dac *src/icu74/common/umapfile.h
eef946c7c54e8f07f5550d4ffe985d29 *src/icu74/common/umath.cpp
6199ba0af7567d62f30ddc83e1baf1ef *src/icu74/common/umutablecptrie.cpp
93124eeb19731ca786f45e6fb81d4d42 *src/icu74/common/umutex.cpp
65a7d296eb467b3bafbf5854a2889180 *src/icu74/common/umutex.h
50cf5c7ab2234b2b5f207116c73e165c *src/icu74/common/unames.cpp
2c738fd96b4efc238076a6801af4252f *src/icu74/common/unifiedcache.cpp
2fd54fd002a9d78ad391539263219e52 *src/icu74/common/unifiedcache.h
16b15efb1af1fa9a625cfc6a1b82b6d8 *src/icu74/common/unifilt.cpp
ea6b8680c985829dfe4fbe9a46c8ca76 *src/icu74/common/unifunct.cpp
d7aa4a918a9cfc6c79c312c7e29b3af4 *src/icu74/common/uniquecharstr.h
0bccbe7880448075e03394c286ab7d61 *src/icu74/common/uniset.cpp
867bd0b3fef0b5af2f82ac4cdf34bc2e *src/icu74/common/uniset_closure.cpp
555a10ca866f036084a58596a07a9e6b *src/icu74/common/uniset_props.cpp
aa75d79e23b9af6a75f8ca18332ca9a0 *src/icu74/common/unisetspan.cpp
ea561c789d88fe9a7a230cf27b4935e3 *src/icu74/common/unisetspan.h
ed434247049f25143e2061b81beb4b22 *src/icu74/common/unistr.cpp
b534afa81a8fa7ec54fd662c77b8f9c9 *src/icu74/common/unistr_case.cpp
ea66a38226c38b16d1fc865fb7808808 *src/icu74/common/unistr_case_locale.cpp
6804634537f1349d5ddc36c5d0949509 *src/icu74/common/unistr_cnv.cpp
177cba781160690d63baeb60a8c9d15c *src/icu74/common/unistr_props.cpp
a1ed95f6af8bd03124dea20ef8a0c821 *src/icu74/common/unistr_titlecase_brkiter.cpp
81845d9908e5228dc19ff5b53df25312 *src/icu74/common/unistrappender.h
863a3a81ddedcbdf621be35654b01d01 *src/icu74/common/unorm.cpp
e94285566619c1869c64c09562eb10c4 *src/icu74/common/unormcmp.cpp
fcc9fb21f7f7e4a86a5c4db6659d07d6 *src/icu74/common/unormimp.h
8c3f26303a50cfa26d38d16592daacb5 *src/icu74/common/uobject.cpp
98a842dc115c9550da1aa39bd18a401c *src/icu74/common/uposixdefs.h
15b7c3c62e8a269905ea22d0cdabf5b5 *src/icu74/common/uprops.cpp
62aa03d20a4677bb4d36cf0fc940e59d *src/icu74/common/uprops.h
649daca03a703e878bae9e14feff7ccd *src/icu74/common/ures_cnv.cpp
c5a1cca06ddb8c74ffda8a06e0bffa5e *src/icu74/common/uresbund.cpp
7d4612dbf7194cb28a64c77b6ec34bdc *src/icu74/common/uresdata.cpp
7a8e310955d46553a7c5bba8c75b51e6 *src/icu74/common/uresdata.h
68328ca14ae0da00afac8171550f97ae *src/icu74/common/uresimp.h
9d1c3398397fcc0c380b549493ac5de8 *src/icu74/common/ureslocs.h
d4721f0e0c6854642777ec23d821c47c *src/icu74/common/usc_impl.cpp
d68c64e47b1338f2ea06ab0d8f3ac9fc *src/icu74/common/usc_impl.h
bfa918431674b85a0308359c7dda5d1d *src/icu74/common/uscript.cpp
dcd63bee366aec3f0b00d40c9d24919a *src/icu74/common/uscript_props.cpp
115ff79c65cf4120b20981d3d1bf5d64 *src/icu74/common/uset.cpp
bf39d3dc51e26aedf66102a9da9e70f3 *src/icu74/common/uset_imp.h
4515d7583242d973f0202f23d72c6c28 *src/icu74/common/uset_props.cpp
c0ed5e972b9cf6998accb9e3ccbdf1e9 *src/icu74/common/usetiter.cpp
7899152e7011492967c14bf7be442ca4 *src/icu74/common/ushape.cpp
5de9494db31ca5476b57bf6c96d10eff *src/icu74/common/usprep.cpp
3f88d88d8345363312d3bb04e9c2e1ba *src/icu74/common/ustack.cpp
0703aabdb61bb50553961d8dbb6c2513 *src/icu74/common/ustr_cnv.cpp
3ec96414f9013aa39acdc09425abb09a *src/icu74/common/ustr_cnv.h
aa91bdcb60b1419b3cbeb9a1faf17d6b *src/icu74/common/ustr_imp.h
6df49081281891caa5364bde0efbb30a *src/icu74/common/ustr_titlecase_brkiter.cpp
accff6321a811a61ec07bf93c5978d9d *src/icu74/common/ustr_wcs.cpp
aecdb259a931f7e3aac3722cd9bd0989 *src/icu74/common/ustrcase.cpp
ebe2ae679ee8ec80a45ed3e4e09d790d *src/icu74/common/ustrcase_locale.cpp
091136e3a83e682d93f4ab6b29d9d01c *src/icu74/common/ustrenum.cpp
6ac428072da3aa7501778af75d112540 *src/icu74/common/ustrenum.h
ec1be0200d6d6e618217694b884fc066 *src/icu74/common/ustrfmt.cpp
8c58fa99afa9a9886e857c080d244142 *src/icu74/common/ustrfmt.h
6341507b1c09d281d8152899b4f5139a *src/icu74/common/ustring.cpp
363dca1fe175c19c7c2d2433c6afa085 *src/icu74/common/ustrtrns.cpp
ad7529b9295fd2991fce3b9fa6804648 *src/icu74/common/utext.cpp
41311318d1bae1c5b005d8603694477d *src/icu74/common/utf_impl.cpp
b0e269c6fe574e497a1547cc09edb199 *src/icu74/common/util.cpp
883d51baa498b48b2b0ec563b117b6ee *src/icu74/common/util.h
8f1393cb34ecbaf48f3f450f61b457a2 *src/icu74/common/util_props.cpp
93334e52979972e6d51941ef1bc3354a *src/icu74/common/utrace.cpp
4749c91038039c73de769530b20325d7 *src/icu74/common/utracimp.h
41b6d141af0cd6d05a29b7e622e36439 *src/icu74/common/utrie.cpp
1f88ef78e5a55b0e29e4aae66acb99da *src/icu74/common/utrie.h
d262b4ce4de01dbec87513adfca7b7dd *src/icu74/common/utrie2.cpp
f1f2817dfd7da87969a091fc8f97360d *src/icu74/common/utrie2.h
fa83f7e77aaea7b8da9aa8af8233d0cc *src/icu74/common/utrie2_builder.cpp
96d79794bd37e8a23ab9103b3b55c43a *src/icu74/common/utrie2_impl.h
da43b5ddee2c00c774aa83fee9583aa9 *src/icu74/common/utrie_swap.cpp
266c6d2f85a12face00e97d2302f3c36 *src/icu74/common/uts46.cpp
e7124ec489a49cffc44f1b3ce723148a *src/icu74/common/utypeinfo.h
dbdda139438671d513593aff4a4bcf58 *src/icu74/common/utypes.cpp
48e7bed955f2c38c97d8b9d84a376afa *src/icu74/common/uvector.cpp
726d69a75555480c8b85cc0669089575 *src/icu74/common/uvector.h
ef978bdffd5bc3bd21b74d9dd5240e58 *src/icu74/common/uvectr32.cpp
7edd1f2067a208d02bca1428282bd69f *src/icu74/common/uvectr32.h
a285a33bdbad4322752f02d05fc6d6b5 *src/icu74/common/uvectr64.cpp
7f960fc52f03556f403798561fa82110 *src/icu74/common/uvectr64.h
99d996d42e83e09075c00302cb8f1a97 *src/icu74/common/wintz.cpp
cc3e99d547dc4b10f1cc752f8ad7ba5d *src/icu74/common/wintz.h
08dc3852df8fffa807301902ad899ff8 *src/icu74/data/LICENSE
20b12254aa9f02c707612fb62e4a43d2 *src/icu74/data/SOURCE
23c26661d62277a88e8ba9d66a88beff *src/icu74/data/icudt74b.dat.md5sum
f7f3e7988145676c45c51bf56e63aea9 *src/icu74/data/icudt74b.dat.sha256sum
d7d69305b5d66fcc60d41e886c480155 *src/icu74/data/icudt74l.dat.md5sum
1541e1efb55443bcd38b349cd88be438 *src/icu74/data/icudt74l.dat.sha256sum
026bc4732611c718d3c08c4e9f5f0656 *src/icu74/data/icudt74l.dat.xz
90e21194f73907fe9dd68ca8491cbea6 *src/icu74/i18n/alphaindex.cpp
e32f21a804407f225e48457b627aa5bd *src/icu74/i18n/anytrans.cpp
d3b7c90f6774e62c7df50954adcc02ee *src/icu74/i18n/anytrans.h
5bc87c0e78d2bb05ee43a12266826700 *src/icu74/i18n/astro.cpp
0f4003d987030dee7d1ab87512943062 *src/icu74/i18n/astro.h
5824ff6d5523118052916abada83fd39 *src/icu74/i18n/basictz.cpp
806d4c338daeff64951e4cb8845fdae8 *src/icu74/i18n/bocsu.cpp
591813bad634eef918147e3b364739a8 *src/icu74/i18n/bocsu.h
e94c43140a7b9a8e9f168212083522e0 *src/icu74/i18n/brktrans.cpp
b23a22e8160fb79f482ae9a29acb8291 *src/icu74/i18n/brktrans.h
6f14e333e4bc190853340a29f4ca6fc1 *src/icu74/i18n/buddhcal.cpp
85a99c65e95ef8d14f2e38529b6c9c3c *src/icu74/i18n/buddhcal.h
12c00296cbb2af3f057ae260717759ec *src/icu74/i18n/calendar.cpp
1cc4e3449a658f09ed5e92316ba7dd74 *src/icu74/i18n/casetrn.cpp
e5ef110faa03e2e92d7ee630739e1bf2 *src/icu74/i18n/casetrn.h
36242f4845f0befec3703d5412b33045 *src/icu74/i18n/cecal.cpp
8229d6a554ed480f8efb4a650ff13b8d *src/icu74/i18n/cecal.h
103e1c08ff6966618a4a29896850bbdb *src/icu74/i18n/chnsecal.cpp
bc0c02f9a43d76cd3587d06169271257 *src/icu74/i18n/chnsecal.h
63f89834ed34b350e4dac944f8a68ba2 *src/icu74/i18n/choicfmt.cpp
28e5c20e350c5f34960a3a445d93f9eb *src/icu74/i18n/coleitr.cpp
ae9988a8fce998084438be92dc6c2796 *src/icu74/i18n/coll.cpp
1bb75bd8d7cd47cea136cef61248af34 *src/icu74/i18n/collation.cpp
4025d092de9e3ba6674535c3b3c7831f *src/icu74/i18n/collation.h
d1ea4a5b241439ce59d033fb943fbd51 *src/icu74/i18n/collationbuilder.cpp
6d7277e221708a5df3842499bf0914b6 *src/icu74/i18n/collationbuilder.h
bad57ed50e7cd389b76ee86944cab0c8 *src/icu74/i18n/collationcompare.cpp
a43cbeda92cf72ce2a8ab9172210bbb2 *src/icu74/i18n/collationcompare.h
16d29d356c9510f81f3e1aa58982303c *src/icu74/i18n/collationdata.cpp
e7157732039ee90675f7f3b1cd34b982 *src/icu74/i18n/collationdata.h
de8c1f2be3cdedd2fc02137a7137cc32 *src/icu74/i18n/collationdatabuilder.cpp
65399521eea51d7fbb9f39be3493aa05 *src/icu74/i18n/collationdatabuilder.h
01c145cdd697165def20af8dfdda508d *src/icu74/i18n/collationdatareader.cpp
0a59dd4d3cb862b2be6646a5092798de *src/icu74/i18n/collationdatareader.h
ef410495845581e2e667da1c90ab188c *src/icu74/i18n/collationdatawriter.cpp
13077c8cee15408ed56f058f68c2b8cf *src/icu74/i18n/collationdatawriter.h
fdb098bc522e1e14dd35c10c054fb0e4 *src/icu74/i18n/collationfastlatin.cpp
78504885e2a9ab29722975ad680f02f6 *src/icu74/i18n/collationfastlatin.h
df88b6af447db5370287178a53556d16 *src/icu74/i18n/collationfastlatinbuilder.cpp
22c30cf23eaa3a3a73473858271d10b7 *src/icu74/i18n/collationfastlatinbuilder.h
85c13c084ebe24944141c0231715dd1b *src/icu74/i18n/collationfcd.cpp
d16a4007fab369418b61224c30c54c78 *src/icu74/i18n/collationfcd.h
1afeb7d364a996e78fba7aa53011666f *src/icu74/i18n/collationiterator.cpp
8518d377a990da3d895eeb727ca70025 *src/icu74/i18n/collationiterator.h
74659b67f2a7ab819b3b6a976c4175ec *src/icu74/i18n/collationkeys.cpp
f45d7a0f68cfdd4a39cda25458399d1d *src/icu74/i18n/collationkeys.h
63559edac2d7833448b7c2b953613429 *src/icu74/i18n/collationroot.cpp
9c74abc9d4b1920f071b47b5c22d2fc5 *src/icu74/i18n/collationroot.h
51a4ccbdb111177a507a64503218bfaa *src/icu74/i18n/collationrootelements.cpp
f29d66fb901acc36ce4c14d4e1fb4e90 *src/icu74/i18n/collationrootelements.h
d0b1b8b51fa09d8653ceb56bebb21826 *src/icu74/i18n/collationruleparser.cpp
5283e6ac91c37ad653b05f3abe29fafc *src/icu74/i18n/collationruleparser.h
c4144f8ef123be110daec6094cd077e7 *src/icu74/i18n/collationsets.cpp
759356c7452d1f5853a156530e39fc4a *src/icu74/i18n/collationsets.h
530a877d6138845f8f8639b2a4101c9e *src/icu74/i18n/collationsettings.cpp
21308670179dfe7ab328ff7b43351ff2 *src/icu74/i18n/collationsettings.h
79b9f7940d77c685b7c938115c9bd4c5 *src/icu74/i18n/collationtailoring.cpp
33c82d5d597c689869ee4e0aef410b80 *src/icu74/i18n/collationtailoring.h
76b8b3b582558850172cd5f9d4fdc401 *src/icu74/i18n/collationweights.cpp
5071775b20b1fc4ade749e30d0481def *src/icu74/i18n/collationweights.h
129fa549890d6dc37c6a6b74faa3e410 *src/icu74/i18n/collunsafe.h
5582e860be31f68194b09671fb2da7cb *src/icu74/i18n/compactdecimalformat.cpp
3afaa912779586044b420924b4a26073 *src/icu74/i18n/coptccal.cpp
384d95da4e51d0193f14741fbab4924c *src/icu74/i18n/coptccal.h
83e08f371f33985f15a539fb50401f33 *src/icu74/i18n/cpdtrans.cpp
e3b3df8c57ffc76602d35b0cc67460fd *src/icu74/i18n/cpdtrans.h
bb8b75b75fc8d7172acefcf0a9c63258 *src/icu74/i18n/csdetect.cpp
7b144abcac10c60e48c297021c6c7e75 *src/icu74/i18n/csdetect.h
0864138c06bc800222587aa582b30589 *src/icu74/i18n/csmatch.cpp
be862da6c6977982550a1853bf0f9c60 *src/icu74/i18n/csmatch.h
80cbffce65c364b000a10f32b866b35f *src/icu74/i18n/csr2022.cpp
b0fac98602d38925ba2c0ac6df7f3258 *src/icu74/i18n/csr2022.h
9053d1c4d19917e6e0d4c90cf01bbedf *src/icu74/i18n/csrecog.cpp
e17d8be1fa152b8526289e636968af82 *src/icu74/i18n/csrecog.h
2855995fabbd4f71478191ca8dedf367 *src/icu74/i18n/csrmbcs.cpp
3cb8aa0d3e5cc644a7b809de76072a74 *src/icu74/i18n/csrmbcs.h
b5137c6b3ea2363a182d4c1551dc4002 *src/icu74/i18n/csrsbcs.cpp
ebf0a695b0a563d3c0e5f69b8df212b0 *src/icu74/i18n/csrsbcs.h
3616d37a7ae3e3d1417647e4930476c4 *src/icu74/i18n/csrucode.cpp
8c41bd77f0c3996717b82b3d0ec5b99a *src/icu74/i18n/csrucode.h
2c9021ddba7947dc66433bcf6ec0dc84 *src/icu74/i18n/csrutf8.cpp
b73f9d41c490a8fe7e056d7026ac89a2 *src/icu74/i18n/csrutf8.h
84ff569a38fe50aba9cf07b129092613 *src/icu74/i18n/curramt.cpp
7478df25773814b0a56abcea635e4891 *src/icu74/i18n/currfmt.cpp
577345a3357e4a78d40bc40361d09a9a *src/icu74/i18n/currfmt.h
f967aaa849b74ef5986b0bb767558d40 *src/icu74/i18n/currpinf.cpp
31e95f882c4450eb396f12924933c0a8 *src/icu74/i18n/currunit.cpp
368d251a8fbf575cdb9ca822da10de30 *src/icu74/i18n/dangical.cpp
41e1c1dbd43193759530c2593900d2da *src/icu74/i18n/dangical.h
a9fc4cce77c1960652cf66b07804bdb6 *src/icu74/i18n/datefmt.cpp
1c70836aa19cd6135642471ab190fae1 *src/icu74/i18n/dayperiodrules.cpp
bc27ce5314b1c5bf9730af55f64c395e *src/icu74/i18n/dayperiodrules.h
bacf1dd3ab7aaff7ed9c3963d0b3f406 *src/icu74/i18n/dcfmtsym.cpp
4119973ef7649a988b9d4c03252b1f5f *src/icu74/i18n/decContext.cpp
596e01fa2e3b5462886bd072a74f27a0 *src/icu74/i18n/decContext.h
70e4b74d496407bd84348c2819813245 *src/icu74/i18n/decNumber.cpp
d444cbd31e4b2d88b8b3caa0d95a2bfd *src/icu74/i18n/decNumber.h
9fc72b97fdca6d16bdeb656738b95eb8 *src/icu74/i18n/decNumberLocal.h
870838f64ca29dfa9a9f198ae0ac4756 *src/icu74/i18n/decimfmt.cpp
035b2353b007a601a03e009b57b19eb5 *src/icu74/i18n/displayoptions.cpp
24b821191d3487139be1daec067c544b *src/icu74/i18n/double-conversion-bignum-dtoa.cpp
6c8773691e230a3aecb41af702c14d7a *src/icu74/i18n/double-conversion-bignum-dtoa.h
52720e041e32513b513e7068bebff605 *src/icu74/i18n/double-conversion-bignum.cpp
73fdb4513c2020fac7d68f80806edb71 *src/icu74/i18n/double-conversion-bignum.h
e0db70de8e5825d89297704b4cf4341a *src/icu74/i18n/double-conversion-cached-powers.cpp
f72851aea7f52fe37f0a12dd8ea671b1 *src/icu74/i18n/double-conversion-cached-powers.h
99b43602fa8d8eca475f790b82f499c7 *src/icu74/i18n/double-conversion-diy-fp.h
a6634bbec4d84f8fb6ceebf8aa533ba4 *src/icu74/i18n/double-conversion-double-to-string.cpp
8889eef01703ceff78d54e99d474e461 *src/icu74/i18n/double-conversion-double-to-string.h
288de63c07216687aadcd7251fc5f60e *src/icu74/i18n/double-conversion-fast-dtoa.cpp
6783de252f77cb093ee152e7cd37b3c8 *src/icu74/i18n/double-conversion-fast-dtoa.h
608e4389548310ddbb97c9f3f18fe47a *src/icu74/i18n/double-conversion-ieee.h
c944fbe5f906c1627eb05764fbc620cd *src/icu74/i18n/double-conversion-string-to-double.cpp
6d6af209112a216d58e477515a2815df *src/icu74/i18n/double-conversion-string-to-double.h
f265a11a189ab3b5ef0c36121384de02 *src/icu74/i18n/double-conversion-strtod.cpp
3e521e3256ff699c3f6bbc15c3fa226e *src/icu74/i18n/double-conversion-strtod.h
88f1d85eff6833f156fd39aac882b09c *src/icu74/i18n/double-conversion-utils.h
b734b48b27dc655feaff3862676843fe *src/icu74/i18n/double-conversion.h
ba5fd1742374a110c6d5bf01f62fd8e5 *src/icu74/i18n/dt_impl.h
0469131ca7becae0ed846fbc95f21f2d *src/icu74/i18n/dtfmtsym.cpp
1e1e17ba31c452f1bb213f00f7674eca *src/icu74/i18n/dtitv_impl.h
3cd89465a31023ffef348ded0518ca14 *src/icu74/i18n/dtitvfmt.cpp
19e4bd6ee24d5ffa8075bfeca2422558 *src/icu74/i18n/dtitvinf.cpp
3e500f44c3f46a146e6293fa0cbd5c72 *src/icu74/i18n/dtptngen.cpp
9185152f4a9321acd6a27c11a67ca08e *src/icu74/i18n/dtptngen_impl.h
a0cd1aaa68703cc9c6be6d04f11a8787 *src/icu74/i18n/dtrule.cpp
4397256b1c81850f77d852b3be861a3c *src/icu74/i18n/erarules.cpp
639bcd84862af3eb8ade10444ac55faf *src/icu74/i18n/erarules.h
d0c415c91a2f75176572fcfbb1745b28 *src/icu74/i18n/esctrn.cpp
4ba8a25d0724205e700e5d23307004de *src/icu74/i18n/esctrn.h
be72dcf9e5d643a21ad9ee46e57c2622 *src/icu74/i18n/ethpccal.cpp
97bda4317c41e08f97d9ccd25eacd628 *src/icu74/i18n/ethpccal.h
aa4219597ac27b44a8aa855053a1d7fa *src/icu74/i18n/fmtable.cpp
731733bdc429695c6d64ef168502496b *src/icu74/i18n/fmtable_cnv.cpp
0f9715c961b9532d2a830fd8efa4c661 *src/icu74/i18n/fmtableimp.h
563a66209b0e298fb807df167ea84c5f *src/icu74/i18n/format.cpp
9f7aff74c34e8ec613bdfbce9289fb5f *src/icu74/i18n/formatted_string_builder.cpp
77362fb1f440a64ff77339a989265bdb *src/icu74/i18n/formatted_string_builder.h
03530a1d2c08eb0154390911a0b5e7af *src/icu74/i18n/formattedval_impl.h
320789a691f33068b87385eadab91c02 *src/icu74/i18n/formattedval_iterimpl.cpp
2011e25d52bfc7378eda9f6fa679a203 *src/icu74/i18n/formattedval_sbimpl.cpp
e5cb3b64477e2b477ac6aff647b44322 *src/icu74/i18n/formattedvalue.cpp
dce4d156e18f26b7867820374d15d23a *src/icu74/i18n/fphdlimp.cpp
1ab3f36c3f6ebc94a3652d7b653bc76c *src/icu74/i18n/fphdlimp.h
24e898a2ff01b9a3974e90fe98adfcbc *src/icu74/i18n/fpositer.cpp
4d65fae82030a1425723a3ec60805a4e *src/icu74/i18n/funcrepl.cpp
0992870e3ce1f279db21fb87c7d7676a *src/icu74/i18n/funcrepl.h
d66a6bacf6786c638cc63bf83789b4d5 *src/icu74/i18n/gender.cpp
de4e55c1c050dac1407cae5522262704 *src/icu74/i18n/gregocal.cpp
592d42fab74507725b80494395d51b30 *src/icu74/i18n/gregoimp.cpp
7616059f7a46284ca0858cb898f4d8fc *src/icu74/i18n/gregoimp.h
540f860f01f830720ebe10b7c0c05518 *src/icu74/i18n/hebrwcal.cpp
d0028ad1c46412b265283992cea8ab15 *src/icu74/i18n/hebrwcal.h
792b7e22539bc2eda65f60af719efab1 *src/icu74/i18n/indiancal.cpp
e738cddfe152d1d4024c941c6686fa5a *src/icu74/i18n/indiancal.h
9ac8ac553a4d79388f31897e51cec72b *src/icu74/i18n/inputext.cpp
ff19ec95809d5b706125a14f83677962 *src/icu74/i18n/inputext.h
d12b0c16c20e7359047c556c94dffc8f *src/icu74/i18n/islamcal.cpp
680950aaa4922b07c3a474b22e623fb7 *src/icu74/i18n/islamcal.h
fbc22c4d41bd3347c666e7a617504eb5 *src/icu74/i18n/iso8601cal.cpp
54800ee228eff4412792a6460a3c1cb5 *src/icu74/i18n/iso8601cal.h
1c224bf8333020e6df833fa584f87371 *src/icu74/i18n/japancal.cpp
8bf5ffa3c3f1f8fdc4ce75cb9a91e6d4 *src/icu74/i18n/japancal.h
ee4f7e72aca491beb219c52020e52eb5 *src/icu74/i18n/listformatter.cpp
eb36140ed7e3ee7f3e3060e59cb19a59 *src/icu74/i18n/measfmt.cpp
cd04a3c355d755733cfb35b4b63959cb *src/icu74/i18n/measunit.cpp
f6c4f6e6a5650b90701e38d45ba26dd9 *src/icu74/i18n/measunit_extra.cpp
ae1f98552badffdcf00e5a0080c943f1 *src/icu74/i18n/measunit_impl.h
143b532eebddfa40470faadeb84aae42 *src/icu74/i18n/measure.cpp
7feb155c754826ce654f9617a663dd21 *src/icu74/i18n/msgfmt.cpp
5e91e4b015acd59fc9d8ae0dc0f4ae99 *src/icu74/i18n/msgfmt_impl.h
7bd6f0cba5fc3716ae03d5bd08705a04 *src/icu74/i18n/name2uni.cpp
40e275d9ea0696bdcf90fcba11343226 *src/icu74/i18n/name2uni.h
8408d8e9bf58b80a0339ec3d2f658250 *src/icu74/i18n/nfrlist.h
17b50dd408e7c6cf4d3dc60bd76c56f8 *src/icu74/i18n/nfrs.cpp
5de3444820766dad63c75387b7115ee6 *src/icu74/i18n/nfrs.h
652e60eed1f5536eb1997ad59218ab02 *src/icu74/i18n/nfrule.cpp
93a6a23ffcebec12ee7af561f0ef7a4f *src/icu74/i18n/nfrule.h
452e9a845585a7553188c4239f55948e *src/icu74/i18n/nfsubs.cpp
cfd3232f443cd2bb45bf5862c5447deb *src/icu74/i18n/nfsubs.h
ce2cb295abbc04551063bbd66a357841 *src/icu74/i18n/nortrans.cpp
384e35eb53a26b00349d5910ed03454e *src/icu74/i18n/nortrans.h
72f0244cd2441bfc684c23af7aff6ba6 *src/icu74/i18n/nultrans.cpp
899bf063c84788aba1c33c33b2bb4cd4 *src/icu74/i18n/nultrans.h
b3792be6321f001a842ccdb9ae71b74a *src/icu74/i18n/number_affixutils.cpp
c72d465ef54714d514e3dc0237865eb8 *src/icu74/i18n/number_affixutils.h
952330f617e93f3c3d6d0e17472dff50 *src/icu74/i18n/number_asformat.cpp
6dc3e7134e5099b53fd10c00c29caaac *src/icu74/i18n/number_asformat.h
00af1a56a211b1b6de01ec7d3e6396b1 *src/icu74/i18n/number_capi.cpp
8b217d085335a612c7a2b86bbeeb9aa1 *src/icu74/i18n/number_compact.cpp
0e626374775553966f159142933c6670 *src/icu74/i18n/number_compact.h
429843cb4961b856ff11ae517389785b *src/icu74/i18n/number_currencysymbols.cpp
7c95d2d371f585387fff30ee9344b5c0 *src/icu74/i18n/number_currencysymbols.h
e9bafbc925414e87a1b86a6e3acc4bee *src/icu74/i18n/number_decimalquantity.cpp
368dd2ae1ba6b306afb493241996c600 *src/icu74/i18n/number_decimalquantity.h
b70dc329d710f421a89b90001b1e4ee5 *src/icu74/i18n/number_decimfmtprops.cpp
634ac5437e87a7bb1046b4f2109bcffa *src/icu74/i18n/number_decimfmtprops.h
a7db7c30b63db78cd19e9eeb65762346 *src/icu74/i18n/number_decnum.h
18d192d5509cd0a6e285f3cd4dec5ea2 *src/icu74/i18n/number_fluent.cpp
e5bfca31ee5fd6bfb14e89db76129300 *src/icu74/i18n/number_formatimpl.cpp
593f69ecc6537aedeb0286ac709a29e1 *src/icu74/i18n/number_formatimpl.h
9555662382ffaa51329c3acdc679d684 *src/icu74/i18n/number_grouping.cpp
39fa6d9de0ac4256d51c316ddd15b172 *src/icu74/i18n/number_integerwidth.cpp
5fbc81adcd146b9a3ff26cd7ce204a6e *src/icu74/i18n/number_longnames.cpp
baa6561cf90e522ff39bd35850b77352 *src/icu74/i18n/number_longnames.h
3c4089b3fa0386ab68a84e718631d964 *src/icu74/i18n/number_mapper.cpp
d133751e0412ebcf38138d0850658d46 *src/icu74/i18n/number_mapper.h
67f4ebfc2e5a4f1d5d904b7c5955e8de *src/icu74/i18n/number_microprops.h
2d7c786159711d7f0d8896e4c919c7e5 *src/icu74/i18n/number_modifiers.cpp
bca3359ee4e1b93037f41352a6cc5678 *src/icu74/i18n/number_modifiers.h
aef5955b95d5bc6be0dc568f391950b4 *src/icu74/i18n/number_multiplier.cpp
faaeeb99e3859c2ce6aded5c8dc52cba *src/icu74/i18n/number_multiplier.h
c87b8fe96ede6d9df126f8b9a0c84c02 *src/icu74/i18n/number_notation.cpp
48e8107253d2ecd75d55b6e0ed744e7b *src/icu74/i18n/number_output.cpp
c93aedc4b3e9304f3deff852c1865327 *src/icu74/i18n/number_padding.cpp
956893cd53d5b3300132d25221bddf49 *src/icu74/i18n/number_patternmodifier.cpp
9de54ab073b3b39e2f45ccea96613532 *src/icu74/i18n/number_patternmodifier.h
eb0c493a9032e9d29a4b640723a141ee *src/icu74/i18n/number_patternstring.cpp
e13a3520702fe4e33f4d2a65cb27e39b *src/icu74/i18n/number_patternstring.h
4ad6568b450149b25f14e682327dc83c *src/icu74/i18n/number_rounding.cpp
4e306a4d94822465bb2fc1491a57cb60 *src/icu74/i18n/number_roundingutils.h
d7f72c4c2298cd0586d2889aa6f00dd6 *src/icu74/i18n/number_scientific.cpp
9d3e6107887c03532b52a801aa8ae8d8 *src/icu74/i18n/number_scientific.h
d453d5b4e031c722a97e4240905145a2 *src/icu74/i18n/number_simple.cpp
0d052d703551f57ef380ae6b98d6d13c *src/icu74/i18n/number_skeletons.cpp
19a995acf95dd14ac94183f136feeba8 *src/icu74/i18n/number_skeletons.h
78aeff8c530099e09caacea791d3d850 *src/icu74/i18n/number_symbolswrapper.cpp
00425da912d4276d8ea39b42fc600a22 *src/icu74/i18n/number_types.h
a0a5aa94ec050a1754bc2cf763bba8ce *src/icu74/i18n/number_usageprefs.cpp
12096dbf814e8456ab5169e4b915d58d *src/icu74/i18n/number_usageprefs.h
8181f5b8a6d5919d664a1de345b85854 *src/icu74/i18n/number_utils.cpp
3e6bcb33fa3f0490fd6437ab340a5b7c *src/icu74/i18n/number_utils.h
13bdac003484ba5361cf0e8b5720c6b3 *src/icu74/i18n/number_utypes.h
fd4c6bf742a3109f3a075a04693b7289 *src/icu74/i18n/numfmt.cpp
8d596b10370e03e93a8c784337c3892c *src/icu74/i18n/numparse_affixes.cpp
15dc0ed3e759d48c99a8403290b146f4 *src/icu74/i18n/numparse_affixes.h
f760db91281f37ffe7373bfeb210a8cb *src/icu74/i18n/numparse_compositions.cpp
0dd20d09fb18f9e7aaa9f825208986b2 *src/icu74/i18n/numparse_compositions.h
867d8cd0f5705a77d96bd98cb68ab5b9 *src/icu74/i18n/numparse_currency.cpp
1a2c07a632970bdf70e3b8e6fee8fc28 *src/icu74/i18n/numparse_currency.h
0bbe0dde27d1675ffe2e204ca34ddfca *src/icu74/i18n/numparse_decimal.cpp
99bf8f6ac32cdabaddc339f97694b97f *src/icu74/i18n/numparse_decimal.h
f99035e8cde5bde79c087ae4b5f165fa *src/icu74/i18n/numparse_impl.cpp
f586b5fb7d1e6be8c17b61d7cd9c8f47 *src/icu74/i18n/numparse_impl.h
5aafcc925efa75f442dcacd4d14a623a *src/icu74/i18n/numparse_parsednumber.cpp
2cd10065dca57d2e2fd34c9d9e167983 *src/icu74/i18n/numparse_scientific.cpp
c4ab41144b0600f20da5288f432c8651 *src/icu74/i18n/numparse_scientific.h
baba2688a4a979e4117b39739b555e01 *src/icu74/i18n/numparse_symbols.cpp
71da71119a0535ac557a65d3e005ccc3 *src/icu74/i18n/numparse_symbols.h
0f6d7be6bc673e6707111ad769f2f520 *src/icu74/i18n/numparse_types.h
1959a7476ef38ca098d6abb6619bd28e *src/icu74/i18n/numparse_utils.h
cec4cdfdfa1cce101919b0748bbc8ca9 *src/icu74/i18n/numparse_validators.cpp
a40c780a7a2649762d6827c13569a911 *src/icu74/i18n/numparse_validators.h
e72ac58b890120161a9a344a8d0c5386 *src/icu74/i18n/numrange_capi.cpp
86f0d63beb6b0f1d8ecb13342a6613f4 *src/icu74/i18n/numrange_fluent.cpp
3b9a7239de549bba1fe72e4904de9539 *src/icu74/i18n/numrange_impl.cpp
92b57cb82ced744e6b00679061c22fc5 *src/icu74/i18n/numrange_impl.h
27bcd721675482d079b948179052970a *src/icu74/i18n/numsys.cpp
a172ec5c557763cd6b092757b2fd462c *src/icu74/i18n/numsys_impl.h
f899886cbc5f633d5f9d46a89d8ff392 *src/icu74/i18n/olsontz.cpp
1218fa0474595656be22063a09e921e3 *src/icu74/i18n/olsontz.h
87100031fd028353243c80ce87d37245 *src/icu74/i18n/persncal.cpp
9b6378f841df0b123aa19f2f0869caab *src/icu74/i18n/persncal.h
ff5c9c9b2a106c9047768dab57e5e6e3 *src/icu74/i18n/pluralranges.cpp
78e4071886f83d932cc819712e3db3b1 *src/icu74/i18n/pluralranges.h
c739d60a64b7dad5c6de92dd27481db4 *src/icu74/i18n/plurfmt.cpp
c843e54a288d1c4dff1be10ff934bdaa *src/icu74/i18n/plurrule.cpp
ef5e4fb83df35b874541c10b87ac2113 *src/icu74/i18n/plurrule_impl.h
738ef02f6e802aa3b7dd486705340c6b *src/icu74/i18n/quant.cpp
0f0d56b6013caf50b06f450635d587ea *src/icu74/i18n/quant.h
0fb69e3829d2628b0ee89fa4d3b9ba5f *src/icu74/i18n/quantityformatter.cpp
0133ad8ffe5881513b82e3d37fb4f85c *src/icu74/i18n/quantityformatter.h
b60ac8bcb4890667f387243003bf255b *src/icu74/i18n/rbnf.cpp
25fdb60dab05421910412e6f4d76c9d5 *src/icu74/i18n/rbt.cpp
79d1ba8d0a1b16d2b74e77dbaa887748 *src/icu74/i18n/rbt.h
8821032e8f87e056c5855948d61addc9 *src/icu74/i18n/rbt_data.cpp
4e2c2c9c674687ae71a1a52f7177de09 *src/icu74/i18n/rbt_data.h
23e3d68c4acf8bee2bf14c9a5d09dfb2 *src/icu74/i18n/rbt_pars.cpp
a041b95f9d99f7bbeca78b5f2c501b5d *src/icu74/i18n/rbt_pars.h
5748e9f794912ccc8db57b1840a99838 *src/icu74/i18n/rbt_rule.cpp
5dfdeb8adce79f2e9b695eb441f6dbe5 *src/icu74/i18n/rbt_rule.h
184376dc3546aa5cf1dd3d7da88a0645 *src/icu74/i18n/rbt_set.cpp
138974c3a39ef8c5fb84d850a5538ff1 *src/icu74/i18n/rbt_set.h
5812e13a3ac120a915c81ac879e570dc *src/icu74/i18n/rbtz.cpp
5d77a27e8db07dd64de7de828bfbb111 *src/icu74/i18n/regexcmp.cpp
cf8010708b2762e77c020a1044a0e114 *src/icu74/i18n/regexcmp.h
4caf2d01afdc843ae32f91dc3c1a8775 *src/icu74/i18n/regexcst.h
3ca092e7ce9e7bd741ab4eb28cfb0695 *src/icu74/i18n/regexcst.pl
7990cbeadf4e0e067b6e1d98895d82d3 *src/icu74/i18n/regexcst.txt
22b65d548d0a21baa1c60bb5de0a73bd *src/icu74/i18n/regeximp.cpp
3f66fa1168b97d7d731faf95733fe68d *src/icu74/i18n/regeximp.h
c25a700ff0102e131078aa983d2418aa *src/icu74/i18n/regexst.cpp
ec2216308bc15bbc9c69ed3836e5c159 *src/icu74/i18n/regexst.h
54adba24f34306a6d9ba04e2c58b3055 *src/icu74/i18n/regextxt.cpp
02e21be1f3c784a681d278201240bc8b *src/icu74/i18n/regextxt.h
df4d106bb8587f902cdb2c284c8bb2c5 *src/icu74/i18n/region.cpp
3e8ecef525426509e2bca8c308d5ccb6 *src/icu74/i18n/region_impl.h
0476d52799f3f5aadebecdd5f8a10218 *src/icu74/i18n/reldatefmt.cpp
82542d6feae419efa6deb05c1d910e90 *src/icu74/i18n/reldtfmt.cpp
e25d17a38cac736c3ae73fae8a81e2cf *src/icu74/i18n/reldtfmt.h
b57d2602241987451644b075a6dd9ed2 *src/icu74/i18n/rematch.cpp
a50149472f47b3dfc9838b4116a270d3 *src/icu74/i18n/remtrans.cpp
68d9094b4a6d70b697f92463fb69c185 *src/icu74/i18n/remtrans.h
a4bcbfcf7ddd7d5c4d5f58a91702fe70 *src/icu74/i18n/repattrn.cpp
002a51bafee19447c957163ffd6c70a8 *src/icu74/i18n/rulebasedcollator.cpp
55b504e415a93a43e9bc29a699f070d6 *src/icu74/i18n/scientificnumberformatter.cpp
7c98b4241a01c1083c78ec99c2cdc7eb *src/icu74/i18n/scriptset.cpp
70163664c605350da257f89405924e63 *src/icu74/i18n/scriptset.h
ec9510445f36dcaa0c9d8c2aea2b2f66 *src/icu74/i18n/search.cpp
7ac6cfe9d5f2d91956d254916d7a04c3 *src/icu74/i18n/selfmt.cpp
ae27723b93891a39cb8894dbad74e243 *src/icu74/i18n/selfmtimpl.h
2565550c239f65d9619ed8a4c668e9e3 *src/icu74/i18n/sharedbreakiterator.cpp
aa975748e44a0c65efc8449f0dfeb976 *src/icu74/i18n/sharedbreakiterator.h
17b1c0c7792a42f5f6249d9775cec741 *src/icu74/i18n/sharedcalendar.h
1386d2906094bd9356ccbd72761429b6 *src/icu74/i18n/shareddateformatsymbols.h
5d1e6b8d6644a32ba9e473e76ffeca99 *src/icu74/i18n/sharednumberformat.h
8f32ec904ffc0ea9f857e2e4b402ec4f *src/icu74/i18n/sharedpluralrules.h
ac06c7c8f0a70bdf3f5fdb52e85357b2 *src/icu74/i18n/simpletz.cpp
5f000a5daa17cbb5fc5f5f5fbaf9f043 *src/icu74/i18n/smpdtfmt.cpp
abf9bfd6cec588b807ff2ef07d715141 *src/icu74/i18n/smpdtfst.cpp
803bce2d5c6ae1cd75295f9f6ab3fef7 *src/icu74/i18n/smpdtfst.h
4a2f0d7a67460c4d63417f1ef2299f35 *src/icu74/i18n/sortkey.cpp
d380ff41e65c118e423c03d7dc3b0262 *src/icu74/i18n/standardplural.cpp
cfa0d8c0219d9b40b23504b04be2a884 *src/icu74/i18n/standardplural.h
3a74ea64b4f5ff0a65bb34cd6ce04dfc *src/icu74/i18n/string_segment.cpp
afbc178c35fe6f1adccb38c03e9a1631 *src/icu74/i18n/string_segment.h
6c464ab583d2cb0c83c4db2a5f73e41d *src/icu74/i18n/strmatch.cpp
2765301e7fbe35a5e1994e9f86c9b1ec *src/icu74/i18n/strmatch.h
3cd0108b1a743d7fc9018487b0528b3d *src/icu74/i18n/strrepl.cpp
cd8c27f855bd94517c0abbfacc299f50 *src/icu74/i18n/strrepl.h
c3164f4b079f5ce10ee4c25e9b16dd99 *src/icu74/i18n/stsearch.cpp
6eb3c3c25dc9a16540f26ede38883c66 *src/icu74/i18n/taiwncal.cpp
ec8c0b7e5b5a2d7a0a4aca486ca3cd8f *src/icu74/i18n/taiwncal.h
dd6395e8cd46205bfa9b7066c2ee8d1b *src/icu74/i18n/timezone.cpp
0a5dc36b7c17d3333b7d895d89c8299f *src/icu74/i18n/titletrn.cpp
a29f132f8b7a8aec85830de60108774c *src/icu74/i18n/titletrn.h
ef2daee21bbf59ad08c30614b90e52cf *src/icu74/i18n/tmunit.cpp
07d8cdfbc2d03813f7c627f8da1655bb *src/icu74/i18n/tmutamt.cpp
0c681c5582802401834b9da9b36ab450 *src/icu74/i18n/tmutfmt.cpp
ac2bbf452dd3783dbac98a5bfea3217a *src/icu74/i18n/tolowtrn.cpp
4d2f04d6c5b9f00a253da9e2df714e56 *src/icu74/i18n/tolowtrn.h
6449b2a314524ec4c0c288d46082ae33 *src/icu74/i18n/toupptrn.cpp
c70ab3b7dc12638c2b27b4204e80f7e2 *src/icu74/i18n/toupptrn.h
370842bfd73fbb33c20647fe3da8d5fc *src/icu74/i18n/translit.cpp
e0e47086f995eea6bb9459f59df55347 *src/icu74/i18n/transreg.cpp
ebca78ee6fb0857067b085818b440316 *src/icu74/i18n/transreg.h
dc7962d26f45145c33c12d59ede9a35d *src/icu74/i18n/tridpars.cpp
9a53b3381bdc7d6cbdc0384e055450c3 *src/icu74/i18n/tridpars.h
73c11b9ea03b9f22f408978591a03bf3 *src/icu74/i18n/tzfmt.cpp
ced689b025fca4dd563d88282fd45c2c *src/icu74/i18n/tzgnames.cpp
08e1df95a27f65421f23df09c8fb0267 *src/icu74/i18n/tzgnames.h
9c063c1a3b0e9dc53a6e2966ab885b84 *src/icu74/i18n/tznames.cpp
6d0e225e55ced3ba250632d91096d5cd *src/icu74/i18n/tznames_impl.cpp
b1fbca10ca71bada19ccdc23a5753681 *src/icu74/i18n/tznames_impl.h
96e4265046612a9f9e45f0e0603143c7 *src/icu74/i18n/tzrule.cpp
8caa658b0790bd2f20d6c045b52dd544 *src/icu74/i18n/tztrans.cpp
7929d2444609f93f42535d8bd4eb7b81 *src/icu74/i18n/ucal.cpp
cf2df4a981de69f553c29760586fd2e8 *src/icu74/i18n/ucln_in.cpp
05eaff445e0ae3e8f3df4a8e2a3b0104 *src/icu74/i18n/ucln_in.h
b75385c005c9405876bd0f39901f2669 *src/icu74/i18n/ucol.cpp
74e2da658fd620e01604a54b13e6a903 *src/icu74/i18n/ucol_imp.h
d416be4f64bba5cfc2c80ade61e1599c *src/icu74/i18n/ucol_res.cpp
a8edef6a8d1a7877b930cfe06bc7fcf8 *src/icu74/i18n/ucol_sit.cpp
a396e5ad795c4fd29a1fd711942165ba *src/icu74/i18n/ucoleitr.cpp
3a1f864549e42c94ae4492b7a95b173e *src/icu74/i18n/ucsdet.cpp
1ec0ea491c3f4e92c49c91d71d33d088 *src/icu74/i18n/udat.cpp
93f90b129ea6ce3c6d0b6e5178168124 *src/icu74/i18n/udateintervalformat.cpp
fcb41c8023b4abb881c515eb3f42b3e1 *src/icu74/i18n/udatpg.cpp
81ca04f0525e9a90631cbd4cc7bb8075 *src/icu74/i18n/ufieldpositer.cpp
8d71927df5786143bc0a67f054e68109 *src/icu74/i18n/uitercollationiterator.cpp
f5dd577dcd177f3f50a76a2355b2d296 *src/icu74/i18n/uitercollationiterator.h
0896ff1f34edc00e33b588db0f73ba76 *src/icu74/i18n/ulistformatter.cpp
8b34721af92e3c1f3b3ab17a57d88e3a *src/icu74/i18n/ulocdata.cpp
6eefc224fd1cdfd1ae446b2f63972487 *src/icu74/i18n/umsg.cpp
014d78caa836affaa85f2e252e39c1fb *src/icu74/i18n/umsg_imp.h
bc1802df6f8b549d67dfc070df3ffd68 *src/icu74/i18n/unesctrn.cpp
853a7e4af8dafb3ff1e9601fbf258daf *src/icu74/i18n/unesctrn.h
f7693b8e7e9e384ca4cee3209c1d9b47 *src/icu74/i18n/uni2name.cpp
f7bfbddb7b0266971da4e66ff2c71955 *src/icu74/i18n/uni2name.h
696752594bb68c28f171fab3674e9820 *src/icu74/i18n/units_complexconverter.cpp
7af7164d8ca6ef0f6a61f32ed02341bb *src/icu74/i18n/units_complexconverter.h
93a52cb93abfd8da0ebbf71e0cc23c8d *src/icu74/i18n/units_converter.cpp
6b77759d83b2ce3307751b72482d01eb *src/icu74/i18n/units_converter.h
05068ed9104ab4edd83ea023dfee6384 *src/icu74/i18n/units_data.cpp
2a9e516c4f92e6194da35044978bfda8 *src/icu74/i18n/units_data.h
bec40009b2a6297feafa7062d2fa2808 *src/icu74/i18n/units_router.cpp
d647205f11d816053621395bf3e56e6b *src/icu74/i18n/units_router.h
cb9c8b40d2d9b74f13281e62cb3d732c *src/icu74/i18n/unum.cpp
e49c6c5347293466fc4ba8f61eb6feca *src/icu74/i18n/unumsys.cpp
b413fb527b94a47d84275231881e864c *src/icu74/i18n/upluralrules.cpp
68931d131908924f671ab5ac202fdba4 *src/icu74/i18n/uregex.cpp
3bdd3210fc12e5e41eab5e0a4c72111f *src/icu74/i18n/uregexc.cpp
4359f9e01178046ea837a4f5c48a3d5c *src/icu74/i18n/uregion.cpp
a59fd255bee8ea6c46a5c4a8e38602ce *src/icu74/i18n/usearch.cpp
6f992808f8a6203496f97d15c6700477 *src/icu74/i18n/uspoof.cpp
1f929ca604e5338ab792d94f7db872cb *src/icu74/i18n/uspoof_build.cpp
f7b4b054fa14efd75a70c215594f2a69 *src/icu74/i18n/uspoof_conf.cpp
84503dab427da29acf1800c7c0eaeb93 *src/icu74/i18n/uspoof_conf.h
076961df215ae0178f75fb464cdff33c *src/icu74/i18n/uspoof_impl.cpp
e3af965ef8796c4587f5342d4b8d563a *src/icu74/i18n/uspoof_impl.h
cb6366ce3299ac312b56102e502f575d *src/icu74/i18n/usrchimp.h
3fa260bb5e98e1454212f6627e64b747 *src/icu74/i18n/utf16collationiterator.cpp
c3f2f7ec37bdb63962e15c0a315e33f1 *src/icu74/i18n/utf16collationiterator.h
dce7ae5ad8236badad99beb6b7d463bb *src/icu74/i18n/utf8collationiterator.cpp
67b7373706e12c98fdabf488a7e3d632 *src/icu74/i18n/utf8collationiterator.h
f4656b9e13a254cadd1315384d60517a *src/icu74/i18n/utmscale.cpp
a8036c12a54f9fe58e3257bead6232a5 *src/icu74/i18n/utrans.cpp
d035f7e39015ccdf8ac0f4199a2aacd6 *src/icu74/i18n/vtzone.cpp
6d05ea39cd1174cdf6eed94050321b0d *src/icu74/i18n/vzone.cpp
252d2a07067bedb3ffe026d5bde9a090 *src/icu74/i18n/vzone.h
0bcc62711eb3576d2ef4c9e915e09047 *src/icu74/i18n/windtfmt.cpp
5318d59e0ac4c0b58655b28cd242dac5 *src/icu74/i18n/windtfmt.h
f1e0b3f778dea398d9edf30b85b248d9 *src/icu74/i18n/winnmfmt.cpp
d5ee2e585d1f0bd75ff717b7dc920ded *src/icu74/i18n/winnmfmt.h
021778c6c0d161f16992a9e1d7d275bc *src/icu74/i18n/wintzimpl.cpp
efa44197993d058a7f1fe304529b0e53 *src/icu74/i18n/wintzimpl.h
b474d1e82f9d838149d766c14ee12687 *src/icu74/i18n/zonemeta.cpp
f1d835a6ef797c289b1c0ab0d167296e *src/icu74/i18n/zonemeta.h
9c1ba436d76330416d8622088bf7cb0e *src/icu74/i18n/zrule.cpp
3499e2f82c86b54ae1df560c68771138 *src/icu74/i18n/zrule.h
a4d52e6c132b080c8a88c2146d9f9b80 *src/icu74/i18n/ztrans.cpp
70d85481c6c331c63c5e97b888e0d2d2 *src/icu74/i18n/ztrans.h
2dbee2e923f1fcc207a4900324ab9ff4 *src/icu74/stubdata/stubdata.cpp
a0a21beebfcdbf8f697c808f18d63360 *src/icu74/stubdata/stubdata.h
ec79ecd11d550d28219edc761ca194c6 *src/icu74/unicode/alphaindex.h
b534307b4927912b07a057ba814a5aff *src/icu74/unicode/appendable.h
b6ba00878a2f1d089810b84e215334f8 *src/icu74/unicode/basictz.h
45759d53352c84e6cc50b27525432137 *src/icu74/unicode/brkiter.h
aa057828d4ac2c5b0391a7ab9fd43099 *src/icu74/unicode/bytestream.h
4150910418a57f6a48d724526b21fedf *src/icu74/unicode/bytestrie.h
3e1a00abe95f389e97aa9ef701b338e5 *src/icu74/unicode/bytestriebuilder.h
08be0538368e51090e6ff1d23dfec6f7 *src/icu74/unicode/calendar.h
641e741b2c01155b2d4fbd9d2713a614 *src/icu74/unicode/caniter.h
e3425f7c74b34cf79026cd19a031f7bb *src/icu74/unicode/casemap.h
6cb8a2e1958b7b7a55726cce5b68a803 *src/icu74/unicode/char16ptr.h
05c18c0f6891a66d16ffc6cb06d4efdd *src/icu74/unicode/chariter.h
c066a51f7d50be3565be48eb25e70441 *src/icu74/unicode/choicfmt.h
db313832874af0e89b4b12b8fd28d0f8 *src/icu74/unicode/coleitr.h
6d0fdc0fa650dda69b0719bb708aa6d5 *src/icu74/unicode/coll.h
510d30046480b513145d1694c7f22293 *src/icu74/unicode/compactdecimalformat.h
33e59e40478f72b464102e8bc8c3c786 *src/icu74/unicode/curramt.h
9593ad2b47287896517deebbf5167390 *src/icu74/unicode/currpinf.h
4bc2e93eee8b039d863aae7deb16c49e *src/icu74/unicode/currunit.h
17a8772d691939702e42bf97973c3454 *src/icu74/unicode/datefmt.h
3d4186ec9549d6693d136257f610ec06 *src/icu74/unicode/dbbi.h
85cb07a5b874370e37d8987e2e9e1fe3 *src/icu74/unicode/dcfmtsym.h
9561cbaedec8cf25002de5554dcd00e8 *src/icu74/unicode/decimfmt.h
e9fc936da341e6bcef7f60b6ddaabcac *src/icu74/unicode/displayoptions.h
9ecff9520f7c9c103fcf2d98071f2820 *src/icu74/unicode/docmain.h
a9a39cdc2820819eca29a17b691a961b *src/icu74/unicode/dtfmtsym.h
354bfd450b52bb73685e60f99df17fe3 *src/icu74/unicode/dtintrv.h
4135d8969428fd463ad91e98e828429b *src/icu74/unicode/dtitvfmt.h
d685ce1d63aa8690a55e0e35fd6144e8 *src/icu74/unicode/dtitvinf.h
c0d29f49d2d4d48c9ec46b217df5ef88 *src/icu74/unicode/dtptngen.h
6a139c62bc78fb7ff84726ec1df8f002 *src/icu74/unicode/dtrule.h
f72eba6fc28806010b354b7367082e3e *src/icu74/unicode/edits.h
3bb75fc43c88682895ae98c02e69810d *src/icu74/unicode/enumset.h
debeda4467a19aaf964fafc73629c8ae *src/icu74/unicode/errorcode.h
321862339b410fb1482231175b6321da *src/icu74/unicode/fieldpos.h
4f05b806ecf8bf8797d7c6ede0851fdf *src/icu74/unicode/filteredbrk.h
45d5addd3877b01d1fb5cec1d2516b58 *src/icu74/unicode/fmtable.h
b00fe8417659b75d47dce7dd3dc731c5 *src/icu74/unicode/format.h
32550933d3feee0fd4e6435174e413c4 *src/icu74/unicode/formattednumber.h
c6f38411864c36f83ed75de4100a8ba8 *src/icu74/unicode/formattedvalue.h
0eb92a7c2025b739075f4891a344a710 *src/icu74/unicode/fpositer.h
d7e955b4dc08fc7dd19a4f699daf1ebf *src/icu74/unicode/gender.h
0c5818fc6f6aca256e4c44eb835c7de7 *src/icu74/unicode/gregocal.h
7f302721a25b90f8db3844963a6db0ed *src/icu74/unicode/icudataver.h
1956da7c9086dcefccc89abf23fc1bc9 *src/icu74/unicode/icuplug.h
80f4736dc94e89d7e0204cd4653c5b04 *src/icu74/unicode/idna.h
f0b534815224ac8fcdb37ebdc85fe36f *src/icu74/unicode/listformatter.h
ab1b8f1cb43634452dad8f3f7df9826f *src/icu74/unicode/localebuilder.h
2e414786f4f07e7690787b46c8054453 *src/icu74/unicode/localematcher.h
c91c88bfc334c7d499f5a41e7af0369d *src/icu74/unicode/localpointer.h
aa260a00e31e970757e3b3cbe00fff3d *src/icu74/unicode/locdspnm.h
e5e8a1f66e28a293c82a9619ed54483e *src/icu74/unicode/locid.h
60f32da233fd51948748d354bad7f8e4 *src/icu74/unicode/measfmt.h
08d500980300a85d60a9978214834f1d *src/icu74/unicode/measunit.h
80785d8036f254a86d9baac0c1af2ed4 *src/icu74/unicode/measure.h
1f576c7e53600371bb451b741f3c9a9d *src/icu74/unicode/messagepattern.h
5e0c21fab16bd123d4ec85e06be97cad *src/icu74/unicode/msgfmt.h
101ff621627f86c93aa2ad3b7325ad11 *src/icu74/unicode/normalizer2.h
c3cbf740fd160ebc483f63ebc98a737a *src/icu74/unicode/normlzr.h
1869765b22f9a6ebab507347f5404cd1 *src/icu74/unicode/nounit.h
80f907ca399d415b1f02eb0c2f74aa01 *src/icu74/unicode/numberformatter.h
68cfbb32bc32dd4309ab9d211570d974 *src/icu74/unicode/numberrangeformatter.h
bde1976339fcd7dafb55ea40b3e18e9d *src/icu74/unicode/numfmt.h
3b99ae8bdb3d640cd23994924ba30cc3 *src/icu74/unicode/numsys.h
ce38831411af01eeaf0bbbb6e1cb0153 *src/icu74/unicode/parseerr.h
cc5ccd6c547dd9091c7f49c751ecb991 *src/icu74/unicode/parsepos.h
3dcb1366225146b55947cd1a0bb7caf9 *src/icu74/unicode/platform.h
483f1161d5dcdd8dabba2ab1c0676472 *src/icu74/unicode/plurfmt.h
d58e612ffa69af52219e7cd9c63f81e4 *src/icu74/unicode/plurrule.h
d74097874c82f77331d8e06a5c59d37f *src/icu74/unicode/ptypes.h
749d40b1814eebb28751c765149de620 *src/icu74/unicode/putil.h
e62b3ce376b276233ca6d5c42b269cf4 *src/icu74/unicode/rbbi.h
c72642843afb410e8608a1854d1ec5e6 *src/icu74/unicode/rbnf.h
f398e5f778c63899a5fb0491a7865e0a *src/icu74/unicode/rbtz.h
792e6d02296f6e2e7aa11c2738ed61ce *src/icu74/unicode/regex.h
4bfb46a34299058158a17293463257ce *src/icu74/unicode/region.h
71f70261239632a1774c662dde7fb15f *src/icu74/unicode/reldatefmt.h
6c6cf6b8bac96615e3e5badfd8e61abf *src/icu74/unicode/rep.h
8916c1460ab58af3dd262b4a4078b689 *src/icu74/unicode/resbund.h
fdffae48b9f5bd15c850ce0adf9b72d0 *src/icu74/unicode/schriter.h
b3f324f5b5909bfdbb44ad017eebfc2a *src/icu74/unicode/scientificnumberformatter.h
0d1d877947274d7d93a978136cd5956b *src/icu74/unicode/search.h
a8673ab1e695133af0e7ba1bfdc65b0d *src/icu74/unicode/selfmt.h
f10649909c461296486a9c0a7dd5857d *src/icu74/unicode/simpleformatter.h
7e988e590297e335b18f3f147a8aaae3 *src/icu74/unicode/simplenumberformatter.h
1f5fe314833142f0a225893d793f77fa *src/icu74/unicode/simpletz.h
51aa79b85d606c79c81900c5eca8ad5a *src/icu74/unicode/smpdtfmt.h
741b50d64482931844d297e126636fa1 *src/icu74/unicode/sortkey.h
bb3b2e28a2ae9b66a60b7d90f8c07512 *src/icu74/unicode/std_string.h
785cf3e673212965b782c4413e6b8222 *src/icu74/unicode/strenum.h
d4929224e8d9df2f272f0305780cbacb *src/icu74/unicode/stringoptions.h
8e783cf9e6e4bf924925ab8a231b4643 *src/icu74/unicode/stringpiece.h
278a1b70f109127ce330b6ce78b10759 *src/icu74/unicode/stringtriebuilder.h
050caaae33d81da3f92391beda9abe6e *src/icu74/unicode/stsearch.h
0a29f6657cdf2f1fb51f571f6ed91188 *src/icu74/unicode/symtable.h
580312a384c454b22fa78efd57d68dc8 *src/icu74/unicode/tblcoll.h
9c085cb49cdb637e1c6ed9135a067a85 *src/icu74/unicode/timezone.h
d4c113ee349bf153446afe6368656eec *src/icu74/unicode/tmunit.h
9b2b3e2a43eaa5de71cb8f48ce70f1cf *src/icu74/unicode/tmutamt.h
e278520847e0ebb7b58047ec4b1938e5 *src/icu74/unicode/tmutfmt.h
1add1a6880730bfb536af6047374930b *src/icu74/unicode/translit.h
09771861d074000b6ddb91670ac0e808 *src/icu74/unicode/tzfmt.h
b999d3f10ebb00b154a9c2cae60dd386 *src/icu74/unicode/tznames.h
3b2958cf9b88d5a66f949d331a87707a *src/icu74/unicode/tzrule.h
7708a0f07a173cfbce2671b54009c9f2 *src/icu74/unicode/tztrans.h
3ab58e9b0b0ab26f8f15b0538ee467c1 *src/icu74/unicode/ubidi.h
06f757bac430c0f6ce0d2f44d090eab8 *src/icu74/unicode/ubiditransform.h
e1cea4823bccb6ca313538a45c75fe40 *src/icu74/unicode/ubrk.h
dc4ad784285da5b246f48f49563af5bf *src/icu74/unicode/ucal.h
86ef4f5a8f0d8708669491ed9467ee1c *src/icu74/unicode/ucasemap.h
b536ccb7b79d1fee71b5b53e7d370f8f *src/icu74/unicode/ucat.h
f2e3aa1c6fea75517962d6d8596aaa94 *src/icu74/unicode/uchar.h
6e75f25a3a7d95f85a104f4800f00e45 *src/icu74/unicode/ucharstrie.h
f45c632aeab5184ef4af16857a096168 *src/icu74/unicode/ucharstriebuilder.h
99952c7f7481d1d6849ca745d7ada609 *src/icu74/unicode/uchriter.h
08c4cdf97d737fb8bed066c4f1411f15 *src/icu74/unicode/uclean.h
7a6379ffa7fb17129c387d51f6a2ed76 *src/icu74/unicode/ucnv.h
b5307a489929d900d1505e3b17546544 *src/icu74/unicode/ucnv_cb.h
a507bbb125e54d860485ec054a4a55ed *src/icu74/unicode/ucnv_err.h
3e19ed11aa05d92bed1e152d75808fbc *src/icu74/unicode/ucnvsel.h
db8f0d1efdcd516cfc72b8f07df02b57 *src/icu74/unicode/ucol.h
82784539ceee31f9117c6a377542ac72 *src/icu74/unicode/ucoleitr.h
eb78be846cee6d512bf6d74c076cea7f *src/icu74/unicode/uconfig.h
fab7a1cb4679b3799a86e331a73e5b6b *src/icu74/unicode/ucpmap.h
b52b06ac993637f99a3c0795019bc616 *src/icu74/unicode/ucptrie.h
115314d31a5b3a0a67a860843bdc5a07 *src/icu74/unicode/ucsdet.h
93df41f8eed0aac934559ba79ec3a4b8 *src/icu74/unicode/ucurr.h
a7a4edb0616190ce137195e18bad95f9 *src/icu74/unicode/udat.h
138bdc88a47b2447778af7b9e7a81405 *src/icu74/unicode/udata.h
b3993a86728da0269bebb9afe03bf40f *src/icu74/unicode/udateintervalformat.h
d3bb32cb07c55a3fced272d0bb320191 *src/icu74/unicode/udatpg.h
2b320b1e6042efb12c0cee35f5c19d7d *src/icu74/unicode/udisplaycontext.h
a3a178d7830dcc01908671bb50f4db31 *src/icu74/unicode/udisplayoptions.h
1c977846ed8fb1ed9a22ef5505b5f74a *src/icu74/unicode/uenum.h
691ad0b0e8795dd2c653a3eba713d8f8 *src/icu74/unicode/ufieldpositer.h
8d5c4936b633b2d697a895b0cba8adfe *src/icu74/unicode/uformattable.h
32927ad9df741fc4f2f30e140e682e56 *src/icu74/unicode/uformattednumber.h
c8cbe1952703b8efa1981fa54ee2449b *src/icu74/unicode/uformattedvalue.h
164eaedd0ecaa7bbe0b4afe99664e4e2 *src/icu74/unicode/ugender.h
1dc6db480aa52a700e6ec4e876783d52 *src/icu74/unicode/uidna.h
43e0c805cd086df8b62aa4110a3a055a *src/icu74/unicode/uiter.h
a70ca9d644ec5a75dcd214cb2972775a *src/icu74/unicode/uldnames.h
a64f637b452dde79f1569d6dd33116e9 *src/icu74/unicode/ulistformatter.h
10e85e2e67a966d8e36ff49d48dae183 *src/icu74/unicode/uloc.h
a4e1ebf51b78f3927af50d001307df7e *src/icu74/unicode/ulocale.h
8a201c407f9d58061ac7f4a54c9a7c9f *src/icu74/unicode/ulocbuilder.h
802fd8f24ae13177edb94e8045293a67 *src/icu74/unicode/ulocdata.h
2d902ff00eac765b305cb113b74d34df *src/icu74/unicode/umachine.h
f01bf0e2ecdad7168cae02051a68c6e0 *src/icu74/unicode/umisc.h
37ee4bc144f6245875d6c16051fd7f79 *src/icu74/unicode/umsg.h
6f10d7a98ed5bb1fa6123d12483a5e9c *src/icu74/unicode/umutablecptrie.h
3c83bb08bd0cdd214f8fb3244a674d79 *src/icu74/unicode/unifilt.h
821f46371d07630960cb18f261e98455 *src/icu74/unicode/unifunct.h
7cc35c453c14958148e3beff36faa28f *src/icu74/unicode/unimatch.h
c551301d7294b366098e863e349a7b40 *src/icu74/unicode/unirepl.h
5cef1e374e075a8e6f4a1a0a0994811f *src/icu74/unicode/uniset.h
462e72372ff841496db258eb69846a7f *src/icu74/unicode/unistr.h
57454a0c2671043683a531c02c3b7784 *src/icu74/unicode/unorm.h
9bf09ab215d2625fc068f9ac4c3ba49b *src/icu74/unicode/unorm2.h
5890f50a4aa9e36955c17fab4bc1ec71 *src/icu74/unicode/unum.h
315dd3c68fa2d700410f8a47bac4098d *src/icu74/unicode/unumberformatter.h
dc5ee8336efcb731c70bc67ef104e69e *src/icu74/unicode/unumberoptions.h
617d50483cf3e0973fca717816e1f8b5 *src/icu74/unicode/unumberrangeformatter.h
18bf54617ad67e194a83ce8de7fec7a8 *src/icu74/unicode/unumsys.h
93cd3606c473f206d9012a1d759aff0f *src/icu74/unicode/uobject.h
b9d4a099b0076de1fe7b5238f6506dbb *src/icu74/unicode/upluralrules.h
f2b961c77896af68f040fee75b96ab1d *src/icu74/unicode/uregex.h
a136613a3d5416b0cb8a38b9a982dcd1 *src/icu74/unicode/uregion.h
2d37d0294471dbfb4303efdcbd817e4e *src/icu74/unicode/ureldatefmt.h
cdfff39775f39a3eddf0df76c90729b9 *src/icu74/unicode/urename.h
6ada98ac1aa68081f10d17b1abadef6e *src/icu74/unicode/urep.h
1c6c3117cde06106cf8d200c0fc4438d *src/icu74/unicode/ures.h
b73724ad86fb127bb7c3ec4ff6075922 *src/icu74/unicode/uscript.h
77ea7c165a8175aeeff168574b81b295 *src/icu74/unicode/usearch.h
05ca5673f7d5b8819c6b8068da5a0fe8 *src/icu74/unicode/uset.h
4b80450fa2e136e639bf86e436cd29ef *src/icu74/unicode/usetiter.h
a30f44d71635b8afa174e2ce1e1188c1 *src/icu74/unicode/ushape.h
13d82c9a8e888e4fd1c1c8a919d52c11 *src/icu74/unicode/usimplenumberformatter.h
8555b71a17076ec05955b8fb3b438834 *src/icu74/unicode/uspoof.h
eabdbc2e85d927e9132a487cceae4d19 *src/icu74/unicode/usprep.h
52148861630982d150b1b08117967604 *src/icu74/unicode/ustring.h
54a28aa9c4e117cd3ebfd0a670324f6c *src/icu74/unicode/ustringtrie.h
eef1213d7071e18534af555eaa6e99a8 *src/icu74/unicode/utext.h
f9990715c6252d369294e99a81b192ab *src/icu74/unicode/utf.h
7ba8c4dd33f399a67a9b2ddd8de5e35d *src/icu74/unicode/utf16.h
a1fdf74f4bd8808f9fb17df371f886d9 *src/icu74/unicode/utf32.h
4cc30aa971e4b8eb7c22c3d0554126ec *src/icu74/unicode/utf8.h
71329d19bae2193dae4d270898718e6f *src/icu74/unicode/utf_old.h
7452fa04c594359438a3240535209eaa *src/icu74/unicode/utmscale.h
1a10170c6f8fefa7f29f28164022fc6a *src/icu74/unicode/utrace.h
48f145f00b2ffaa332a63184a5282981 *src/icu74/unicode/utrans.h
5a4cc77d4aded3026b10df71bfb99dae *src/icu74/unicode/utypes.h
779cdf7824a7dbcb471aa4301ca8c425 *src/icu74/unicode/uvernum.h
2da9700caecc79f6640b9c92c1bc56cb *src/icu74/unicode/uversion.h
0406bfde77988075c026776072385914 *src/icu74/unicode/vtzone.h
a4070f14b37d8f1acd211650f2ded226 *src/icu74_common_cpp.txt
cd0d99d38a7d4ac6f0a4bddd70688de4 *src/icu74_i18n_cpp.txt
d9648c8f0c3712b1f96f6ad9c7d3e683 *src/icu74_stubdata_cpp.txt
6150c6361d82f4c9026aab6b6a87a85f *src/install.libs.R.in
fb5f86e0873efd7e7c549c13699fbb61 *src/stri_ICU_settings.cpp
0ff20e51dff85659d7cd69b56b298ff8 *src/stri_brkiter.cpp
29309d746858edba632a6ac13841ca01 *src/stri_brkiter.h
65c59843fa6ac72a2e8dd0979639462b *src/stri_bytesearch_matcher.h
4536f48e72dcc8f2fd072484c1afc380 *src/stri_callables.cpp
6d7577f17868bbf007b3ab898156adb2 *src/stri_callables.h
999b74134eb9940700f76479b316fbbe *src/stri_collator.cpp
de7839d95c156cb9c0d371ad71909804 *src/stri_common.cpp
a5a327fce8f64462c5641fc28f79ba04 *src/stri_compare.cpp
f4f90e887cee7681ec60c61b18b69cad *src/stri_container_base.cpp
782e20deb4f3275feb4e8f26168640ac *src/stri_container_base.h
ac5ab1c263eaa4d1a022d469af812040 *src/stri_container_bytesearch.cpp
75fd333fc8240055a37e3d071a2b5e9e *src/stri_container_bytesearch.h
edb3cf965229a035067483457c9696e3 *src/stri_container_charclass.h
c14b6e86b694301cf193d323d7b1f075 *src/stri_container_double.h
e34704dc4ed05be96acd19a8ac0ae8e6 *src/stri_container_integer.h
2e5cf0580ed0efda929b9fe8ff5e26ac *src/stri_container_listint.cpp
810a33d2ad7b21d781d8205c9208f596 *src/stri_container_listint.h
76da91deea79a83d9a99d28a35cf94a4 *src/stri_container_listraw.cpp
12001c2617b74ca38cdc83b86829f4b2 *src/stri_container_listraw.h
266a12b65704bb121af10c69a6abfc62 *src/stri_container_listutf8.cpp
1435c851af3349e0aabba8b948161e09 *src/stri_container_listutf8.h
744c0de65919d3a38e55f15edbb235b7 *src/stri_container_logical.h
8131ebae84748a16a124a477be3efcd2 *src/stri_container_regex.cpp
d827d5a7633dd33db76991c6152ce7fb *src/stri_container_regex.h
7f5c9cc952082c5e783d279b50a023e7 *src/stri_container_usearch.cpp
aff54201e0bc936849b79d237e54d93e *src/stri_container_usearch.h
240e0e4d9a445e434edfba60897dc2f3 *src/stri_container_utf16.cpp
5a429ebeeb7648ec50ad732cacd73964 *src/stri_container_utf16.h
253d3abedb7b8809adbb2a68b643ee45 *src/stri_container_utf8.cpp
31dfda69c9dee74a6392cb10a08010fe *src/stri_container_utf8.h
2ad8726ff33daf537a77699017f51998 *src/stri_container_utf8_indexable.cpp
2cb6d113326f3222c745b7f127d9c790 *src/stri_container_utf8_indexable.h
f14bb0884dbf9ffc4982a8c2958db613 *src/stri_cpp.txt
67b4cbe9ecb1f096de83d3b5d513fa96 *src/stri_encoding_conversion.cpp
37c0e82c56933142e81641c96e584ffc *src/stri_encoding_detection.cpp
6fd2ba70aa6412a6e43e84d6779fa51d *src/stri_encoding_management.cpp
7563bcd22b34f1e9ae327805c5f45a45 *src/stri_escape.cpp
40c0157929ab0be4cdc431b274c81f2e *src/stri_exception.cpp
976d5e2faab934bf7c4b9132d0dcb084 *src/stri_exception.h
2fa6be7c409dcae8774302560b258239 *src/stri_exports.h
fcbd9989f52a638e5e48c292c28bb119 *src/stri_external.h
041502f3bd4953d92ace1bd2aa101239 *src/stri_interval.h
10dbc313734a5bb26ab3bfcff8debc57 *src/stri_intvec.h
7d6a57e2fa6abf2615a1cac9ab365be5 *src/stri_join.cpp
ffa3db866c25b86e50b7061df4faff12 *src/stri_length.cpp
46a8b7117cba2e48c12eee0929aa3980 *src/stri_macros.h
bd743a87be2256eeec123d73445dc6ee *src/stri_messages.h
ea2ad4d5742fafa38c206cc4b941949c *src/stri_pad.cpp
152db0a1f2c4caa32fcf5314abcdcda8 *src/stri_prepare_arg.cpp
2adc53a4628f3040aff80ee02f98d41c *src/stri_random.cpp
1cb4ec7d0b6bc0d6adf244e785679826 *src/stri_reverse.cpp
bfba95388e4f32795ba6844ff528277b *src/stri_search_boundaries_count.cpp
18791d35d171aeda92d161063732f734 *src/stri_search_boundaries_extract.cpp
350f53d64ad199495ec530e0cff93d66 *src/stri_search_boundaries_locate.cpp
d48fbabffcea8deccf18d0677bb6b944 *src/stri_search_boundaries_split.cpp
94df35b3edf13dbf9ff4db1d5679b308 *src/stri_search_class_count.cpp
a27d5f514ab03f883d3e6ff4333fc29d *src/stri_search_class_detect.cpp
1bcee41c9ccfab4f8a1dc08f8b8238f8 *src/stri_search_class_extract.cpp
9f05dc7db3735c45079b0451e9862a9d *src/stri_search_class_locate.cpp
bcd00c874fddcf3d11ef0f5214ef9949 *src/stri_search_class_replace.cpp
2301343cc6d94c17a2a437cbdda36db2 *src/stri_search_class_split.cpp
a9f9a80ca84aaf7fc5800db889998fb9 *src/stri_search_class_startsendswith.cpp
dc1e20c339eecdbf64b66edcb30814c6 *src/stri_search_class_subset.cpp
9ad5ca966c1343f6c9a1ec54f2805e09 *src/stri_search_class_trim.cpp
400a34946a62a6665dba6133613aa427 *src/stri_search_coll_count.cpp
68428d5a38f50cb68317c0aa2321f2bc *src/stri_search_coll_detect.cpp
ab8d89132b513b55da86315d5ce321a1 *src/stri_search_coll_extract.cpp
2a64b8ce7c6483ec0e6752ac9b46c4d5 *src/stri_search_coll_locate.cpp
9f5c17d4be8e8b272a40ab22e8167c21 *src/stri_search_coll_replace.cpp
bb8f994bc643fff66afa1ac0bd754226 *src/stri_search_coll_split.cpp
8f9d57bf1e05251a794b3118aa603a7d *src/stri_search_coll_startsendswith.cpp
b279600820fe650272bb17adeb8be3b4 *src/stri_search_coll_subset.cpp
ac697151b8087b75ee766226bf406efc *src/stri_search_common.cpp
5a3cb5f5107f6d5ea557eab429d515cf *src/stri_search_fixed_count.cpp
16d121db52a7f6adcc0f07d6d8f88632 *src/stri_search_fixed_detect.cpp
ade62a6f1dee537a373816d625b05241 *src/stri_search_fixed_extract.cpp
8c82763d282a04b411875cbcf37e21ca *src/stri_search_fixed_locate.cpp
4c467f11a8762e96d72be58e3110dcb4 *src/stri_search_fixed_replace.cpp
0d4e558341ba7d95e531aeb05398f220 *src/stri_search_fixed_split.cpp
39572950e4c8ca0ce5f37791c508dbd7 *src/stri_search_fixed_startsendswith.cpp
f7d990d20ad700912ebd5fdb85b9cf85 *src/stri_search_fixed_subset.cpp
797523b241c0a526e5d0cdf00ac0a036 *src/stri_search_in.cpp
16ec5e9660e1a1ff631d4b387a3242d2 *src/stri_search_other_split.cpp
5801b7e6ff5dca59feee5b8d7be9d59f *src/stri_search_regex_count.cpp
d772307081f0bea44ba25ef90f505ab4 *src/stri_search_regex_detect.cpp
aa632649d54c16dff6caf386d50dfc25 *src/stri_search_regex_extract.cpp
28e44f4781b3f88a3f36830f2aae0e70 *src/stri_search_regex_locate.cpp
98e5b7c40210647bd1c43925f2afa84a *src/stri_search_regex_match.cpp
e33b296680044b6e3204249fdb477d67 *src/stri_search_regex_replace.cpp
50dadf6eaaa7a8759856a824ece614e2 *src/stri_search_regex_split.cpp
33a6024188555ada4d6d7bbafad284e6 *src/stri_search_regex_subset.cpp
c107f666262ea6e772403cee8093d339 *src/stri_sort.cpp
a376d482fdaabe2f0fab36d79aad672c *src/stri_sprintf.cpp
bd4783e910100d5d22e404ec8e019b3f *src/stri_stats.cpp
fd32f7c9b4dda6e83be100fbacd7bf2a *src/stri_string8.cpp
55f899d3255f92a4875e0532be42ec4e *src/stri_string8.h
05a15f74a18447d688094e9b484d4512 *src/stri_string8buf.h
57b37b079e73d9a5ee57c47fc6767c45 *src/stri_stringi.cpp
b31d40f98a7051e0b2fb0843fe95c032 *src/stri_stringi.h
6535842e678905e27cb8a9e0bf6648b4 *src/stri_sub.cpp
c33bce1462f1a48f4dd2ebaab9c1852b *src/stri_test.cpp
188ef6ea0fc5e80bcd23eaf5c1135e14 *src/stri_time_calendar.cpp
e3426a190e0b751ca561080c383ec9a3 *src/stri_time_format.cpp
e47421dc81ae335f8dfc65f94ecb85f6 *src/stri_time_symbols.cpp
dd25cde7f0118f7aeecbe6f7cedde614 *src/stri_time_zone.cpp
ed8b336d61c3eca1be978371efb8f681 *src/stri_trans_casemap.cpp
8c83c91976b44ebcef290131e61c1686 *src/stri_trans_normalization.cpp
f16759c75e1682efd17a50d04e948c66 *src/stri_trans_other.cpp
d296e57c3bba2d0c9ec2fb951e96c13d *src/stri_trans_transliterate.cpp
9764a87dca4729845b5dbf5c7e925d86 *src/stri_ucnv.cpp
0ff791f6303d23e4ac36857c90c588a8 *src/stri_ucnv.h
e77227c5e7acd921504fb129769ce259 *src/stri_uloc.cpp
500df823cc0795f16d15bc411eecfdac *src/stri_utils.cpp
9e121071c528d969cefdc5b02efd5d1d *src/stri_wrap.cpp
bb1dcb7fe7fd5ca35f7ec3626ea59e2f *src/uconfig_local.h.in
7c699ef342589de58701f093695d4ab4 *tools/AC_CXX_HAVE_STL.m4
b58a26e64432c535ed081494fc39783e *tools/AC_CXX_NAMESPACES.m4
stringi/configure.win 0000644 0001762 0000144 00000003501 14750110641 014427 0 ustar ligges users # Copyright (c) 2013-2025, Marek Gagolewski
# This is an architecture-independent configure.win file
ICU_FOUND=0 # use our ICU bundle
ICUDT_DIR="icu74/data"
ICU_BUNDLE_VERSION=74
# some systems do not have ResolveLocaleName - this applies to mingw
# on 32-bit windows shipped with older Rtools (R < 4.2)
DISABLE_RESOLVE_LOCALE_NAME=`"${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e '
cat(as.integer(getRversion() < "4.2"))
'`
echo "ICU_FOUND=${ICU_FOUND}"
echo "ICU_BUNDLE_VERSION=${ICU_BUNDLE_VERSION}"
echo "ICUDT_DIR=${ICUDT_DIR}"
echo "DISABLE_RESOLVE_LOCALE_NAME=${DISABLE_RESOLVE_LOCALE_NAME}"
"${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e '
fin <- "src/uconfig_local.h.in";
fout <- "src/uconfig_local.h";
f <- readLines(fin);
f <- gsub("@ICU_FOUND@", '"${ICU_FOUND}"', f, fixed = TRUE);
f <- gsub("@DISABLE_RESOLVE_LOCALE_NAME@", '"${DISABLE_RESOLVE_LOCALE_NAME}"', f, fixed = TRUE);
f <- gsub("@ICUDT_DIR@", "'"${ICUDT_DIR}"'", f, fixed = TRUE);
f <- gsub("@ICU_BUNDLE_VERSION@", "'"${ICU_BUNDLE_VERSION}"'", f, fixed = TRUE);
f <- gsub("@ICUDT_ENDIANNESS@", .Platform$endian, f, fixed = TRUE);
con <- file(fout, "wb") # LF line ending
writeLines(f, con);
close(con)
'
"${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e '
fin <- "src/install.libs.R.in";
fout <- "src/install.libs.R";
f <- readLines(fin);
f <- gsub("@ICU_FOUND@", '"${ICU_FOUND}"', f, fixed = TRUE);
f <- gsub("@DISABLE_RESOLVE_LOCALE_NAME@", '"${DISABLE_RESOLVE_LOCALE_NAME}"', f, fixed = TRUE);
f <- gsub("@ICUDT_DIR@", "'"${ICUDT_DIR}"'", f, fixed = TRUE);
f <- gsub("@ICU_BUNDLE_VERSION@", "'"${ICU_BUNDLE_VERSION}"'", f, fixed = TRUE);
f <- gsub("@ICUDT_ENDIANNESS@", .Platform$endian, f, fixed = TRUE);
con <- file(fout, "wb") # LF line ending
writeLines(f, con);
close(con)
'
stringi/R/ 0000755 0001762 0000144 00000000000 14771224007 012136 5 ustar ligges users stringi/R/files.R 0000644 0001762 0000144 00000014077 14750110641 013367 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Read Text File as Raw
#'
#' @description
#' Reads a text file as-is, with no conversion or text line splitting.
#'
#' @details
#' Once a text file is read into memory,
#' encoding detection (see \code{\link{stri_enc_detect}}),
#' conversion (see \code{\link{stri_encode}}), and/or
#' splitting of text into lines (see \code{\link{stri_split_lines1}})
#' can be performed.
#'
#' @param con name of the output file or a connection object
#' (opened in the binary mode)
#' @param fname [DEPRECATED] alias of \code{con}
#'
#' @return
#' Returns a vector of type \code{raw}.
#'
#' @family files
#' @export
stri_read_raw <- function(con, fname = con)
{
if (!missing(fname) && missing(con)) { # DEPRECATED
warning("The 'fname' argument in stri_read_raw is a deprecated alias of 'con' and will be removed in a future release of 'stringi'.")
con <- fname
}
if (is.character(con)) {
con <- file(con, "rb")
on.exit(close(con))
}
bufsize <- 4194304L
data <- list()
n <- 1L
repeat {
buf <- readBin(con, what = "raw", size = 1L, n = bufsize)
data[[n]] <- buf
n <- n + 1L
if (length(buf) < bufsize)
break
}
do.call(c, data)
}
#' @title
#' Read Text Lines from a Text File
#'
#' @description
#' Reads a text file in ins entirety, re-encodes it, and splits it into text lines.
#'
#' @details
#' This aims to be a substitute for the \code{\link{readLines}} function,
#' with the ability to re-encode the input file in a much more robust way,
#' and split the text into lines with \code{\link{stri_split_lines1}}
#' (which conforms with the Unicode guidelines for newline markers).
#'
#' The function calls \code{\link{stri_read_raw}},
#' \code{\link{stri_encode}}, and \code{\link{stri_split_lines1}},
#' in this order.
#'
#' Because of the way this function is currently implemented,
#' maximal file size cannot exceed ~0.67 GB.
#'
#' @param con name of the output file or a connection object
#' (opened in the binary mode)
#' @param encoding single string; input encoding;
#' \code{NULL} or \code{''} for the current default encoding.
#' @param fname [DEPRECATED] alias of \code{con}
#'
#' @return
#' Returns a character vector, each text line is a separate string.
#' The output is always marked as UTF-8.
#'
#' @family files
#' @export
stri_read_lines <- function(con, encoding = NULL,
fname = con)
{
if (!missing(fname) && missing(con)) { # DEPRECATED
warning("The 'fname' argument in stri_read_lines is a deprecated alias of 'con' and will be removed in a future release of 'stringi'.")
con <- fname
}
stopifnot(is.null(encoding) || is.character(encoding))
if (is.null(encoding) || encoding == "")
encoding <- stri_enc_get() # this need to be done manually, see ?stri_encode
if (encoding == "auto")
stop("encoding `auto` is no longer supported") # TODO: remove in the future
txt <- stri_read_raw(con)
txt <- stri_encode(txt, encoding, "UTF-8")
stri_split_lines1(txt)
}
#' @title
#' Write Text Lines to a Text File
#'
#' @description
#' Writes a text file is such a way that each element of a given
#' character vector becomes a separate text line.
#'
#'
#' @details
#' It is a substitute for the \R \code{\link{writeLines}} function,
#' with the ability to easily re-encode the output.
#'
#' We suggest using the UTF-8 encoding for all text files:
#' thus, it is the default one for the output.
#'
#' @param str character vector with data to write
#' @param con name of the output file or a connection object
#' (opened in the binary mode)
#' @param encoding output encoding, \code{NULL} or \code{''} for
#' the current default one
#' @param sep newline separator
#' @param fname [DEPRECATED] alias of \code{con}
#'
#' @return
#' This function returns nothing noteworthy.
#'
#' @family files
#' @export
stri_write_lines <- function(str, con,
encoding = "UTF-8",
sep = ifelse(.Platform$OS.type == "windows", "\r\n", "\n"),
fname = con)
{
if (!missing(fname) && missing(con)) { # DEPRECATED
warning("The 'fname' argument in stri_write_lines is a deprecated alias of 'con' and will be removed in a future release of 'stringi'.")
con <- fname
}
stopifnot(is.character(sep), length(sep) == 1)
str <- stri_join(str, sep, collapse = "")
str <- stri_encode(str, "", encoding, to_raw = TRUE)[[1]]
writeBin(str, con, useBytes = TRUE)
invisible(NULL)
}
stringi/R/encoding_management.R 0000644 0001762 0000144 00000021546 14750110641 016246 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' List Known Character Encodings
#'
#' @description
#' Gives the list of encodings that are supported by \pkg{ICU}.
#'
#' @details
#' Apart from given encoding identifiers and their aliases,
#' some other specifiers might additionally be available.
#' This is due to the fact that \pkg{ICU} tries to normalize
#' converter names. For instance, \code{'UTF8'} is also valid,
#' see \link{stringi-encoding} for more information.
#'
#' @param simplify single logical value; return a character vector or a
#' list of character vectors?
#'
#' @return If \code{simplify} is \code{FALSE}, a list of
#' character vectors is returned. Each list element represents a unique
#' character encoding. The \code{name} attribute gives the \pkg{ICU} Canonical
#' Name of an encoding family. The elements (character vectors) are
#' its aliases.
#'
#' If \code{simplify} is \code{TRUE} (the default), then the resulting list
#' is coerced to a character vector and sorted, and returned with
#' removed duplicated entries.
#'
#' @examples
#' stri_enc_list()
#' stri_enc_list(FALSE)
#'
#' @family encoding_management
#' @export
stri_enc_list <- function(simplify=TRUE)
{
simplify <- (is.logical(simplify) && length(simplify) == 1L && !is.na(simplify) && simplify) # isTRUE(simplify)
ret <- .Call(C_stri_enc_list)
if (simplify) {
stri_sort(
unique(unlist(ret)),
locale="en_US", numeric=TRUE, strength=1
)
} else {
lapply(
ret[
stri_order(
names(ret), locale="en_US", numeric=TRUE, strength=1
)
],
stri_sort,
locale="en_US", numeric=TRUE, strength=1
)
}
}
#' @title
#' Query a Character Encoding
#'
#' @description
#' Gets basic information on a character encoding.
#'
#' @details
#' An error is raised if the provided encoding is unknown to \pkg{ICU}
#' (see \code{\link{stri_enc_list}} for more details).
#'
#'
#' @param enc \code{NULL} or \code{''} for the default encoding,
#' or a single string with encoding name
#'
#' @return
#' Returns a list with the following components:
#' \itemize{
#' \item \code{Name.friendly} -- friendly encoding name:
#' MIME Name or JAVA Name or \pkg{ICU} Canonical Name
#' (the first of provided ones is selected, see below);
#' \item \code{Name.ICU} -- encoding name as identified by \pkg{ICU};
#' \item \code{Name.*} -- other standardized encoding names,
#' e.g., \code{Name.UTR22}, \code{Name.IBM}, \code{Name.WINDOWS},
#' \code{Name.JAVA}, \code{Name.IANA}, \code{Name.MIME} (some of them
#' may be unavailable for all the encodings);
#' \item \code{ASCII.subset} -- is ASCII a subset of the given encoding?;
#' \item \code{Unicode.1to1} -- for 8-bit encodings only: are all characters
#' translated to exactly one Unicode code point and is the translation
#' scheme reversible?;
#' \item \code{CharSize.8bit} -- is this an 8-bit encoding, i.e., do we have
#' \code{CharSize.min == CharSize.max} and \code{CharSize.min == 1}?;
#' \item \code{CharSize.min} -- minimal number of bytes used
#' to represent a UChar (in UTF-16, this is not the same as UChar32)
#' \item \code{CharSize.max} -- maximal number of bytes used
#' to represent a UChar (in UTF-16, this is not the same as UChar32,
#' i.e., does not reflect the maximal code point representation size)
#' }
#'
#' @family encoding_management
#' @export
stri_enc_info <- function(enc = NULL)
{
.Call(C_stri_enc_info, enc)
}
#' @title
#' Set or Get Default Character Encoding in \pkg{stringi}
#'
#' @description
#' \code{stri_enc_set} sets the encoding used to re-encode strings
#' internally (i.e., by \R) declared to be in native encoding,
#' see \link{stringi-encoding} and \code{\link{stri_enc_mark}}.
#' \code{stri_enc_get} returns the currently used default encoding.
#'
#' @details
#' \code{stri_enc_get} is the same as
#' \code{\link{stri_enc_info}(NULL)$Name.friendly}.
#'
#' Note that changing the default encoding may have undesired consequences.
#' Unless you are an expert user and you know what you are doing,
#' \code{stri_enc_set} should only be used if \pkg{ICU} fails to detect
#' your system's encoding correctly (while testing \pkg{stringi}
#' we only encountered such a situation on a very old Solaris machine).
#' Note that \pkg{ICU} tries to match the encoding part of the \code{LC_CTYPE}
#' category as given by \code{\link{Sys.getlocale}}.
#'
#' If you set a default encoding that is neither a superset of ASCII,
#' nor an 8-bit encoding, a warning will be generated,
#' see \link{stringi-encoding} for discussion.
#'
#' \code{stri_enc_set} has no effect if the system ICU assumes that
#' the default charset is always UTF-8 (i.e., where the internal
#' \code{U_CHARSET_IS_UTF8} is defined and set to 1), see
#' \code{\link{stri_info}}.
#'
#' @param enc single string; character encoding name,
#' see \code{\link{stri_enc_list}} for the list of supported encodings.
#'
#' @return
#' \code{stri_enc_set} returns a string with
#' previously used character encoding, invisibly.
#'
#' \code{stri_enc_get} returns a string with current default character
#' encoding.
#'
#' @family encoding_management
#' @rdname stri_enc_set
#' @export
stri_enc_set <- function(enc)
{
previous <- stri_enc_get()
# We call stri_info, because it generates some warnings,
# in case any problems are found:
.Call(C_stri_enc_set, enc)
message(stri_paste("New settings: ", stri_info(short = TRUE)))
invisible(previous)
}
#' @rdname stri_enc_set
#' @export
stri_enc_get <- function() {
stri_enc_info(NULL)$Name.friendly
}
#' @title
#' Get Declared Encodings of Each String
#'
#' @description
#' Reads declared encodings for each string in a character vector
#' as seen by \pkg{stringi}.
#'
#' @details
#' According to \code{\link{Encoding}},
#' \R has a simple encoding marking mechanism:
#' strings can be declared to be in \code{latin1},
#' \code{UTF-8} or \code{bytes}.
#'
#' Moreover, we may check (via the R/C API) whether
#' a string is in ASCII (\R assumes that this holds if and only if
#' all bytes in a string are not greater than 127,
#' so there is an implicit assumption that your platform uses
#' an encoding that extends ASCII)
#' or in the system's default (a.k.a. \code{unknown} in \code{\link{Encoding}})
#' encoding.
#'
#' Intuitively, the default encoding should be equivalent to
#' the one you use on \code{stdin} (e.g., your 'keyboard').
#' In \pkg{stringi} we assume that such an encoding
#' is equivalent to the one returned by \code{\link{stri_enc_get}}.
#' It is automatically detected by \pkg{ICU}
#' to match -- by default -- the encoding part of the \code{LC_CTYPE} category
#' as given by \code{\link{Sys.getlocale}}.
#'
#'
#'
#' @param str character vector
#' or an object coercible to a character vector
#'
#' @return Returns a character vector of the same length as \code{str}.
#' Unlike in the \code{\link{Encoding}} function, here the possible encodings are:
#' \code{ASCII}, \code{latin1}, \code{bytes}, \code{native},
#' and \code{UTF-8}. Additionally, missing values are handled properly.
#'
#' This gives exactly the same data that is used by
#' all the functions in \pkg{stringi} to re-encode their inputs.
#'
#' @family encoding_management
#' @export
stri_enc_mark <- function(str)
{
.Call(C_stri_enc_mark, str)
}
stringi/R/search_replace_4.R 0000644 0001762 0000144 00000033630 14750110641 015444 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Replace Pattern Occurrences
#'
#' @description
#' These functions replace, with the given replacement string, every/first/last
#' substring of the input that matches the specified \code{pattern}.
#'
#' @details
#' By default, all the functions are vectorized over
#' \code{str}, \code{pattern}, \code{replacement} (with recycling
#' of the elements in the shorter vector if necessary).
#' Input that is not part of any match is left unchanged;
#' each match is replaced in the result by the replacement string.
#'
#' However, for \code{stri_replace_all*}, if \code{vectorize_all} is \code{FALSE},
#' then each substring matching any of the supplied \code{pattern}s
#' is replaced by a corresponding \code{replacement} string.
#' In such a case, the vectorization is over \code{str},
#' and - independently - over \code{pattern} and \code{replacement}.
#' In other words, this is equivalent to something like
#' \code{for (i in 1:npatterns) str <- stri_replace_all(str, pattern[i], replacement[i]}.
#' Note that you must set \code{length(pattern) >= length(replacement)}.
#'
#' In case of \code{stri_replace_*_regex},
#' the replacement string may contain references to capture groups
#' (in round parentheses).
#' References are of the form \code{$n}, where \code{n} is the number
#' of the capture group (\code{$1} denotes the first group).
#' For the literal \code{$},
#' escape it with a backslash.
#' Moreover, \code{${name}} are used for named capture groups.
#'
#' Note that \code{stri_replace_last_regex} searches from start to end,
#' but skips overlapping matches, see the example below.
#'
#' \code{stri_replace}, \code{stri_replace_all}, \code{stri_replace_first},
#' and \code{stri_replace_last} are convenience functions; they just call
#' \code{stri_replace_*_*} variants, depending on the arguments used.
#'
#' If you wish to remove white-spaces from the start or end
#' of a string, see \code{\link{stri_trim}}.
#'
#' @param str character vector; strings to search in
#' @param pattern,regex,fixed,coll,charclass character vector;
#' search patterns; for more details refer to \link{stringi-search}
#' @param replacement character vector with replacements for matched patterns
#' @param opts_collator,opts_fixed,opts_regex a named list used to tune up
#' the search engine's settings; see
#' \code{\link{stri_opts_collator}}, \code{\link{stri_opts_fixed}},
#' and \code{\link{stri_opts_regex}}, respectively; \code{NULL}
#' for the defaults
#' @param merge single logical value;
#' should consecutive matches be merged into one string;
#' \code{stri_replace_all_charclass} only
#' @param vectorize_all single logical value;
#' should each occurrence of a pattern in every string
#' be replaced by a corresponding replacement string?;
#' \code{stri_replace_all_*} only
#' @param vectorise_all alias of \code{vectorize_all}
#' @param mode single string;
#' one of: \code{'first'} (the default), \code{'all'}, \code{'last'}
#' @param ... supplementary arguments passed to the underlying functions,
#' including additional settings for \code{opts_collator}, \code{opts_regex},
#' \code{opts_fixed}, and so on
#'
#' @return All the functions return a character vector.
#'
#' @examples
#' stri_replace_all_charclass('aaaa', '[a]', 'b', merge=c(TRUE, FALSE))
#'
#' stri_replace_all_charclass('a\nb\tc d', '\\p{WHITE_SPACE}', ' ')
#' stri_replace_all_charclass('a\nb\tc d', '\\p{WHITE_SPACE}', ' ', merge=TRUE)
#'
#' s <- 'Lorem ipsum dolor sit amet, consectetur adipisicing elit.'
#' stri_replace_all_fixed(s, ' ', '#')
#' stri_replace_all_fixed(s, 'o', '0')
#'
#' stri_replace_all_fixed(c('1', 'NULL', '3'), 'NULL', NA)
#'
#' stri_replace_all_regex(s, ' .*? ', '#')
#' stri_replace_all_regex(s, '(el|s)it', '1234')
#' stri_replace_all_regex('abaca', 'a', c('!', '*'))
#' stri_replace_all_regex('123|456|789', '(\\p{N}).(\\p{N})', '$2-$1')
#' stri_replace_all_regex(c('stringi R', 'REXAMINE', '123'), '( R|R.)', ' r ')
#'
#' # named capture groups are available since ICU 55
#' \dontrun{
#' stri_replace_all_regex('words 123 and numbers 456',
#' '(?[0-9]+)', '!${numbers}!')
#' }
#'
#' # Compare the results:
#' stri_replace_all_fixed('The quick brown fox jumped over the lazy dog.',
#' c('quick', 'brown', 'fox'), c('slow', 'black', 'bear'), vectorize_all=TRUE)
#' stri_replace_all_fixed('The quick brown fox jumped over the lazy dog.',
#' c('quick', 'brown', 'fox'), c('slow', 'black', 'bear'), vectorize_all=FALSE)
#'
#' # Compare the results:
#' stri_replace_all_fixed('The quicker brown fox jumped over the lazy dog.',
#' c('quick', 'brown', 'fox'), c('slow', 'black', 'bear'), vectorize_all=FALSE)
#' stri_replace_all_regex('The quicker brown fox jumped over the lazy dog.',
#' '\\b'%s+%c('quick', 'brown', 'fox')%s+%'\\b', c('slow', 'black', 'bear'), vectorize_all=FALSE)
#'
#' # Searching for the last occurrence:
#' # Note the difference - regex searches left to right, with no overlaps.
#' stri_replace_last_fixed("agAGA", "aga", "*", case_insensitive=TRUE)
#' stri_replace_last_regex("agAGA", "aga", "*", case_insensitive=TRUE)
#'
#' @family search_replace
#' @export
#' @rdname stri_replace
stri_replace_all <- function(str, replacement, ..., regex, fixed, coll, charclass)
{
providedarg <- c(
regex = !missing(regex),
fixed = !missing(fixed),
coll = !missing(coll),
charclass = !missing(charclass))
if (sum(providedarg) != 1)
stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`")
if (providedarg["regex"])
stri_replace_all_regex(str, regex, replacement, ...)
else if (providedarg["fixed"])
stri_replace_all_fixed(str, fixed, replacement, ...)
else if (providedarg["coll"])
stri_replace_all_coll(str, coll, replacement, ...)
else if (providedarg["charclass"])
stri_replace_all_charclass(str, charclass, replacement, ...)
}
#' @export
#' @rdname stri_replace
stri_replace_first <- function(str, replacement, ..., regex, fixed, coll, charclass)
{
providedarg <- c(
regex = !missing(regex),
fixed = !missing(fixed),
coll = !missing(coll),
charclass = !missing(charclass))
if (sum(providedarg) != 1)
stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`")
if (providedarg["regex"])
stri_replace_first_regex(str, regex, replacement, ...)
else if (providedarg["fixed"])
stri_replace_first_fixed(str, fixed, replacement, ...)
else if (providedarg["coll"])
stri_replace_first_coll(str, coll, replacement, ...)
else if (providedarg["charclass"])
stri_replace_first_charclass(str, charclass, replacement, ...)
}
#' @export
#' @rdname stri_replace
stri_replace_last <- function(str, replacement, ..., regex, fixed, coll, charclass)
{
providedarg <- c(
regex = !missing(regex),
fixed = !missing(fixed),
coll = !missing(coll),
charclass = !missing(charclass))
if (sum(providedarg) != 1)
stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`")
if (providedarg["regex"])
stri_replace_last_regex(str, regex, replacement, ...)
else if (providedarg["fixed"])
stri_replace_last_fixed(str, fixed, replacement, ...)
else if (providedarg["coll"])
stri_replace_last_coll(str, coll, replacement, ...)
else if (providedarg["charclass"])
stri_replace_last_charclass(str, charclass, replacement, ...)
}
#' @export
#' @rdname stri_replace
stri_replace <- function(str, replacement, ..., regex, fixed, coll, charclass,
mode = c("first", "all", "last"))
{
# `first` is default for compatibility with stringr
mode <- match.arg(mode) # this is slow
switch(mode,
first = stri_replace_first(str, replacement, ..., regex = regex,
fixed = fixed, coll = coll, charclass = charclass),
last = stri_replace_last(str, replacement, ..., regex = regex,
fixed = fixed, coll = coll, charclass = charclass),
all = stri_replace_all(str, replacement, ..., regex = regex,
fixed = fixed, coll = coll, charclass = charclass))
}
#' @export
#' @rdname stri_replace
stri_replace_all_charclass <- function(str, pattern, replacement, merge = FALSE,
vectorize_all = TRUE, vectorise_all = vectorize_all)
{
if (!missing(vectorise_all))
vectorize_all <- vectorise_all
.Call(C_stri_replace_all_charclass, str, pattern, replacement, merge, vectorize_all)
}
#' @export
#' @rdname stri_replace
stri_replace_first_charclass <- function(str, pattern, replacement)
{
.Call(C_stri_replace_first_charclass, str, pattern, replacement)
}
#' @export
#' @rdname stri_replace
stri_replace_last_charclass <- function(str, pattern, replacement)
{
.Call(C_stri_replace_last_charclass, str, pattern, replacement)
}
#' @export
#' @rdname stri_replace
stri_replace_all_coll <- function(str, pattern, replacement,
vectorize_all = TRUE, vectorise_all = vectorize_all, ..., opts_collator = NULL)
{
if (!missing(vectorise_all))
vectorize_all <- vectorise_all
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_replace_all_coll, str, pattern, replacement, vectorize_all, opts_collator)
}
#' @export
#' @rdname stri_replace
stri_replace_first_coll <- function(str, pattern, replacement, ..., opts_collator = NULL)
{
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_replace_first_coll, str, pattern, replacement, opts_collator)
}
#' @export
#' @rdname stri_replace
stri_replace_last_coll <- function(str, pattern, replacement, ..., opts_collator = NULL)
{
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_replace_last_coll, str, pattern, replacement, opts_collator)
}
#' @export
#' @rdname stri_replace
stri_replace_all_fixed <- function(str, pattern, replacement,
vectorize_all = TRUE, vectorise_all = vectorize_all, ..., opts_fixed = NULL)
{
if (!missing(vectorise_all))
vectorize_all <- vectorise_all
if (!missing(...))
opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...)))
.Call(C_stri_replace_all_fixed, str, pattern, replacement, vectorize_all, opts_fixed)
}
#' @export
#' @rdname stri_replace
stri_replace_first_fixed <- function(str, pattern, replacement, ..., opts_fixed = NULL)
{
if (!missing(...))
opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...)))
.Call(C_stri_replace_first_fixed, str, pattern, replacement, opts_fixed)
}
#' @export
#' @rdname stri_replace
stri_replace_last_fixed <- function(str, pattern, replacement, ..., opts_fixed = NULL)
{
if (!missing(...))
opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...)))
.Call(C_stri_replace_last_fixed, str, pattern, replacement, opts_fixed)
}
#' @export
#' @rdname stri_replace
stri_replace_all_regex <- function(str, pattern, replacement,
vectorize_all = TRUE, vectorise_all = vectorize_all, ..., opts_regex = NULL)
{
if (!missing(vectorise_all))
vectorize_all <- vectorise_all
if (!missing(...))
opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...)))
.Call(C_stri_replace_all_regex, str, pattern, replacement, vectorize_all, opts_regex)
}
#' @export
#' @rdname stri_replace
stri_replace_first_regex <- function(str, pattern, replacement, ..., opts_regex = NULL)
{
if (!missing(...))
opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...)))
.Call(C_stri_replace_first_regex, str, pattern, replacement, opts_regex)
}
#' @export
#' @rdname stri_replace
stri_replace_last_regex <- function(str, pattern, replacement, ..., opts_regex = NULL)
{
if (!missing(...))
opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...)))
.Call(C_stri_replace_last_regex, str, pattern, replacement, opts_regex)
}
#' Convert gsub-Style Replacement Strings
#'
#' @description
#' Converts a \code{\link[base]{gsub}}-style replacement strings
#' to those which can be used in \code{\link{stri_replace}}.
#' In particular, \code{$} becomes \code{\\$} and \code{\\1} becomes \code{$1}.
#'
#' @param x character vector
#'
#' @return Returns a character vector.
#'
#' @family search_replace
#' @export
stri_replace_rstr <- function(x)
{
.Call(C_stri_replace_rstr, x)
}
stringi/R/search_extract_4.R 0000644 0001762 0000144 00000031335 14750110641 015503 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Extract Pattern Occurrences
#'
#' @description
#' These functions extract all substrings matching a given pattern.
#'
#' \code{stri_extract_all_*} extracts all the matches.
#' \code{stri_extract_first_*} and \code{stri_extract_last_*}
#' yield the first or the last matches, respectively.
#'
#' @details
#' Vectorized over \code{str} and \code{pattern} (with recycling
#' of the elements in the shorter vector if necessary). This allows to,
#' for instance, search for one pattern in each given string,
#' search for each pattern in one given string,
#' and search for the i-th pattern within the i-th string.
#'
#' Check out \code{\link{stri_match}} for the extraction of matches
#' to individual regex capture groups.
#'
#' \code{stri_extract}, \code{stri_extract_all}, \code{stri_extract_first},
#' and \code{stri_extract_last} are convenience functions.
#' They merely call \code{stri_extract_*_*}, depending on the arguments used.
#'
#' @param str character vector; strings to search in
#' @param pattern,regex,fixed,coll,charclass character vector;
#' search patterns; for more details refer to \link{stringi-search}
#' @param opts_collator,opts_fixed,opts_regex a named list to tune up
#' the search engine's settings; see \code{\link{stri_opts_collator}},
#' \code{\link{stri_opts_fixed}}, and \code{\link{stri_opts_regex}},
#' respectively; \code{NULL} for the defaults
#' @param merge single logical value; indicates whether consecutive pattern
#' matches will be merged into one string;
#' \code{stri_extract_all_charclass} only
#' @param simplify single logical value;
#' if \code{TRUE} or \code{NA}, then a character matrix is returned;
#' otherwise (the default), a list of character vectors is given, see Value;
#' \code{stri_extract_all_*} only
#' @param omit_no_match single logical value; if \code{FALSE},
#' then a missing value will indicate that there was no match;
#' \code{stri_extract_all_*} only
#' @param mode single string;
#' one of: \code{'first'} (the default), \code{'all'}, \code{'last'}
#' @param ... supplementary arguments passed to the underlying functions,
#' including additional settings for \code{opts_collator}, \code{opts_regex},
#' and so on
#'
#' @return
#' For \code{stri_extract_all*}, if \code{simplify=FALSE} (the default), then
#' a list of character vectors is returned. Each list element
#' represents the results of a different search scenario.
#' If a pattern is not found and \code{omit_no_match=FALSE},
#' then a character vector of length 1
#' with single \code{NA} value will be generated.
#'
#' Otherwise, i.e., if \code{simplify} is not \code{FALSE},
#' then \code{\link{stri_list2matrix}} with \code{byrow=TRUE} argument
#' is called on the resulting object.
#' In such a case, the function yields a character matrix with an appropriate
#' number of rows (according to the length of \code{str}, \code{pattern}, etc.).
#' Note that \code{\link{stri_list2matrix}}'s \code{fill} argument is set
#' either to an empty string or \code{NA}, depending on
#' whether \code{simplify} is \code{TRUE} or \code{NA}, respectively.
#'
#' \code{stri_extract_first*} and \code{stri_extract_last*}
#' return a character vector. A \code{NA} element indicates a no-match.
#'
#' Note that \code{stri_extract_last_regex} searches from start to end,
#' but skips overlapping matches, see the example below.
#'
#' @examples
#' stri_extract_all('XaaaaX', regex=c('\\p{Ll}', '\\p{Ll}+', '\\p{Ll}{2,3}', '\\p{Ll}{2,3}?'))
#' stri_extract_all('Bartolini', coll='i')
#' stri_extract_all('stringi is so good!', charclass='\\p{Zs}') # all white-spaces
#'
#' stri_extract_all_charclass(c('AbcdeFgHijK', 'abc', 'ABC'), '\\p{Ll}')
#' stri_extract_all_charclass(c('AbcdeFgHijK', 'abc', 'ABC'), '\\p{Ll}', merge=FALSE)
#' stri_extract_first_charclass('AaBbCc', '\\p{Ll}')
#' stri_extract_last_charclass('AaBbCc', '\\p{Ll}')
#'
#' \dontrun{
#' # emoji support available since ICU 57
#' stri_extract_all_charclass(stri_enc_fromutf32(32:55200), '\\p{EMOJI}')
#' }
#'
#' stri_extract_all_coll(c('AaaaaaaA', 'AAAA'), 'a')
#' stri_extract_first_coll(c('Yy\u00FD', 'AAA'), 'y', strength=2, locale='sk_SK')
#' stri_extract_last_coll(c('Yy\u00FD', 'AAA'), 'y', strength=1, locale='sk_SK')
#'
#' stri_extract_all_regex('XaaaaX', c('\\p{Ll}', '\\p{Ll}+', '\\p{Ll}{2,3}', '\\p{Ll}{2,3}?'))
#' stri_extract_first_regex('XaaaaX', c('\\p{Ll}', '\\p{Ll}+', '\\p{Ll}{2,3}', '\\p{Ll}{2,3}?'))
#' stri_extract_last_regex('XaaaaX', c('\\p{Ll}', '\\p{Ll}+', '\\p{Ll}{2,3}', '\\p{Ll}{2,3}?'))
#'
#' stri_list2matrix(stri_extract_all_regex('XaaaaX', c('\\p{Ll}', '\\p{Ll}+')))
#' stri_extract_all_regex('XaaaaX', c('\\p{Ll}', '\\p{Ll}+'), simplify=TRUE)
#' stri_extract_all_regex('XaaaaX', c('\\p{Ll}', '\\p{Ll}+'), simplify=NA)
#'
#' stri_extract_all_fixed('abaBAba', 'Aba', case_insensitive=TRUE)
#' stri_extract_all_fixed('abaBAba', 'Aba', case_insensitive=TRUE, overlap=TRUE)
#'
#' # Searching for the last occurrence:
#' # Note the difference - regex searches left to right, with no overlaps.
#' stri_extract_last_fixed("agAGA", "aga", case_insensitive=TRUE)
#' stri_extract_last_regex("agAGA", "aga", case_insensitive=TRUE)
#'
#' @family search_extract
#'
#' @export
#' @rdname stri_extract
stri_extract_all <- function(str, ..., regex, fixed, coll, charclass)
{
providedarg <- c(
regex = !missing(regex),
fixed = !missing(fixed),
coll = !missing(coll),
charclass = !missing(charclass))
if (sum(providedarg) != 1)
stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`")
if (providedarg["regex"])
stri_extract_all_regex(str, regex, ...)
else if (providedarg["fixed"])
stri_extract_all_fixed(str, fixed, ...)
else if (providedarg["coll"])
stri_extract_all_coll(str, coll, ...)
else if (providedarg["charclass"])
stri_extract_all_charclass(str, charclass, ...)
}
#' @export
#' @rdname stri_extract
stri_extract_first <- function(str, ..., regex, fixed, coll, charclass)
{
providedarg <- c(
regex = !missing(regex),
fixed = !missing(fixed),
coll = !missing(coll),
charclass = !missing(charclass))
if (sum(providedarg) != 1)
stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`")
if (providedarg["regex"])
stri_extract_first_regex(str, regex, ...)
else if (providedarg["fixed"])
stri_extract_first_fixed(str, fixed, ...)
else if (providedarg["coll"])
stri_extract_first_coll(str, coll, ...)
else if (providedarg["charclass"])
stri_extract_first_charclass(str, charclass, ...)
}
#' @export
#' @rdname stri_extract
stri_extract_last <- function(str, ..., regex, fixed, coll, charclass)
{
providedarg <- c(
regex = !missing(regex),
fixed = !missing(fixed),
coll = !missing(coll),
charclass = !missing(charclass))
if (sum(providedarg) != 1)
stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`")
if (providedarg["regex"])
stri_extract_last_regex(str, regex, ...)
else if (providedarg["fixed"])
stri_extract_last_fixed(str, fixed, ...)
else if (providedarg["coll"])
stri_extract_last_coll(str, coll, ...)
else if (providedarg["charclass"])
stri_extract_last_charclass(str, charclass, ...)
}
#' @export
#' @rdname stri_extract
stri_extract <- function(str, ..., regex, fixed, coll, charclass,
mode = c("first", "all", "last"))
{
# `first` is default for compatibility with stringr
mode <- match.arg(mode) # this is slow
switch(mode,
first = stri_extract_first(str, ..., regex = regex, fixed = fixed,
coll = coll, charclass = charclass),
last = stri_extract_last(str, ..., regex = regex,
fixed = fixed, coll = coll, charclass = charclass),
all = stri_extract_all(str, ..., regex = regex, fixed = fixed,
coll = coll, charclass = charclass))
}
#' @export
#' @rdname stri_extract
stri_extract_all_charclass <- function(str, pattern, merge = TRUE, simplify = FALSE,
omit_no_match = FALSE)
{
.Call(C_stri_extract_all_charclass, str, pattern, merge, simplify, omit_no_match)
}
#' @export
#' @rdname stri_extract
stri_extract_first_charclass <- function(str, pattern)
{
.Call(C_stri_extract_first_charclass, str, pattern)
}
#' @export
#' @rdname stri_extract
stri_extract_last_charclass <- function(str, pattern)
{
.Call(C_stri_extract_last_charclass, str, pattern)
}
#' @export
#' @rdname stri_extract
stri_extract_all_coll <- function(str, pattern, simplify = FALSE,
omit_no_match = FALSE, ..., opts_collator = NULL)
{
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_extract_all_coll, str, pattern, simplify, omit_no_match, opts_collator)
}
#' @export
#' @rdname stri_extract
stri_extract_first_coll <- function(str, pattern, ..., opts_collator = NULL)
{
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_extract_first_coll, str, pattern, opts_collator)
}
#' @export
#' @rdname stri_extract
stri_extract_last_coll <- function(str, pattern, ..., opts_collator = NULL)
{
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_extract_last_coll, str, pattern, opts_collator)
}
#' @export
#' @rdname stri_extract
stri_extract_all_regex <- function(str, pattern, simplify = FALSE,
omit_no_match = FALSE, ..., opts_regex = NULL)
{
if (!missing(...))
opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...)))
.Call(C_stri_extract_all_regex, str, pattern, simplify, omit_no_match, opts_regex)
}
#' @export
#' @rdname stri_extract
stri_extract_first_regex <- function(str, pattern, ..., opts_regex = NULL)
{
if (!missing(...))
opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...)))
.Call(C_stri_extract_first_regex, str, pattern, opts_regex)
}
#' @export
#' @rdname stri_extract
stri_extract_last_regex <- function(str, pattern, ..., opts_regex = NULL)
{
if (!missing(...))
opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...)))
.Call(C_stri_extract_last_regex, str, pattern, opts_regex)
}
#' @export
#' @rdname stri_extract
stri_extract_all_fixed <- function(str, pattern, simplify = FALSE,
omit_no_match = FALSE, ..., opts_fixed = NULL)
{
if (!missing(...))
opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...)))
.Call(C_stri_extract_all_fixed, str, pattern, simplify, omit_no_match, opts_fixed)
}
#' @export
#' @rdname stri_extract
stri_extract_first_fixed <- function(str, pattern, ..., opts_fixed = NULL)
{
if (!missing(...))
opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...)))
.Call(C_stri_extract_first_fixed, str, pattern, opts_fixed)
}
#' @export
#' @rdname stri_extract
stri_extract_last_fixed <- function(str, pattern, ..., opts_fixed = NULL)
{
if (!missing(...))
opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...)))
.Call(C_stri_extract_last_fixed, str, pattern, opts_fixed)
}
stringi/R/locale_management.R 0000644 0001762 0000144 00000012501 14750110641 015706 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' List Available Locales
#'
#' @description
#' Creates a character vector with all available locale identifies.
#'
#' @details
#' Note that some of the services may be unavailable in some locales.
#' Querying for locale-specific services is always performed
#' during the resource request.
#'
#' See \link{stringi-locale} for more information.
#'
#' @return
#' Returns a character vector with locale identifiers
#' that are known to \pkg{ICU}.
#'
#' @examples
#' stri_locale_list()
#'
#' @family locale_management
#' @export
stri_locale_list <- function()
{
stri_sort(
.Call(C_stri_locale_list), locale="en_US", numeric=TRUE, strength=1
)
}
#' @title
#' Set or Get Default Locale in \pkg{stringi}
#'
#' @description
#' \code{stri_locale_set} changes the default locale for all the functions
#' in the \pkg{stringi} package,
#' i.e., establishes the meaning of the ``\code{NULL} locale'' argument
#' of locale-sensitive functions.
#' \code{stri_locale_get}
#' gives the current default locale.
#'
#' @details
#' See \link{stringi-locale} for more information on the effect of
#' changing the default locale.
#'
#' \code{stri_locale_get} is the same as \code{\link{stri_locale_info}(NULL)$Name}.
#'
#' @param locale single string of the form \code{Language},
#' \code{Language_Country}, or \code{Language_Country_Variant}, e.g.,
#' \code{'en_US'}, see \code{\link{stri_locale_list}}.
#'
#' @return
#' \code{stri_locale_set} returns a string with
#' previously used locale, invisibly.
#'
#' \code{stri_locale_get} returns a string of the form \code{Language},
#' \code{Language_Country}, or \code{Language_Country_Variant},
#' e.g., \code{'en_US'}.
#'
#' @family locale_management
#' @rdname stri_locale_set
#' @examples
#' \dontrun{
#' oldloc <- stri_locale_set('pt_BR')
#' # ... some locale-dependent operations
#' # ... note that you may always modify a locale per-call
#' # ... changing the default locale is convenient if you perform
#' # ... many operations
#' stri_locale_set(oldloc) # restore the previous default locale
#' }
#' @export
stri_locale_set <- function(locale)
{
previous <- stri_locale_get()
.Call(C_stri_locale_set, locale)
# We call stri_info, because it generates some warnings,
# in case any problems are found:
message(stri_paste("You are now working with ", stri_info(short = TRUE)))
invisible(previous)
}
#' @rdname stri_locale_set
#' @export
stri_locale_get <- function()
{
stri_locale_info(NULL)$Name
}
#' @title
#' Query Given Locale
#'
#' @description
#' Provides some basic information on a given locale identifier.
#'
#' @details
#' With this function you may obtain some basic information
#' on any provided locale identifier,
#' even if it is unsupported by \pkg{ICU} or if you pass a malformed locale
#' identifier (the one that is not, e.g., of the form Language_Country).
#' See \link{stringi-locale} for discussion.
#'
#' This function does not do anything really complicated. In many
#' cases it is similar to a call to
#' \code{\link{as.list}(\link{stri_split_fixed}(locale, '_', 3L)[[1]])},
#' with \code{locale} case mapped.
#' It may be used, however, to get insight on how ICU understands a given
#' locale identifier.
#'
#' @param locale \code{NULL} or \code{''} for default locale,
#' or a single string with locale identifier.
#'
#' @return
#' Returns a list with the following named character strings:
#' \code{Language}, \code{Country}, \code{Variant}, and
#' \code{Name}, being their underscore separated combination.
#'
#' @examples
#' stri_locale_info('pl_PL')
#' stri_locale_info('Pl_pL') # the same result
#'
#' @family locale_management
#' @export
stri_locale_info <- function(locale = NULL)
{
.Call(C_stri_locale_info, locale)
}
stringi/R/search_split_4.R 0000644 0001762 0000144 00000022415 14750110641 015163 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Split a String By Pattern Matches
#'
#' @description
#' These functions split each element in \code{str} into substrings.
#' \code{pattern} defines the delimiters that separate the inputs into tokens.
#' The input data between the matches become the fields themselves.
#'
#' @details
#' Vectorized over \code{str}, \code{pattern}, \code{n}, and \code{omit_empty}
#' (with recycling of the elements in the shorter vector if necessary).
#'
#' If \code{n} is negative, then all pieces are extracted.
#' Otherwise, if \code{tokens_only} is \code{FALSE} (which is the default),
#' then \code{n-1} tokens are extracted (if possible) and the \code{n}-th string
#' gives the remainder (see Examples).
#' On the other hand, if \code{tokens_only} is \code{TRUE},
#' then only full tokens (up to \code{n} pieces) are extracted.
#'
#' \code{omit_empty} is applied during the split process: if it is set to
#' \code{TRUE}, then tokens of zero length are ignored. Thus, empty strings
#' will never appear in the resulting vector. On the other hand, if
#' \code{omit_empty} is \code{NA}, then empty tokens are substituted with
#' missing strings.
#'
#' Empty search patterns are not supported. If you wish to split a
#' string into individual characters, use, e.g.,
#' \code{\link{stri_split_boundaries}(str, type='character')} for THE Unicode way.
#'
#' \code{stri_split} is a convenience function. It calls either
#' \code{stri_split_regex}, \code{stri_split_fixed}, \code{stri_split_coll},
#' or \code{stri_split_charclass}, depending on the argument used.
#'
#' @param str character vector; strings to search in
#' @param pattern,regex,fixed,coll,charclass character vector;
#' search patterns; for more details refer to \link{stringi-search}
#' @param n integer vector, maximal number of strings to return,
#' and, at the same time, maximal number of text boundaries to look for
#' @param omit_empty logical vector; determines whether empty
#' tokens should be removed from the result (\code{TRUE} or \code{FALSE})
#' or replaced with \code{NA}s (\code{NA})
#' @param tokens_only single logical value;
#' may affect the result if \code{n} is positive, see Details
#' @param simplify single logical value;
#' if \code{TRUE} or \code{NA}, then a character matrix is returned;
#' otherwise (the default), a list of character vectors is given, see Value
#' @param opts_collator,opts_fixed,opts_regex a named list used to tune up
#' the search engine's settings; see
#' \code{\link{stri_opts_collator}}, \code{\link{stri_opts_fixed}},
#' and \code{\link{stri_opts_regex}}, respectively; \code{NULL}
#' for the defaults
#' @param ... supplementary arguments passed to the underlying functions,
#' including additional settings for \code{opts_collator}, \code{opts_regex},
#' \code{opts_fixed}, and so on
#'
#' @return If \code{simplify=FALSE} (the default),
#' then the functions return a list of character vectors.
#'
#' Otherwise, \code{\link{stri_list2matrix}} with \code{byrow=TRUE}
#' and \code{n_min=n} arguments is called on the resulting object.
#' In such a case, a character matrix with an appropriate number of rows
#' (according to the length of \code{str}, \code{pattern}, etc.)
#' is returned. Note that \code{\link{stri_list2matrix}}'s \code{fill} argument
#' is set to an empty string and \code{NA}, for \code{simplify} equal to
#' \code{TRUE} and \code{NA}, respectively.
#'
#' @examples
#' stri_split_fixed('a_b_c_d', '_')
#' stri_split_fixed('a_b_c__d', '_')
#' stri_split_fixed('a_b_c__d', '_', omit_empty=TRUE)
#' stri_split_fixed('a_b_c__d', '_', n=2, tokens_only=FALSE) # 'a' & remainder
#' stri_split_fixed('a_b_c__d', '_', n=2, tokens_only=TRUE) # 'a' & 'b' only
#' stri_split_fixed('a_b_c__d', '_', n=4, omit_empty=TRUE, tokens_only=TRUE)
#' stri_split_fixed('a_b_c__d', '_', n=4, omit_empty=FALSE, tokens_only=TRUE)
#' stri_split_fixed('a_b_c__d', '_', omit_empty=NA)
#' stri_split_fixed(c('ab_c', 'd_ef_g', 'h', ''), '_', n=1, tokens_only=TRUE, omit_empty=TRUE)
#' stri_split_fixed(c('ab_c', 'd_ef_g', 'h', ''), '_', n=2, tokens_only=TRUE, omit_empty=TRUE)
#' stri_split_fixed(c('ab_c', 'd_ef_g', 'h', ''), '_', n=3, tokens_only=TRUE, omit_empty=TRUE)
#'
#' stri_list2matrix(stri_split_fixed(c('ab,c', 'd,ef,g', ',h', ''), ',', omit_empty=TRUE))
#' stri_split_fixed(c('ab,c', 'd,ef,g', ',h', ''), ',', omit_empty=FALSE, simplify=TRUE)
#' stri_split_fixed(c('ab,c', 'd,ef,g', ',h', ''), ',', omit_empty=NA, simplify=TRUE)
#' stri_split_fixed(c('ab,c', 'd,ef,g', ',h', ''), ',', omit_empty=TRUE, simplify=TRUE)
#' stri_split_fixed(c('ab,c', 'd,ef,g', ',h', ''), ',', omit_empty=NA, simplify=NA)
#'
#' stri_split_regex(c('ab,c', 'd,ef , g', ', h', ''),
#' '\\p{WHITE_SPACE}*,\\p{WHITE_SPACE}*', omit_empty=NA, simplify=TRUE)
#'
#' stri_split_charclass('Lorem ipsum dolor sit amet', '\\p{WHITE_SPACE}')
#' stri_split_charclass(' Lorem ipsum dolor', '\\p{WHITE_SPACE}', n=3,
#' omit_empty=c(FALSE, TRUE))
#'
#' stri_split_regex('Lorem ipsum dolor sit amet',
#' '\\p{Z}+') # see also stri_split_charclass
#'
#' @export
#' @rdname stri_split
#' @family search_split
#' @export
stri_split <- function(str, ..., regex, fixed, coll, charclass)
{
providedarg <- c(
regex = !missing(regex),
fixed = !missing(fixed),
coll = !missing(coll),
charclass = !missing(charclass))
if (sum(providedarg) != 1)
stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`")
if (providedarg["regex"])
stri_split_regex(str, regex, ...) else if (providedarg["fixed"])
stri_split_fixed(str, fixed, ...) else if (providedarg["coll"])
stri_split_coll(str, coll, ...) else if (providedarg["charclass"])
stri_split_charclass(str, charclass, ...)
}
#' @export
#' @rdname stri_split
stri_split_fixed <- function(str, pattern, n = -1L,
omit_empty = FALSE, tokens_only = FALSE,
simplify = FALSE, ..., opts_fixed = NULL)
{
# omit_empty defaults to FALSE for compatibility with the stringr package
# tokens_only defaults to FALSE for compatibility with the stringr package
if (!missing(...))
opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...)))
.Call(C_stri_split_fixed, str, pattern, n, omit_empty, tokens_only, simplify,
opts_fixed)
}
#' @export
#' @rdname stri_split
stri_split_regex <- function(str, pattern, n = -1L,
omit_empty = FALSE, tokens_only = FALSE,
simplify = FALSE, ..., opts_regex = NULL)
{
# omit_empty defaults to FALSE for compatibility with the stringr package
# tokens_only defaults to FALSE for compatibility with the stringr package
if (!missing(...))
opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...)))
.Call(C_stri_split_regex, str, pattern, n, omit_empty, tokens_only, simplify,
opts_regex)
}
#' @export
#' @rdname stri_split
stri_split_coll <- function(str, pattern, n = -1L,
omit_empty = FALSE, tokens_only = FALSE,
simplify = FALSE, ..., opts_collator = NULL)
{
# omit_empty defaults to FALSE for compatibility with the stringr package
# tokens_only defaults to FALSE for compatibility with the stringr package
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_split_coll, str, pattern, n, omit_empty, tokens_only, simplify,
opts_collator)
}
#' @export
#' @rdname stri_split
stri_split_charclass <- function(str, pattern, n = -1L,
omit_empty = FALSE, tokens_only = FALSE,
simplify = FALSE)
{
# omit_empty defaults to FALSE for compatibility with the stringr package
# tokens_only defaults to FALSE for compatibility with the stringr package
.Call(C_stri_split_charclass, str, pattern, n, omit_empty, tokens_only, simplify)
}
stringi/R/install.R 0000644 0001762 0000144 00000011540 14750110641 013723 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# internal functions used whilst installing stringi
icudt_fname <- c(
little74 = "icudt74l.dat",
big74 = "icudt74b.dat"
)
# This function is not exported; it is called by install.libs.r(.in)
stri_install_icudt <- function(outpath, inpath, icu_bundle_version)
{
# remember about importFrom tools md5sum -> stringi-package.R
xzpath <- stri_download_icudt(inpath, icu_bundle_version)
if (identical(xzpath, FALSE) || !file.exists(xzpath)) {
return(invisible(FALSE))
}
basepath <- substr(xzpath, 1, nchar(xzpath)-3) # ~~".xz"~~
message("decompressing ", xzpath, " to: ", outpath)
fin <- xzfile(xzpath, "rb")
fout <- file(basepath, "wb")
repeat {
chunk <- readBin(fin, raw(), 8192L)
if (length(chunk) <= 0) break
writeBin(chunk, fout)
}
close(fout)
close(fin)
md5ex <- scan(sprintf("%s.md5sum", basepath), what=character(), n=1, quiet=TRUE)
md5ob <- tools::md5sum(basepath)
if (is.na(md5ob) || md5ob != md5ex) {
message(sprintf("md5sum mismatch for %s (%s vs %s)",
basepath, as.character(md5ob), as.character(md5ex)
))
file.remove(basepath)
return(invisible(FALSE))
}
file.copy(basepath, file.path(outpath, basename(basepath)), overwrite=TRUE)
file.remove(basepath)
message(sprintf("%s installed successfully", basepath))
invisible(TRUE)
}
# This function is not exported;
# it is called by configure(.ac) and stri_install_icudt above
stri_download_icudt <- function(inpath, icu_bundle_version)
{
fname <- icudt_fname[paste0(.Platform$endian, icu_bundle_version)]
path <- file.path(inpath, fname)
commit_id <- "bbe75eca8f9ef4dc72dc5c6e36c8f8306a324b7e"
mirrors <- sprintf(
"%s://raw.githubusercontent.com/gagolews/stringi/%s/src/icu%d/data/",
c("https", "http"),
commit_id,
icu_bundle_version
)
xzpath <- sprintf("%s.xz", path)
if (file.exists(xzpath)) {
message(sprintf("%s exists", xzpath))
return(xzpath)
}
download_from_mirror <- function(href, fname, xzpath) {
tryCatch({
suppressWarnings(file.remove(xzpath))
# download icudt
if (
download.file(
paste(href, fname, sep = ""), xzpath, mode = "wb"
) != 0
) {
return("download error")
}
if (!file.exists(xzpath)) {
return("download error")
}
TRUE
}, error = function(e) as.character(e))
}
message(sprintf("downloading the ICU data library (%s)...", xzpath))
if (!dir.exists(inpath)) suppressWarnings(dir.create(inpath))
allok <- FALSE
for (m in mirrors) {
status <- download_from_mirror(m, sprintf("%s.xz", fname), xzpath)
if (identical(status, TRUE)) {
allok <- TRUE
break
}
else message(status)
}
if (!allok || !file.exists(xzpath)) {
suppressWarnings(file.remove(xzpath))
message(sprintf("Error: %s could not be downloaded", xzpath))
return(invisible(FALSE))
}
message(sprintf("%s downloaded successfully", xzpath))
return(xzpath)
}
stringi/R/search_match_4.R 0000644 0001762 0000144 00000016247 14750110641 015132 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Extract Regex Pattern Matches, Together with Capture Groups
#'
#' @description
#' These functions extract substrings in \code{str} that
#' match a given regex \code{pattern}. Additionally, they extract matches
#' to every \emph{capture group}, i.e., to all the sub-patterns given
#' in round parentheses.
#'
#' @details
#' Vectorized over \code{str} and \code{pattern} (with recycling
#' of the elements in the shorter vector if necessary). This allows to,
#' for instance, search for one pattern in each given string,
#' search for each pattern in one given string,
#' and search for the i-th pattern within the i-th string.
#'
#' If no pattern match is detected and \code{omit_no_match=FALSE},
#' then \code{NA}s are included in the resulting matrix (matrices), see Examples.
#'
#' \code{stri_match}, \code{stri_match_all}, \code{stri_match_first},
#' and \code{stri_match_last} are convenience functions.
#' They merely call \code{stri_match_*_regex} and are
#' provided for consistency with other string searching functions' wrappers,
#' see, among others, \code{\link{stri_extract}}.
#'
#' @param str character vector; strings to search in
#' @param pattern,regex character vector;
#' search patterns; for more details refer to \link{stringi-search}
#' @param opts_regex a named list with \pkg{ICU} Regex settings,
#' see \code{\link{stri_opts_regex}}; \code{NULL}
#' for default settings
#' @param omit_no_match single logical value; if \code{FALSE},
#' then a row with missing values will indicate that there was no match;
#' \code{stri_match_all_*} only
#' @param cg_missing single string to be used if a capture group match
#' is unavailable
#' @param mode single string;
#' one of: \code{'first'} (the default), \code{'all'}, \code{'last'}
#' @param ... supplementary arguments passed to the underlying functions,
#' including additional settings for \code{opts_regex}
#'
#' @return
#' For \code{stri_match_all*},
#' a list of character matrices is returned. Each list element
#' represents the results of a different search scenario.
#'
#' For \code{stri_match_first*} and \code{stri_match_last*}
#' a character matrix is returned.
#' Each row corresponds to a different search result.
#'
#' The first matrix column gives the whole match. The second one corresponds to
#' the first capture group, the third -- the second capture group, and so on.
#'
#' If regular expressions feature a named capture group,
#' the matrix columns will be named accordingly.
#' However, for \code{stri_match_first*} and \code{stri_match_last*}
#' this will only be the case if there is a single pattern.
#'
#'
#' @examples
#' stri_match_all_regex('breakfast=eggs, lunch=pizza, dessert=icecream',
#' '(\\w+)=(\\w+)')
#' stri_match_all_regex(c('breakfast=eggs', 'lunch=pizza', 'no food here'),
#' '(\\w+)=(\\w+)')
#' stri_match_all_regex(c('breakfast=eggs;lunch=pizza',
#' 'breakfast=bacon;lunch=spaghetti', 'no food here'),
#' '(\\w+)=(\\w+)')
#' stri_match_all_regex(c('breakfast=eggs;lunch=pizza',
#' 'breakfast=bacon;lunch=spaghetti', 'no food here'),
#' '(?\\w+)=(?\\w+)') # named capture groups
#' stri_match_first_regex(c('breakfast=eggs;lunch=pizza',
#' 'breakfast=bacon;lunch=spaghetti', 'no food here'),
#' '(\\w+)=(\\w+)')
#' stri_match_last_regex(c('breakfast=eggs;lunch=pizza',
#' 'breakfast=bacon;lunch=spaghetti', 'no food here'),
#' '(\\w+)=(\\w+)')
#'
#' stri_match_first_regex(c('abcd', ':abcd', ':abcd:'), '^(:)?([^:]*)(:)?$')
#' stri_match_first_regex(c('abcd', ':abcd', ':abcd:'), '^(:)?([^:]*)(:)?$', cg_missing='')
#'
#' # Match all the pattern of the form XYX, including overlapping matches:
#' stri_match_all_regex('ACAGAGACTTTAGATAGAGAAGA', '(?=(([ACGT])[ACGT]\\2))')[[1]][,2]
#' # Compare the above to:
#' stri_extract_all_regex('ACAGAGACTTTAGATAGAGAAGA', '([ACGT])[ACGT]\\1')
#'
#' @family search_extract
#' @export
#' @rdname stri_match
stri_match_all <- function(str, ..., regex)
{
stri_match_all_regex(str, regex, ...)
}
#' @export
#' @rdname stri_match
stri_match_first <- function(str, ..., regex)
{
stri_match_first_regex(str, regex, ...)
}
#' @export
#' @rdname stri_match
stri_match_last <- function(str, ..., regex)
{
stri_match_last_regex(str, regex, ...)
}
#' @export
#' @rdname stri_match
stri_match <- function(str, ..., regex, mode = c("first", "all", "last"))
{
# `first` is default for compatibility with stringr
mode <- match.arg(mode) # this is slow
switch(mode,
first = stri_match_first_regex(str, regex, ...),
last = stri_match_last_regex(str, regex, ...),
all = stri_match_all_regex(str, regex, ...))
}
#' @export
#' @rdname stri_match
stri_match_all_regex <- function(str, pattern,
omit_no_match = FALSE, cg_missing = NA_character_,
..., opts_regex = NULL)
{
if (!missing(...))
opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...)))
.Call(C_stri_match_all_regex, str, pattern, omit_no_match, cg_missing, opts_regex)
}
#' @export
#' @rdname stri_match
stri_match_first_regex <- function(str, pattern, cg_missing = NA_character_, ...,
opts_regex = NULL)
{
if (!missing(...))
opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...)))
.Call(C_stri_match_first_regex, str, pattern, cg_missing, opts_regex)
}
#' @export
#' @rdname stri_match
stri_match_last_regex <- function(str, pattern, cg_missing = NA_character_, ...,
opts_regex = NULL)
{
if (!missing(...))
opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...)))
.Call(C_stri_match_last_regex, str, pattern, cg_missing, opts_regex)
}
stringi/R/time_format.R 0000644 0001762 0000144 00000033477 14750110641 014600 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Date and Time Formatting and Parsing
#'
#' @description
#' These functions convert a given date/time object
#' to a character vector, or vice versa.
#'
#' @details
#' Vectorized over \code{format} and \code{time} or \code{str}.
#'
#' When parsing strings, unspecified date-time fields
#' (e.g., seconds where only hours and minutes are given)
#' are based on today's midnight in the local time zone
#' (for compatibility with \code{\link[base]{strptime}}).
#'
#' By default, \code{stri_datetime_format} (for compatibility
#' with the \code{\link[base]{strftime}} function)
#' formats a date/time object using the current default time zone.
#'
#' \code{format} may be one of \code{DT_STYLE} or \code{DT_relative_STYLE},
#' where \code{DT} is equal to \code{date}, \code{time}, or \code{datetime},
#' and \code{STYLE} is equal to \code{full}, \code{long}, \code{medium},
#' or \code{short}. This gives a locale-dependent date and/or time format.
#' Note that currently \pkg{ICU} does not support \code{relative}
#' \code{time} formats, thus this flag is currently ignored in such a context.
#'
#' Otherwise, \code{format} is a pattern:
#' a string where specific sequences of characters are replaced
#' with date/time data from a calendar when formatting or used
#' to generate data for a calendar when parsing.
#' For example, \code{y} stands for 'year'. Characters
#' may be used multiple times:
#' \code{yy} might produce \code{99}, whereas \code{yyyy} yields \code{1999}.
#' For most numerical fields, the number of characters specifies
#' the field width. For example, if \code{h} is the hour, \code{h} might
#' produce \code{5}, but \code{hh} yields \code{05}.
#' For some characters, the count specifies whether an abbreviated
#' or full form should be used.
#'
#' Two single quotes represent a literal single quote, either
#' inside or outside single quotes. Text within single quotes
#' is not interpreted in any way (except for two adjacent single quotes).
#' Otherwise, all ASCII letters from \code{a} to \code{z} and
#' \code{A} to \code{Z} are reserved as syntax characters, and require quoting
#' if they are to represent literal characters. In addition, certain
#' ASCII punctuation characters may become available in the future
#' (e.g., \code{:} being interpreted as the time separator and \code{/}
#' as a date separator, and replaced by respective
#' locale-sensitive characters in display).
#'
#' \tabular{llll}{
#' \bold{Symbol} \tab \bold{Meaning} \tab \bold{Example(s)} \tab \bold{Output} \cr
#' G \tab era designator \tab G, GG, or GGG \tab AD \cr
#' \tab \tab GGGG \tab Anno Domini \cr
#' \tab \tab GGGGG \tab A \cr
#' y \tab year \tab yy \tab 96 \cr
#' \tab \tab y or yyyy \tab 1996 \cr
# Y \tab year of 'Week of Year' \tab Y \tab 1997 \cr
#' u \tab extended year \tab u \tab 4601 \cr
#' U \tab cyclic year name, as in Chinese lunar calendar \tab U \tab \cr
#' r \tab related Gregorian year \tab r \tab 1996 \cr
#' Q \tab quarter \tab Q or QQ \tab 02 \cr
#' \tab \tab QQQ \tab Q2 \cr
#' \tab \tab QQQQ \tab 2nd quarter \cr
#' \tab \tab QQQQQ \tab 2 \cr
#' q \tab Stand Alone quarter \tab q or qq \tab 02 \cr
#' \tab \tab qqq \tab Q2 \cr
#' \tab \tab qqqq \tab 2nd quarter \cr
#' \tab \tab qqqqq \tab 2 \cr
#' M \tab month in year \tab M or MM \tab 09 \cr
#' \tab \tab MMM \tab Sep \cr
#' \tab \tab MMMM \tab September \cr
#' \tab \tab MMMMM \tab S \cr
#' L \tab Stand Alone month in year \tab L or LL \tab 09 \cr
#' \tab \tab LLL \tab Sep \cr
#' \tab \tab LLLL \tab September \cr
#' \tab \tab LLLLL \tab S \cr
#' w \tab week of year \tab w or ww \tab 27 \cr
#' W \tab week of month \tab W \tab 2 \cr
#' d \tab day in month \tab d \tab 2 \cr
#' \tab \tab dd \tab 02 \cr
#' D \tab day of year \tab D \tab 189 \cr
#' F \tab day of week in month \tab F \tab 2 (2nd Wed in July) \cr
#' g \tab modified Julian day \tab g \tab 2451334 \cr
#' E \tab day of week \tab E, EE, or EEE \tab Tue \cr
#' \tab \tab EEEE \tab Tuesday \cr
#' \tab \tab EEEEE \tab T \cr
#' \tab \tab EEEEEE \tab Tu \cr
#' e \tab local day of week \tab e or ee \tab 2 \cr
#' \tab example: if Monday is 1st day, Tuesday is 2nd ) \tab eee \tab Tue \cr
#' \tab \tab eeee \tab Tuesday \cr
#' \tab \tab eeeee \tab T \cr
#' \tab \tab eeeeee \tab Tu \cr
#' c \tab Stand Alone local day of week \tab c or cc \tab 2 \cr
#' \tab \tab ccc \tab Tue \cr
#' \tab \tab cccc \tab Tuesday \cr
#' \tab \tab ccccc \tab T \cr
#' \tab \tab cccccc \tab Tu \cr
#' a \tab am/pm marker \tab a \tab pm \cr
#' h \tab hour in am/pm (1~12) \tab h \tab 7 \cr
#' \tab \tab hh \tab 07 \cr
#' H \tab hour in day (0~23) \tab H \tab 0 \cr
#' \tab \tab HH \tab 00 \cr
#' k \tab hour in day (1~24) \tab k \tab 24 \cr
#' \tab \tab kk \tab 24 \cr
#' K \tab hour in am/pm (0~11) \tab K \tab 0 \cr
#' \tab \tab KK \tab 00 \cr
#' m \tab minute in hour \tab m \tab 4 \cr
#' \tab \tab mm \tab 04 \cr
#' s \tab second in minute \tab s \tab 5 \cr
#' \tab \tab ss \tab 05 \cr
#' S \tab fractional second - truncates (like other time fields) \tab S \tab 2 \cr
#' \tab to the count of letters when formatting. Appends \tab SS \tab 23 \cr
#' \tab zeros if more than 3 letters specified. Truncates at \tab SSS \tab 235 \cr
#' \tab three significant digits when parsing. \tab SSSS \tab 2350 \cr
#' A \tab milliseconds in day \tab A \tab 61201235 \cr
#' z \tab Time Zone: specific non-location \tab z, zz, or zzz \tab PDT \cr
#' \tab \tab zzzz \tab Pacific Daylight Time \cr
#' Z \tab Time Zone: ISO8601 basic hms? / RFC 822 \tab Z, ZZ, or ZZZ \tab -0800 \cr
#' \tab Time Zone: long localized GMT (=OOOO) \tab ZZZZ \tab GMT-08:00 \cr
#' \tab Time Zone: ISO8601 extended hms? (=XXXXX) \tab ZZZZZ \tab -08:00, -07:52:58, Z \cr
#' O \tab Time Zone: short localized GMT \tab O \tab GMT-8 \cr
#' \tab Time Zone: long localized GMT (=ZZZZ) \tab OOOO \tab GMT-08:00 \cr
#' v \tab Time Zone: generic non-location \tab v \tab PT \cr
#' \tab (falls back first to VVVV) \tab vvvv \tab Pacific Time or Los Angeles Time \cr
#' V \tab Time Zone: short time zone ID \tab V \tab uslax \cr
#' \tab Time Zone: long time zone ID \tab VV \tab America/Los_Angeles \cr
#' \tab Time Zone: time zone exemplar city \tab VVV \tab Los Angeles \cr
#' \tab Time Zone: generic location (falls back to OOOO) \tab VVVV \tab Los Angeles Time \cr
#' X \tab Time Zone: ISO8601 basic hm?, with Z for 0 \tab X \tab -08, +0530, Z \cr
#' \tab Time Zone: ISO8601 basic hm, with Z \tab XX \tab -0800, Z \cr
#' \tab Time Zone: ISO8601 extended hm, with Z \tab XXX \tab -08:00, Z \cr
#' \tab Time Zone: ISO8601 basic hms?, with Z \tab XXXX \tab -0800, -075258, Z \cr
#' \tab Time Zone: ISO8601 extended hms?, with Z \tab XXXXX \tab -08:00, -07:52:58, Z \cr
#' x \tab Time Zone: ISO8601 basic hm?, without Z for 0 \tab x \tab -08, +0530 \cr
#' \tab Time Zone: ISO8601 basic hm, without Z \tab xx \tab -0800 \cr
#' \tab Time Zone: ISO8601 extended hm, without Z \tab xxx \tab -08:00 \cr
#' \tab Time Zone: ISO8601 basic hms?, without Z \tab xxxx \tab -0800, -075258 \cr
#' \tab Time Zone: ISO8601 extended hms?, without Z \tab xxxxx \tab -08:00, -07:52:58 \cr
#' ' \tab escape for text \tab ' \tab (nothing) \cr
#' ' ' \tab two single quotes produce one \tab ' ' \tab '
#' }
#'
#' Note that any characters in the pattern that are not in the ranges
#' of \code{[a-z]} and \code{[A-Z]} will be treated as quoted text.
#' For instance, characters like \code{:}, \code{.}, \code{ } (a space),
#' \code{#} and \code{@@} will appear in the resulting time text
#' even if they are not enclosed within single quotes. The single quote is used
#' to ``escape'' the letters. Two single quotes in a row,
#' inside or outside a quoted sequence, represent a ``real'' single quote.
#'
#'
#' A few examples:
#'
#' \tabular{ll}{
#' \bold{Example Pattern} \tab \bold{Result} \cr
#' yyyy.MM.dd 'at' HH:mm:ss zzz \tab 2015.12.31 at 23:59:59 GMT+1 \cr
#' EEE, MMM d, ''yy \tab czw., gru 31, '15 \cr
#' h:mm a \tab 11:59 PM \cr
#' hh 'o''clock' a, zzzz \tab 11 o'clock PM, GMT+01:00 \cr
#' K:mm a, z \tab 11:59 PM, GMT+1 \cr
#' yyyyy.MMMM.dd GGG hh:mm aaa \tab 2015.grudnia.31 n.e. 11:59 PM \cr
#' uuuu-MM-dd'T'HH:mm:ssZ \tab 2015-12-31T23:59:59+0100 (the ISO 8601 guideline) \cr
#' }
#'
#' @param time an object of class \code{\link{POSIXct}} with date-time data
#' to be formatted
#' (\code{as.POSIXct} will be called on character vectors
#' and objects of class \code{POSIXlt}, \code{Date}, and \code{factor})
#' @param str character vector with strings to be parsed
#' @param format character vector, see Details; see also \code{\link{stri_datetime_fstr}}
#' @param tz \code{NULL} or \code{''} for the default time zone
#' or a single string with a timezone identifier,
#' see \code{\link{stri_timezone_get}} and \code{\link{stri_timezone_list}}
#' @param lenient single logical value; should date/time parsing be lenient?
#' @param locale \code{NULL} or \code{''} for the default locale,
#' or a single string with locale identifier; a non-Gregorian calendar
#' may be specified by setting the \code{@@calendar=name} keyword
#'
#' @return
#' \code{stri_datetime_format} returns a character vector.
#'
#' \code{stri_datetime_parse} returns an object of class \code{\link{POSIXct}}.
#'
#' @references
#' \emph{Formatting Dates and Times} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/format_parse/datetime/}
#'
#'
#' @examples
#' x <- c('2015-02-28', '2015-02-29')
#' stri_datetime_parse(x, 'yyyy-MM-dd')
#' stri_datetime_parse(x, 'yyyy-MM-dd', lenient=TRUE)
#' stri_datetime_parse(x %s+% " 17:13", "yyyy-MM-dd HH:mm")
#' stri_datetime_parse('19 lipca 2015', 'date_long', locale='pl_PL')
#' stri_datetime_format(stri_datetime_now(), 'datetime_relative_medium')
#'
#' @rdname stri_datetime_format
#' @family datetime
#' @export
stri_datetime_format <- function(
time, format = "uuuu-MM-dd HH:mm:ss", tz = NULL, locale = NULL
) {
.Call(C_stri_datetime_format, time, format, tz, locale)
}
#' @export
#' @rdname stri_datetime_format
#' @aliases stri_datetime_format
stri_datetime_parse <- function(
str, format = "uuuu-MM-dd HH:mm:ss",
lenient = FALSE, tz = NULL, locale = NULL
) {
.Call(C_stri_datetime_parse, str, format, lenient, tz, locale)
}
#' @title
#' Convert \code{strptime}-Style Format Strings
#'
#' @description
#' This function converts \code{\link[base]{strptime}} or
#' \code{\link[base]{strftime}}-style
#' format strings to \pkg{ICU} format strings that may be used
#' in \code{\link{stri_datetime_parse}} and \code{\link{stri_datetime_format}}
#' functions.
#'
#' @details
#' For more details on conversion specifiers please refer to
#' the manual page of \code{\link[base]{strptime}}. Most of the formatters
#' of the form \code{\%x}, where \code{x} is a letter, are supported.
#' Moreover, each \code{\%\%} is replaced with \code{\%}.
#'
#' Warnings are given in the case of \code{\%x}, \code{\%X}, \code{\%u},
#' \code{\%w}, \code{\%g}, \code{\%G}, \code{\%c}, \code{\%U}, and \code{\%W}
#' as in such circumstances either \pkg{ICU} does not
#' support the functionality requested using the string format API
#' or there are some inconsistencies between base R and \pkg{ICU}.
#'
#' @param x character vector of date/time format strings
#'
#' @param ignore_special if \code{FALSE}, special identifiers like
#' \code{"datetime_full"} or \code{date_relative_short}
#' (see \code{\link{stri_datetime_format}}) are left as-is
#'
#' @return Returns a character vector.
#'
#' @examples
#' stri_datetime_fstr('%Y-%m-%d %H:%M:%S')
#'
#' @family datetime
#' @export
stri_datetime_fstr <- function(x, ignore_special=TRUE)
{
x <- .Call(C_stri_datetime_fstr, x)
ignore_special <- (is.logical(ignore_special) && length(ignore_special) == 1L && !is.na(ignore_special) && ignore_special) # isTRUE(ignore_special)
if (length(x) > 0 && !ignore_special) {
formats <- outer(
c("date", "time", "datetime", "date_relative", "datetime_relative"),
c("full", "long", "medium", "short"),
stri_paste,
sep="_"
)
which_p <- match(x, stringi::stri_sprintf("'%s'", formats))
# works for NAs and no items from the above list too
x[which(!is.na(which_p))] <- formats[which_p[!is.na(which_p)]]
}
x
}
# ?DateTimeClasses
# cut
# round
# trunc
# time + z
# z + time
# time - z
# time1 lop time2
stringi/R/search_startsendswith_4.R 0000644 0001762 0000144 00000016661 14750110641 017124 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Determine if the Start or End of a String Matches a Pattern
#'
#' @description
#' These functions check if a string starts or ends with a match
#' to a given pattern. Also, it is possible to check if there is a match
#' at a specific position.
#'
#' @details
#' Vectorized over \code{str}, \code{pattern},
#' and \code{from} or \code{to} (with recycling
#' of the elements in the shorter vector if necessary).
#'
#' If \code{pattern} is empty, then the result is \code{NA}
#' and a warning is generated.
#'
#' Argument \code{start} controls the start position in \code{str}
#' where there is a match to a \code{pattern}.
#' \code{to} gives the end position.
#'
#' Indexes given by \code{from} or \code{to} are of course 1-based,
#' i.e., an index 1 denotes the first character
#' in a string. This gives a typical R look-and-feel.
#'
#' For negative indexes in \code{from} or \code{to}, counting starts
#' at the end of the string. For instance, index -1 denotes the last code point
#' in the string.
#'
#' If you wish to test for a pattern match at an arbitrary
#' position in \code{str}, use \code{\link{stri_detect}}.
#'
#' \code{stri_startswith} and \code{stri_endswith} are convenience functions.
#' They call either \code{stri_*_fixed}, \code{stri_*_coll},
#' or \code{stri_*_charclass}, depending on the argument used.
#' Relying on these underlying functions directly will make your code run
#' slightly faster.
#'
#' Note that testing for a pattern match at the start or end of a string
#' has not been implemented separately for regex patterns.
#' For that you may use the '\code{^}' and '\code{$}' meta-characters,
#' see \link{stringi-search-regex}.
#'
#' @param str character vector
#' @param pattern,fixed,coll,charclass character vector defining search patterns;
#' for more details refer to \link{stringi-search}
#' @param from integer vector
#' @param to integer vector
#' @param negate single logical value; whether a no-match to a pattern
#' is rather of interest
#' @param opts_collator,opts_fixed a named list used to tune up
#' the search engine's settings; see \code{\link{stri_opts_collator}}
#' and \code{\link{stri_opts_fixed}}, respectively; \code{NULL}
#' for the defaults
#' @param ... supplementary arguments passed to the underlying functions,
#' including additional settings for \code{opts_collator}, \code{opts_fixed},
#' and so on.
#'
#' @return Each function returns a logical vector.
#'
#'
#' @examples
#' stri_startswith_charclass(' trim me! ', '\\p{WSpace}')
#' stri_startswith_fixed(c('a1', 'a2', 'b3', 'a4', 'c5'), 'a')
#' stri_detect_regex(c('a1', 'a2', 'b3', 'a4', 'c5'), '^a')
#' stri_startswith_fixed('ababa', 'ba')
#' stri_startswith_fixed('ababa', 'ba', from=2)
#' stri_startswith_coll(c('a1', 'A2', 'b3', 'A4', 'C5'), 'a', strength=1)
#' pat <- stri_paste('\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 ',
#' '\u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645XYZ')
#' stri_endswith_coll('\ufdfa\ufdfa\ufdfaXYZ', pat, strength=1)
#'
#' @family search_detect
#' @export
#' @rdname stri_startsendswith
stri_startswith <- function(str, ..., fixed, coll, charclass)
{
providedarg <- c(
fixed = !missing(fixed),
coll = !missing(coll),
charclass = !missing(charclass))
if (sum(providedarg) != 1)
stop("you have to specify either `fixed`, `coll`, or `charclass`")
if (providedarg["fixed"])
stri_startswith_fixed(str, fixed, ...)
else if (providedarg["coll"])
stri_startswith_coll(str, coll, ...)
else if (providedarg["charclass"])
stri_startswith_charclass(str, charclass, ...)
}
#' @export
#' @rdname stri_startsendswith
stri_endswith <- function(str, ..., fixed, coll, charclass)
{
providedarg <- c(
fixed = !missing(fixed),
coll = !missing(coll),
charclass = !missing(charclass))
if (sum(providedarg) != 1)
stop("you have to specify either `fixed`, `coll`, or `charclass`")
if (providedarg["fixed"])
stri_endswith_fixed(str, fixed, ...)
else if (providedarg["coll"])
stri_endswith_coll(str, coll, ...)
else if (providedarg["charclass"])
stri_endswith_charclass(str, charclass, ...)
}
#' @export
#' @rdname stri_startsendswith
stri_startswith_fixed <- function(str, pattern, from = 1L,
negate = FALSE, ..., opts_fixed = NULL)
{
if (!missing(...))
opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...)))
.Call(C_stri_startswith_fixed, str, pattern, from, negate, opts_fixed)
}
#' @export
#' @rdname stri_startsendswith
stri_endswith_fixed <- function(str, pattern, to = -1L,
negate = FALSE, ..., opts_fixed = NULL)
{
if (!missing(...))
opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...)))
.Call(C_stri_endswith_fixed, str, pattern, to, negate, opts_fixed)
}
#' @export
#' @rdname stri_startsendswith
stri_startswith_charclass <- function(str, pattern, from = 1L, negate = FALSE)
{
.Call(C_stri_startswith_charclass, str, pattern, from, negate)
}
#' @export
#' @rdname stri_startsendswith
stri_endswith_charclass <- function(str, pattern, to = -1L, negate = FALSE)
{
.Call(C_stri_endswith_charclass, str, pattern, to, negate)
}
#' @export
#' @rdname stri_startsendswith
stri_startswith_coll <- function(str, pattern, from = 1L,
negate = FALSE, ..., opts_collator = NULL)
{
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_startswith_coll, str, pattern, from, negate, opts_collator)
}
#' @export
#' @rdname stri_startsendswith
stri_endswith_coll <- function(str, pattern, to = -1L,
negate = FALSE, ..., opts_collator = NULL)
{
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_endswith_coll, str, pattern, to, negate, opts_collator)
}
stringi/R/internal_test.R 0000644 0001762 0000144 00000005162 14750110641 015133 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# Check R encoding marking [internal, DEBUG only]
#
# This is an internal function (no-export & no-manual) - test how R marks
# ASCII/LATIN1/UTF8/BYTES encodings (see also \code{?Encoding}).
#
# Results are printed on STDERR
#
# @param str character vector
# @return who cares
.stri_test_Rmark <- function(str)
{
invisible(.Call(C_stri_test_Rmark, str))
}
# For testing StriContainerUTF16's performance [internal, DEBUG only]
#
# @param str character vector
# @return who cares
.stri_test_StriContainerUTF16 <- function(str)
{
.Call(C_stri_test_UnicodeContainer16, str)
}
# For testing StriContainerUTF8's performance [internal, DEBUG only]
#
# @param str character vector
# @return who cares
.stri_test_StriContainerUTF8 <- function(str)
{
.Call(C_stri_test_UnicodeContainer8, str)
}
# For testing .Call performance [internal, DEBUG only]
#
# @param str some object
# @return \code{str}
.stri_test_returnasis <- function(x)
{
.Call(C_stri_test_returnasis, x)
}
stringi/R/pad.R 0000644 0001762 0000144 00000011340 14750110641 013017 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Pad (Center/Left/Right Align) a String
#'
#' @description
#' Add multiple \code{pad} characters at the given \code{side}(s) of each string
#' so that each output string is of total width of at least \code{width}.
#' These functions may be used to center or left/right-align each string.
#'
#' @details
#' Vectorized over \code{str}, \code{width}, and \code{pad}.
#' Each string in \code{pad} should consist of a code points of total width
#' equal to 1 or, if \code{use_length} is \code{TRUE}, exactly one code point.
#'
#' \code{stri_pad} is a convenience function, which dispatches
#' to \code{stri_pad_*}.
#'
#' Note that Unicode code points may have various widths when
#' printed on the console and that, by default, the function takes that
#' into account. By changing the state of the \code{use_length}
#' argument, this function starts acting like each code point
#' was of width 1. This feature should rather be used with
#' text in Latin script.
#'
#' See \code{\link{stri_trim_left}} (among others) for reverse operation.
#' Also check out \code{\link{stri_wrap}} for line wrapping.
#'
#' @param str character vector
#' @param width integer vector giving minimal output string lengths
#' @param side [\code{stri_pad} only] single character string;
#' sides on which padding character is added
#' (\code{left} (default), \code{right}, or \code{both})
#' @param pad character vector giving padding code points
#' @param use_length single logical value; should the number of code
#' points be used instead of the total code point width
#' (see \code{\link{stri_width}})?
#'
#' @return These functions return a character vector.
#'
#' @examples
#' stri_pad_left('stringi', 10, pad='#')
#' stri_pad_both('stringi', 8:12, pad='*')
#' # center on screen:
#' cat(stri_pad_both(c('the', 'string', 'processing', 'package'),
#' getOption('width')*0.9), sep='\n')
#' cat(stri_pad_both(c('\ud6c8\ubbfc\uc815\uc74c', # takes width into account
#' stri_trans_nfkd('\ud6c8\ubbfc\uc815\uc74c'), 'abcd'),
#' width=10), sep='\n')
#'
#' @family length
#' @rdname stri_pad
#' @export
stri_pad_both <- function(str, width = floor(0.9 * getOption("width")), pad = " ",
use_length = FALSE)
{
.Call(C_stri_pad, str, width, 2L, pad, use_length)
}
#' @rdname stri_pad
#' @export
stri_pad_left <- function(str, width = floor(0.9 * getOption("width")), pad = " ",
use_length = FALSE)
{
.Call(C_stri_pad, str, width, 0L, pad, use_length)
}
#' @rdname stri_pad
#' @export
stri_pad_right <- function(str, width = floor(0.9 * getOption("width")), pad = " ",
use_length = FALSE)
{
.Call(C_stri_pad, str, width, 1L, pad, use_length)
}
#' @rdname stri_pad
#' @export
stri_pad <- function(str, width = floor(0.9 * getOption("width")), side = c("left",
"right", "both"), pad = " ", use_length = FALSE)
{
# `left` is the default for compatibility with stringr
side <- match.arg(side) # this is slow
switch(side,
both = stri_pad_both(str, width, pad, use_length),
left = stri_pad_left(str, width, pad, use_length),
right = stri_pad_right(str, width, pad, use_length))
}
stringi/R/opts.R 0000644 0001762 0000144 00000037311 14750110641 013246 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Generate a List with Collator Settings
#'
#' @description
#' A convenience function to tune the \pkg{ICU} Collator's behavior,
#' e.g., in \code{\link{stri_compare}}, \code{\link{stri_order}},
#' \code{\link{stri_unique}}, \code{\link{stri_duplicated}},
#' as well as \code{\link{stri_detect_coll}}
#' and other \link{stringi-search-coll} functions.
#'
#'
#' @details
#' \pkg{ICU}'s \emph{collator} performs a locale-aware, natural-language
#' alike string comparison.
#' This is a more reliable way of establishing relationships between
#' strings than the one provided by base \R, and definitely
#' one that is more complex and appropriate than ordinary bytewise
#' comparison.
#'
#'
#' @param locale single string, \code{NULL} or
#' \code{''} for default locale
#' @param strength single integer in \{1,2,3,4\}, which defines collation strength;
#' \code{1} for the most permissive collation rules, \code{4} for the strictest
#' ones
#' @param alternate_shifted single logical value; \code{FALSE}
#' treats all the code points with non-ignorable primary weights in the same way,
#' \code{TRUE} causes code points with primary weights that are equal or below
#' the variable top value to be ignored on primary level and moved to the quaternary level
#' @param french single logical value; used in Canadian French;
#' \code{TRUE} results in secondary weights being considered backwards
#' @param uppercase_first single logical value; \code{NA}
#' orders upper and lower case letters in accordance to their tertiary weights,
#' \code{TRUE} forces upper case letters to sort before lower case letters,
#' \code{FALSE} does the opposite
#' @param case_level single logical value;
#' controls whether an extra case level (positioned before the third level) is generated or not
#' @param normalization
#' single logical value; if \code{TRUE},
#' then incremental check is performed to see whether the input data is in
#' the FCD form. If the data is not in the FCD form, incremental NFD
#' normalization is performed
#' @param normalisation alias of \code{normalization}
#' @param numeric single logical value;
#' when turned on, this attribute generates a collation key for
#' the numeric value of substrings of digits;
#' this is a way to get '100' to sort AFTER '2';
#' note that negative or non-integer numbers will not be ordered properly
#'
#' @return
#' Returns a named list object; missing settings are left with default values.
#'
#' @export
#' @family locale_sensitive
#' @family search_coll
#'
#'
#' @rdname stri_opts_collator
#'
#' @references
#' \emph{Collation} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/collation/}
#'
#' \emph{ICU Collation Service Architecture} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/collation/architecture.html}
#'
#' \emph{\code{icu::Collator} Class Reference} -- ICU4C API Documentation,
#' \url{https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1Collator.html}
#'
#' @examples
#' stri_cmp('number100', 'number2')
#' stri_cmp('number100', 'number2', opts_collator=stri_opts_collator(numeric=TRUE))
#' stri_cmp('number100', 'number2', numeric=TRUE) # equivalent
#' stri_cmp('above mentioned', 'above-mentioned')
#' stri_cmp('above mentioned', 'above-mentioned', alternate_shifted=TRUE)
stri_opts_collator <- function(
locale = NULL, strength = 3L, alternate_shifted = FALSE,
french = FALSE, uppercase_first = NA, case_level = FALSE, normalization = FALSE,
normalisation = normalization, numeric = FALSE
) {
opts <- list()
if (!missing(locale))
opts["locale"] <- locale
if (!missing(strength))
opts["strength"] <- strength
if (!missing(alternate_shifted))
opts["alternate_shifted"] <- alternate_shifted
if (!missing(french))
opts["french"] <- french
if (!missing(uppercase_first))
opts["uppercase_first"] <- uppercase_first
if (!missing(case_level))
opts["case_level"] <- case_level
if (!missing(numeric))
opts["numeric"] <- numeric
if (!missing(normalization))
opts["normalization"] <- normalization
else if (!missing(normalisation))
opts["normalization"] <- normalisation
opts
}
#' @rdname stri_opts_collator
#' @export
stri_coll <- stri_opts_collator
#' @title
#' Generate a List with Regex Matcher Settings
#'
#' @description
#' A convenience function to tune the \pkg{ICU} regular expressions
#' matcher's behavior, e.g., in \code{\link{stri_count_regex}}
#' and other \link{stringi-search-regex} functions.
#'
#' @details
#' Note that some regex settings may be changed using ICU regex flags
#' inside regexes. For example, \code{'(?i)pattern'} performs
#' a case-insensitive match of a given pattern,
#' see the \pkg{ICU} User Guide entry on Regular Expressions
#' in the References section or \link{stringi-search-regex}.
#'
#' @param case_insensitive logical; enables case insensitive matching [regex flag \code{(?i)}]
#' @param comments logical; allows white space and comments within patterns [regex flag \code{(?x)}]
#' @param dotall logical; if set, `\code{.}` matches line terminators,
#' otherwise matching of `\code{.}` stops at a line end [regex flag \code{(?s)}]
#' @param dot_all alias of \code{dotall}
#' @param literal logical; if set, treat the entire pattern as a literal string:
#' metacharacters or escape sequences in the input sequence will be given no special meaning;
#' note that in most cases you would rather use the \link{stringi-search-fixed}
#' facilities in this case
#' @param multiline logical; controls the behavior of `\code{$}` and `\code{^}`.
#' If set, recognize line terminators within a string, otherwise,
#' match only at start and end of input string [regex flag \code{(?m)}]
#' @param multi_line alias of \code{multiline}
#' @param unix_lines logical; Unix-only line endings;
#' when enabled, only \code{U+000a} is recognized as a
#' line ending by `\code{.}`, `\code{$}`, and `\code{^}`.
#' @param uword logical; Unicode word boundaries;
#' if set, uses the Unicode TR 29 definition of word boundaries;
#' warning: Unicode word boundaries are quite different from traditional
#' regex word boundaries. [regex flag \code{(?w)}]
#' See \url{https://unicode.org/reports/tr29/#Word_Boundaries}
#' @param error_on_unknown_escapes logical;
#' whether to generate an error on unrecognized backslash escapes;
#' if set, fail with an error on patterns that contain backslash-escaped ASCII
#' letters without a known special meaning;
#' otherwise, these escaped letters represent themselves
#' @param time_limit integer; processing time limit, in ~milliseconds (but not precisely so,
#' depends on the CPU speed), for match operations;
#' setting a limit is desirable if poorly written regexes are expected on input;
#' 0 for no limit
#' @param stack_limit integer; maximal size, in bytes, of the heap storage available
#' for the match backtracking stack; setting a limit is desirable if poorly
#' written regexes are expected on input; 0 for no limit
#'
#' @return
#' Returns a named list object; missing settings are left with default values.
#'
#' @export
#' @family search_regex
#'
#' @references
#' \emph{\code{enum URegexpFlag}: Constants for Regular Expression Match Modes}
#' -- ICU4C API Documentation,
#' \url{https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/uregex_8h.html}
#'
#' \emph{Regular Expressions} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/strings/regexp.html}
#'
#' @examples
#' stri_detect_regex('ala', 'ALA') # case-sensitive by default
#' stri_detect_regex('ala', 'ALA', opts_regex=stri_opts_regex(case_insensitive=TRUE))
#' stri_detect_regex('ala', 'ALA', case_insensitive=TRUE) # equivalent
#' stri_detect_regex('ala', '(?i)ALA') # equivalent
stri_opts_regex <- function(
case_insensitive, comments,
dotall, dot_all = dotall,
literal,
multiline, multi_line = multiline,
unix_lines, uword, error_on_unknown_escapes,
time_limit = 0L, stack_limit = 0L
) {
opts <- list()
if (!missing(case_insensitive))
opts["case_insensitive"] <- case_insensitive
if (!missing(comments))
opts["comments"] <- comments
if (!missing(literal))
opts["literal"] <- literal
if (!missing(unix_lines))
opts["unix_lines"] <- unix_lines
if (!missing(uword))
opts["uword"] <- uword
if (!missing(error_on_unknown_escapes))
opts["error_on_unknown_escapes"] <- error_on_unknown_escapes
if (!missing(stack_limit))
opts["stack_limit"] <- stack_limit
if (!missing(time_limit))
opts["time_limit"] <- time_limit
if (!missing(dotall))
opts["dotall"] <- dotall
else if (!missing(dot_all))
opts["dotall"] <- dot_all
if (!missing(multiline))
opts["multiline"] <- multiline
else if (!missing(multi_line))
opts["multiline"] <- multi_line
opts
}
#' @title
#' Generate a List with BreakIterator Settings
#'
#' @description
#' A convenience function to tune the \pkg{ICU} \code{BreakIterator}'s behavior
#' in some text boundary analysis functions, see
#' \link{stringi-search-boundaries}.
#'
#' @details
#' The \code{skip_*} family of settings may be used to prevent performing
#' any special actions on particular types of text boundaries, e.g.,
#' in case of the \code{\link{stri_locate_all_boundaries}} and
#' \code{\link{stri_split_boundaries}} functions.
#'
#' Note that custom break iterator rules (advanced users only)
#' should be specified as a single string.
#' For a detailed description of the syntax of RBBI rules, please refer
#' to the ICU User Guide on Boundary Analysis.
#'
#' @param type single string; either the break iterator type, one of \code{character},
#' \code{line_break}, \code{sentence}, \code{word},
#' or a custom set of ICU break iteration rules;
#' see \link{stringi-search-boundaries}
#' @param locale single string, \code{NULL} or \code{''} for default locale
#' @param skip_word_none logical; perform no action for 'words' that
#' do not fit into any other categories
#' @param skip_word_number logical; perform no action for words that
#' appear to be numbers
#' @param skip_word_letter logical; perform no action for words that
#' contain letters, excluding hiragana, katakana, or ideographic characters
#' @param skip_word_kana logical; perform no action for words
#' containing kana characters
#' @param skip_word_ideo logical; perform no action for words
#' containing ideographic characters
#' @param skip_line_soft logical; perform no action for soft line breaks,
#' i.e., positions where a line break is acceptable but not required
#' @param skip_line_hard logical; perform no action for hard,
#' or mandatory line breaks
#' @param skip_sentence_term logical; perform no action for sentences
#' ending with a sentence terminator ('\code{.}', '\code{,}', '\code{?}',
#' '\code{!}'), possibly followed by a hard separator
#' (\code{CR}, \code{LF}, \code{PS}, etc.)
#' @param skip_sentence_sep logical; perform no action for sentences
#' that do not contain an ending sentence terminator, but are ended
#' by a hard separator or end of input
#'
#' @return
#' Returns a named list object.
#' Omitted \code{skip_*} values act as they have been set to \code{FALSE}.
#'
#' @export
#' @family text_boundaries
#'
#' @references
#' \emph{\code{ubrk.h} File Reference} -- ICU4C API Documentation,
#' \url{https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/ubrk_8h.html}
#'
#' \emph{Boundary Analysis} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/boundaryanalysis/}
stri_opts_brkiter <- function(
type, locale, skip_word_none, skip_word_number,
skip_word_letter, skip_word_kana, skip_word_ideo, skip_line_soft,
skip_line_hard, skip_sentence_term, skip_sentence_sep
) {
opts <- list()
if (!missing(type))
opts["type"] <- type
if (!missing(locale))
opts["locale"] <- locale
if (!missing(skip_word_none))
opts["skip_word_none"] <- skip_word_none
if (!missing(skip_word_number))
opts["skip_word_number"] <- skip_word_number
if (!missing(skip_word_letter))
opts["skip_word_letter"] <- skip_word_letter
if (!missing(skip_word_kana))
opts["skip_word_kana"] <- skip_word_kana
if (!missing(skip_word_ideo))
opts["skip_word_ideo"] <- skip_word_ideo
if (!missing(skip_line_soft))
opts["skip_line_soft"] <- skip_line_soft
if (!missing(skip_line_hard))
opts["skip_line_hard"] <- skip_line_hard
if (!missing(skip_sentence_term))
opts["skip_sentence_term"] <- skip_sentence_term
if (!missing(skip_sentence_sep))
opts["skip_sentence_sep"] <- skip_sentence_sep
opts
}
#' @title
#' Generate a List with Fixed Pattern Search Engine's Settings
#'
#' @description
#' A convenience function used to tune up the behavior of \code{stri_*_fixed}
#' functions, see \link{stringi-search-fixed}.
#'
#' @details
#' Case-insensitive matching uses a simple, single-code point case mapping
#' (via ICU's \code{u_toupper()} function).
#' Full case mappings should be used whenever possible because they produce
#' better results by working on whole strings. They also take into account
#' the string context and the language, see \link{stringi-search-coll}.
#'
#' Searching for overlapping pattern matches is available in
#' \code{\link{stri_extract_all_fixed}}, \code{\link{stri_locate_all_fixed}},
#' and \code{\link{stri_count_fixed}} functions.
#'
#' @param case_insensitive logical; enable simple case insensitive matching
#' @param overlap logical; enable overlapping matches' detection
#'
#' @return
#' Returns a named list object.
#'
#' @export
#' @family search_fixed
#'
#' @references
#' \emph{C/POSIX Migration} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/icu/posix.html}
#'
#' @examples
#' stri_detect_fixed('ala', 'ALA') # case-sensitive by default
#' stri_detect_fixed('ala', 'ALA', opts_fixed=stri_opts_fixed(case_insensitive=TRUE))
#' stri_detect_fixed('ala', 'ALA', case_insensitive=TRUE) # equivalent
stri_opts_fixed <- function(case_insensitive = FALSE, overlap = FALSE)
{
opts <- list()
if (!missing(case_insensitive))
opts["case_insensitive"] <- case_insensitive
if (!missing(overlap))
opts["overlap"] <- overlap
opts
}
stringi/R/time_zone.R 0000644 0001762 0000144 00000020057 14750110641 014251 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' List Available Time Zone Identifiers
#'
#' @description
#' Returns a list of available time zone identifiers.
#'
#' @details
#' If \code{offset} and \code{region} are \code{NA} (the default), then
#' all time zones are returned. Otherwise,
#' only time zone identifiers with a given raw offset from GMT
#' and/or time zones corresponding to a given region are provided.
#' Note that the effect of daylight savings time is ignored.
#'
#' A time zone represents an offset applied to the Greenwich Mean Time (GMT)
#' to obtain local time (Universal Coordinated Time, or UTC, is similar,
#' but not precisely identical, to GMT; in \pkg{ICU} the two terms
#' are used interchangeably since \pkg{ICU} does not concern itself with
#' either leap seconds or historical behavior).
#' The offset might vary throughout the year, if daylight savings time (DST)
#' is used, or might be the same all year long.
#' Typically, regions closer to the equator do not use DST.
#' If DST is in use, then specific rules define the point where
#' the offset changes and the amount by which it changes.
#'
#' If DST is observed, then three additional bits of information are needed:
#' \enumerate{
#' \item The precise date and time during the year when DST begins.
#' In the first half of the year it is in the northern hemisphere,
#' and in the second half of the year it is in the southern hemisphere.
#' \item The precise date and time during the year when DST ends.
#' In the first half of the year it is in the southern hemisphere,
#' and in the second half of the year it is in the northern hemisphere.
#' \item The amount by which the GMT offset changes when DST is in effect.
#' This is almost always one hour.
#' }
#'
#'
#' @param offset single numeric value;
#' a given raw offset from GMT, in hours;
#' \code{NA} for all offsets
#' @param region single string;
#' a ISO 3166 two-letter country code or UN M.49 three-digit area code;
#' \code{NA} for all regions
#'
#' @return Returns a character vector.
#'
#' @references
#' \emph{TimeZone} class -- ICU API Documentation,
#' \url{https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1TimeZone.html}
#'
#' \emph{ICU TimeZone classes} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/datetime/timezone/}
#'
#' \emph{Date/Time Services} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/datetime/}
#'
#' @examples
#' stri_timezone_list()
#' stri_timezone_list(offset=1)
#' stri_timezone_list(offset=5.5)
#' stri_timezone_list(offset=5.75)
#' stri_timezone_list(region='PL')
#' stri_timezone_list(region='US', offset=-10)
#'
#' # Fetch information on all time zones
#' do.call(rbind.data.frame,
#' lapply(stri_timezone_list(), function(tz) stri_timezone_info(tz)))
#'
#' @family datetime
#' @family timezone
#' @export
stri_timezone_list <- function(region=NA_character_, offset=NA_integer_)
{
stri_sort(
.Call(C_stri_timezone_list, region, offset),
locale="en_US", numeric=TRUE, strength=1
)
}
#' @title
#' Set or Get Default Time Zone in \pkg{stringi}
#'
#' @description
#' \code{stri_timezone_set} changes the current default time zone for all functions
#' in the \pkg{stringi} package, i.e., establishes the meaning of the
#' ``\code{NULL} time zone'' argument to date/time processing functions.
#'
#' \code{stri_timezone_get} gets the current default time zone.
#'
#' For more information on time zone representation in \pkg{ICU}
#' and \pkg{stringi}, refer to \code{\link{stri_timezone_list}}.
#'
#' @details
#' Unless the default time zone has already been set using
#' \code{stri_timezone_set}, the default time zone is determined
#' by querying the OS with methods in \pkg{ICU}'s internal platform utilities.
#'
#' @param tz single string; time zone identifier
#'
#' @return
#' \code{stri_timezone_set} returns a string with
#' previously used timezone, invisibly.
#'
#' \code{stri_timezone_get} returns a single string
#' with the current default time zone.
#'
#' @references
#' \emph{TimeZone} class -- ICU API Documentation,
#' \url{https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1TimeZone.html}
#'
#' @examples
#' \dontrun{
#' oldtz <- stri_timezone_set('Europe/Warsaw')
#' # ... many time zone-dependent operations
#' stri_timezone_set(oldtz) # restore previous default time zone
#' }
#'
#' @export
#' @family datetime
#' @family timezone
#' @rdname stri_timezone_set
#' @export
stri_timezone_get <- function()
{
stri_timezone_info()$ID
}
#' @rdname stri_timezone_set
#' @export
stri_timezone_set <- function(tz)
{
previous <- stri_timezone_get()
.Call(C_stri_timezone_set, tz)
invisible(previous)
}
#' @title
#' Query a Given Time Zone
#'
#' @description
#' Provides some basic information on a given time zone identifier.
#'
#' @details
#' Used to fetch basic information
#' on any supported time zone.
#'
#' For more information on time zone representation in \pkg{ICU},
#' see \code{\link{stri_timezone_list}}.
#'
#' @param tz \code{NULL} or \code{''} for default time zone,
#' or a single string with time zone ID otherwise
#' @param display_type single string;
#' one of \code{'short'}, \code{'long'}, \code{'generic_short'},
#' \code{'generic_long'}, \code{'gmt_short'}, \code{'gmt_long'},
#' \code{'common'}, \code{'generic_location'}
#' @param locale \code{NULL} or \code{''} for default locale,
#' or a single string with locale identifier
#'
#' @return
#' Returns a list with the following named components:
#' \enumerate{
#' \item \code{ID} (time zone identifier),
#' \item \code{Name} (localized human-readable time zone name),
#' \item \code{Name.Daylight} (localized human-readable time zone
#' name when DST is used, if available),
#' \item \code{Name.Windows} (Windows time zone ID, if available),
#' \item \code{RawOffset} (raw GMT offset, in hours, before taking
#' daylight savings into account), and
#' \item \code{UsesDaylightTime} (states whether a time zone uses
#' daylight savings time in the current Gregorian calendar year).
#' }
#'
#' @examples
#' stri_timezone_info()
#' stri_timezone_info(locale='sk_SK')
#' sapply(c('short', 'long', 'generic_short', 'generic_long',
#' 'gmt_short', 'gmt_long', 'common', 'generic_location'),
#' function(e) stri_timezone_info('Europe/London', display_type=e))
#'
#' @family datetime
#' @family timezone
#' @export
stri_timezone_info <- function(tz=NULL, locale=NULL, display_type="long")
{
# TODO: when does DST start???
.Call(C_stri_timezone_info, tz, locale, display_type)
}
stringi/R/internal_prepare_arg.R 0000644 0001762 0000144 00000027231 14750110641 016444 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Passing Arguments to Functions in \pkg{stringi}
#'
#' @description
#' Below we explain how \pkg{stringi} deals with its functions' arguments.
#'
#' If some function violates one of the following rules
#' (for a very important reason),
#' this is clearly indicated in its documentation (with discussion).
#'
#' @section Coercion of Arguments:
#'
#' When a character vector argument is expected, factors and other vectors
#' coercible to characters vectors are silently converted with
#' \code{\link{as.character}}, otherwise an error is generated.
#' Coercion from a list which does not consist of length-1 atomic vectors
#' issues a warning.
#'
#' When a logical, numeric, or integer vector argument is expected,
#' factors are converted with \code{as.*(\link{as.character}(...))},
#' and other coercible vectors are converted with \code{as.*},
#' otherwise an error is generated.
#'
#'
#' @section Vectorization:
#'
#' Almost all functions are vectorized with respect to all their arguments
#' and the recycling rule is applied whenever necessary.
#' Due to this property you may,
#' for instance, search for one pattern in each given string,
#' search for each pattern in one given string,
#' and search for the i-th pattern within the i-th string.
#'
#' We of course took great care of performance issues:
#' e.g., in regular expression searching, regex matchers are reused
#' from iteration to iteration, as long as it is possible.
#'
#' Functions with some non-vectorized arguments are rare:
#' e.g., regular expression matcher's settings are established
#' once per each call.
#'
#' Some functions
#' assume that a vector with one element is given
#' as an argument (like \code{collapse} in \code{\link{stri_join}}).
#' In such cases, if an empty vector is given you will get an error
#' and for vectors with more than 1 elements - a warning will be
#' generated (only the first element will be used).
#'
#' You may find details on vectorization behavior in the man pages
#' on each particular function of your interest.
#'
#' @section Handling Missing Values (\code{NA}s):
#'
#' \pkg{stringi} handles missing values consistently.
#' For any vectorized operation, if at least one vector element is missing,
#' then the corresponding resulting value is also set to \code{NA}.
#'
#'
#' @section Preserving Object Attributes:
#'
#' Generally, all our functions drop input objects' attributes
#' (e.g., \code{\link{names}}, \code{\link{dim}}, etc.).
#' This is due to deep vectorization as well as for efficiency reasons.
#' If the preservation of attributes is needed,
#' important attributes can be manually copied. Alternatively, the notation
#' \code{x[] <- stri_...(x, ...)} can sometimes be used too.
#'
#' @rdname about_arguments
#' @name about_arguments
#' @aliases arguments stringi-arguments stringi-arguments
#' @family stringi_general_topics
#' @family prepare_arg
invisible(NULL)
# @title
# Prepare a String Vector Argument [internal]
#
# @description
# This is an internal function. However, the interested user may play with it
# in order to get more insight on how \pkg{stringi} deals with its
# functions' arguments. See `Value' section for details.
#
# @param x argument to be checked
#
# @return
# If \code{x} is a factor or an object equipped with a \code{class}
# attribute or a list, then \code{\link{as.character}} is called.
# If \code{x} is a string, it is returned with no change.
# If an atomic vector or a matrix is given, it is coerced to a character vector.
# If it is a \code{name} object, a character vector of length 1 is generated.
# Otherwise the function throws an error.
#
# @family prepare_arg
stri_prepare_arg_string <- function(x)
{
.Call(C_stri_prepare_arg_string, x, deparse(substitute(x)))
}
# @title
# Prepare a Numeric Vector Argument [internal]
#
# @description
# This is an internal function. However, the interested user may play with it
# in order to get more insight on how \pkg{stringi} deals with its
# functions' arguments. See `Value' section for details.
# TODO: factors_as_strings
#
# @param x argument to be checked
#
# @return
# If \code{x} is a factor, \code{\link{as.character}} is called, and the
# resulting character vector is coerced to numeric.
# If it is an object equipped with a \code{class} attribute or a list,
# \code{as.double} is called.
# If it is a numeric vector, then it is returned with no change.
# If atomic vector or a matrix is given, it is coerced to a numeric vector.
# Otherwise the function throws an error.
#
# @family prepare_arg
stri_prepare_arg_double <- function(x)
{
.Call(C_stri_prepare_arg_double, x, deparse(substitute(x)))
}
# @title
# Prepare an Integer Vector Argument [internal]
#
# @description
# This is an internal function. However, the interested user may play with it
# in order to get more insight on how \pkg{stringi} deals with its
# functions' arguments. See `Value' section for details.
# TODO: factors_as_strings
#
# @param x argument to be checked
#
# @return
# If \code{x} is a factor, \code{\link{as.character}} is called, and the
# resulting character vector is coerced to integer.
# If it is an object equipped with a \code{class} attribute or a list,
# \code{as.integer} is called.
# If it is an integer vector, then it is returned with no change.
# If an atomic vector or a matrix is given, it is coerced to an integer vector.
# Otherwise the function throws an error.
#
# @family prepare_arg
stri_prepare_arg_integer <- function(x)
{
.Call(C_stri_prepare_arg_integer, x, deparse(substitute(x)))
}
# @title
# Prepare a Logical Vector Argument [internal]
#
# @description
# This is an internal function. However, the interested user may play with it
# in order to get more insight on how \pkg{stringi} deals with its
# functions' arguments. See `Value' section for details.
# TODO: factors_as_strings
#
# @param x argument to be checked
#
# @return
# If \code{x} is a logical vector, it is returned with no change.
# If it is an object equipped with a \code{class} attribute or a list,
# \code{as.logical} is called.
# If \code{x} is a factor, \code{\link{as.character}} is called, and the
# resulting character vector is coerced to logical.
# If atomic vector or a matrix is given, it is coerced to a logical vector.
# Otherwise the function throws an error.
#
# @family prepare_arg
stri_prepare_arg_logical <- function(x)
{
.Call(C_stri_prepare_arg_logical, x, deparse(substitute(x)))
}
# @title
# Prepare a Raw Vector Argument [internal]
#
# @description
# This is an internal function. However, the interested user may play with it
# in order to get more insight on how \pkg{stringi} deals with its
# functions' arguments. See `Value' section for details.
# TODO: factors_as_strings
#
# @param x argument to be checked
#
# @return
# If \code{x} is a factor, \code{\link{as.character}} is called, and the
# resulting character vector is coerced to raw.
# If it is an object equipped with a \code{class} attribute or a list,
# \code{as.raw} is called.
# If \code{x} is a raw vector, it is returned with no change.
# If atomic vector or a matrix is given, it is coerced to a raw vector.
# Otherwise the function throws an error.
#
# @family prepare_arg
stri_prepare_arg_raw <- function(x)
{
.Call(C_stri_prepare_arg_raw, x, deparse(substitute(x)))
}
# @title
# Prepare a String Vector Argument [Single Value] [internal]
#
# @description
# This is an internal function. However, the interested user may play with it
# in order to get more insight on how \pkg{stringi} deals with its
# functions' arguments. See `Value' section for details.
#
# @param x argument to be checked
# @return
# In the first place, \code{\link{stri_prepare_arg_string}} is called.
# On an empty vector, an error is generated.
# If there are more than 1 elements, a warning is generated.
# A vector with one element (the first in \code{x}) is returned.
#
# @family prepare_arg
stri_prepare_arg_string_1 <- function(x)
{
.Call(C_stri_prepare_arg_string_1, x, deparse(substitute(x)))
}
# @title
# Prepare a Numeric Vector Argument [Single Value] [internal]
#
# @description
# This is an internal function. However, the interested user may play with it
# in order to get more insight on how \pkg{stringi} deals with its
# functions' arguments. See `Value' section for details.
# TODO: factors_as_strings
#
# @param x argument to be checked
# @return
# In the first place, \code{\link{stri_prepare_arg_double}} is called.
# On an empty vector, an error is generated.
# If there are more than 1 elements, a warning is generated.
# A vector with one element (the first in \code{x}) is returned.
#
# @family prepare_arg
stri_prepare_arg_double_1 <- function(x)
{
.Call(C_stri_prepare_arg_double_1, x, deparse(substitute(x)))
}
# @title
# Prepare an Integer Vector Argument [Single Value] [internal]
#
# @description
# This is an internal function. However, the interested user may play with it
# in order to get more insight on how \pkg{stringi} deals with its
# functions' arguments. See `Value' section for details.
#
# TODO: factors_as_strings
#
# @param x argument to be checked
# @return
# In the first place, \code{\link{stri_prepare_arg_integer}} is called.
# On an empty vector, an error is generated.
# If there are more than 1 elements, a warning is generated.
# A vector with one element (the first in \code{x}) is returned.
#
# @family prepare_arg
stri_prepare_arg_integer_1 <- function(x)
{
.Call(C_stri_prepare_arg_integer_1, x, deparse(substitute(x)))
}
# @title
# Prepare a Logical Vector Argument [Single Value] [internal]
#
# @description
# This is an internal function. However, the interested user may play with it
# in order to get more insight on how \pkg{stringi} deals with its
# functions' arguments. See `Value' section for details.
#
# @param x argument to be checked
# @return
# In the first place, \code{\link{stri_prepare_arg_logical}} is called.
# On an empty vector, an error is generated.
# If there are more than 1 elements, a warning is generated.
# A vector with one element (the first in \code{x}) is returned.
#
# @family prepare_arg
stri_prepare_arg_logical_1 <- function(x)
{
.Call(C_stri_prepare_arg_logical_1, x, deparse(substitute(x)))
}
stringi/R/wrap.R 0000644 0001762 0000144 00000016265 14750110641 013237 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Word Wrap Text to Format Paragraphs
#'
#' @description
#' This function breaks text paragraphs into lines,
#' of total width (if it is possible) at most given \code{width}.
#'
#' @details
#' Vectorized over \code{str}.
#'
#' If \code{whitespace_only} is \code{FALSE},
#' then \pkg{ICU}'s line-\code{BreakIterator} is used to determine
#' text boundaries where a line break is possible.
#' This is a locale-dependent operation.
#' Otherwise, the breaks are only at white-spaces.
#'
#' Note that Unicode code points may have various widths when
#' printed on the console and that this function, by default, takes that
#' into account. By changing the state of the \code{use_length}
#' argument, this function starts to act as if each code point
#' was of width 1.
#'
#' If \code{normalize} is \code{FALSE},
#' then multiple white spaces between the word boundaries are
#' preserved within each wrapped line.
#' In such a case, none of the strings can contain \code{\\r}, \code{\\n},
#' or other new line characters, otherwise you will get an error.
#' You should split the input text into lines
#' or, for example, substitute line breaks with spaces
#' before applying this function.
#'
#' If \code{normalize} is \code{TRUE}, then
#' all consecutive white space (ASCII space, horizontal TAB, CR, LF)
#' sequences are replaced with single ASCII spaces
#' before actual string wrapping. Moreover, \code{\link{stri_split_lines}}
#' and \code{\link{stri_trans_nfc}} is called on the input character vector.
#' This is for compatibility with \code{\link{strwrap}}.
#'
#' The greedy algorithm (for \code{cost_exponent} being non-positive)
#' provides a very simple way for word wrapping.
#' It always puts as many words in each line as possible.
#' This method -- contrary to the dynamic algorithm -- does not minimize
#' the number of space left at the end of every line.
#' The dynamic algorithm (a.k.a. Knuth's word wrapping algorithm)
#' is more complex, but it returns text wrapped
#' in a more aesthetic way. This method minimizes the squared
#' (by default, see \code{cost_exponent}) number of spaces (raggedness)
#' at the end of each line, so the text is mode arranged evenly.
#' Note that the cost of printing the last line is always zero.
#'
#' @param str character vector of strings to reformat
#' @param width single integer giving the suggested
#' maximal total width/number of code points per line
#' @param cost_exponent single numeric value, values not greater than zero
#' will select a greedy word-wrapping algorithm; otherwise
#' this value denotes the exponent in the cost function
#' of a (more aesthetic) dynamic programming-based algorithm
#' (values in [2, 3] are recommended)
#' @param simplify single logical value, see Value
#' @param normalize single logical value, see Details
#' @param normalise alias of \code{normalize}
#' @param indent single non-negative integer; gives the indentation of the
#' first line in each paragraph
#' @param exdent single non-negative integer; specifies the indentation
#' of subsequent lines in paragraphs
#' @param prefix,initial single strings; \code{prefix} is used as prefix for each
#' line except the first, for which \code{initial} is utilized
#' @param whitespace_only single logical value; allow breaks only at white-spaces?
#' if \code{FALSE}, \pkg{ICU}'s line break iterator is used to split text
#' into words, which is suitable for natural language processing
#' @param locale \code{NULL} or \code{''} for text boundary analysis following
#' the conventions of the default locale, or a single string with
#' locale identifier, see \link{stringi-locale}
#' @param use_length single logical value; should the number of code
#' points be used instead of the total code point width (see \code{\link{stri_width}})?
#'
#' @return
#' If \code{simplify} is \code{TRUE}, then a character vector is returned.
#' Otherwise, you will get a list of \code{length(str)} character vectors.
#'
#' @rdname stri_wrap
#' @family locale_sensitive
#' @family text_boundaries
#' @examples
#' s <- stri_paste(
#' 'Lorem ipsum dolor sit amet, consectetur adipisicing elit. Proin ',
#' 'nibh augue, suscipit a, scelerisque sed, lacinia in, mi. Cras vel ',
#' 'lorem. Etiam pellentesque aliquet tellus.')
#' cat(stri_wrap(s, 20, 0.0), sep='\n') # greedy
#' cat(stri_wrap(s, 20, 2.0), sep='\n') # dynamic
#' cat(stri_pad(stri_wrap(s), side='both'), sep='\n')
#'
#' @references
#' D.E. Knuth, M.F. Plass,
#' Breaking paragraphs into lines, \emph{Software: Practice and Experience} 11(11),
#' 1981, pp. 1119--1184.
#'
#' @export
stri_wrap <- function(str, width = floor(0.9 * getOption("width")),
cost_exponent = 2,
simplify = TRUE, normalize = TRUE, normalise = normalize,
indent = 0, exdent = 0, prefix = "", initial = prefix,
whitespace_only = FALSE, use_length = FALSE, locale = NULL)
{
simplify <- as.logical(simplify)
if (!missing(normalise))
normalize <- normalise
normalize <- as.logical(normalize)
if (normalize) {
# this will give an informative warning or error if sth is wrong
str <- sapply(stri_split_lines(str), function(s) stri_flatten(s, collapse = " "))
str <- stri_trim(stri_replace_all_charclass(str, "[\\u0020\\r\\n\\t]", " ",
merge = TRUE))
str <- stri_trans_nfc(str)
}
ret <- .Call(C_stri_wrap, str, width, cost_exponent, indent, exdent, prefix,
initial, whitespace_only, use_length, locale)
if (simplify) {
# this will give an informative warning or error if sth is wrong
as.character(unlist(ret))
} else ret
}
stringi/R/reverse.R 0000644 0001762 0000144 00000004524 14750110641 013734 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Reverse Each String
#'
#' @description
#' Reverses the order of the code points in every string.
#'
#' @details
#' Note that this operation may result in non-Unicode-normalized
#' strings and may give peculiar outputs for bidirectional strings.
#'
#' See also \code{\link{stri_rand_shuffle}} for a random permutation
#' of code points.
#'
#' @param str character vector
#'
#' @return Returns a character vector.
#'
#' @examples
#' stri_reverse(c('123', 'abc d e f'))
#' stri_reverse('ZXY (\u0105\u0104123$^).')
#' stri_reverse(stri_trans_nfd('\u0105')) == stri_trans_nfd('\u0105') # A, ogonek -> agonek, A
#'
#' @export
stri_reverse <- function(str)
{
.Call(C_stri_reverse, str)
}
stringi/R/search_extract_bound.R 0000644 0001762 0000144 00000013413 14750110641 016444 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Extract Data Between Text Boundaries
#'
#' @description
#' These functions extract data between text boundaries.
#'
#' @details
#' Vectorized over \code{str}.
#'
#' For more information on text boundary analysis
#' performed by \pkg{ICU}'s \code{BreakIterator}, see
#' \link{stringi-search-boundaries}.
#'
#' In case of \code{stri_extract_*_words},
#' just like in \code{\link{stri_count_words}},
#' \pkg{ICU}'s word \code{BreakIterator} iterator is used
#' to locate the word boundaries, and all non-word characters
#' (\code{UBRK_WORD_NONE} rule status) are ignored.
#'
#'
#' @param str character vector or an object coercible to
#' @param omit_no_match single logical value; if \code{FALSE},
#' then a missing value will indicate that there are no words
#' @param simplify single logical value;
#' if \code{TRUE} or \code{NA}, then a character matrix is returned;
#' otherwise (the default), a list of character vectors is given, see Value
#' @param opts_brkiter a named list with \pkg{ICU} BreakIterator's settings,
#' see \code{\link{stri_opts_brkiter}};
#' \code{NULL} for the default break iterator, i.e., \code{line_break}
#' @param ... additional settings for \code{opts_brkiter}
#' @param locale \code{NULL} or \code{''} for text boundary analysis following
#' the conventions of the default locale, or a single string with
#' locale identifier, see \link{stringi-locale}
#'
#' @return
#' For \code{stri_extract_all_*},
#' if \code{simplify=FALSE} (the default), then a
#' list of character vectors is returned. Each string consists of
#' a separate word. In case of \code{omit_no_match=FALSE} and
#' if there are no words or if a string is missing,
#' a single \code{NA} is provided on output.
#'
#' Otherwise, \code{\link{stri_list2matrix}} with \code{byrow=TRUE} argument
#' is called on the resulting object.
#' In such a case, a character matrix with \code{length(str)} rows
#' is returned. Note that \code{\link{stri_list2matrix}}'s \code{fill} argument
#' is set to an empty string and \code{NA},
#' for \code{simplify} \code{TRUE} and \code{NA}, respectively.
#'
#' For \code{stri_extract_first_*} and \code{stri_extract_last_*},
#' a character vector is returned.
#' A \code{NA} element indicates a no-match.
#'
#' @examples
#' stri_extract_all_words('stringi: THE string processing package 123.48...')
#'
#' @export
#' @family search_extract
#' @family locale_sensitive
#' @family text_boundaries
#' @rdname stri_extract_boundaries
stri_extract_all_boundaries <- function(str, simplify = FALSE, omit_no_match = FALSE,
..., opts_brkiter = NULL)
{
if (!missing(...))
opts_brkiter <- do.call(stri_opts_brkiter, as.list(c(opts_brkiter, ...)))
.Call(C_stri_extract_all_boundaries, str, simplify, omit_no_match, opts_brkiter)
}
#' @export
#' @rdname stri_extract_boundaries
stri_extract_last_boundaries <- function(str, ..., opts_brkiter = NULL)
{
if (!missing(...))
opts_brkiter <- do.call(stri_opts_brkiter, as.list(c(opts_brkiter, ...)))
.Call(C_stri_extract_last_boundaries, str, opts_brkiter)
}
#' @export
#' @rdname stri_extract_boundaries
stri_extract_first_boundaries <- function(str, ..., opts_brkiter = NULL)
{
if (!missing(...))
opts_brkiter <- do.call(stri_opts_brkiter, as.list(c(opts_brkiter, ...)))
.Call(C_stri_extract_first_boundaries, str, opts_brkiter)
}
#' @export
#' @rdname stri_extract_boundaries
stri_extract_all_words <- function(str, simplify = FALSE, omit_no_match = FALSE,
locale = NULL)
{
stri_extract_all_boundaries(
str, simplify, omit_no_match,
opts_brkiter = stri_opts_brkiter(
type = "word", skip_word_none = TRUE, locale = locale
)
)
}
#' @export
#' @rdname stri_extract_boundaries
stri_extract_first_words <- function(str, locale = NULL)
{
stri_extract_first_boundaries(str,
opts_brkiter = stri_opts_brkiter(type = "word",
skip_word_none = TRUE, locale = locale))
}
#' @export
#' @rdname stri_extract_boundaries
stri_extract_last_words <- function(str, locale = NULL)
{
stri_extract_last_boundaries(str,
opts_brkiter = stri_opts_brkiter(type = "word",
skip_word_none = TRUE, locale = locale))
}
stringi/R/trans_other.R 0000644 0001762 0000144 00000005471 14750110641 014613 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Translate Characters
#'
#' @description
#' Translates Unicode code points in each input string.
#'
#' @details
#' Vectorized over \code{str} and with respect to each code point
#' in \code{pattern} and \code{replacement}.
#'
#' If \code{pattern} and \code{replacement} consist of a different number
#' of code points, then the extra code points in the longer of the two
#' are ignored, with a warning.
#'
#' If code points in a given \code{pattern} are not unique, the
#' last corresponding replacement code point is used.
#'
#' Time complexity for each string in \code{str} is
#' O(\code{stri_length(str)*stri_length(pattern)}).
#'
#' @param str character vector
#' @param pattern a single character string providing code points to be translated
#' @param replacement a single character string giving translated code points
#'
#' @return Returns a character vector.
#'
#' @export
#' @family transform
#' @examples
#' stri_trans_char('id.123', '.', '_')
#' stri_trans_char('babaab', 'ab', '01')
#' stri_trans_char('GCUACGGAGCUUCGGAGCUAG', 'ACGT', 'TGCA')
stri_trans_char <- function(str, pattern, replacement) {
.Call(C_stri_trans_char, str, pattern, replacement)
}
stringi/R/search_locate_bound.R 0000644 0001762 0000144 00000014034 14750110641 016241 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title Locate Text Boundaries
#'
#' @description
#' These functions locate text boundaries
#' (like character, word, line, or sentence boundaries).
#' Use \code{stri_locate_all_*} to locate all the matches.
#' \code{stri_locate_first_*} and \code{stri_locate_last_*}
#' give the first or the last matches, respectively.
#'
#'
#' @details
#' Vectorized over \code{str}.
#'
#' For more information on text boundary analysis
#' performed by \pkg{ICU}'s \code{BreakIterator}, see
#' \link{stringi-search-boundaries}.
#'
#' For \code{stri_locate_*_words},
#' just like in \code{\link{stri_extract_all_words}} and \code{\link{stri_count_words}},
#' \pkg{ICU}'s word \code{BreakIterator} iterator is used
#' to locate the word boundaries, and all non-word characters
#' (\code{UBRK_WORD_NONE} rule status) are ignored.
#' This function is equivalent to a call to
#' \code{stri_locate_*_boundaries(str, type='word', skip_word_none=TRUE, locale=locale)}
#'
#'
#'
#' @param str character vector or an object coercible to
#'
#' @param omit_no_match single logical value; if \code{TRUE},
#' a no-match will be indicated by a matrix with 0 rows
#' \code{stri_locate_all_*} only
#'
#' @param opts_brkiter named list with \pkg{ICU} BreakIterator's settings,
#' see \code{\link{stri_opts_brkiter}};
#' \code{NULL} for default break iterator, i.e., \code{line_break}
#'
#' @param ... additional settings for \code{opts_brkiter}
#'
#' @param locale \code{NULL} or \code{''} for text boundary analysis following
#' the conventions of the default locale, or a single string with
#' locale identifier, see \link{stringi-locale}
#'
#' @param get_length single logical value; if \code{FALSE} (default),
#' generate \emph{from-to} matrices; otherwise, output
#' \emph{from-length} ones
#'
#'
#' @return
#' \code{stri_locate_all_*} yields a list of \code{length(str)}
#' integer matrices.
#' \code{stri_locate_first_*} and \code{stri_locate_last_*} generate
#' return an integer matrix.
#' See \code{\link{stri_locate}} for more details.
#'
#'
#' @examples
#' test <- 'The\u00a0above-mentioned features are very useful. Spam, spam, eggs, bacon, and spam.'
#' stri_locate_all_words(test)
#' stri_locate_all_boundaries(
#' 'Mr. Jones and Mrs. Brown are very happy. So am I, Prof. Smith.',
#' type='sentence',
#' locale='en_US@ss=standard' # ICU >= 56 only
#' )
#'
#'
#'
#' @export
#' @family search_locate
#' @family indexing
#' @family locale_sensitive
#' @family text_boundaries
#' @rdname stri_locate_boundaries
stri_locate_all_boundaries <- function(
str, omit_no_match=FALSE, get_length=FALSE, ..., opts_brkiter=NULL
) {
if (!missing(...))
opts_brkiter <- do.call(stri_opts_brkiter, as.list(c(opts_brkiter, ...)))
.Call(C_stri_locate_all_boundaries, str, omit_no_match, opts_brkiter, get_length)
}
#' @export
#' @rdname stri_locate_boundaries
stri_locate_last_boundaries <- function(
str, get_length=FALSE, ..., opts_brkiter=NULL
) {
if (!missing(...))
opts_brkiter <- do.call(stri_opts_brkiter, as.list(c(opts_brkiter, ...)))
.Call(C_stri_locate_last_boundaries, str, opts_brkiter, get_length)
}
#' @export
#' @rdname stri_locate_boundaries
stri_locate_first_boundaries <- function(
str, get_length=FALSE, ..., opts_brkiter=NULL
) {
if (!missing(...))
opts_brkiter <- do.call(stri_opts_brkiter, as.list(c(opts_brkiter, ...)))
.Call(C_stri_locate_first_boundaries, str, opts_brkiter, get_length)
}
#' @export
#' @rdname stri_locate_boundaries
stri_locate_all_words <- function(
str, omit_no_match=FALSE, locale=NULL, get_length=FALSE
) {
stri_locate_all_boundaries(
str, omit_no_match=omit_no_match, get_length=get_length,
opts_brkiter=stri_opts_brkiter(
type="word", skip_word_none=TRUE, locale=locale
)
)
}
#' @export
#' @rdname stri_locate_boundaries
stri_locate_last_words <- function(
str, locale=NULL, get_length=FALSE
) {
stri_locate_last_boundaries(
str, get_length=get_length,
opts_brkiter=stri_opts_brkiter(
type="word", skip_word_none=TRUE, locale=locale
)
)
}
#' @export
#' @rdname stri_locate_boundaries
stri_locate_first_words <- function(
str, locale=NULL, get_length=FALSE
) {
stri_locate_first_boundaries(
str, get_length=get_length,
opts_brkiter=stri_opts_brkiter(
type="word", skip_word_none=TRUE, locale=locale
)
)
}
stringi/R/time_symbols.R 0000644 0001762 0000144 00000007721 14750110641 014771 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' List Localizable Date-Time Formatting Data
#'
#' @description
#' Returns a list of all localizable date-time formatting data,
#' including month and weekday names, localized AM/PM strings, etc.
#'
#' @details
#' \code{context} stands for a selector for date formatting context
#' and \code{width} - for date formatting width.
#'
#'
#' @param locale \code{NULL} or \code{''} for default locale,
#' or a single string with locale identifier
#' @param context single string; one of: \code{'format'}, \code{'standalone'}
#' @param width single string; one of: \code{'abbreviated'}, \code{'wide'}, \code{'narrow'}
#'
#' @return Returns a list with the following named components:
#' \enumerate{
#' \item \code{Month} - month names,
#' \item \code{Weekday} - weekday names,
#' \item \code{Quarter} - quarter names,
#' \item \code{AmPm} - AM/PM names,
#' \item \code{Era} - era names.
#' }
#'
#' @examples
#' stri_datetime_symbols() # uses the Gregorian calendar in most locales
#' stri_datetime_symbols('@@calendar=hebrew')
#' stri_datetime_symbols('he_IL@@calendar=hebrew')
#' stri_datetime_symbols('@@calendar=islamic')
#' stri_datetime_symbols('@@calendar=persian')
#' stri_datetime_symbols('@@calendar=indian')
#' stri_datetime_symbols('@@calendar=coptic')
#' stri_datetime_symbols('@@calendar=japanese')
#'
#' stri_datetime_symbols('ja_JP_TRADITIONAL') # uses the Japanese calendar by default
#' stri_datetime_symbols('th_TH_TRADITIONAL') # uses the Buddhist calendar
#'
#' stri_datetime_symbols('pl_PL', context='format')
#' stri_datetime_symbols('pl_PL', context='standalone')
#'
#' stri_datetime_symbols(width='wide')
#' stri_datetime_symbols(width='abbreviated')
#' stri_datetime_symbols(width='narrow')
#'
#' @references
#' \emph{Calendar} - ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/datetime/calendar/}
#'
#' \emph{DateFormatSymbols} class -- ICU API Documentation,
#' \url{https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1DateFormatSymbols.html}
#'
#' \emph{Formatting Dates and Times} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/format_parse/datetime/}
#'
#' @family datetime
#' @export
stri_datetime_symbols <- function(locale=NULL, context="standalone", width="wide")
{
# TODO: get first day of week
.Call(C_stri_datetime_symbols, locale, context, width)
}
stringi/R/random.R 0000644 0001762 0000144 00000022722 14750110641 013541 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Randomly Shuffle Code Points in Each String
#'
#' @description
#' Generates a (pseudo)random permutation of the code points
#' in each string.
#'
#' @details
#' This operation may result in non-Unicode-normalized
#' strings and may give peculiar outputs in case of bidirectional strings.
#'
#' See also \code{\link{stri_reverse}} for reversing the order of code points.
#'
#' @param str character vector
#'
#' @return Returns a character vector.
#'
#' @examples
#' stri_rand_shuffle(c('abcdefghi', '0123456789'))
#' # you can do better than this with stri_rand_strings:
#' stri_rand_shuffle(rep(stri_paste(letters, collapse=''), 10))
#'
#' @family random
#' @export
stri_rand_shuffle <- function(str)
{
.Call(C_stri_rand_shuffle, str)
}
#' @title
#' Generate Random Strings
#'
#' @description
#' Generates (pseudo)random strings of desired lengths.
#'
#' @details
#' Vectorized over \code{length} and \code{pattern}.
#' If length of \code{length} or \code{pattern} is greater than \code{n},
#' then redundant elements are ignored. Otherwise,
#' these vectors are recycled if necessary.
#'
#' This operation may result in non-Unicode-normalized
#' strings and may give peculiar outputs for bidirectional strings.
#'
#' Sampling of code points from the set specified by \code{pattern}
#' is always done with replacement and each code point appears with equal
#' probability.
#'
#' @param n single integer, number of observations
#' @param length integer vector, desired string lengths
#' @param pattern character vector specifying character classes to draw
#' elements from, see \link{stringi-search-charclass}
#'
#' @return Returns a character vector.
#'
#' @examples
#' stri_rand_strings(5, 10) # 5 strings of length 10
#' stri_rand_strings(5, sample(1:10, 5, replace=TRUE)) # 5 strings of random lengths
#' stri_rand_strings(10, 5, '[\\p{script=latin}&\\p{Ll}]') # small letters from the Latin script
#'
#' # generate n random passwords of length in [8, 14]
#' # consisting of at least one digit, small and big ASCII letter:
#' n <- 10
#' stri_rand_shuffle(stri_paste(
#' stri_rand_strings(n, 1, '[0-9]'),
#' stri_rand_strings(n, 1, '[a-z]'),
#' stri_rand_strings(n, 1, '[A-Z]'),
#' stri_rand_strings(n, sample(5:11, 5, replace=TRUE), '[a-zA-Z0-9]')
#' ))
#'
#' @family random
#' @export
stri_rand_strings <- function(n, length, pattern = "[A-Za-z0-9]")
{
.Call(C_stri_rand_strings, n, length, pattern)
}
#' @title
#' A Lorem Ipsum Generator
#'
#' @description
#' Generates (pseudo)random \emph{lorem ipsum} text consisting
#' of a given number of text paragraphs.
#'
#' @details
#' \emph{Lorem ipsum} is a dummy text often used as a source
#' of data for string processing and displaying/lay-outing exercises.
#'
#' The current implementation is very simple:
#' words are selected randomly from a Zipf distribution
#' (based on a set of ca. 190 predefined Latin words).
#' The number of words per sentence and sentences per paragraph
#' follows a discretized, truncated normal distribution.
#' No Markov chain modeling, just i.i.d. word selection.
#'
#' @param n_paragraphs single integer, number of paragraphs to generate
#' @param start_lipsum single logical value; should the resulting
#' text start with \emph{Lorem ipsum dolor sit amet}?
#' @param nparagraphs [DEPRECATED] alias of \code{n_paragraphs}
#'
#' @return Returns a character vector of length \code{n_paragraphs}.
#'
#' @examples
#' cat(sapply(
#' stri_wrap(stri_rand_lipsum(10), 80, simplify=FALSE),
#' stri_flatten, collapse='\n'), sep='\n\n')
#' cat(stri_rand_lipsum(10), sep='\n\n')
#'
#' @family random
#' @export
stri_rand_lipsum <- function(n_paragraphs, start_lipsum = TRUE,
nparagraphs=n_paragraphs)
{
if (!missing(nparagraphs) && missing(n_paragraphs)) { # DEPRECATED
warning("The 'nparagraphs' argument in stri_rand_lipsum is a deprecated alias of 'n_paragraphs' and will be removed in a future release of 'stringi'.")
n_paragraphs <- nparagraphs
}
# Whoa! A pure R function in stringi :)
# Version 0.3-1 (Marek Gagolewski, 2014-10-16)
n_paragraphs <- as.integer(n_paragraphs)
stopifnot(is.finite(n_paragraphs), n_paragraphs >= 1)
start_lipsum <- identical(start_lipsum, TRUE)
rwords <- function(n) {
# generate n random words
words <- c("SED", "IN", "UT", "ET", "AC", "EU", "NON", "NEC", "AMET", "SIT",
"VEL", "AT", "MAURIS", "A", "VITAE", "EGET", "QUIS", "NUNC", "NULLA",
"ID", "VESTIBULUM", "PELLENTESQUE", "TINCIDUNT", "ALIQUAM", "IPSUM",
"DONEC", "TURPIS", "LIGULA", "EGESTAS", "NIBH", "SAPIEN", "ANTE", "NISL",
"VELIT", "ERAT", "EROS", "LEO", "MAGNA", "JUSTO", "ENIM", "MI", "PURUS",
"EST", "LACUS", "LOREM", "QUAM", "DIAM", "RISUS", "DOLOR", "SEM", "AUGUE",
"NEQUE", "TEMPOR", "DUI", "ARCU", "METUS", "TORTOR", "URNA", "LIBERO",
"PHARETRA", "TEMPUS", "FAUCIBUS", "LECTUS", "SUSPENDISSE", "FELIS", "ODIO",
"ORCI", "VARIUS", "MASSA", "TELLUS", "VOLUTPAT", "BLANDIT", "INTERDUM",
"LOBORTIS", "MAXIMUS", "NISI", "LUCTUS", "PORTTITOR", "AUCTOR", "ELEMENTUM",
"EX", "MAECENAS", "MALESUADA", "TRISTIQUE", "ULLAMCORPER", "ULTRICES",
"NULLAM", "CONSEQUAT", "LACINIA", "PHASELLUS", "ACCUMSAN", "DAPIBUS",
"ELEIFEND", "COMMODO", "DUIS", "EFFICITUR", "ELIT", "IMPERDIET", "AENEAN",
"IACULIS", "NAM", "CONSECTETUR", "FERMENTUM", "PORTA", "SCELERISQUE",
"SODALES", "FEUGIAT", "LAOREET", "VULPUTATE", "DICTUM", "QUISQUE", "FACILISIS",
"FINIBUS", "ORNARE", "PULVINAR", "RHONCUS", "CONDIMENTUM", "MOLLIS",
"PRETIUM", "ALIQUET", "CONGUE", "POSUERE", "SUSCIPIT", "ULTRICIES", "CURABITUR",
"GRAVIDA", "MATTIS", "VIVERRA", "CURSUS", "EUISMOD", "RUTRUM", "VENENATIS",
"CONVALLIS", "PROIN", "VEHICULA", "PLACERAT", "SAGITTIS", "CRAS", "INTEGER",
"MORBI", "VIVAMUS", "PRAESENT", "BIBENDUM", "MOLESTIE", "SEMPER", "FRINGILLA",
"FUSCE", "DIGNISSIM", "ETIAM", "HENDRERIT", "SOLLICITUDIN", "PER", "FAMES",
"POTENTI", "AD", "APTENT", "CLASS", "CONUBIA", "HIMENAEOS", "INCEPTOS",
"LITORA", "NOSTRA", "SOCIOSQU", "TACITI", "TORQUENT", "HABITANT", "NETUS",
"SENECTUS", "PRIMIS", "CUM", "DIS", "MAGNIS", "MONTES", "MUS", "NASCETUR",
"NATOQUE", "PARTURIENT", "PENATIBUS", "RIDICULUS", "SOCIIS", "ADIPISCING",
"FACILISI", "CUBILIA", "CURAE", "DICTUMST", "HABITASSE", "HAC", "PLATEA")
# Zipf distribution
dzipf <- function(k, N, s) 1/k^s/sum(1/(1:N)^s)
pzipf.y <- c(0, cumsum(dzipf(1:length(words), length(words), 0.5)))
robs <- findInterval(runif(n), pzipf.y)
words[robs]
}
rtruncnorm <- function(n, a, b, mu, sd) {
# truncated discretized normal distribution
x <- round(rnorm(n, mu, sd))
while (any(x < a | x > b)) x[x < a | x > b] <- round(rnorm(sum(x < a | x >
b), mu, sd))
x
}
sent_para <- rtruncnorm(n_paragraphs, 7, 20, 11, 3)
word_sent <- lapply(sent_para, function(numsent) rtruncnorm(numsent, 2, Inf,
8, 3))
totwords <- sum(unlist(word_sent))
words <- rwords(totwords)
seps <- sample(c(" ", ", "), replace = TRUE, size = totwords, prob = c(0.9, 0.1))
seps[cumsum(unlist(word_sent))] <- sample(c(". ", "? ", "! "), size = length(unlist(word_sent)),
replace = TRUE, prob = c(0.95, 0.025, 0.025)) # end of sentence
seps[cumsum(sapply(word_sent, sum))] <- ".\n" # end of para
seps[totwords] <- "." # very last sentence in very last para
if (start_lipsum) {
words <- c("LOREM", "IPSUM", "DOLOR", "SIT", "AMET", words)
seps <- c(" ", " ", " ", " ", ", ", seps)
}
ret <- stri_split_charclass(stri_paste(words, seps, collapse = ""), "[\\n]")[[1]]
ret <- stri_trans_totitle(ret, opts_brkiter = stri_opts_brkiter(type = "sentence"))
ret
}
stringi/R/sub.R 0000644 0001762 0000144 00000032532 14750110641 013052 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Extract a Substring From or Replace a Substring In a Character Vector
#'
#' @description
#' \code{stri_sub} extracts particular substrings at code point-based
#' index ranges provided. Its replacement version allows to substitute
#' (in-place) parts of
#' a string with given replacement strings. \code{stri_sub_replace}
#' is its forward pipe operator-friendly variant that returns
#' a copy of the input vector.
#'
#' For extracting/replacing multiple substrings from/within each string, see
#' \code{\link{stri_sub_all}}.
#'
#' @details
#' Vectorized over \code{str}, [\code{value}], \code{from} and
#' (\code{to} or \code{length}). Parameters
#' \code{to} and \code{length} are mutually exclusive.
#'
#' Indexes are 1-based, i.e., the start of a string is at index 1.
#' For negative indexes in \code{from} or \code{to},
#' counting starts at the end of the string.
#' For instance, index -1 denotes the last code point in the string.
#' Non-positive \code{length} gives an empty string.
#'
#' Argument \code{from} gives the start of a substring to extract.
#' Argument \code{to} defines the last index of a substring, inclusive.
#' Alternatively, its \code{length} may be provided.
#'
#' If \code{from} is a two-column matrix, then these two columns are
#' used as \code{from} and \code{to}, respectively,
#' unless the second column is named \code{length}.
#' In such a case anything passed
#' explicitly as \code{to} or \code{length} is ignored.
#' Such types of index matrices are generated by \code{\link{stri_locate_first}}
#' and \code{\link{stri_locate_last}}. If extraction based on
#' \code{\link{stri_locate_all}} is needed, see
#' \code{\link{stri_sub_all}}.
#'
#' In \code{stri_sub}, out-of-bound indexes are silently
#' corrected. If \code{from} > \code{to}, then an empty string is returned.
#' By default, negative \code{length} results in the corresponding output being
#' \code{NA}, see \code{ignore_negative_length}, though.
#'
#' In \code{stri_sub<-}, some configurations of indexes may work as
#' substring 'injection' at the front, back, or in middle.
#' Negative \code{length} does not alter the corresponding input string.
#'
#' If both \code{to} and \code{length} are provided,
#' \code{length} has priority over \code{to}.
#'
#' Note that for some Unicode strings, the extracted substrings might not
#' be well-formed, especially if input strings are not normalized
#' (see \code{\link{stri_trans_nfc}}),
#' include byte order marks, Bidirectional text marks, and so on.
#' Handle with care.
#'
#'
#'
#'
#' @param str character vector
#'
#' @param from integer vector giving the start indexes; alternatively,
#' if \code{use_matrix=TRUE},
#' a two-column matrix of type \code{cbind(from, to)}
#' (unnamed columns or the 2nd column named other than \code{length})
#' or \code{cbind(from, length=length)} (2nd column named \code{length})
#'
#' @param to integer vector giving the end indexes; mutually exclusive with
#' \code{length} and \code{from} being a matrix
#'
#' @param length integer vector giving the substring lengths;
#' mutually exclusive with \code{to} and \code{from} being a matrix
#'
#' @param omit_na single logical value; indicates whether missing values
#' in any of the indexes or in \code{value} leave the corresponding input string
#' unchanged [replacement function only]
#'
#' @param use_matrix single logical value; see \code{from}
#'
#' @param replacement alias of \code{value} [wherever applicable]
#'
#' @param value a character vector defining the replacement strings
#' [replacement function only]
#'
#' @param ignore_negative_length single logical value; whether
#' negative lengths should be ignored or result in missing values
#'
#' @param ... arguments to be passed to \code{stri_sub<-}
#'
#'
#' @return
#' \code{stri_sub} and \code{stri_sub_replace} return a character vector.
#' \code{stri_sub<-} changes the \code{str} object 'in-place'.
#'
#' @examples
#' s <- c("spam, spam, bacon, and spam", "eggs and spam")
#' stri_sub(s, from=-4)
#' stri_sub(s, from=1, length=c(10, 4))
#' (stri_sub(s, 1, 4) <- 'stringi')
#'
#' x <- c('12 3456 789', 'abc', '', NA, '667')
#' stri_sub(x, stri_locate_first_regex(x, '[0-9]+')) # see stri_extract_first
#' stri_sub(x, stri_locate_last_regex(x, '[0-9]+')) # see stri_extract_last
#'
#' stri_sub_replace(x, stri_locate_first_regex(x, '[0-9]+'),
#' omit_na=TRUE, replacement='***') # see stri_replace_first
#' stri_sub_replace(x, stri_locate_last_regex(x, '[0-9]+'),
#' omit_na=TRUE, replacement='***') # see stri_replace_last
#'
#'
#' \dontrun{x |> stri_sub_replace(1, 5, replacement='new_substring')}
#' @family indexing
#' @rdname stri_sub
#' @export
stri_sub <- function(
str, from = 1L, to = -1L, length,
use_matrix=TRUE, ignore_negative_length=FALSE
) {
use_matrix <- (is.logical(use_matrix) && base::length(use_matrix) == 1L && !is.na(use_matrix) && use_matrix) # isTRUE(use_matrix)
if (missing(length)) {
if (use_matrix && is.matrix(from) && !missing(to)) {
warning("argument `to` is ignored in the current context")
to <- NULL
}
.Call(C_stri_sub, str, from, to, NULL, use_matrix, ignore_negative_length)
} else {
if (!missing(to))
warning("argument `to` is ignored in the current context")
if (use_matrix && is.matrix(from)) {
warning("argument `length` is ignored in the current context")
length <- NULL
}
.Call(C_stri_sub, str, from, NULL, length, use_matrix, ignore_negative_length)
}
}
#' @rdname stri_sub
#' @export
`stri_sub<-` <- function(
str, from = 1L, to = -1L, length, omit_na=FALSE, use_matrix=TRUE, value
) {
use_matrix <- (is.logical(use_matrix) && base::length(use_matrix) == 1L && !is.na(use_matrix) && use_matrix) # isTRUE(use_matrix)
if (missing(length)) {
if (use_matrix && is.matrix(from) && !missing(to)) {
warning("argument `to` is ignored in this context")
to <- NULL
}
.Call(C_stri_sub_replacement, str, from, to, NULL, omit_na, value, use_matrix)
} else {
if (!missing(to))
warning("argument `to` is ignored in this context")
if (use_matrix && is.matrix(from)) {
warning("argument `length` is ignored in this context")
length <- NULL
}
.Call(C_stri_sub_replacement, str, from, NULL, length, omit_na, value, use_matrix)
}
}
#' @rdname stri_sub
#' @export
stri_sub_replace <- function(..., replacement, value = replacement)
`stri_sub<-`(..., value = value)
#' @title
#' Extract or Replace Multiple Substrings
#'
#' @description
#' \code{stri_sub_all} extracts multiple substrings from each string.
#' Its replacement version substitutes (in-place) multiple substrings with the
#' corresponding replacement strings.
#' \code{stri_sub_replace_all} (alias \code{stri_sub_all_replace})
#' is its forward pipe operator-friendly variant, returning
#' a copy of the input vector.
#'
#' For extracting/replacing single substrings from/within each string, see
#' \code{\link{stri_sub}}.
#'
#' @details
#' Vectorized over \code{str}, [\code{value}], \code{from} and
#' (\code{to} or \code{length}). Just like in \code{\link{stri_sub}}, parameters
#' \code{to} and \code{length} are mutually exclusive.
#'
#' In one of the simplest scenarios, \code{stri_sub_all(str, from, to)},
#' the i-th element of the resulting list
#' generated like \code{stri_sub(str[i], from[[i]], to[[i]])}.
#' As usual, if one of the inputs is shorter than the others,
#' recycling rule is applied.
#'
#'
#' If any of \code{from}, \code{to}, \code{length},
#' or \code{value} is not a list,
#' it is wrapped into a list.
#'
#' If \code{from} consists of a two-column matrix, then these two columns are
#' used as \code{from} and \code{to}, respectively,
#' unless the second column is named \code{length}.
#' Such types of index matrices are generated by
#' \code{\link{stri_locate_all}}.
#' If extraction or replacement based on \code{\link{stri_locate_first}}
#' or \code{\link{stri_locate_last}} is needed, see \code{\link{stri_sub}}.
#'
#' In the replacement function, the index ranges must be sorted
#' with respect to \code{from} and must be mutually disjoint.
#' Negative \code{length} does not result in any altering of the
#' corresponding input string. On the other hand, in \code{stri_sub_all},
#' this make the corresponding chunk be ignored,
#' see \code{ignore_negative_length}, though.
#'
#' @param str character vector
#'
#' @param from list of integer vector giving the start indexes; alternatively,
#' if \code{use_matrix=TRUE}, a list of two-column matrices of type
#' \code{cbind(from, to)}
#' (unnamed columns or the 2nd column named other than \code{length})
#' or \code{cbind(from, length=length)} (2nd column named \code{length})
#'
#' @param to list of integer vectors giving the end indexes
#'
#' @param length list of integer vectors giving the substring lengths
#'
#' @param omit_na single logical value; indicates whether missing values
#' in any of the indexes or in \code{value} leave the part of the
#' corresponding input string
#' unchanged [replacement function only]
#'
#' @param use_matrix single logical value; see \code{from}
#'
#' @param replacement alias of \code{value} [wherever applicable]
#'
#' @param value a list of character vectors defining the replacement strings
#' [replacement function only]
#'
#' @param ignore_negative_length single logical value; whether
#' negative lengths should be ignored or result in missing values
#'
#' @param ... arguments to be passed to \code{stri_sub_all<-}
#'
#'
#' @return
#' \code{stri_sub_all} returns a list of character vectors.
#' Its replacement versions modify the input 'in-place'.
#'
#' @examples
#' x <- c('12 3456 789', 'abc', '', NA, '667')
#' stri_sub_all(x, stri_locate_all_regex(x, '[0-9]+')) # see stri_extract_all
#' stri_sub_all(x, stri_locate_all_regex(x, '[0-9]+', omit_no_match=TRUE))
#'
#' stri_sub_all(x, stri_locate_all_regex(x, '[0-9]+', omit_no_match=TRUE)) <- '***'
#' print(x)
#'
#' stri_sub_replace_all('a b c', c(1, 3, 5), c(1, 3, 5), replacement=c('A', 'B', 'C'))
#'
#'
#' @family indexing
#' @rdname stri_sub_all
#' @export
stri_sub_all <- function(
str, from = list(1L), to = list(-1L), length,
use_matrix=TRUE, ignore_negative_length=TRUE
) {
if (!is.list(from))
from <- list(from)
if (missing(length)) {
if (!missing(to) && !is.list(to)) {
to <- list(to)
}
.Call(C_stri_sub_all, str, from, to, NULL, use_matrix, ignore_negative_length)
} else {
if (!missing(to))
warning("argument `to` is ignored in this context")
if (!is.list(length)) {
length <- list(length)
}
.Call(C_stri_sub_all, str, from, NULL, length, use_matrix, ignore_negative_length)
}
}
#' @rdname stri_sub_all
#' @export
`stri_sub_all<-` <- function(
str, from = list(1L), to = list(-1L), length,
omit_na=FALSE, use_matrix=TRUE, value
) {
if (!is.list(from))
from <- list(from)
if (!is.list(value))
value <- list(value)
if (missing(length)) {
if (!missing(to) && !is.list(to)) {
to <- list(to)
}
.Call(C_stri_sub_replacement_all, str, from, to, NULL, omit_na, value, use_matrix)
} else {
if (!missing(to))
warning("argument `to` is ignored in this context")
if (!is.list(length)) {
length <- list(length)
}
.Call(C_stri_sub_replacement_all, str, from, NULL, length, omit_na, value, use_matrix)
}
}
#' @rdname stri_sub_all
#' @export
stri_sub_replace_all <- function(..., replacement, value=replacement)
`stri_sub_all<-`(..., value=value)
#' @rdname stri_sub_all
#' @export
stri_sub_all_replace <- stri_sub_replace_all
stringi/R/sort.R 0000644 0001762 0000144 00000035127 14750110641 013253 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title String Sorting
#'
#' @description
#' This function sorts a character vector according to a locale-dependent
#' lexicographic order.
#'
#' @details
#' For more information on \pkg{ICU}'s Collator and how to tune it up
#' in \pkg{stringi}, refer to \code{\link{stri_opts_collator}}.
#'
#' As usual in \pkg{stringi}, non-character inputs are coerced to strings,
#' see an example below for a somewhat non-intuitive behavior of lexicographic
#' sorting on numeric inputs.
#'
#' This function uses a stable sort algorithm (\pkg{STL}'s \code{stable_sort}),
#' which performs up to \eqn{N*log^2(N)} element comparisons,
#' where \eqn{N} is the length of \code{str}.
#'
#' @param str a character vector
#' @param decreasing a single logical value; should the sort order
#' be nondecreasing (\code{FALSE}, default, i.e., weakly increasing)
#' or nonincreasing (\code{TRUE})?
#' @param na_last a single logical value; controls the treatment of \code{NA}s
#' in \code{str}. If \code{TRUE}, then missing values in \code{str} are put
#' at the end; if \code{FALSE}, they are put at the beginning;
#' if \code{NA}, then they are removed from the output
#' @param opts_collator a named list with \pkg{ICU} Collator's options,
#' see \code{\link{stri_opts_collator}}, \code{NULL}
#' for default collation options
#' @param ... additional settings for \code{opts_collator}
#'
#' @return
#' The result is a sorted version of \code{str},
#' i.e., a character vector.
#'
#' @references
#' \emph{Collation} - ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/collation/}
#'
#' @family locale_sensitive
#' @export
#' @rdname stri_sort
#'
#' @examples
#' stri_sort(c('hladny', 'chladny'), locale='pl_PL')
#' stri_sort(c('hladny', 'chladny'), locale='sk_SK')
#' stri_sort(sample(LETTERS))
#' stri_sort(c(1, 100, 2, 101, 11, 10)) # lexicographic order
#' stri_sort(c(1, 100, 2, 101, 11, 10), numeric=TRUE) # OK for integers
#' stri_sort(c(0.25, 0.5, 1, -1, -2, -3), numeric=TRUE) # incorrect
stri_sort <- function(str, decreasing = FALSE, na_last = NA, ..., opts_collator = NULL)
{
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_sort, str, decreasing, na_last, opts_collator)
}
#' @title Ordering Permutation
#'
#' @description
#' This function finds a permutation which rearranges the
#' strings in a given character vector into the ascending or descending
#' locale-dependent lexicographic order.
#'
#' @details
#' For more information on \pkg{ICU}'s Collator and how to tune it up
#' in \pkg{stringi}, refer to \code{\link{stri_opts_collator}}.
#'
#' As usual in \pkg{stringi}, non-character inputs are coerced to strings,
#' see an example below for a somewhat non-intuitive behavior of lexicographic
#' sorting on numeric inputs.
#'
#' This function uses a stable sort algorithm (\pkg{STL}'s \code{stable_sort}),
#' which performs up to \eqn{N*log^2(N)} element comparisons,
#' where \eqn{N} is the length of \code{str}.
#'
#' For ordering with regards to multiple criteria (such as sorting
#' data frames by more than 1 column), see \code{\link{stri_rank}}.
#'
#' @param str a character vector
#' @param decreasing a single logical value; should the sort order
#' be nondecreasing (\code{FALSE}, default)
#' or nonincreasing (\code{TRUE})?
#' @param na_last a single logical value; controls the treatment of \code{NA}s
#' in \code{str}. If \code{TRUE}, then missing values in \code{str} are put
#' at the end; if \code{FALSE}, they are put at the beginning;
#' if \code{NA}, then they are removed from the output
#' @param opts_collator a named list with \pkg{ICU} Collator's options,
#' see \code{\link{stri_opts_collator}}, \code{NULL}
#' for default collation options
#' @param ... additional settings for \code{opts_collator}
#'
#' @return The function yields an integer vector that gives the sort order.
#'
#' @references
#' \emph{Collation} - ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/collation/}
#'
#' @family locale_sensitive
#' @export
#' @rdname stri_order
#'
#' @examples
#' stri_order(c('hladny', 'chladny'), locale='pl_PL')
#' stri_order(c('hladny', 'chladny'), locale='sk_SK')
#'
#' stri_order(c(1, 100, 2, 101, 11, 10)) # lexicographic order
#' stri_order(c(1, 100, 2, 101, 11, 10), numeric=TRUE) # OK for integers
#' stri_order(c(0.25, 0.5, 1, -1, -2, -3), numeric=TRUE) # incorrect
stri_order <- function(str, decreasing = FALSE, na_last = TRUE, ..., opts_collator = NULL)
{
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_order, str, decreasing, na_last, opts_collator)
}
#' @title Extract Unique Elements
#'
#' @description
#' This function returns a character vector like \code{str},
#' but with duplicate elements removed.
#'
#' @details
#' As usual in \pkg{stringi}, no attributes are copied.
#' Unlike \code{\link{unique}}, this function
#' tests for canonical equivalence of strings (and not
#' whether the strings are just bytewise equal). Such an operation
#' is locale-dependent. Hence, \code{stri_unique} is significantly
#' slower (but much better suited for natural language processing)
#' than its base R counterpart.
#'
#' See also \code{\link{stri_duplicated}} for indicating non-unique elements.
#'
#' @param str a character vector
#' @param opts_collator a named list with \pkg{ICU} Collator's options,
#' see \code{\link{stri_opts_collator}}, \code{NULL}
#' for default collation options
#' @param ... additional settings for \code{opts_collator}
#'
#' @return Returns a character vector.
#'
#' @examples
#' # normalized and non-Unicode-normalized version of the same code point:
#' stri_unique(c('\u0105', stri_trans_nfkd('\u0105')))
#' unique(c('\u0105', stri_trans_nfkd('\u0105')))
#'
#' stri_unique(c('gro\u00df', 'GROSS', 'Gro\u00df', 'Gross'), strength=1)
#'
#' @references
#' \emph{Collation} - ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/collation/}
#'
#' @family locale_sensitive
#' @export
stri_unique <- function(str, ..., opts_collator = NULL)
{
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_unique, str, opts_collator)
}
#' @title
#' Determine Duplicated Elements
#'
#' @description
#' \code{stri_duplicated()} determines which strings in a character vector
#' are duplicates of other elements.
#'
#' \code{stri_duplicated_any()} determines if there are any duplicated
#' strings in a character vector.
#'
#' @details
#' Missing values are regarded as equal.
#'
#' Unlike \code{\link{duplicated}} and \code{\link{anyDuplicated}},
#' these functions test for canonical equivalence of strings
#' (and not whether the strings are just bytewise equal)
#' Such operations are locale-dependent.
#' Hence, \code{stri_duplicated} and \code{stri_duplicated_any}
#' are significantly slower (but much better suited for natural language
#' processing) than their base R counterparts.
#'
#' See also \code{\link{stri_unique}} for extracting unique elements.
#'
#' @param str a character vector
#' @param from_last a single logical value;
#' indicates whether search should be performed from the last to the
#' first string
#' @param fromLast [DEPRECATED] alias of \code{from_last}
#' @param opts_collator a named list with \pkg{ICU} Collator's options,
#' see \code{\link{stri_opts_collator}}, \code{NULL}
#' for default collation options
#' @param ... additional settings for \code{opts_collator}
#'
#' @return
#' \code{stri_duplicated()} returns a logical vector of the same length
#' as \code{str}. Each of its elements indicates whether a canonically
#' equivalent string was already found in \code{str}.
#'
#' \code{stri_duplicated_any()} returns a single non-negative integer.
#' Value of 0 indicates that all the elements in \code{str} are unique.
#' Otherwise, it gives the index of the first non-unique element.
#'
#' @references
#' \emph{Collation} - ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/collation/}
#'
#' @examples
#' # In the following examples, we have 3 duplicated values,
#' # 'a' - 2 times, NA - 1 time
#' stri_duplicated(c('a', 'b', 'a', NA, 'a', NA))
#' stri_duplicated(c('a', 'b', 'a', NA, 'a', NA), from_last=TRUE)
#' stri_duplicated_any(c('a', 'b', 'a', NA, 'a', NA))
#'
#' # compare the results:
#' stri_duplicated(c('\u0105', stri_trans_nfkd('\u0105')))
#' duplicated(c('\u0105', stri_trans_nfkd('\u0105')))
#'
#' stri_duplicated(c('gro\u00df', 'GROSS', 'Gro\u00df', 'Gross'), strength=1)
#' duplicated(c('gro\u00df', 'GROSS', 'Gro\u00df', 'Gross'))
#'
#' @rdname stri_duplicated
#' @family locale_sensitive
#' @export
stri_duplicated <- function(str, from_last = FALSE,
fromLast = from_last, ..., opts_collator = NULL) {
if (!missing(fromLast)) {
warning("The 'fromLast' argument in stri_duplicated is a deprecated alias of 'from_last' and will be removed in a future release of 'stringi'.")
from_last <- fromLast
}
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_duplicated, str, from_last, opts_collator)
}
#' @rdname stri_duplicated
#' @export
stri_duplicated_any <- function(str, from_last = FALSE, fromLast = from_last, ...,
opts_collator = NULL) {
if (!missing(fromLast)) { # DEPRECATED
warning("The 'fromLast' argument in stri_duplicated_any is a deprecated alias of 'from_last' and will be removed in a future release of 'stringi'.")
from_last <- fromLast
}
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_duplicated_any, str, from_last, opts_collator)
}
#' @title
#' Sort Keys
#'
#' @description
#' This function computes a locale-dependent sort key, which is an alternative
#' character representation of the string that, when ordered in the C locale
#' (which orders using the underlying bytes directly), will give an equivalent
#' ordering to the original string. It is useful for enhancing algorithms
#' that sort only in the C locale (e.g., the \code{strcmp} function in libc)
#' with the ability to be locale-aware.
#'
#' @details
#' For more information on \pkg{ICU}'s Collator and how to tune it up
#' in \pkg{stringi}, refer to \code{\link{stri_opts_collator}}.
#'
#' See also \code{\link{stri_rank}} for ranking strings with a single character
#' vector, i.e., generating relative sort keys.
#'
#' @param str a character vector
#' @param opts_collator a named list with \pkg{ICU} Collator's options,
#' see \code{\link{stri_opts_collator}}, \code{NULL}
#' for default collation options
#' @param ... additional settings for \code{opts_collator}
#'
#' @return
#' The result is a character vector with the same length as \code{str} that
#' contains the sort keys. The output is marked as \code{bytes}-encoded.
#'
#' @references
#' \emph{Collation} - ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/collation/}
#'
#' @examples
#' stri_sort_key(c('hladny', 'chladny'), locale='pl_PL')
#' stri_sort_key(c('hladny', 'chladny'), locale='sk_SK')
#'
#' @family locale_sensitive
#' @export
#' @rdname stri_sort_key
stri_sort_key <- function(str, ..., opts_collator = NULL)
{
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_sort_key, str, opts_collator)
}
#' @title
#' Ranking
#'
#' @description
#' This function ranks each string in a character vector according to a
#' locale-dependent lexicographic order.
#' It is a portable replacement for the base \code{xtfrm} function.
#'
#' @details
#' Missing values result in missing ranks and tied observations receive
#' the same ranks (based on min).
#'
#' For more information on \pkg{ICU}'s Collator and how to tune it up
#' in \pkg{stringi}, refer to \code{\link{stri_opts_collator}}.
#'
#' @param str a character vector
#' @param opts_collator a named list with \pkg{ICU} Collator's options,
#' see \code{\link{stri_opts_collator}}, \code{NULL}
#' for default collation options
#' @param ... additional settings for \code{opts_collator}
#'
#' @return
#' The result is a vector of ranks corresponding to each
#' string in \code{str}.
#'
#' @references
#' \emph{Collation} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/collation/}
#'
#' @examples
#' stri_rank(c('hladny', 'chladny'), locale='pl_PL')
#' stri_rank(c('hladny', 'chladny'), locale='sk_SK')
#'
#' stri_rank("a" %s+% c(1, 100, 2, 101, 11, 10)) # lexicographic order
#' stri_rank("a" %s+% c(1, 100, 2, 101, 11, 10), numeric=TRUE) # OK
#' stri_rank("a" %s+% c(0.25, 0.5, 1, -1, -2, -3), numeric=TRUE) # incorrect
#'
#' # Ordering a data frame with respect to two criteria:
#' X <- data.frame(a=c("b", NA, "b", "b", NA, "a", "a", "c"), b=runif(8))
#' X[order(stri_rank(X$a), X$b), ]
#'
#' @family locale_sensitive
#' @export
#' @rdname stri_rank
stri_rank <- function(str, ..., opts_collator=NULL)
{
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_rank, str, opts_collator)
}
stringi/R/search_detect_4.R 0000644 0001762 0000144 00000015440 14750110641 015300 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Detect Pattern Occurrences
#'
#' @description
#' These functions determine, for each string in \code{str},
#' if there is at least one match to a corresponding \code{pattern}.
#'
#' @details
#' Vectorized over \code{str} and \code{pattern} (with recycling
#' of the elements in the shorter vector if necessary). This allows to,
#' for instance, search for one pattern in each given string,
#' search for each pattern in one given string,
#' and search for the i-th pattern within the i-th string.
#'
#' If \code{pattern} is empty, then the result is \code{NA}
#' and a warning is generated.
#'
#' \code{stri_detect} is a convenience function.
#' It calls either \code{stri_detect_regex},
#' \code{stri_detect_fixed}, \code{stri_detect_coll},
#' or \code{stri_detect_charclass}, depending on the argument used.
#'
#' See also \code{\link{stri_startswith}} and \code{\link{stri_endswith}}
#' for testing whether a string starts or ends with a match to a given pattern.
#' Moreover, see \code{\link{stri_subset}} for a character vector subsetting.
#'
#' If \code{max_count} is negative, then all stings are examined.
#' Otherwise, searching terminates
#' once \code{max_count} matches (or, if \code{negate} is \code{TRUE},
#' no-matches) are detected. The uninspected cases are marked
#' as missing in the return vector. Be aware that, unless \code{pattern} is a
#' singleton, the elements in \code{str} might be inspected in a
#' non-consecutive order.
#'
#'
#' @param str character vector; strings to search in
#' @param pattern,regex,fixed,coll,charclass character vector;
#' search patterns; for more details refer to \link{stringi-search}
#' @param negate single logical value; whether a no-match to a pattern
#' is rather of interest
#' @param max_count single integer; allows to stop searching once a given
#' number of occurrences is detected; \code{-1} (the default) inspects all
#' elements
#' @param opts_collator,opts_fixed,opts_regex a named list used to tune up
#' the search engine's settings; see
#' \code{\link{stri_opts_collator}}, \code{\link{stri_opts_fixed}},
#' and \code{\link{stri_opts_regex}}, respectively; \code{NULL}
#' for the defaults
#' @param ... supplementary arguments passed to the underlying functions,
#' including additional settings for \code{opts_collator}, \code{opts_regex},
#' \code{opts_fixed}, and so on
#'
#' @return Each function returns a logical vector.
#'
#' @examples
#' stri_detect_fixed(c('stringi R', 'R STRINGI', '123'), c('i', 'R', '0'))
#' stri_detect_fixed(c('stringi R', 'R STRINGI', '123'), 'R')
#'
#' stri_detect_charclass(c('stRRRingi','R STRINGI', '123'),
#' c('\\p{Ll}', '\\p{Lu}', '\\p{Zs}'))
#'
#' stri_detect_regex(c('stringi R', 'R STRINGI', '123'), 'R.')
#' stri_detect_regex(c('stringi R', 'R STRINGI', '123'), '[[:alpha:]]*?')
#' stri_detect_regex(c('stringi R', 'R STRINGI', '123'), '[a-zC1]')
#' stri_detect_regex(c('stringi R', 'R STRINGI', '123'), '( R|RE)')
#' stri_detect_regex('stringi', 'STRING.', case_insensitive=TRUE)
#'
#' stri_detect_regex(c('abc', 'def', '123', 'ghi', '456', '789', 'jkl'),
#' '^[0-9]+$', max_count=1)
#' stri_detect_regex(c('abc', 'def', '123', 'ghi', '456', '789', 'jkl'),
#' '^[0-9]+$', max_count=2)
#' stri_detect_regex(c('abc', 'def', '123', 'ghi', '456', '789', 'jkl'),
#' '^[0-9]+$', negate=TRUE, max_count=3)
#'
#' @family search_detect
#' @export
#' @rdname stri_detect
stri_detect <- function(str, ..., regex, fixed, coll, charclass)
{
providedarg <- c(
regex = !missing(regex),
fixed = !missing(fixed),
coll = !missing(coll),
charclass = !missing(charclass))
if (sum(providedarg) != 1)
stop("you have to specify one of: `regex`, `fixed`, `coll`, or `charclass`")
if (providedarg["regex"])
stri_detect_regex(str, regex, ...) else if (providedarg["fixed"])
stri_detect_fixed(str, fixed, ...) else if (providedarg["coll"])
stri_detect_coll(str, coll, ...) else if (providedarg["charclass"])
stri_detect_charclass(str, charclass, ...)
}
#' @export
#' @rdname stri_detect
stri_detect_fixed <- function(
str, pattern, negate=FALSE, max_count=-1, ...,
opts_fixed=NULL
) {
if (!missing(...))
opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...)))
.Call(C_stri_detect_fixed, str, pattern, negate, max_count, opts_fixed)
}
#' @export
#' @rdname stri_detect
stri_detect_charclass <- function(str, pattern, negate = FALSE, max_count = -1)
{
.Call(C_stri_detect_charclass, str, pattern, negate, max_count)
}
#' @export
#' @rdname stri_detect
stri_detect_coll <- function(
str, pattern, negate = FALSE,
max_count = -1, ..., opts_collator = NULL
) {
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_detect_coll, str, pattern, negate, max_count, opts_collator)
}
#' @export
#' @rdname stri_detect
stri_detect_regex <- function(
str, pattern, negate = FALSE,
max_count = -1, ...,
opts_regex = NULL
) {
if (!missing(...))
opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...)))
.Call(C_stri_detect_regex, str, pattern, negate, max_count, opts_regex)
}
stringi/R/encoding_detection.R 0000644 0001762 0000144 00000027245 14750110641 016112 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Check If a Data Stream Is Possibly in UTF-16 or UTF-32
#'
#' @description
#' These functions detect whether a given byte stream is
#' valid UTF-16LE, UTF-16BE, UTF-32LE, or UTF-32BE.
#'
#' @details
#' These functions are independent of the way \R marks encodings in
#' character strings (see \link{Encoding} and \link{stringi-encoding}).
#' Most often, these functions act on raw vectors.
#'
#' A result of \code{FALSE} means that a string is surely not valid UTF-16
#' or UTF-32. However, false positives are possible.
#'
#' Also note that a data stream may be sometimes classified
#' as both valid UTF-16LE and UTF-16BE.
#'
#' @param str character vector, a raw vector, or
#' a list of \code{raw} vectors
#'
#' @return Returns a logical vector.
#'
#' @rdname stri_enc_isutf16
#' @family encoding_detection
#' @export
stri_enc_isutf16be <- function(str)
{
.Call(C_stri_enc_isutf16be, str)
}
#' @rdname stri_enc_isutf16
#' @export
stri_enc_isutf16le <- function(str)
{
.Call(C_stri_enc_isutf16le, str)
}
#' @rdname stri_enc_isutf16
#' @export
stri_enc_isutf32be <- function(str)
{
.Call(C_stri_enc_isutf32be, str)
}
#' @rdname stri_enc_isutf16
#' @export
stri_enc_isutf32le <- function(str)
{
.Call(C_stri_enc_isutf32le, str)
}
#' @title
#' Check If a Data Stream Is Possibly in ASCII
#'
#' @description
#' The function checks whether all bytes in a string are <= 127.
#'
#' @details
#' This function is independent of the way \R marks encodings in
#' character strings (see \link{Encoding} and \link{stringi-encoding}).
#'
#' @param str character vector, a raw vector, or
#' a list of \code{raw} vectors
#'
#' @return Returns a logical vector.
#' The i-th element indicates whether the i-th string
#' corresponds to a valid ASCII byte sequence.
#'
#' @examples
#' stri_enc_isascii(letters[1:3])
#' stri_enc_isascii('\u0105\u0104')
#'
#' @family encoding_detection
#' @export
stri_enc_isascii <- function(str)
{
.Call(C_stri_enc_isascii, str)
}
#' @title
#' Check If a Data Stream Is Possibly in UTF-8
#'
#' @description
#' The function checks whether given sequences of bytes forms
#' a proper UTF-8 string.
#'
#' @details
#' \code{FALSE} means that a string is certainly not valid UTF-8.
#' However, false positives are possible. For instance,
#' \code{(c4,85)} represents ('a with ogonek') in UTF-8
#' as well as ('A umlaut', 'Ellipsis') in WINDOWS-1250.
#' Also note that UTF-8, as well as most 8-bit encodings, extend ASCII
#' (note that \code{\link{stri_enc_isascii}} implies that
#' \code{\link{stri_enc_isutf8}}).
#'
#' However, the longer the sequence,
#' the greater the possibility that the result
#' is indeed in UTF-8 -- this is because not all sequences of bytes
#' are valid UTF-8.
#'
#' This function is independent of the way \R marks encodings in
#' character strings (see \link{Encoding} and \link{stringi-encoding}).
#'
#' @param str character vector, a raw vector, or
#' a list of \code{raw} vectors
#'
#' @return Returns a logical vector.
#' Its i-th element indicates whether the i-th string
#' corresponds to a valid UTF-8 byte sequence.
#'
#' @examples
#' stri_enc_isutf8(letters[1:3])
#' stri_enc_isutf8('\u0105\u0104')
#' stri_enc_isutf8('\u1234\u0222')
#'
#' @family encoding_detection
#' @export
stri_enc_isutf8 <- function(str)
{
.Call(C_stri_enc_isutf8, str)
}
#' @title
#' Detect Character Set and Language
#'
#' @description
#' This function uses the \pkg{ICU} engine to determine the character set,
#' or encoding, of character data in an unknown format.
#'
#' @details
#' Vectorized over \code{str} and \code{filter_angle_brackets}.
#'
#' For a character vector input, merging all text lines
#' via \code{\link{stri_flatten}(str, collapse='\n')}
#' might be needed if \code{str} has been obtained via a call to
#' \code{readLines} and in fact represents an image of a single text file.
#'
#' This is, at best, an imprecise operation using statistics and heuristics.
#' Because of this, detection works best if you supply at least a few hundred
#' bytes of character data that is mostly in a single language.
#' However, because the detection only looks at a limited amount of the input
#' data, some of the returned character sets may fail to handle all of the
#' input data. Note that in some cases,
#' the language can be determined along with the encoding.
#'
#' Several different techniques are used for character set detection.
#' For multi-byte encodings, the sequence of bytes is checked for legible
#' patterns. The detected characters are also checked against a list of
#' frequently used characters in that encoding. For single byte encodings,
#' the data is checked against a list of the most commonly occurring three
#' letter groups for each language that can be written using that encoding.
#'
#' The detection process can be configured to optionally ignore
#' HTML or XML style markup (using \pkg{ICU}'s internal facilities),
#' which can interfere with the detection
#' process by changing the statistics.
#'
#' This function should most often be used for byte-marked input strings,
#' especially after loading them from text files and before the main
#' conversion with \code{\link{stri_encode}}.
#' The input encoding is of course not taken into account here, even
#' if marked.
#'
#' The following table shows all the encodings that can be detected:
#'
#' \tabular{ll}{
#' \strong{Character_Set} \tab \strong{Languages}\cr
#' UTF-8 \tab -- \cr
#' UTF-16BE \tab -- \cr
#' UTF-16LE \tab -- \cr
#' UTF-32BE \tab -- \cr
#' UTF-32LE \tab -- \cr
#' Shift_JIS \tab Japanese \cr
#' ISO-2022-JP \tab Japanese \cr
#' ISO-2022-CN \tab Simplified Chinese \cr
#' ISO-2022-KR \tab Korean \cr
#' GB18030 \tab Chinese \cr
#' Big5 \tab Traditional Chinese \cr
#' EUC-JP \tab Japanese \cr
#' EUC-KR \tab Korean \cr
#' ISO-8859-1 \tab Danish, Dutch, English, French, German, Italian, Norwegian, Portuguese, Swedish \cr
#' ISO-8859-2 \tab Czech, Hungarian, Polish, Romanian \cr
#' ISO-8859-5 \tab Russian \cr
#' ISO-8859-6 \tab Arabic \cr
#' ISO-8859-7 \tab Greek \cr
#' ISO-8859-8 \tab Hebrew \cr
#' ISO-8859-9 \tab Turkish \cr
#' windows-1250 \tab Czech, Hungarian, Polish, Romanian \cr
#' windows-1251 \tab Russian \cr
#' windows-1252 \tab Danish, Dutch, English, French, German, Italian, Norwegian, Portuguese, Swedish \cr
#' windows-1253 \tab Greek \cr
#' windows-1254 \tab Turkish \cr
#' windows-1255 \tab Hebrew \cr
#' windows-1256 \tab Arabic \cr
#' KOI8-R \tab Russian \cr
#' IBM420 \tab Arabic \cr
#' IBM424 \tab Hebrew \cr
#' }
#'
#'
#' @param str character vector, a raw vector, or
#' a list of \code{raw} vectors
#'
#' @param filter_angle_brackets logical; If filtering is enabled,
#' text within angle brackets ('<' and '>') will be removed before detection,
#' which will remove most HTML or XML markup.
#'
#' @return Returns a list of length equal to the length of \code{str}.
#' Each list element is a data frame with the following three named vectors
#' representing all the guesses:
#' \itemize{
#' \item \code{Encoding} -- string; guessed encodings; \code{NA} on failure,
#' \item \code{Language} -- string; guessed languages; \code{NA} if the language could
#' not be determined (e.g., in case of UTF-8),
#' \item \code{Confidence} -- numeric in [0,1]; the higher the value,
#' the more confidence there is in the match; \code{NA} on failure.
#' }
#' The guesses are ordered by decreasing confidence.
#'
#' @examples
#' ## Not run:
#' ## f <- rawToChar(readBin('test.txt', 'raw', 100000))
#' ## stri_enc_detect(f)
#'
#' @references
#' \emph{Character Set Detection} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/conversion/detection.html}
#'
#' @family encoding_detection
#' @export
stri_enc_detect <- function(str, filter_angle_brackets = FALSE)
{
lapply(.Call(C_stri_enc_detect, str, filter_angle_brackets),
as.data.frame, stringsAsFactors = FALSE)
}
#' @title
#' [DEPRECATED] Detect Locale-Sensitive Character Encoding
#'
#' @description
#' This function tries to detect character encoding
#' in case the language of text is known.
#'
#'
#' @details
#' Vectorized over \code{str}.
#'
#' First, the text is checked whether it is valid
#' UTF-32BE, UTF-32LE, UTF-16BE, UTF-16LE, UTF-8
#' (as in \code{\link{stri_enc_detect}},
#' this is roughly inspired by \pkg{ICU}'s \code{i18n/csrucode.cpp}) or ASCII.
#'
#'
#' If \code{locale} is not \code{NA} and the above fails,
#' the text is checked for the number of occurrences
#' of language-specific code points (data provided by the \pkg{ICU} library)
#' converted to all possible 8-bit encodings
#' that fully cover the indicated language.
#' The encoding is selected based on the greatest number of total
#' byte hits.
#'
#' The guess is of course imprecise,
#' as it is obtained using statistics and heuristics.
#' Because of this, detection works best if you supply at least a few hundred
#' bytes of character data that is in a single language.
#'
#'
#' If you have no initial guess on the language and encoding, try with
#' \code{\link{stri_enc_detect}} (uses \pkg{ICU} facilities).
#'
#' @param str character vector, a raw vector, or
#' a list of \code{raw} vectors
#' @param locale \code{NULL} or \code{''} for the default locale,
#' or a single string with locale identifier.
#'
#' @return
#' Just like \code{\link{stri_enc_detect}},
#' this function returns a list of length equal to the length of \code{str}.
#' Each list element is a data frame with the following three named components:
#' \itemize{
#' \item \code{Encoding} -- string; guessed encodings; \code{NA} on failure
#' (if and only if \code{encodings} is empty),
#' \item \code{Language} -- always \code{NA},
#' \item \code{Confidence} -- numeric in [0,1]; the higher the value,
#' the more confidence there is in the match; \code{NA} on failure.
#' }
#' The guesses are ordered by decreasing confidence.
#'
#' @family locale_sensitive
#' @family encoding_detection
#' @export
stri_enc_detect2 <- function(str, locale = NULL)
{
warning("stri_enc_detect2 is deprecated and will be removed in a future release of 'stringi'.")
suppressWarnings(lapply(.Call(C_stri_enc_detect2, str, locale), as.data.frame,
stringsAsFactors = FALSE))
}
stringi/R/search_split_bound.R 0000644 0001762 0000144 00000016455 14750110641 016136 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Split a String Into Text Lines
#'
#' @description
#' These functions split each character string in a given vector
#' into text lines.
#'
#' @details
#' Vectorized over \code{str} and \code{omit_empty}.
#'
#' \code{omit_empty} is applied when splitting. If set to \code{TRUE},
#' then empty strings will never appear in the resulting vector.
#'
#' Newlines are represented with the Carriage Return
#' (CR, 0x0D), Line Feed (LF, 0x0A), CRLF, or Next Line (NEL, 0x85) characters,
#' depending on the platform.
#' Moreover, the Unicode Standard defines two unambiguous separator characters,
#' the Paragraph Separator (PS, 0x2029) and the Line Separator (LS, 0x2028).
#' Sometimes also the Vertical Tab (VT, 0x0B) and the Form Feed (FF, 0x0C)
#' are used for this purpose.
#'
#' These \pkg{stringi} functions follow UTR#18 rules,
#' where a newline sequence
#' corresponds to the following regular expression:
#' \code{(?:\\u\{D A\}|(?!\\u\{D A\})[\\u\{A\}-\\u\{D\}\\u\{85\}\\u\{2028\}\\u\{2029\}]}.
#' Each match serves as a text line separator.
#'
#'
#' @param str character vector (\code{stri_split_lines})
#' or a single string (\code{stri_split_lines1})
#' @param omit_empty logical vector; determines whether empty
#' strings should be removed from the result
#' [\code{stri_split_lines} only]
#'
#' @return \code{stri_split_lines} returns a list of character vectors.
#' If any input string is \code{NA}, then the corresponding list element
#' is a single \code{NA} string.
#'
#' \code{stri_split_lines1(str)} is equivalent to
#' \code{stri_split_lines(str[1])[[1]]} (with default parameters),
#' therefore it returns a character vector. Moreover, if the input string
#' ends with a newline sequence, the last empty string is omitted from the
# result. This function may come in handy if you wish to split a text
#' file's contents into text lines.
#'
#' @references
#' \emph{Unicode Newline Guidelines} -- Unicode Technical Report #13,
#' \url{https://www.unicode.org/standard/reports/tr13/tr13-5.html}
#'
#' \emph{Unicode Regular Expressions} -- Unicode Technical Standard #18,
#' \url{https://www.unicode.org/reports/tr18/}
#'
#' @family search_split
#' @family text_boundaries
#' @export
#' @rdname stri_split_lines
#' @aliases stri_split_lines stri_split_lines1
stri_split_lines <- function(str, omit_empty = FALSE) {
.Call(C_stri_split_lines, str, omit_empty)
}
#' @rdname stri_split_lines
#' @export
stri_split_lines1 <- function(str) {
.Call(C_stri_split_lines1, str)
}
#' @title
#' Split a String at Text Boundaries
#'
#' @description
#' This function locates text boundaries
#' (like character, word, line, or sentence boundaries)
#' and splits strings at the indicated positions.
#'
#' @details
#' Vectorized over \code{str} and \code{n}.
#'
#' If \code{n} is negative (the default), then all text pieces are extracted.
#'
#' Otherwise, if \code{tokens_only} is \code{FALSE} (which is the default),
#' then \code{n-1} tokens are extracted (if possible) and the \code{n}-th string
#' gives the (non-split) remainder (see Examples).
#' On the other hand, if \code{tokens_only} is \code{TRUE},
#' then only full tokens (up to \code{n} pieces) are extracted.
#'
#' For more information on text boundary analysis
#' performed by \pkg{ICU}'s \code{BreakIterator}, see
#' \link{stringi-search-boundaries}.
#'
#' @param str character vector or an object coercible to
#' @param n integer vector, maximal number of strings to return
#' @param tokens_only single logical value; may affect the result if \code{n}
#' is positive, see Details
#' @param simplify single logical value; if \code{TRUE} or \code{NA},
#' then a character matrix is returned; otherwise (the default), a list of
#' character vectors is given, see Value
#' @param opts_brkiter a named list with \pkg{ICU} BreakIterator's settings,
#' see \code{\link{stri_opts_brkiter}}; \code{NULL} for the
#' default break iterator, i.e., \code{line_break}
#' @param ... additional settings for \code{opts_brkiter}
#'
#' @return If \code{simplify=FALSE} (the default),
#' then the functions return a list of character vectors.
#'
#' Otherwise, \code{\link{stri_list2matrix}} with \code{byrow=TRUE}
#' and \code{n_min=n} arguments is called on the resulting object.
#' In such a case, a character matrix with \code{length(str)} rows
#' is returned. Note that \code{\link{stri_list2matrix}}'s \code{fill}
#' argument is set to an empty string and \code{NA},
#' for \code{simplify} equal to \code{TRUE} and \code{NA}, respectively.
#'
#' @examples
#' test <- 'The\u00a0above-mentioned features are very useful. ' %s+%
#' 'Spam, spam, eggs, bacon, and spam. 123 456 789'
#' stri_split_boundaries(test, type='line')
#' stri_split_boundaries(test, type='word')
#' stri_split_boundaries(test, type='word', skip_word_none=TRUE)
#' stri_split_boundaries(test, type='word', skip_word_none=TRUE, skip_word_letter=TRUE)
#' stri_split_boundaries(test, type='word', skip_word_none=TRUE, skip_word_number=TRUE)
#' stri_split_boundaries(test, type='sentence')
#' stri_split_boundaries(test, type='sentence', skip_sentence_sep=TRUE)
#' stri_split_boundaries(test, type='character')
#'
#' # a filtered break iterator with the new ICU:
#' stri_split_boundaries('Mr. Jones and Mrs. Brown are very happy.
#' So am I, Prof. Smith.', type='sentence', locale='en_US@ss=standard') # ICU >= 56 only
#'
#' @export
#' @family search_split
#' @family locale_sensitive
#' @family text_boundaries
stri_split_boundaries <- function(str, n = -1L,
tokens_only = FALSE, simplify = FALSE,
..., opts_brkiter = NULL)
{
if (!missing(...))
opts_brkiter <- do.call(stri_opts_brkiter, as.list(c(opts_brkiter, ...)))
.Call(C_stri_split_boundaries, str, n, tokens_only, simplify, opts_brkiter)
}
stringi/R/utils.R 0000644 0001762 0000144 00000015244 14750110641 013422 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Convert a List to a Character Matrix
#'
#' @description
#' This function converts a given list of atomic vectors to
#' a character matrix.
#'
#' @details
#' This function is similar to the built-in \code{\link{simplify2array}}
#' function. However, it always returns a character matrix,
#' even if each element in \code{x} is of length 1
#' or if elements in \code{x} are not of the same lengths.
#' Moreover, the elements in \code{x} are always coerced to character vectors.
#'
#' If \code{byrow} is \code{FALSE}, then a matrix with \code{length(x)}
#' columns is returned.
#' The number of rows is the length of the
#' longest vector in \code{x}, but no less than \code{n_min}. Basically, we have
#' \code{result[i,j] == x[[j]][i]} if \code{i <= length(x[[j]])}
#' and \code{result[i,j] == fill} otherwise, see Examples.
#'
#' If \code{byrow} is \code{TRUE}, then the resulting matrix is
#' a transposition of the above-described one.
#'
#' This function may be useful, e.g., in connection with \code{\link{stri_split}}
#' and \code{\link{stri_extract_all}}.
#'
#' @param x a list of atomic vectors
#' @param byrow a single logical value; should the resulting matrix be
#' transposed?
#' @param fill a single string, see Details
#' @param n_min a single integer value; minimal number of rows (\code{byrow==FALSE})
#' or columns (otherwise) in the resulting matrix
#' @param by_row alias of \code{byrow}
#'
#' @return
#' Returns a character matrix.
#'
#' @examples
#' simplify2array(list(c('a', 'b'), c('c', 'd'), c('e', 'f')))
#' stri_list2matrix(list(c('a', 'b'), c('c', 'd'), c('e', 'f')))
#' stri_list2matrix(list(c('a', 'b'), c('c', 'd'), c('e', 'f')), byrow=TRUE)
#'
#' simplify2array(list('a', c('b', 'c')))
#' stri_list2matrix(list('a', c('b', 'c')))
#' stri_list2matrix(list('a', c('b', 'c')), fill='')
#' stri_list2matrix(list('a', c('b', 'c')), fill='', n_min=5)
#'
#' @family utils
#' @export
stri_list2matrix <- function(x,
byrow = FALSE, fill = NA_character_, n_min = 0, by_row = byrow)
{
if (!missing(by_row))
byrow <- by_row
.Call(C_stri_list2matrix, x, byrow, stri_enc_toutf8(fill), n_min)
}
#' @title
#' Replace NAs with Empty Strings
#'
#' @description
#' This function replaces all missing values with empty strings.
#' See \code{\link{stri_replace_na}} for a generalization.
#'
#' @param x a character vector
#'
#' @return
#' Returns a character vector.
#'
#' @examples
#' stri_na2empty(c('a', NA, '', 'b'))
#'
#' @family utils
#' @export
stri_na2empty <- function(x)
{
x <- stri_enc_toutf8(x)
x[is.na(x)] <- ""
x
}
#' @title
#' Remove All Empty Strings from a Character Vector
#'
#' @description
#' \code{stri_remove_empty} (alias \code{stri_omit_empty})
#' removes all empty strings from a character vector,
#' and, if \code{na_empty} is \code{TRUE}, also gets rid of all missing
#' values.
#'
#' \code{stri_remove_empty_na} (alias \code{stri_omit_empty_na})
#' removes both empty strings and missing values.
#'
#' \code{stri_remove_na} (alias \code{stri_omit_na})
#' returns a version of \code{x} with missing values removed.
#'
#' @param x a character vector
#' @param na_empty should missing values be treated as empty strings?
#'
#' @return
#' Returns a character vector.
#'
#' @examples
#' stri_remove_empty(stri_na2empty(c('a', NA, '', 'b')))
#' stri_remove_empty(c('a', NA, '', 'b'))
#' stri_remove_empty(c('a', NA, '', 'b'), TRUE)
#'
#' stri_omit_empty_na(c('a', NA, '', 'b'))
#'
#' @family utils
#' @rdname stri_remove_empty
#' @export
stri_remove_empty <- function(x, na_empty = FALSE)
{
x <- stri_enc_toutf8(x)
if (identical(na_empty, TRUE))
x[!is.na(x) & !stri_isempty(x)]
else
x[!stri_isempty(x)]
}
#' @rdname stri_remove_empty
#' @export
stri_omit_empty <- stri_remove_empty
#' @rdname stri_remove_empty
#' @export
stri_remove_empty_na <- function(x)
{
stri_remove_empty(x, TRUE)
}
#' @rdname stri_remove_empty
#' @export
stri_omit_empty_na <- stri_remove_empty_na
#' @rdname stri_remove_empty
#' @export
stri_remove_na <- function(x)
{
x <- stri_enc_toutf8(x)
x[!is.na(x)]
}
#' @rdname stri_remove_empty
#' @export
stri_omit_na <- stri_remove_na
#' @title
#' Replace Missing Values in a Character Vector
#'
#' @description
#' This function gives a convenient way to replace each missing (\code{NA})
#' value with a given string.
#'
#' @details
#' This function is roughly equivalent to
#' \code{str2 <- stri_enc_toutf8(str);
#' str2[is.na(str2)] <- stri_enc_toutf8(replacement);
#' str2}.
#' It may be used, e.g., wherever the 'plain R' \code{NA} handling is
#' desired, see Examples.
#'
#' @param str character vector or an object coercible to
#' @param replacement single string
#'
#' @return Returns a character vector.
#'
#' @examples
#' x <- c('test', NA)
#' stri_paste(x, 1:2) # 'test1' NA
#' paste(x, 1:2) # 'test 1' 'NA 2'
#' stri_paste(stri_replace_na(x), 1:2, sep=' ') # 'test 1' 'NA 2'
#'
#' @export
#' @family utils
stri_replace_na <- function(str, replacement = "NA")
{
.Call(C_stri_replace_na, str, replacement)
}
stringi/R/search_locate_4.R 0000644 0001762 0000144 00000031761 14750110641 015303 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title Locate Pattern Occurrences
#'
#' @description
#' These functions find the indexes (positions) where
#' there is a match to some pattern.
#' The functions \code{stri_locate_all_*} locate all the matches.
#' \code{stri_locate_first_*} and \code{stri_locate_last_*}
#' give the first and the last matches, respectively.
#'
#' @details
#' Vectorized over \code{str} and \code{pattern} (with recycling
#' of the elements in the shorter vector if necessary). This allows to,
#' for instance, search for one pattern in each string,
#' search for each pattern in one string,
#' and search for the i-th pattern within the i-th string.
#'
#' The matches may be extracted by calling
#' \code{\link{stri_sub}} or \code{\link{stri_sub_all}}.
#' Alternatively, you may call \code{\link{stri_extract}} directly.
#'
#' \code{stri_locate}, \code{stri_locate_all}, \code{stri_locate_first},
#' and \code{stri_locate_last} are convenience functions.
#' They just call \code{stri_locate_*_*}, depending on the arguments used.
#'
#'
#'
#' @param str character vector; strings to search in
#'
#' @param pattern,regex,fixed,coll,charclass character vector;
#' search patterns; for more details refer to \link{stringi-search}
#'
#' @param opts_collator,opts_fixed,opts_regex named list used to tune up
#' the selected search engine's settings; see
#' \code{\link{stri_opts_collator}}, \code{\link{stri_opts_fixed}},
#' and \code{\link{stri_opts_regex}}, respectively; \code{NULL}
#' for the defaults
#'
#' @param merge single logical value;
#' indicates whether consecutive sequences of indexes in the resulting
#' matrix should be merged; \code{stri_locate_all_charclass} only
#'
#' @param omit_no_match single logical value; if \code{TRUE},
#' a no-match will be indicated by a matrix with 0 rows
#' \code{stri_locate_all_*} only
#'
#' @param get_length single logical value; if \code{FALSE} (default),
#' generate \emph{from-to} matrices; otherwise, output
#' \emph{from-length} ones
#'
#' @param capture_groups single logical value;
#' whether positions of matches to parenthesized subexpressions
#' should be returned too (as \code{capture_groups} attribute);
#' \code{stri_locate_*_regex} only
#'
#' @param mode single string;
#' one of: \code{'first'} (the default), \code{'all'}, \code{'last'}
#'
#' @param ... supplementary arguments passed to the underlying functions,
#' including additional settings for \code{opts_collator},
#' \code{opts_regex}, \code{opts_fixed}, and so on
#'
#'
#' @return
#' For \code{stri_locate_all_*},
#' a list of integer matrices is returned. Each list element
#' represents the results of a separate search scenario.
#' The first column gives the start positions
#' of the matches, and the second column gives the end positions.
#' Moreover, two \code{NA}s in a row denote \code{NA} arguments
#' or a no-match (the latter only if \code{omit_no_match} is \code{FALSE}).
#'
#' \code{stri_locate_first_*} and \code{stri_locate_last_*}
#' return an integer matrix with
#' two columns, giving the start and end positions of the first
#' or the last matches, respectively, and two \code{NA}s if and
#' only if they are not found.
#'
#' For \code{stri_locate_*_regex}, if the match is of zero length,
#' \code{end} will be one character less than \code{start}.
#' Note that \code{stri_locate_last_regex} searches from start to end,
#' but skips overlapping matches, see the example below.
#'
#' Setting \code{get_length=TRUE} results in the 2nd column representing
#' the length of the match instead of the end position. In this case,
#' negative length denotes a no-match.
#'
#' If \code{capture_groups=TRUE}, then the outputs are equipped with the
#' \code{capture_groups} attribute, which is a list of matrices
#' giving the start-end positions of matches to parenthesized subexpressions.
#' Similarly to \code{stri_match_regex}, capture group names are extracted
#' unless looking for first/last occurrences of many different patterns.
#'
#' @examples
#' stri_locate_all('stringi', fixed='i')
#'
#' stri_locate_first_coll('hladn\u00FD', 'HLADNY', strength=1, locale='sk_SK')
#'
#' stri_locate_all_regex(
#' c('breakfast=eggs;lunch=pizza', 'breakfast=spam', 'no food here'),
#' '(?\\w+)=(?\\w+)',
#' capture_groups=TRUE
#' ) # named capture groups
#'
#' stri_locate_all_fixed("abababa", "ABA", case_insensitive=TRUE, overlap=TRUE)
#' stri_locate_first_fixed("ababa", "aba")
#' stri_locate_last_fixed("ababa", "aba") # starts from end
#' stri_locate_last_regex("ababa", "aba") # no overlaps, from left to right
#'
#' x <- c("yes yes", "no", NA)
#' stri_locate_all_fixed(x, "yes")
#' stri_locate_all_fixed(x, "yes", omit_no_match=TRUE)
#' stri_locate_all_fixed(x, "yes", get_length=TRUE)
#' stri_locate_all_fixed(x, "yes", get_length=TRUE, omit_no_match=TRUE)
#' stri_locate_first_fixed(x, "yes")
#' stri_locate_first_fixed(x, "yes", get_length=TRUE)
#'
#' # Use regex positive-lookahead to locate overlapping pattern matches:
#' stri_locate_all_regex('ACAGAGACTTTAGATAGAGAAGA', '(?=AGA)')
#' # note that start > end here (match of length zero)
#'
#'
#' @family search_locate
#' @family indexing
#'
#' @export
#' @rdname stri_locate
stri_locate_all <- function(str, ..., regex, fixed, coll, charclass)
{
providedarg <- c(
regex=!missing(regex),
fixed=!missing(fixed),
coll=!missing(coll),
charclass=!missing(charclass))
if (sum(providedarg) != 1)
stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`")
if (providedarg["regex"])
stri_locate_all_regex(str, regex, ...)
else if (providedarg["fixed"])
stri_locate_all_fixed(str, fixed, ...)
else if (providedarg["coll"])
stri_locate_all_coll(str, coll, ...)
else if (providedarg["charclass"])
stri_locate_all_charclass(str, charclass, ...)
}
#' @export
#' @rdname stri_locate
stri_locate_first <- function(str, ..., regex, fixed, coll, charclass)
{
providedarg <- c(
regex=!missing(regex),
fixed=!missing(fixed),
coll=!missing(coll),
charclass=!missing(charclass))
if (sum(providedarg) != 1)
stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`")
if (providedarg["regex"])
stri_locate_first_regex(str, regex, ...)
else if (providedarg["fixed"])
stri_locate_first_fixed(str, fixed, ...)
else if (providedarg["coll"])
stri_locate_first_coll(str, coll, ...)
else if (providedarg["charclass"])
stri_locate_first_charclass(str, charclass, ...)
}
#' @export
#' @rdname stri_locate
stri_locate_last <- function(str, ..., regex, fixed, coll, charclass)
{
providedarg <- c(
regex=!missing(regex),
fixed=!missing(fixed),
coll=!missing(coll),
charclass=!missing(charclass))
if (sum(providedarg) != 1)
stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`")
if (providedarg["regex"])
stri_locate_last_regex(str, regex, ...)
else if (providedarg["fixed"])
stri_locate_last_fixed(str, fixed, ...)
else if (providedarg["coll"])
stri_locate_last_coll(str, coll, ...)
else if (providedarg["charclass"])
stri_locate_last_charclass(str, charclass, ...)
}
#' @export
#' @rdname stri_locate
stri_locate <- function(
str, ..., regex, fixed, coll, charclass,
mode=c("first", "all", "last")
) {
# `first` is default for compatibility with stringr
mode <- match.arg(mode) # this is slow
switch(mode,
first=stri_locate_first(str, ..., regex=regex, fixed=fixed,
coll=coll, charclass=charclass),
last=stri_locate_last(str, ..., regex=regex,
fixed=fixed, coll=coll, charclass=charclass),
all=stri_locate_all(str, ..., regex=regex, fixed=fixed,
coll=coll, charclass=charclass))
}
#' @export
#' @rdname stri_locate
stri_locate_all_charclass <- function(
str, pattern, merge=TRUE, omit_no_match=FALSE, get_length=FALSE
) {
.Call(C_stri_locate_all_charclass, str, pattern, merge, omit_no_match, get_length)
}
#' @export
#' @rdname stri_locate
stri_locate_first_charclass <- function(str, pattern, get_length=FALSE)
{
.Call(C_stri_locate_first_charclass, str, pattern, get_length)
}
#' @export
#' @rdname stri_locate
stri_locate_last_charclass <- function(str, pattern, get_length=FALSE)
{
.Call(C_stri_locate_last_charclass, str, pattern, get_length)
}
#' @export
#' @rdname stri_locate
stri_locate_all_coll <- function(
str, pattern,
omit_no_match=FALSE, get_length=FALSE, ..., opts_collator=NULL
) {
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_locate_all_coll, str, pattern, omit_no_match, opts_collator, get_length)
}
#' @export
#' @rdname stri_locate
stri_locate_first_coll <- function(
str, pattern, get_length=FALSE, ..., opts_collator=NULL
) {
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_locate_first_coll, str, pattern, opts_collator, get_length)
}
#' @export
#' @rdname stri_locate
stri_locate_last_coll <- function(
str, pattern, get_length=FALSE, ..., opts_collator=NULL
) {
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_locate_last_coll, str, pattern, opts_collator, get_length)
}
#' @export
#' @rdname stri_locate
stri_locate_all_regex <- function(
str, pattern,
omit_no_match=FALSE,
capture_groups=FALSE,
get_length=FALSE,
..., opts_regex=NULL
) {
if (!missing(...))
opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...)))
.Call(C_stri_locate_all_regex, str, pattern, omit_no_match, opts_regex, capture_groups, get_length)
}
#' @export
#' @rdname stri_locate
stri_locate_first_regex <- function(
str, pattern, capture_groups=FALSE, get_length=FALSE, ..., opts_regex=NULL
) {
if (!missing(...))
opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...)))
.Call(C_stri_locate_first_regex, str, pattern, opts_regex, capture_groups, get_length)
}
#' @export
#' @rdname stri_locate
stri_locate_last_regex <- function(
str, pattern, capture_groups=FALSE, get_length=FALSE, ..., opts_regex=NULL
) {
if (!missing(...))
opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...)))
.Call(C_stri_locate_last_regex, str, pattern, opts_regex, capture_groups, get_length)
}
#' @export
#' @rdname stri_locate
stri_locate_all_fixed <- function(
str, pattern, omit_no_match=FALSE, get_length=FALSE, ..., opts_fixed=NULL
) {
if (!missing(...))
opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...)))
.Call(C_stri_locate_all_fixed, str, pattern, omit_no_match, opts_fixed, get_length)
}
#' @export
#' @rdname stri_locate
stri_locate_first_fixed <- function(
str, pattern, get_length=FALSE, ..., opts_fixed=NULL
) {
if (!missing(...))
opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...)))
.Call(C_stri_locate_first_fixed, str, pattern, opts_fixed, get_length)
}
#' @export
#' @rdname stri_locate
stri_locate_last_fixed <- function(
str, pattern, get_length=FALSE, ..., opts_fixed=NULL
) {
if (!missing(...))
opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...)))
.Call(C_stri_locate_last_fixed, str, pattern, opts_fixed, get_length)
}
stringi/R/length.R 0000644 0001762 0000144 00000016313 14750110641 013541 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Count the Number of Bytes
#'
#' @description
#' Counts the number of bytes needed to store
#' each string in the computer's memory.
#'
#' @details
#' Often, this is not the function you would normally use
#' in your string processing activities. See \code{\link{stri_length}} instead.
#'
#' For 8-bit encoded strings, this is the same as \code{\link{stri_length}}.
#' For UTF-8 strings, the returned values may be greater
#' than the number of code points, as UTF-8 is not a fixed-byte encoding:
#' one code point may be encoded by 1-4 bytes
#' (according to the current Unicode standard).
#'
#' Missing values are handled properly.
#'
#' The strings do not need to be re-encoded to perform this operation.
#'
#' The returned values do not include the trailing NUL bytes,
#' which are used internally to mark the end of string data (in C).
#'
#' @param str character vector or an object coercible to
#'
#' @return Returns an integer vector of the same length as \code{str}.
#'
#' @examples
#' stri_numbytes(letters)
#' stri_numbytes(c('abc', '123', '\u0105\u0104'))
#'
#' \dontrun{
#' # this used to fail on Windows, where there were no native support
#' # for 4-bytes Unicode characters; see, however, stri_unescape_unicode():
#' stri_numbytes('\U001F600') # compare stri_length('\U001F600')
#' }
#'
#' @export
#' @family length
stri_numbytes <- function(str) {
.Call(C_stri_numbytes, str)
}
#' @title
#' Count the Number of Code Points
#'
#' @description
#' This function returns the number of code points
#' in each string.
#'
#' @details
#' Note that the number of code points is
#' not the same as the `width` of the string when
#' printed on the console.
#'
#' If a given string is in UTF-8 and has not been properly normalized
#' (e.g., by \code{\link{stri_trans_nfc}}), the returned counts may sometimes be
#' misleading. See \code{\link{stri_count_boundaries}} for a method to count
#' \emph{Unicode characters}. Moreover, if an incorrect UTF-8 byte sequence
#' is detected, then a warning is generated and the corresponding output element
#' is set to \code{NA}, see also \code{\link{stri_enc_toutf8}} for a method
#' to deal with such cases.
#'
#' Missing values are handled properly.
#' For `byte` encodings we get, as usual, an error.
#'
#' @param str character vector or an object coercible to
#' @return Returns an integer vector of the same length as \code{str}.
#'
#' @examples
#' stri_length(LETTERS)
#' stri_length(c('abc', '123', '\u0105\u0104'))
#' stri_length('\u0105') # length is one, but...
#' stri_numbytes('\u0105') # 2 bytes are used
#' stri_numbytes(stri_trans_nfkd('\u0105')) # 3 bytes here but...
#' stri_length(stri_trans_nfkd('\u0105')) # ...two code points (!)
#' stri_count_boundaries(stri_trans_nfkd('\u0105'), type='character') # ...and one Unicode character
#'
#' @export
#' @family length
stri_length <- function(str)
{
.Call(C_stri_length, str)
}
#' @title
#' Determine if a String is of Length Zero
#'
#' @description
#' This is the fastest way to find out
#' whether the elements of a character vector are empty strings.
#'
#' @details
#' Missing values are handled properly.
#'
#' @param str character vector or an object coercible to
#' @return Returns a logical vector of the same length as \code{str}.
#'
#' @examples
#' stri_isempty(letters[1:3])
#' stri_isempty(c(',', '', 'abc', '123', '\u0105\u0104'))
#' stri_isempty(character(1))
#'
#' @export
#' @family length
stri_isempty <- function(str)
{
.Call(C_stri_isempty, str)
}
#' @title
#' Determine the Width of Code Points
#'
#' @description
#' Approximates the number of text columns the `cat()` function
#' might use to print a string using a mono-spaced font.
#'
#' @details
#' The Unicode standard does not formalize the notion of a character
#' width. Roughly based on \url{http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c},
#' \url{https://github.com/nodejs/node/blob/master/src/node_i18n.cc},
#' and UAX #11 we proceed as follows.
#' The following code points are of width 0:
#' \itemize{
#' \item code points with general category (see \link{stringi-search-charclass})
#' \code{Me}, \code{Mn}, and \code{Cf}),
#' \item \code{C0} and \code{C1} control codes (general category \code{Cc})
#' - for compatibility with the \code{\link{nchar}} function,
#' \item Hangul Jamo medial vowels and final consonants
#' (code points with enumerable property \code{UCHAR_HANGUL_SYLLABLE_TYPE}
#' equal to \code{U_HST_VOWEL_JAMO} or \code{U_HST_TRAILING_JAMO};
#' note that applying the NFC normalization with \code{\link{stri_trans_nfc}}
#' is encouraged),
#' \item ZERO WIDTH SPACE (U+200B),
#' }
#'
#' Characters with the \code{UCHAR_EAST_ASIAN_WIDTH} enumerable property
#' equal to \code{U_EA_FULLWIDTH} or \code{U_EA_WIDE} are
#' of width 2.
#'
#' Most emojis and characters with general category So (other symbols)
#' are of width 2.
#'
#' SOFT HYPHEN (U+00AD) (for compatibility with \code{\link{nchar}})
#' as well as any other characters have width 1.
#'
#' @param str character vector or an object coercible to
#' @return Returns an integer vector of the same length as \code{str}.
#'
#' @examples
#' stri_width(LETTERS[1:5])
#' stri_width(stri_trans_nfkd('\u0105'))
#' stri_width(stri_trans_nfkd('\U0001F606'))
#' stri_width( # Full-width equivalents of ASCII characters:
#' stri_enc_fromutf32(as.list(c(0x3000, 0xFF01:0xFF5E)))
#' )
#' stri_width(stri_trans_nfkd('\ubc1f')) # includes Hangul Jamo medial vowels and final consonants
#' @export
#' @family length
#'
#' @references
#' \emph{East Asian Width} -- Unicode Standard Annex #11,
#' \url{https://www.unicode.org/reports/tr11/}
stri_width <- function(str)
{
.Call(C_stri_width, str)
}
stringi/R/ICU_settings.R 0000644 0001762 0000144 00000007753 14750110641 014630 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Query Default Settings for \pkg{stringi}
#'
#' @description
#' Gives the current default settings used by the \pkg{ICU} library.
#'
#' @param short logical; whether or not the results should be given
#' in a concise form; defaults to \code{TRUE}
#'
#' @return If \code{short} is \code{TRUE}, then a single string providing
#' information on the default character encoding, locale, and Unicode
#' as well as \pkg{ICU} version is returned.
#'
#' Otherwise, a list with the following components is returned:
#' \itemize{
#' \item \code{Unicode.version} -- version of Unicode supported
#' by the \pkg{ICU} library;
#' \item \code{ICU.version} -- \pkg{ICU} library version used;
#' \item \code{Locale} -- contains information on default locale,
#' as returned by \code{\link{stri_locale_info}};
#' \item \code{Charset.internal} -- fixed at \code{c('UTF-8', 'UTF-16')};
#' \item \code{Charset.native} -- information on the default encoding,
#' as returned by \code{\link{stri_enc_info}};
#' \item \code{ICU.system} -- logical; \code{TRUE} indicates that
#' the system \pkg{ICU} libs are used, otherwise \pkg{ICU} was built together
#' with \pkg{stringi};
#' \item \code{ICU.UTF8} -- logical; \code{TRUE} if the internal
#' \code{U_CHARSET_IS_UTF8} flag is defined and set.
#' }
#'
#' @export
#' @family locale
#' @family encoding
stri_info <- function(short = FALSE)
{
stopifnot(is.logical(short), length(short) == 1)
info <- .Call(C_stri_info)
#loclist <- stri_locale_list()
locale <- info$Locale$Name
charset <- info$Charset.native$Name.friendly
if (charset != "UTF-8") {
if (!identical(info$Charset.native$ASCII.subset, TRUE))
warning(stri_paste("Your native character encoding is not a superset of US-ASCII. ",
"Consider switching to UTF-8."))
else if (!identical(info$Charset.native$Unicode.1to1, TRUE))
warning(stri_paste("Your native character encoding does not map to Unicode properly. ",
"Consider switching to UTF-8."))
}
if (!short)
return(info) else {
return(sprintf("stringi_%s (%s.%s; ICU4C %s [%s%s]; Unicode %s)", as.character(packageVersion("stringi")),
locale, charset, info$ICU.version, if (info$ICU.system) "system" else "bundle",
if (info$ICU.UTF8) "#U_CHARSET_IS_UTF8" else "", info$Unicode.version))
}
}
stringi/R/search_subset_4.R 0000644 0001762 0000144 00000020510 14750110641 015327 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Select Elements that Match a Given Pattern
#'
#' @description
#' These functions return or modify a sub-vector where there is a match to
#' a given pattern. In other words, they
#' are roughly equivalent (but faster and easier to use) to a call to
#' \code{str[\link{stri_detect}(str, ...)]} or
#' \code{str[\link{stri_detect}(str, ...)] <- value}.
#'
#' @details
#' Vectorized over \code{str} as well as partially over \code{pattern}
#' and \code{value},
#' with recycling of the elements in the shorter vector if necessary.
#' As the aim here is to subset \code{str}, \code{pattern}
#' cannot be longer than the former. Moreover, if the number of
#' items to replace is not a multiple of length of \code{value},
#' a warning is emitted and the unused elements are ignored.
#' Hence, the length of the output will be the same as length of \code{str}.
#'
#' \code{stri_subset} and \code{stri_subset<-} are convenience functions.
#' They call either \code{stri_subset_regex},
#' \code{stri_subset_fixed}, \code{stri_subset_coll},
#' or \code{stri_subset_charclass},
#' depending on the argument used.
#'
#' @param str character vector; strings to search within
#'
#' @param pattern,regex,fixed,coll,charclass character vector;
#' search patterns (no more than the length of \code{str});
#' for more details refer to \link{stringi-search}
#'
#' @param negate single logical value; whether a no-match is rather of interest
#'
#' @param omit_na single logical value; should missing values be excluded
#' from the result?
#'
#' @param opts_collator,opts_fixed,opts_regex a named list used to tune up
#' the search engine's settings; see
#' \code{\link{stri_opts_collator}}, \code{\link{stri_opts_fixed}},
#' and \code{\link{stri_opts_regex}}, respectively; \code{NULL}
#' for the defaults
#'
#' @param ... supplementary arguments passed to the underlying functions,
#' including additional settings for \code{opts_collator}, \code{opts_regex},
#' \code{opts_fixed}, and so on
#'
#' @param value non-empty character vector of replacement strings;
#' replacement function only
#'
#'
#' @return The \code{stri_subset_*} functions return a character vector.
#' As usual, the output encoding is UTF-8.
#'
#' The \code{stri_subset_*<-} functions modifies \code{str} 'in-place'.
#'
#'
#' @examples
#' stri_subset_regex(c('stringi R', '123', 'ID456', ''), '^[0-9]+$')
#'
#' x <- c('stringi R', '123', 'ID456', '')
#' `stri_subset_regex<-`(x, '[0-9]+$', negate=TRUE, value=NA) # returns a copy
#' stri_subset_regex(x, '[0-9]+$') <- NA # modifies `x` in-place
#' print(x)
#'
#' @family search_subset
#' @export
#' @rdname stri_subset
stri_subset <- function(str, ..., regex, fixed, coll, charclass)
{
providedarg <- c(
regex = !missing(regex),
fixed = !missing(fixed),
coll = !missing(coll),
charclass = !missing(charclass))
if (sum(providedarg) != 1)
stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`")
if (providedarg["regex"])
stri_subset_regex(str, regex, ...)
else if (providedarg["fixed"])
stri_subset_fixed(str, fixed, ...)
else if (providedarg["coll"])
stri_subset_coll(str, coll, ...)
else if (providedarg["charclass"])
stri_subset_charclass(str, charclass, ...)
}
#' @export
#' @rdname stri_subset
#' @usage stri_subset(str, ..., regex, fixed, coll, charclass) <- value
`stri_subset<-` <- function(str, ..., regex, fixed, coll, charclass, value)
{
providedarg <- c(
regex = !missing(regex),
fixed = !missing(fixed),
coll = !missing(coll),
charclass = !missing(charclass))
if (sum(providedarg) != 1)
stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`")
if (providedarg["regex"])
`stri_subset_regex<-`(str, regex, ..., value = value)
else if (providedarg["fixed"])
`stri_subset_fixed<-`(str, fixed, ..., value = value)
else if (providedarg["coll"])
`stri_subset_coll<-`(str, coll, ..., value = value)
else if (providedarg["charclass"])
`stri_subset_charclass<-`(str, charclass, ..., value = value)
}
#' @export
#' @rdname stri_subset
stri_subset_fixed <- function(str, pattern, omit_na = FALSE, negate = FALSE, ...,
opts_fixed = NULL)
{
if (!missing(...))
opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...)))
.Call(C_stri_subset_fixed, str, pattern, omit_na, negate, opts_fixed)
}
#' @export
#' @rdname stri_subset
#' @usage stri_subset_fixed(str, pattern, negate=FALSE, ..., opts_fixed=NULL) <- value
`stri_subset_fixed<-` <- function(str, pattern, negate = FALSE, ...,
opts_fixed = NULL, value)
{
if (!missing(...))
opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...)))
.Call(C_stri_subset_fixed_replacement, str, pattern, negate, opts_fixed, value)
}
#' @export
#' @rdname stri_subset
stri_subset_charclass <- function(str, pattern, omit_na = FALSE, negate = FALSE)
{
.Call(C_stri_subset_charclass, str, pattern, omit_na, negate)
}
#' @export
#' @rdname stri_subset
#' @usage stri_subset_charclass(str, pattern, negate=FALSE) <- value
`stri_subset_charclass<-` <- function(str, pattern, negate = FALSE, value)
{
.Call(C_stri_subset_charclass_replacement, str, pattern, negate, value)
}
#' @export
#' @rdname stri_subset
stri_subset_coll <- function(str, pattern, omit_na = FALSE, negate = FALSE, ...,
opts_collator = NULL)
{
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_subset_coll, str, pattern, omit_na, negate, opts_collator)
}
#' @export
#' @rdname stri_subset
#' @usage stri_subset_coll(str, pattern, negate=FALSE, ..., opts_collator=NULL) <- value
`stri_subset_coll<-` <- function(str, pattern, negate = FALSE, ..., opts_collator = NULL,
value)
{
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_subset_coll_replacement, str, pattern, negate, opts_collator, value)
}
#' @export
#' @rdname stri_subset
stri_subset_regex <- function(str, pattern, omit_na = FALSE, negate = FALSE, ...,
opts_regex = NULL)
{
if (!missing(...))
opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...)))
.Call(C_stri_subset_regex, str, pattern, omit_na, negate, opts_regex)
}
#' @export
#' @rdname stri_subset
#' @usage stri_subset_regex(str, pattern, negate=FALSE, ..., opts_regex=NULL) <- value
`stri_subset_regex<-` <- function(str, pattern, negate = FALSE, ..., opts_regex = NULL,
value)
{
if (!missing(...))
opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...)))
.Call(C_stri_subset_regex_replacement, str, pattern, negate, opts_regex, value)
}
stringi/R/stats.R 0000644 0001762 0000144 00000011066 14750110641 013416 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' General Statistics for a Character Vector
#'
#' @description
#' This function gives general statistics for a character vector,
#' e.g., obtained by loading a text file with the
#' \code{\link{readLines}} or \code{\link{stri_read_lines}} function,
#' where each text line' is represented by a separate string.
#'
#' @details
#' None of the strings may contain \code{\\r} or \code{\\n} characters,
#' otherwise you will get at error.
#'
#' Below by `white space` we mean the Unicode binary property
#' \code{WHITE_SPACE}, see \code{stringi-search-charclass}.
#'
#' @param str character vector to be aggregated
#' @return Returns an integer vector with the following named elements:
#' \enumerate{
#' \item \code{Lines} - number of lines (number of
#' non-missing strings in the vector);
#' \item \code{LinesNEmpty} - number of lines with at least
#' one non-\code{WHITE_SPACE} character;
#' \item \code{Chars} - total number of Unicode code points detected;
#' \item \code{CharsNWhite} - number of Unicode code points
#' that are not \code{WHITE_SPACE}s;
#' \item ... (Other stuff that may appear in future releases of \pkg{stringi}).
#' }
#' @examples
#' s <- c('Lorem ipsum dolor sit amet, consectetur adipisicing elit.',
#' 'nibh augue, suscipit a, scelerisque sed, lacinia in, mi.',
#' 'Cras vel lorem. Etiam pellentesque aliquet tellus.',
#' '')
#' stri_stats_general(s)
#'
#' @family stats
#' @export
stri_stats_general <- function(str)
{
.Call(C_stri_stats_general, str)
}
#' @title
#' Statistics for a Character Vector Containing LaTeX Commands
#'
#' @description
#' This function gives LaTeX-oriented statistics for a character vector,
#' e.g., obtained by loading a text file with the
#' \code{\link{readLines}} function, where each text line
#' is represented by a separate string.
#'
#' @details
#' We use a slightly modified LaTeX Word Count algorithm implemented in
#' Kile 2.1.3, see
#' \url{https://kile.sourceforge.io/team.php} for the original contributors.
#'
#'
#'
#' @param str character vector to be aggregated
#' @return Returns an integer vector with the following named elements:
#' \enumerate{
#' \item \code{CharsWord} - number of word characters;
#' \item \code{CharsCmdEnvir} - command and words characters;
#' \item \code{CharsWhite} - LaTeX white spaces, including \{ and \} in some contexts;
#' \item \code{Words} - number of words;
#' \item \code{Cmds} - number of commands;
#' \item \code{Envirs} - number of environments;
#' \item ... (Other stuff that may appear in future releases of \pkg{stringi}).
#' }
#' @examples
#' s <- c('Lorem \\textbf{ipsum} dolor sit \\textit{amet}, consectetur adipisicing elit.',
#' '\\begin{small}Proin nibh augue,\\end{small} suscipit a, scelerisque sed, lacinia in, mi.',
#' '')
#' stri_stats_latex(s)
#'
#' @family stats
#' @export
stri_stats_latex <- function(str)
{
.Call(C_stri_stats_latex, str)
}
stringi/R/search.R 0000644 0001762 0000144 00000104550 14750110641 013526 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title String Searching
#'
#' @description
#' This man page explains how to perform string search-based
#' operations in \pkg{stringi}.
#'
#' @details
#' The following independent string searching engines are available
#' in \pkg{stringi}.
#' \itemize{
#' \item \code{stri_*_regex} -- \pkg{ICU}'s regular expressions (regexes),
#' see \link{about_search_regex},
#' \item \code{stri_*_fixed} -- locale-independent byte-wise pattern matching,
#' see \link{about_search_fixed},
#' \item \code{stri_*_coll} -- \pkg{ICU}'s \code{StringSearch},
#' locale-sensitive, Collator-based pattern search,
#' useful for natural language processing tasks,
#' see \link{about_search_coll},
#' \item \code{stri_*_charclass} -- character classes search,
#' e.g., Unicode General Categories or Binary Properties,
#' see \link{about_search_charclass},
#' \item \code{stri_*_boundaries} -- text boundary analysis,
#' see \link{about_search_boundaries}
#' }
#'
#' Each search engine is able to perform many search-based operations.
#' These may include:
#' \itemize{
#' \item \code{stri_detect_*} - detect if a pattern occurs in a string,
#' see, e.g., \code{\link{stri_detect}},
#' \item \code{stri_count_*} - count the number of pattern occurrences,
#' see, e.g., \code{\link{stri_count}},
#' \item \code{stri_locate_*} - locate all, first, or last occurrences
#' of a pattern, see, e.g., \code{\link{stri_locate}},
#' \item \code{stri_extract_*} - extract all, first, or last occurrences
#' of a pattern, see, e.g., \code{\link{stri_extract}}
#' and, in case of regexes, \code{\link{stri_match}},
#' \item \code{stri_replace_*} - replace all, first, or last occurrences
#' of a pattern, see, e.g., \code{\link{stri_replace}}
#' and also \code{\link{stri_trim}},
#' \item \code{stri_split_*} - split a string into chunks indicated
#' by occurrences of a pattern,
#' see, e.g., \code{\link{stri_split}},
#' \item \code{stri_startswith_*} and \code{stri_endswith_*} detect
#' if a string starts or ends with a pattern match, see,
#' e.g., \code{\link{stri_startswith}},
#' \item \code{stri_subset_*} - return a subset of a character vector
#' with strings that match a given pattern, see, e.g., \code{\link{stri_subset}}.
#' }
#'
#' @name about_search
#' @rdname about_search
#' @aliases about_search search stringi-search
#' @family text_boundaries
#' @family search_regex
#' @family search_fixed
#' @family search_coll
#' @family search_charclass
#' @family search_detect
#' @family search_count
#' @family search_locate
#' @family search_replace
#' @family search_split
#' @family search_subset
#' @family search_extract
#' @family search_in
#' @family stringi_general_topics
invisible(NULL)
#' @title
#' Regular Expressions in \pkg{stringi}
#'
#' @description
#' A regular expression is a pattern describing, possibly in a very
#' abstract way, a text fragment.
#' With so many regex functions in \pkg{stringi},
#' regular expressions may be a very powerful tool
#' to perform string searching, substring extraction, string splitting, etc.,
#' tasks.
#'
#'
#' @details
#' All \code{stri_*_regex} functions in \pkg{stringi} use
#' the \pkg{ICU} regex engine. Its settings may be tuned up (for example
#' to perform case-insensitive search) via the
#' \code{\link{stri_opts_regex}} function.
#'
#'
#' Regular expression patterns in \pkg{ICU} are quite similar in form and
#' behavior to Perl's regexes. Their implementation is loosely inspired
#' by JDK 1.4 \code{java.util.regex}.
#' \pkg{ICU} Regular Expressions conform to the Unicode Technical Standard #18
#' (see References section) and its features are summarized in
#' the ICU User Guide (see below). A good general introduction
#' to regexes is (Friedl, 2002).
#' Some general topics are also covered in the \R manual, see \link{regex}.
#'
#' @section \pkg{ICU} Regex Operators at a Glance:
#'
#' Here is a list of operators provided by the
#' ICU User Guide on regexes.
#'
#' \describe{
#' \item{\code{|}}{Alternation. \code{A|B} matches either A or B.}
#' \item{\code{*}}{Match 0 or more times. Match as many times as possible.}
#' \item{\code{+}}{Match 1 or more times. Match as many times as possible.}
#' \item{\code{?}}{Match zero or one times. Prefer one.}
#' \item{\code{{n}} }{Match exactly n times.}
#' \item{\code{{n,}} }{Match at least n times. Match as many times as possible.}
#' \item{\code{{n,m}} }{Match between n and m times.
#' Match as many times as possible, but not more than m.}
#' \item{\code{*?}}{Match 0 or more times. Match as few times as possible.}
#' \item{\code{+?}}{Match 1 or more times. Match as few times as possible.}
#' \item{\code{??}}{Match zero or one times. Prefer zero.}
#' \item{\code{{n}?}}{Match exactly n times.}
#' \item{\code{{n,}?}}{Match at least n times, but no more than required
#' for an overall pattern match.}
#' \item{\code{{n,m}?}}{Match between n and m times. Match as few times
#' as possible, but not less than n.}
#' \item{\code{*+}}{Match 0 or more times. Match as many times as possible
#' when first encountered, do not retry with fewer even if overall match fails
#' (Possessive Match).}
#' \item{\code{++}}{Match 1 or more times. Possessive match.}
#' \item{\code{?+}}{Match zero or one times. Possessive match.}
#' \item{\code{{n}+}}{Match exactly n times.}
#' \item{\code{{n,}+}}{Match at least n times. Possessive Match.}
#' \item{\code{{n,m}+}}{Match between n and m times. Possessive Match.}
#' \item{\code{(...)}}{Capturing parentheses. Range of input that matched
#' the parenthesized sub-expression is available after the match,
#' see \code{\link{stri_match}}.}
#' \item{\code{(?:...)}}{Non-capturing parentheses. Groups the included pattern,
#' but does not provide capturing of matching text. Somewhat more efficient
#' than capturing parentheses.}
#' \item{\code{(?>...)}}{Atomic-match parentheses. The first match of the
#' parenthesized sub-expression is the only one tried; if it does not lead to
#' an overall pattern match, back up the search for a match to a position
#' before the \code{(?>}.}
#' \item{\code{(?#...)}}{Free-format comment \code{(?# comment )}.}
#' \item{\code{(?=...)}}{Look-ahead assertion. True if the parenthesized
#' pattern matches at the current input position, but does not advance
#' the input position.}
#' \item{\code{(?!...)}}{Negative look-ahead assertion. True if the
#' parenthesized pattern does not match at the current input position.
#' Does not advance the input position.}
#' \item{\code{(?<=...)}}{Look-behind assertion. True if the parenthesized
#' pattern matches text preceding the current input position, with the last
#' character of the match being the input character just before the current
#' position. Does not alter the input position. The length of possible strings
#' matched by the look-behind pattern must not be unbounded (no \code{*}
#' or \code{+} operators.)}
#' \item{\code{(?...)}}{Named capture group, where \code{name}
#' (enclosed within the angle brackets)
#' is a sequence like \code{[A-Za-z][A-Za-z0-9]*}}
#' \item{\code{(?ismwx-ismwx:...)}}{Flag settings. Evaluate the parenthesized
#' expression with the specified flags enabled or \code{-}disabled,
#' see also \code{\link{stri_opts_regex}}.}
#' \item{\code{(?ismwx-ismwx)}}{Flag settings. Change the flag settings.
#' Changes apply to the portion of the pattern following the setting.
#' For example, \code{(?i)} changes to a case insensitive match,
#' see also \code{\link{stri_opts_regex}}.}
#' }
#'
#'
#' @section \pkg{ICU} Regex Meta-characters at a Glance:
#'
#' Here is a list of meta-characters provided by the
#' ICU User Guide on regexes.
#'
#' \describe{
#' \item{\code{\\a}}{Match a BELL, \code{\\u0007}.}
#' \item{\code{\\A}}{Match at the beginning of the input. Differs from \code{^}.
#' in that \code{\\A} will not match after a new line within the input.}
#' \item{\code{\\b}}{Match if the current position is a word boundary.
#' Boundaries occur at the transitions between word (\code{\\w}) and non-word
#' (\code{\\W}) characters, with combining marks ignored. For better word
#' boundaries, see \pkg{ICU} Boundary Analysis, e.g., \code{\link{stri_extract_all_words}}.}
#' \item{\code{\\B}}{Match if the current position is not a word boundary.}
#' \item{\code{\\cX}}{Match a control-\code{X} character.}
#' \item{\code{\\d}}{Match any character with the Unicode General Category of
#' \code{Nd} (Number, Decimal Digit.).}
#' \item{\code{\\D}}{Match any character that is not a decimal digit.}
#' \item{\code{\\e}}{Match an ESCAPE, \code{\\u001B}.}
#' \item{\code{\\E}}{Terminates a \code{\\Q} ... \code{\\E} quoted sequence.}
#' \item{\code{\\f}}{Match a FORM FEED, \code{\\u000C}.}
#' \item{\code{\\G}}{Match if the current position is at the end of the
#' previous match.}
#' \item{\code{\\h}}{Match a Horizontal White Space character.
#' They are characters with Unicode General Category of Space_Separator plus
#' the ASCII tab, \code{\\u0009}. [Since ICU 55]}
#' \item{\code{\\H}}{Match a non-Horizontal White Space character.
#' [Since ICU 55]}
#' \item{\code{\\k}}{Named Capture Back Reference. [Since ICU 55]}
#' \item{\code{\\n}}{Match a LINE FEED, \code{\\u000A}.}
#' \item{\code{\\N{UNICODE CHARACTER NAME}} }{Match the named character.}
#' \item{\code{\\p{UNICODE PROPERTY NAME}} }{Match any character with the
#' specified Unicode Property.}
#' \item{\code{\\P{UNICODE PROPERTY NAME}} }{Match any character not having
#' the specified Unicode Property.}
#' \item{\code{\\Q}}{Quotes all following characters until \code{\\E}.}
#' \item{\code{\\r}}{Match a CARRIAGE RETURN, \code{\\u000D}.}
#' \item{\code{\\s}}{Match a white space character. White space is defined
#' as \code{[\\t\\n\\f\\r\\p{Z}]}.}
#' \item{\code{\\S}}{Match a non-white space character.}
#' \item{\code{\\t}}{Match a HORIZONTAL TABULATION, \code{\\u0009}.}
#' \item{\code{\\uhhhh}}{Match the character with the hex value \code{hhhh}.}
#' \item{\code{\\Uhhhhhhhh}}{Match the character with the hex value \code{hhhhhhhh}.
#' Exactly eight hex digits must be provided, even though the largest
#' Unicode code point is \code{\\U0010ffff}.}
#' \item{\code{\\w}}{Match a word character. Word characters are
#' \code{[\\p{Alphabetic}\\p{Mark}\\p{Decimal_Number}\\p{Connector_Punctuation}\\u200c\\u200d]}.}
#' \item{\code{\\W}}{Match a non-word character.}
#' \item{\code{\\x{hhhh}} }{Match the character with hex value hhhh.
#' From one to six hex digits may be supplied.}
#' \item{\code{\\xhh}}{Match the character with two digit hex value hh }
#' \item{\code{\\X}}{Match a Grapheme Cluster.}
#' \item{\code{\\Z}}{Match if the current position is at the end of input,
#' but before the final line terminator, if one exists.}
#' \item{\code{\\z}}{Match if the current position is at the end of input.}
#' \item{\code{\\n}}{Back Reference. Match whatever the nth capturing
#' group matched. n must be a number > 1 and < total number of capture
#' groups in the pattern.}
#' \item{\code{\\0ooo}}{Match an Octal character. \code{'ooo'} is from one to three
#' octal digits. 0377 is the largest allowed Octal character. The leading
#' zero is required; it distinguishes Octal constants from back references.}
#' \item{\code{[pattern]}}{Match any one character from the set.}
#' \item{\code{.}}{Match any character except for - by default - newline, compare \code{\link{stri_opts_regex}}.}
#' \item{\code{^}}{Match at the beginning of a line.}
#' \item{\code{$}}{Match at the end of a line.}
#' \item{\code{\\}}{[outside of sets] Quotes the following character.
#' Characters that must be quoted to be treated as literals are
#' \code{* ? + [ ( ) { } ^ $ | \\ .}.}
#' \item{\code{\\}}{[inside sets] Quotes the following character.
#' Characters that must be quoted to be treated as literals are
#' \code{[ ] \\}; Characters that may need to be quoted, depending
#' on the context are \code{- &}.}
#' }
#'
#' @section Character Classes:
#'
#' The syntax is similar, but not 100\% compatible with the one
#' described in \link{about_search_charclass}. In particular,
#' whitespaces are not ignored and set-theoretic operations are
#' denoted slightly differently. However, other than this
#' \link{about_search_charclass} is a good reference
#' on the capabilities offered.
#'
#' The ICU User Guide on regexes lists what follows.
#'
#' \describe{
#' \item{\code{[abc]}}{Match any of the characters a, b, or c}
#' \item{\code{[^abc]}}{Negation -- match any character except a, b, or c}
#' \item{\code{[A-M]}}{Range -- match any character from A to M (based on Unicode code point ordering)}
#' \item{\code{[\\p{L}]}, \code{[\\p{Letter}]}, \code{[\\p{General_Category=Letter}]}, \code{[:letter:]}}{Characters with Unicode Category = Letter (4 equivalent forms)}
#' \item{\code{[\\P{Letter}]}}{Negated property -- natch everything except Letters}
#' \item{\code{[\\p{numeric_value=9}]}}{Match all numbers with a numeric value of 9}
#' \item{\code{[\\p{Letter}&&\\p{script=cyrillic}]}}{Intersection; match the set of all Cyrillic letters}
#' \item{\code{[\\p{Letter}--\\p{script=latin}]}}{Set difference; match all non-Latin letters}
#' \item{\code{[[a-z][A-Z][0-9]]}, \code{[a-zA-Z0-9]}}{Union; match ASCII letters and digits (2 equivalent forms)}
#' }
#'
#'
#' @section Regex Functions in \pkg{stringi}:
#'
#' Note that if a given regex \code{pattern} is empty,
#' then all the functions in \pkg{stringi} give \code{NA} in result
#' and generate a warning.
#' On a syntax error, a quite informative failure message is shown.
#'
#' If you wish to search for a fixed pattern,
#' refer to \link{about_search_coll} or \link{about_search_fixed}.
#' They allow to perform a locale-aware text lookup,
#' or a very fast exact-byte search, respectively.
#'
#'
#'
#' @references
#' \emph{Regular expressions} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/strings/regexp.html}
#'
#' J.E.F. Friedl, \emph{Mastering Regular Expressions}, O'Reilly, 2002
#'
#' \emph{Unicode Regular Expressions} -- Unicode Technical Standard #18,
#' \url{https://www.unicode.org/reports/tr18/}
#'
#' \emph{Unicode Regular Expressions} -- Regex tutorial,
#' \url{https://www.regular-expressions.info/unicode.html}
#'
#' @name about_search_regex
#' @rdname about_search_regex
#' @aliases about_search_regex search_regex stringi-search-regex
#' @family search_regex
#' @family stringi_general_topics
invisible(NULL)
#' @title
#' Locale-Insensitive Fixed Pattern Matching in \pkg{stringi}
#'
#' @description
#' String searching facilities described here
#' provide a way to locate a specific sequence of bytes in a string.
#' The search engine's settings may be tuned up (for example
#' to perform case-insensitive search) via a call to the
#' \code{\link{stri_opts_fixed}} function.
#'
#'
#' @section Byte Compare:
#'
#' The fast Knuth-Morris-Pratt search algorithm, with worst time complexity of
#' O(n+p) (\code{n == length(str)}, \code{p == length(pattern)})
#' is implemented (with some tweaks for very short search patterns).
#'
#' Be aware that, for natural language processing,
#' fixed pattern searching might not be what
#' you actually require. It is because a bitwise match will
#' not give correct results in cases of:
#' \enumerate{
#' \item accented letters;
#' \item conjoined letters;
#' \item ignorable punctuation;
#' \item ignorable case,
#' }
#' see also \link{about_search_coll}.
#'
#' Note that the conversion of input data
#' to Unicode is done as usual.
#'
#' @name about_search_fixed
#' @rdname about_search_fixed
#' @aliases about_search_fixed search_fixed stringi-search-fixed
#' @family search_fixed
#' @family stringi_general_topics
invisible(NULL)
#' @title
#' Locale-Sensitive Text Searching in \pkg{stringi}
#'
#' @description
#' String searching facilities described here
#' provide a way to locate a specific piece of
#' text. Interestingly, locale-sensitive searching, especially
#' on a non-English text, is a much more complex process
#' than it seems at first glance.
#'
#'
#'
#' @section Locale-Aware String Search Engine:
#'
#' All \code{stri_*_coll} functions in \pkg{stringi} use
#' \pkg{ICU}'s \code{StringSearch} engine,
#' which implements a locale-sensitive string search algorithm.
#' The matches are defined by using the notion of ``canonical equivalence''
#' between strings.
#'
#' Tuning the Collator's parameters allows you to perform correct matching
#' that properly takes into account accented letters, conjoined letters,
#' ignorable punctuation and letter case.
#'
#' For more information on \pkg{ICU}'s Collator and the search engine
#' and how to tune it up
#' in \pkg{stringi}, refer to \code{\link{stri_opts_collator}}.
#'
#' Please note that \pkg{ICU}'s \code{StringSearch}-based functions
#' are often much slower that those to perform fixed pattern searches.
#'
#'
#' @references
#' \emph{ICU String Search Service} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/collation/string-search.html}
#'
#' L. Werner, \emph{Efficient Text Searching in Java}, 1999,
#' \url{https://icu-project.org/docs/papers/efficient_text_searching_in_java.html}
#'
#' @name about_search_coll
#' @rdname about_search_coll
#' @aliases about_search_coll search_coll stringi-search-coll
#' @family search_coll
#' @family locale_sensitive
#' @family stringi_general_topics
invisible(NULL)
#' @title Character Classes in \pkg{stringi}
#'
#' @description
#' Here we describe how character classes (sets) can be specified
#' in the \pkg{stringi} package. These are useful for defining
#' search patterns (note that the \pkg{ICU} regex engine uses the same
#' scheme for denoting character classes) or, e.g.,
#' generating random code points with \code{\link{stri_rand_strings}}.
#'
#'
#' @details
#' All \code{stri_*_charclass} functions in \pkg{stringi} perform
#' a single character (i.e., Unicode code point) search-based operations.
#' You may obtain the same results using \link{about_search_regex}.
#' However, these very functions aim to be faster.
#'
#' Character classes are defined using \pkg{ICU}'s \code{UnicodeSet}
#' patterns. Below we briefly summarize their syntax.
#' For more details refer to the bibliographic References below.
#'
#'
#' @section \code{UnicodeSet} patterns:
#'
#' A \code{UnicodeSet} represents a subset of Unicode code points
#' (recall that \pkg{stringi} converts strings in your native encoding
#' to Unicode automatically). Legal code points are U+0000 to U+10FFFF,
#' inclusive.
#'
#' Patterns either consist of series of characters bounded by
#' square brackets
#' (such patterns follow a syntax similar to that employed
#' by regular expression character classes)
#' or of Perl-like Unicode property set specifiers.
#'
#' \code{[]} denotes an empty set, \code{[a]} --
#' a set consisting of character ``a'',
#' \code{[\\u0105]} -- a set with character U+0105,
#' and \code{[abc]} -- a set with ``a'', ``b'', and ``c''.
#'
#' \code{[a-z]} denotes a set consisting of characters
#' ``a'' through ``z'' inclusively, in Unicode code point order.
#'
#' Some set-theoretic operations are available.
#' \code{^} denotes the complement, e.g., \code{[^a-z]} contains
#' all characters but ``a'' through ``z''.
#' Moreover, \code{[[pat1][pat2]]},
#' \code{[[pat1]\&[pat2]]}, and \code{[[pat1]-[pat2]]}
#' denote union, intersection, and asymmetric difference of sets
#' specified by \code{pat1} and \code{pat2}, respectively.
#'
#' Note that all white-spaces are ignored unless they are quoted or back-slashed
#' (white spaces can be freely used for clarity, as \code{[a c d-f m]}
#' means the same as \code{[acd-fm]}).
#' \pkg{stringi} does not allow including multi-character strings
#' (see \code{UnicodeSet} API documentation).
#' Also, empty string patterns are disallowed.
#'
#' Any character may be preceded by
#' a backslash in order to remove its special meaning.
#'
#' A malformed pattern always results in an error.
#'
#' Set expressions at a glance
#' (according to \url{https://unicode-org.github.io/icu/userguide/strings/regexp.html}):
#'
#'
#' Some examples:
#'
#' \describe{
#' \item{\code{[abc]}}{Match any of the characters a, b or c.}
#' \item{\code{[^abc]}}{Negation -- match any character except a, b or c.}
#' \item{\code{[A-M]}}{Range -- match any character from A to M. The characters
#' to include are determined by Unicode code point ordering.}
#' \item{\code{[\\u0000-\\U0010ffff]}}{Range -- match all characters.}
#' \item{\code{[\\p{Letter}]} or \code{[\\p{General_Category=Letter}]} or \code{[\\p{L}]}}{
#' Characters with Unicode Category = Letter. All forms shown are equivalent.}
#' \item{\code{[\\P{Letter}]}}{Negated property
#' (Note the upper case \code{\\P}) -- match everything except Letters.}
#' \item{\code{[\\p{numeric_value=9}]}}{Match all numbers with a numeric value of 9.
#' Any Unicode Property may be used in set expressions.}
#' \item{\code{[\\p{Letter}&\\p{script=cyrillic}]}}{Set
#' intersection -- match the set of all Cyrillic letters.}
#' \item{\code{[\\p{Letter}-\\p{script=latin}]}}{Set difference --
#' match all non-Latin letters.}
#' \item{\code{[[a-z][A-Z][0-9]]} or \code{[a-zA-Z0-9]}}{Implicit union of
#' sets -- match ASCII letters and digits (the two forms are equivalent).}
#' \item{\code{[:script=Greek:]}}{Alternative POSIX-like syntax for properties --
#' equivalent to \code{\\p{script=Greek}}.}
#' }
#'
#' @section Unicode properties:
#'
#' Unicode property sets are specified with a POSIX-like syntax,
#' e.g., \code{[:Letter:]},
#' or with a (extended) Perl-style syntax, e.g., \code{\\p{L}}.
#' The complements of the above sets are
#' \code{[:^Letter:]} and \code{\\P{L}}, respectively.
#'
#' The names are normalized before matching
#' (for example, the match is case-insensitive).
#' Moreover, many names have short aliases.
#'
#' Among predefined Unicode properties we find, e.g.:
#' \itemize{
#' \item Unicode General Categories, e.g., \code{Lu} for uppercase letters,
#' \item Unicode Binary Properties, e.g., \code{WHITE_SPACE},
#' }
#' and many more (including Unicode scripts).
#'
#' Each property provides access to the large and comprehensive
#' Unicode Character Database.
#' Generally, the list of properties available in \pkg{ICU}
#' is not well-documented. Please refer to the References section
#' for some links.
#'
#' Please note that some classes might overlap.
#' However, e.g., General Category \code{Z} (some space) and Binary Property
#' \code{WHITE_SPACE} matches different character sets.
#'
#'
#' @section Unicode General Categories:
#'
#' The Unicode General Category property of a code point provides the most
#' general classification of that code point.
#' Each code point falls into one and only one Category.
#'
#' \describe{
#' \item{\code{Cc}}{a C0 or C1 control code.}
#' \item{\code{Cf}}{a format control character.}
#' \item{\code{Cn}}{a reserved unassigned code point or a non-character.}
#' \item{\code{Co}}{a private-use character.}
#' \item{\code{Cs}}{a surrogate code point.}
#' \item{\code{Lc}}{the union of Lu, Ll, Lt.}
#' \item{\code{Ll}}{a lowercase letter.}
#' \item{\code{Lm}}{a modifier letter.}
#' \item{\code{Lo}}{other letters, including syllables and ideographs.}
#' \item{\code{Lt}}{a digraphic character, with the first part uppercase.}
#' \item{\code{Lu}}{an uppercase letter.}
#' \item{\code{Mc}}{a spacing combining mark (positive advance width).}
#' \item{\code{Me}}{an enclosing combining mark.}
#' \item{\code{Mn}}{a non-spacing combining mark (zero advance width).}
#' \item{\code{Nd}}{a decimal digit.}
#' \item{\code{Nl}}{a letter-like numeric character.}
#' \item{\code{No}}{a numeric character of other type.}
#' \item{\code{Pd}}{a dash or hyphen punctuation mark.}
#' \item{\code{Ps}}{an opening punctuation mark (of a pair).}
#' \item{\code{Pe}}{a closing punctuation mark (of a pair).}
#' \item{\code{Pc}}{a connecting punctuation mark, like a tie.}
#' \item{\code{Po}}{a punctuation mark of other type.}
#' \item{\code{Pi}}{an initial quotation mark.}
#' \item{\code{Pf}}{a final quotation mark.}
#' \item{\code{Sm}}{a symbol of mathematical use.}
#' \item{\code{Sc}}{a currency sign.}
#' \item{\code{Sk}}{a non-letter-like modifier symbol.}
#' \item{\code{So}}{a symbol of other type.}
#' \item{\code{Zs}}{a space character (of non-zero width).}
#' \item{\code{Zl}}{U+2028 LINE SEPARATOR only.}
#' \item{\code{Zp}}{U+2029 PARAGRAPH SEPARATOR only.}
#' \item{\code{C} }{the union of Cc, Cf, Cs, Co, Cn.}
#' \item{\code{L} }{the union of Lu, Ll, Lt, Lm, Lo.}
#' \item{\code{M} }{the union of Mn, Mc, Me.}
#' \item{\code{N} }{the union of Nd, Nl, No.}
#' \item{\code{P} }{the union of Pc, Pd, Ps, Pe, Pi, Pf, Po.}
#' \item{\code{S} }{the union of Sm, Sc, Sk, So.}
#' \item{\code{Z} }{the union of Zs, Zl, Zp }
#' }
#'
#' @section Unicode Binary Properties:
#'
#' Each character may follow many Binary Properties at a time.
#'
#' Here is a comprehensive list of supported Binary Properties:
#'
#' \describe{
#' \item{\code{ALPHABETIC} }{alphabetic character.}
#' \item{\code{ASCII_HEX_DIGIT}}{a character matching the \code{[0-9A-Fa-f]} charclass.}
#' \item{\code{BIDI_CONTROL} }{a format control which have specific functions
#' in the Bidi (bidirectional text) Algorithm.}
#' \item{\code{BIDI_MIRRORED} }{a character that may change display in right-to-left text.}
#' \item{\code{DASH} }{a kind of a dash character.}
#' \item{\code{DEFAULT_IGNORABLE_CODE_POINT}}{characters that are ignorable in most
#' text processing activities,
#' e.g., <2060..206F, FFF0..FFFB, E0000..E0FFF>.}
#' \item{\code{DEPRECATED} }{a deprecated character according
#' to the current Unicode standard (the usage of deprecated characters
#' is strongly discouraged).}
#' \item{\code{DIACRITIC} }{a character that linguistically modifies
#' the meaning of another character to which it applies.}
#' \item{\code{EXTENDER} }{a character that extends the value
#' or shape of a preceding alphabetic character,
#' e.g., a length and iteration mark.}
#' \item{\code{HEX_DIGIT} }{a character commonly
#' used for hexadecimal numbers,
#' see also \code{ASCII_HEX_DIGIT}.}
#' \item{\code{HYPHEN}}{a dash used to mark connections between
#' pieces of words, plus the Katakana middle dot.}
#' \item{\code{ID_CONTINUE}}{a character that can continue an identifier,
#' \code{ID_START}+\code{Mn}+\code{Mc}+\code{Nd}+\code{Pc}.}
#' \item{\code{ID_START}}{a character that can start an identifier,
#' \code{Lu}+\code{Ll}+\code{Lt}+\code{Lm}+\code{Lo}+\code{Nl}.}
#' \item{\code{IDEOGRAPHIC}}{a CJKV (Chinese-Japanese-Korean-Vietnamese)
#' ideograph.}
#' \item{\code{LOWERCASE}}{...}
#' \item{\code{MATH}}{...}
#' \item{\code{NONCHARACTER_CODE_POINT}}{...}
#' \item{\code{QUOTATION_MARK}}{...}
#' \item{\code{SOFT_DOTTED}}{a character with a ``soft dot'', like i or j,
#' such that an accent placed on this character causes the dot to disappear.}
#' \item{\code{TERMINAL_PUNCTUATION}}{a punctuation character that generally
#' marks the end of textual units.}
#' \item{\code{UPPERCASE}}{...}
#' \item{\code{WHITE_SPACE}}{a space character or TAB or CR or LF or ZWSP or ZWNBSP.}
#' \item{\code{CASE_SENSITIVE}}{...}
#' \item{\code{POSIX_ALNUM}}{...}
#' \item{\code{POSIX_BLANK}}{...}
#' \item{\code{POSIX_GRAPH}}{...}
#' \item{\code{POSIX_PRINT}}{...}
#' \item{\code{POSIX_XDIGIT}}{...}
#' \item{\code{CASED}}{...}
#' \item{\code{CASE_IGNORABLE}}{...}
#' \item{\code{CHANGES_WHEN_LOWERCASED}}{...}
#' \item{\code{CHANGES_WHEN_UPPERCASED}}{...}
#' \item{\code{CHANGES_WHEN_TITLECASED}}{...}
#' \item{\code{CHANGES_WHEN_CASEFOLDED}}{...}
#' \item{\code{CHANGES_WHEN_CASEMAPPED}}{...}
#' \item{\code{CHANGES_WHEN_NFKC_CASEFOLDED}}{...}
#' \item{\code{EMOJI}}{Since ICU 57}
#' \item{\code{EMOJI_PRESENTATION}}{Since ICU 57}
#' \item{\code{EMOJI_MODIFIER}}{Since ICU 57}
#' \item{\code{EMOJI_MODIFIER_BASE}}{Since ICU 57}
#' }
#'
#'
#' @section POSIX Character Classes:
#'
#' Avoid using POSIX character classes,
#' e.g., \code{[:punct:]}. The ICU User Guide (see below)
#' states that in general they are not well-defined, so you may end up
#' with something different than you expect.
#'
#' In particular, in POSIX-like regex engines, \code{[:punct:]} stands for
#' the character class corresponding to the \code{ispunct()} classification
#' function (check out \code{man 3 ispunct} on UNIX-like systems).
#' According to ISO/IEC 9899:1990 (ISO C90), the \code{ispunct()} function
#' tests for any printing character except for space or a character
#' for which \code{isalnum()} is true. However, in a POSIX setting,
#' the details of what characters belong into which class depend
#' on the current locale. So the \code{[:punct:]} class does not lead
#' to a portable code (again, in POSIX-like regex engines).
#'
#' Therefore, a POSIX flavor of \code{[:punct:]} is more like
#' \code{[\\p{P}\\p{S}]} in \pkg{ICU}. You have been warned.
#'
#'
#' @references
#' \emph{The Unicode Character Database} -- Unicode Standard Annex #44,
#' \url{https://www.unicode.org/reports/tr44/}
#'
#' \emph{UnicodeSet} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/strings/unicodeset.html}
#'
#' \emph{Properties} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/strings/properties.html}
#'
#' \emph{C/POSIX Migration} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/icu/posix.html}
#'
#' \emph{Unicode Script Data}, \url{https://www.unicode.org/Public/UNIDATA/Scripts.txt}
#'
#' \emph{icu::Unicodeset Class Reference} -- ICU4C API Documentation,
#' \url{https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1UnicodeSet.html}
#'
#' @name about_search_charclass
#' @rdname about_search_charclass
#' @aliases about_search_charclass search_charclass stringi-search-charclass
#' @family search_charclass
#' @family stringi_general_topics
invisible(NULL)
#' @title
#' Text Boundary Analysis in \pkg{stringi}
#'
#' @description
#' Text boundary analysis is the process of locating linguistic boundaries
#' while formatting and handling text.
#'
#' @details
#' Examples of the boundary analysis process include:
#'
#' \itemize{
#' \item Locating positions to word-wrap text to fit
#' within specific margins while displaying or printing,
#' see \code{\link{stri_wrap}} and \code{\link{stri_split_boundaries}}.
#' \item Counting characters, words, sentences, or paragraphs,
#' see \code{\link{stri_count_boundaries}}.
#' \item Making a list of the unique words in a document,
#' see \code{\link{stri_extract_all_words}} and then \code{\link{stri_unique}}.
#' \item Capitalizing the first letter of each word
#' or sentence, see also \code{\link{stri_trans_totitle}}.
#' \item Locating a particular unit of the text (for example,
#' finding the third word in the document),
#' see \code{\link{stri_locate_all_boundaries}}.
#' }
#'
#' Generally, text boundary analysis is a locale-dependent operation.
#' For example, in Japanese and Chinese one does not separate words with spaces
#' - a line break can occur even in the middle of a word.
#' These languages have punctuation and diacritical
#' marks that cannot start or end a line, so this must also be taken into account.
#'
#' \pkg{stringi} uses \pkg{ICU}'s \code{BreakIterator} to locate specific
#' text boundaries. Note that the \code{BreakIterator}'s behavior
#' may be controlled in come cases, see \code{\link{stri_opts_brkiter}}.
#' \itemize{
#' \item The \code{character} boundary iterator tries to match what a user
#' would think of as a ``character'' -- a basic unit of a writing system
#' for a language -- which may be more than just a single Unicode code point.
#' \item The \code{word} boundary iterator locates the boundaries
#' of words, for purposes such as ``Find whole words'' operations.
#' \item The \code{line_break} iterator locates positions that would
#' be appropriate to wrap lines when displaying the text.
#' \item The break iterator of type \code{sentence}
#' locates sentence boundaries.
#' }
#'
#' For technical details on different classes of text boundaries refer
#' to the \pkg{ICU} User Guide, see below.
#'
#' @references
#' \emph{Boundary Analysis} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/boundaryanalysis/}
#'
#' @name about_search_boundaries
#' @rdname about_search_boundaries
#' @aliases about_search_boundaries search_boundaries stringi-search-boundaries
#' @family locale_sensitive
#' @family text_boundaries
#' @family stringi_general_topics
invisible(NULL)
stringi/R/search_count_bound.R 0000644 0001762 0000144 00000010706 14750110641 016124 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Count the Number of Text Boundaries
#'
#' @description
#' These functions determine the number of text boundaries
#' (like character, word, line, or sentence boundaries) in a string.
#'
#' @details
#' Vectorized over \code{str}.
#'
#' For more information on text boundary analysis
#' performed by \pkg{ICU}'s \code{BreakIterator}, see
#' \link{stringi-search-boundaries}.
#'
#' In case of \code{stri_count_words},
#' just like in \code{\link{stri_extract_all_words}} and
#' \code{\link{stri_locate_all_words}},
#' \pkg{ICU}'s word \code{BreakIterator} iterator is used
#' to locate the word boundaries, and all non-word characters
#' (\code{UBRK_WORD_NONE} rule status) are ignored.
#' This function is equivalent to a call to
#' \code{\link{stri_count_boundaries}(str, type='word', skip_word_none=TRUE, locale=locale)}.
#'
#' Note that a \code{BreakIterator} of type \code{character}
#' may be used to count the number of \emph{Unicode characters} in a string.
#' The \code{\link{stri_length}} function,
#' which aims to count the number of \emph{Unicode code points},
#' might report different results.
#'
#' Moreover, a \code{BreakIterator} of type \code{sentence}
#' may be used to count the number of sentences in a text piece.
#'
#'
#' @param str character vector or an object coercible to
#' @param opts_brkiter a named list with \pkg{ICU} BreakIterator's settings,
#' see \code{\link{stri_opts_brkiter}};
#' \code{NULL} for the default break iterator, i.e., \code{line_break}
#' @param ... additional settings for \code{opts_brkiter}
#' @param locale \code{NULL} or \code{''} for text boundary analysis following
#' the conventions of the default locale, or a single string with
#' locale identifier, see \link{stringi-locale}
#'
#' @return
#' Both functions return an integer vector.
#'
#' @examples
#' test <- 'The\u00a0above-mentioned features are very useful. Spam, spam, eggs, bacon, and spam.'
#' stri_count_boundaries(test, type='word')
#' stri_count_boundaries(test, type='sentence')
#' stri_count_boundaries(test, type='character')
#' stri_count_words(test)
#'
#' test2 <- stri_trans_nfkd('\u03c0\u0153\u0119\u00a9\u00df\u2190\u2193\u2192')
#' stri_count_boundaries(test2, type='character')
#' stri_length(test2)
#' stri_numbytes(test2)
#'
#' @export
#' @family search_count
#' @family locale_sensitive
#' @family text_boundaries
#' @rdname stri_count_boundaries
stri_count_boundaries <- function(str, ..., opts_brkiter = NULL)
{
if (!missing(...))
opts_brkiter <- do.call(stri_opts_brkiter, as.list(c(opts_brkiter, ...)))
.Call(C_stri_count_boundaries, str, opts_brkiter)
}
#' @export
#' @rdname stri_count_boundaries
stri_count_words <- function(str, locale = NULL)
{
stri_count_boundaries(str,
opts_brkiter = stri_opts_brkiter(type = "word", skip_word_none = TRUE,
locale = locale))
}
stringi/R/join.R 0000644 0001762 0000144 00000022175 14750110641 013222 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Duplicate Strings
#'
#' @description
#' Duplicates each \code{str}(\code{e1}) string \code{times}(\code{e2}) times
#' and concatenates the results.
#'
#' @details
#' Vectorized over all arguments.
#'
#' \code{e1 \%s*\% e2} and \code{e1 \%stri*\% e2} are synonyms
#' for \code{stri_dup(e1, e2)}
#'
#' @param str,e1 a character vector of strings to be duplicated
#' @param times,e2 an integer vector with the numbers of times to duplicate each string
#'
#' @return Returns a character vector.
#'
#' @export
#' @family join
#' @rdname stri_dup
#' @aliases stri_dup operator_multiply oper_multiply
#' @examples
#' stri_dup('a', 1:5)
#' stri_dup(c('a', NA, 'ba'), 4)
#' stri_dup(c('abc', 'pqrst'), c(4, 2))
#' "a" %s*% 5
stri_dup <- function(str, times)
{
.Call(C_stri_dup, str, times)
}
#' @usage
#' e1 \%s*\% e2
#' @rdname stri_dup
#' @export
`%s*%` <- function(e1, e2)
{
.Call(C_stri_dup, e1, e2)
}
#' @usage
#' e1 \%stri*\% e2
#' @rdname stri_dup
#' @export
`%stri*%` <- `%s*%`
#' @title
#' Concatenate Two Character Vectors
#'
#' @description
#' Binary operators for joining (concatenating) two character vectors,
#' with a typical R look-and-feel.
#'
#' @details
#' Vectorized over \code{e1} and \code{e2}.
#'
#' These operators act like a call to \code{\link{stri_join}(e1, e2, sep='')}.
#' However, note that joining 3 vectors, e.g., \code{e1 \%s+\% e2 \%s+\% e3}
#' is slower than \code{\link{stri_join}(e1, e2, e3, sep='')},
#' because it creates a new (temporary) result vector each time
#' the operator is applied.
#'
#'
#' @param e1 a character vector or an object coercible to a character vector
#' @param e2 a character vector or an object coercible to a character vector
#'
#' @return Returns a character vector.
#'
#'
#' @examples
#' c('abc', '123', 'xy') %s+% letters[1:6]
#' 'ID_' %s+% 1:5
#'
#' @rdname operator_add
#' @aliases oper_plus operator_add operator_plus
#' @family join
#'
#' @usage
#' e1 \%s+\% e2
#'
#' @export
`%s+%` <- function(e1, e2)
{
.Call(C_stri_join2, e1, e2)
}
#' @usage
#' e1 \%stri+\% e2
#' @rdname operator_add
#' @export
`%stri+%` <- `%s+%`
#' @title
#' Concatenate Character Vectors
#'
#' @description
#' These are the \pkg{stringi}'s equivalents of the built-in
#' \code{\link{paste}} function.
#' \code{stri_c} and \code{stri_paste} are aliases for \code{stri_join}.
#'
#' @details
#' Vectorized over each atomic vector in `\code{...}`.
#'
#' Unless \code{collapse} is \code{NULL}, the result will be a single string.
#' Otherwise, you get a character vector of length equal
#' to the length of the longest argument.
#'
#' If any of the arguments in `\code{...}` is a vector of length 0
#' (not to be confused with vectors of empty strings)
#' and \code{ignore_null} is \code{FALSE}, then
#' you will get a 0-length character vector in result.
#'
#' If \code{collapse} or \code{sep} has length greater than 1,
#' then only the first string will be used.
#'
#' In case where there are missing values in any of the input vectors,
#' \code{NA} is set to the corresponding element.
#' Note that this behavior is different from \code{\link{paste}},
#' which treats missing values as ordinary strings like \code{'NA'}.
#' Moreover, as usual in \pkg{stringi}, the resulting strings are
#' always in UTF-8.
#'
#' @param ... character vectors (or objects coercible to character vectors)
#' whose corresponding elements are to be concatenated
#' @param sep a single string; separates terms
#' @param collapse a single string or \code{NULL}; an optional
#' results separator
#' @param ignore_null a single logical value; if \code{TRUE}, then empty
#' vectors provided via \code{...} are silently ignored
#'
#' @return Returns a character vector.
#'
#' @export
#' @examples
#' stri_join(1:13, letters)
#' stri_join(1:13, letters, sep=',')
#' stri_join(1:13, letters, collapse='; ')
#' stri_join(1:13, letters, sep=',', collapse='; ')
#' stri_join(c('abc', '123', 'xyz'),'###', 1:6, sep=',')
#' stri_join(c('abc', '123', 'xyz'),'###', 1:6, sep=',', collapse='; ')
#'
#' @family join
#' @rdname stri_join
stri_join <- function(..., sep = "", collapse = NULL, ignore_null = FALSE)
{
.Call(C_stri_join, list(...), sep, collapse, ignore_null)
}
#' @rdname stri_join
#' @export
stri_c <- stri_join
#' @rdname stri_join
#' @export
stri_paste <- stri_join
#' @title
#' Flatten a String
#'
#' @description
#' Joins the elements of a character vector into one string.
#'
#' @details
#' The \code{stri_flatten(str, collapse='XXX')} call
#' is equivalent to \code{\link{paste}(str, collapse='XXX', sep='')}.
#'
#' If you wish to use some more fancy (e.g., differing)
#' separators between flattened strings,
#' call \code{\link{stri_join}(str, separators, collapse='')}.
#'
#' If \code{str} is not empty, then a single string is returned.
#' If \code{collapse} has length > 1, then only the first string
#' will be used.
#'
#' @param str a vector of strings to be coerced to character
#' @param collapse a single string denoting the separator
#' @param na_empty single logical value; should missing values
#' in \code{str} be treated as empty strings (\code{TRUE})
#' or be omitted whatsoever (\code{NA})?
#' @param omit_empty single logical value; should empty strings
#' in \code{str} be omitted?
#'
#' @return
#' Returns a single string, i.e., a character
#' vector of length 1.
#'
#' @examples
#' stri_flatten(LETTERS)
#' stri_flatten(LETTERS, collapse=',')
#' stri_flatten(stri_dup(letters[1:6], 1:3))
#' stri_flatten(c(NA, '', 'A', '', 'B', NA, 'C'), collapse=',', na_empty=TRUE, omit_empty=TRUE)
#' stri_flatten(c(NA, '', 'A', '', 'B', NA, 'C'), collapse=',', na_empty=NA)
#'
#' @export
#' @family join
stri_flatten <- function(str, collapse = "", na_empty = FALSE, omit_empty = FALSE)
{
.Call(C_stri_flatten, str, collapse, na_empty, omit_empty)
}
#' @title
#' Concatenate Strings in a List
#'
#' @description
#' These functions concatenate all the strings in each character vector
#' in a given list.
#' \code{stri_c_list} and \code{stri_paste_list} are aliases for
#' \code{stri_join_list}.
#'
#' @details
#' Unless \code{collapse} is \code{NULL}, the result will be a single string.
#' Otherwise, you get a character vector of length equal
#' to the length of \code{x}.
#'
#' Vectors in \code{x} of length 0 are silently ignored.
#'
#' If \code{collapse} or \code{sep} has length greater than 1,
#' then only the first string will be used.
#'
#' @param x a list consisting of character vectors
#' @param sep a single string; separates strings in each of the character
#' vectors in \code{x}
#' @param collapse a single string or \code{NULL}; an optional
#' results separator
#'
#' @return Returns a character vector.
#'
#' @export
#' @examples
#' stri_join_list(
#' stri_extract_all_words(c('Lorem ipsum dolor sit amet.',
#' 'Spam spam bacon sausage and spam.')),
#' sep=', ')
#'
#' stri_join_list(
#' stri_extract_all_words(c('Lorem ipsum dolor sit amet.',
#' 'Spam spam bacon sausage and spam.')),
#' sep=', ', collapse='. ')
#'
#' stri_join_list(
#' stri_extract_all_regex(
#' c('spam spam bacon', '123 456', 'spam 789 sausage'), '\\p{L}+'
#' ),
#' sep=',')
#'
#' stri_join_list(
#' stri_extract_all_regex(
#' c('spam spam bacon', '123 456', 'spam 789 sausage'), '\\p{L}+',
#' omit_no_match=TRUE
#' ),
#' sep=',', collapse='; ')
#'
#' @family join
#' @rdname stri_join_list
stri_join_list <- function(x, sep = "", collapse = NULL)
{
.Call(C_stri_join_list, x, sep, collapse)
}
#' @rdname stri_join_list
#' @export
stri_c_list <- stri_join_list
#' @rdname stri_join_list
#' @export
stri_paste_list <- stri_join_list
stringi/R/time_calendar.R 0000644 0001762 0000144 00000024577 14770534711 015074 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Create a Date-Time Object
#'
#' @description
#' Constructs date-time objects from numeric representations.
#'
#' @details
#' Vectorized over \code{year}, \code{month}, \code{day}, \code{hour},
#' \code{hour}, \code{minute}, and \code{second}.
#'
#' @param year integer vector; 0 is 1BCE, -1 is 2BCE, etc.;
#' \code{NULL} for the current year
#' @param month integer vector; months are 1-based;
#' \code{NULL} for the current month
#' @param day integer vector;
#' \code{NULL} for the current day
#' @param hour integer vector;
#' \code{NULL} for the current hour
#' @param minute integer vector;
#' \code{NULL} for the current minute
#' @param second numeric vector; fractional seconds are allowed;
#' \code{NULL} for the current seconds (without milliseconds)
#' @param tz \code{NULL} or \code{''} for the default time zone or
#' a single string with time zone identifier, see \code{\link{stri_timezone_list}}
#' @param lenient single logical value; should the operation be lenient?
#' @param locale \code{NULL} or \code{''} for default locale,
#' or a single string with locale identifier; a non-Gregorian calendar
#' may be specified by setting \code{@@calendar=name} keyword
#'
#' @return
#' Returns an object of class \code{\link{POSIXct}}.
#'
#' @examples
#' stri_datetime_create(2015, 12, 31, 23, 59, 59.999)
#' stri_datetime_create(5775, 8, 1, locale='@@calendar=hebrew') # 1 Nisan 5775 -> 2015-03-21
#' stri_datetime_create(2015, 02, 29)
#' stri_datetime_create(2015, 02, 29, lenient=TRUE)
#' stri_datetime_create(hour=15, minute=59)
#'
#' @family datetime
#' @export
stri_datetime_create <- function(
year = NULL, month = NULL, day = NULL,
hour = 0L, minute = 0L, second = 0,
lenient = FALSE, tz = NULL, locale = NULL)
{
if (any(sapply(list(year, month, day, hour, minute, second), is.null))) {
now <- stri_datetime_fields(stri_datetime_now(), tz=tz, locale=locale)
if (is.null(year)) year <- now[["Year"]]
if (is.null(month)) month <- now[["Month"]]
if (is.null(day)) day <- now[["Day"]]
if (is.null(hour)) hour <- now[["Hour"]]
if (is.null(minute)) minute <- now[["Minute"]]
if (is.null(second)) second <- now[["Second"]]
}
.Call(C_stri_datetime_create, year, month, day, hour, minute, second,
lenient, tz, locale)
}
#' @title
#' Get Current Date and Time
#'
#' @description
#' Returns the current date and time.
#'
#' @details
#' The current date and time in \pkg{stringi} is represented as the (signed)
#' number of seconds since 1970-01-01 00:00:00 UTC.
#' UTC leap seconds are ignored.
#'
#' @return
#' Returns an object of class \code{\link{POSIXct}}.
#'
#' @family datetime
#' @export
stri_datetime_now <- function()
{
.Call(C_stri_datetime_now)
}
#' @title
#' Get Values for Date and Time Fields
#'
#' @description
#' Computes and returns values for all date and time fields.
#'
#' @details
#' Vectorized over \code{time}.
#'
#'
#' @param time an object of class \code{\link{POSIXct}}
#' (\code{as.POSIXct} will be called on character vectors
#' and objects of class \code{POSIXlt}, \code{Date}, and \code{factor})
#' @param tz \code{NULL} or \code{''} for the default time zone or
#' a single string with time zone identifier, see \code{\link{stri_timezone_list}}
#' @param locale \code{NULL} or \code{''} for the current default locale,
#' or a single string with a locale identifier; a non-Gregorian calendar
#' may be specified by setting \code{@@calendar=name} keyword
#'
#' @return
#' Returns a data frame with the following columns:
#' \enumerate{
#' \item Year (0 is 1BC, -1 is 2BC, etc.)
#' \item Month (1-based, i.e., 1 stands for the first month, e.g., January;
#' note that the number of months depends on the selected calendar,
#' see \code{\link{stri_datetime_symbols}})
#' \item Day
#' \item Hour (24-h clock)
#' \item Minute
#' \item Second
#' \item Millisecond
#' \item WeekOfYear (this is locale-dependent)
#' \item WeekOfMonth (this is locale-dependent)
#' \item DayOfYear
#' \item DayOfWeek (1-based, 1 denotes Sunday; see \code{\link{stri_datetime_symbols}})
#' \item Hour12 (12-h clock)
#' \item AmPm (see \code{\link{stri_datetime_symbols}})
#' \item Era (see \code{\link{stri_datetime_symbols}})
#' }
#'
#' @examples
#' stri_datetime_fields(stri_datetime_now())
#' stri_datetime_fields(stri_datetime_now(), locale='@@calendar=hebrew')
#' stri_datetime_symbols(locale='@@calendar=hebrew')$Month[
#' stri_datetime_fields(stri_datetime_now(), locale='@@calendar=hebrew')$Month
#' ]
#'
#' @family datetime
#' @export
stri_datetime_fields <- function(time, tz = attr(time, "tzone"), locale = NULL)
{
# POSSIBLY @TODO:
# TimeZone
# GMT Offset CAL_ZONE_OFFSET + UCAL_DST_OFFSET
# isDST: UBool inDaylightTime (UErrorCode &status) const =0
# isWeekend: virtual UBool isWeekend (void) const
as.data.frame(.Call(C_stri_datetime_fields, time, tz, locale))
}
#' @title
#' Date and Time Arithmetic
#'
#' @description
#' Modifies a date-time object by adding a specific amount of time units.
#'
#' @details
#' Vectorized over \code{time} and \code{value}.
#'
#'
#' Note that, e.g., January, 31 + 1 month = February, 28 or 29.
#'
#' @param time an object of class \code{\link{POSIXct}}
#' (\code{as.POSIXct} will be called on character vectors
#' and objects of class \code{POSIXlt}, \code{Date}, and \code{factor})
#' @param value integer vector; signed number of units to add to \code{time}
#' @param units single string; one of \code{'years'}, \code{'months'},
#' \code{'weeks'}, \code{'days'}, \code{'hours'}, \code{'minutes'},
#' \code{'seconds'}, or \code{'milliseconds'}
#' @param tz \code{NULL} or \code{''} for the default time zone
#' or a single string with a timezone identifier,
#' @param locale \code{NULL} or \code{''} for default locale,
#' or a single string with locale identifier; a non-Gregorian calendar
#' may be specified by setting the \code{@@calendar=name} keyword
#'
#' @return
#' Both functions return an object of class \code{\link{POSIXct}}.
#'
#' The replacement version of \code{stri_datetime_add} modifies
#' the state of the \code{time} object.
#'
#' @references
#' \emph{Calendar Classes} - ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/datetime/calendar/}
#'
#'
#' @examples
#' x <- stri_datetime_now()
#' print(x)
#' stri_datetime_add(x, units='months') <- 2
#' print(x)
#'
#' x <- stri_datetime_create(2025, 4, 20)
#' print(x)
#' stri_datetime_add(x, -2, units='months')
#' stri_datetime_add(x, 1, units='years')
#' stri_datetime_add(x, 1, units='years', locale='@@calendar=hebrew')
#'
#' stri_datetime_add(stri_datetime_create(2024, 1, 31), 1, units='months')
#'
#' @family datetime
#' @rdname stri_datetime_add
#' @export
stri_datetime_add <- function(time, value = 1L, units = "seconds",
tz = NULL, locale = NULL)
{
.Call(C_stri_datetime_add, time, value, units, tz, locale)
}
#' @rdname stri_datetime_add
#' @export
`stri_datetime_add<-` <- function(time, units = "seconds",
tz = NULL, locale = NULL,
value)
{
.Call(C_stri_datetime_add, time, value, units, tz, locale)
}
# #' @title
# #' Date-Time Objects in \pkg{stringi}
# #'
# #' @description
# #' Date-time objects' representation in \pkg{stringi} may change
# #' in future versions of the package. This is DRAFT API.
# #'
# #' @details
# #' An object of class \code{\link{POSIXst}},
# #' inherits from (for compatibility with other base R functions)
# #' \code{POSIXct} and \code{POSIX} classes.
# #' In fact, it is a numeric vector representing the (signed) number of seconds
# #' since the UNIX Epoch, i.e., 1970-01-01 00:00:00 UTC.
# #' UTC leap seconds are ignored.
# #'
# #' Thanks to this property, standard comparison operators, e.g., \code{<}, \code{==},
# #' etc. or the \code{sort()} function may be used.
# #'
# #' An object of class \code{\link{POSIXst}} may be equipped with
# #' an attribute called \code{tzone}. Its value is used for date/time
# #' formatting (e.g., when objects are printed in the console),
# #' see \code{\link{format.POSIXst}} and \code{\link{stri_datetime_fields}}.
# #'
# #' @param x ...
# #' @param tz \code{NULL} or \code{''} for the default time zone or
# #' a single string with time zone identifier, see \code{\link{stri_timezone_list}}
# #' @param recursive,... further arguments to be passed to or from other methods.
# #'
# #' @return
# #' \code{as.POSIXst} returns an object of class \code{POSIXst}.
# #'
# #' @export
# #' @rdname as.POSIXst
# #' @family datetime
# #' @aliases as.POSIXst POSIXst
# as.POSIXst <- function(x, tz=attr(time, 'tzone'), ...) {
# # UseMethod('as.POSIXct')
# stop('TO DO')
# }
# #' @export
# #' @rdname as.POSIXst
# c.POSIXst <- function (..., recursive=FALSE) {
# stopifnot(identical(recursive, FALSE))
# .Call(C_stri_c_posixst, list(...))
# }
# TO DO: field difference
stringi/R/trans_normalization.R 0000644 0001762 0000144 00000012752 14750110641 016360 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Perform or Check For Unicode Normalization
#'
#' @description
#' These functions convert strings to NFC, NFKC, NFD, NFKD, or NFKC_Casefold
#' Unicode Normalization Form or check whether strings are normalized.
#'
#' @details
#' Unicode Normalization Forms are formally defined normalizations of Unicode
#' strings which, e.g., make possible to determine whether any two
#' strings are equivalent.
#' Essentially, the Unicode Normalization Algorithm puts all combining
#' marks in a specified order, and uses rules for decomposition
#' and composition to transform each string into one of the
#' Unicode Normalization Forms.
#'
#' The following Normalization Forms (NFs) are supported:
#' \itemize{
#' \item NFC (Canonical Decomposition, followed by Canonical Composition),
#' \item NFD (Canonical Decomposition),
#' \item NFKC (Compatibility Decomposition, followed by Canonical Composition),
#' \item NFKD (Compatibility Decomposition),
#' \item NFKC_Casefold (combination of NFKC, case folding, and removing ignorable
#' characters which was introduced with Unicode 5.2).
#' }
#'
#' Note that many W3C Specifications recommend using NFC for all content,
#' because this form avoids potential interoperability problems arising
#' from the use of canonically equivalent, yet different,
#' character sequences in document formats on the Web.
#' Thus, you will rather not use these functions in typical
#' string processing activities. Most often you may assume
#' that a string is in NFC, see RFC5198.
#'
#' As usual in \pkg{stringi},
#' if the input character vector is in the native encoding,
#' it will be automatically converted to UTF-8.
#'
#' For more general text transforms refer to \code{\link{stri_trans_general}}.
#'
#'
#' @param str character vector to be encoded
#'
#' @return The \code{stri_trans_nf*} functions return a character vector
#' of the same length as input (the output is always in UTF-8).
#'
#' \code{stri_trans_isnf*} return a logical vector.
#'
#' @references
#' \emph{Unicode Normalization Forms} -- Unicode Standard Annex #15,
#' \url{https://unicode.org/reports/tr15/}
#'
#' \emph{Unicode Format for Network Interchange}
#' -- RFC5198, \url{https://www.rfc-editor.org/rfc/rfc5198}
#'
#' \emph{Character Model for the World Wide Web 1.0: Normalization}
#' -- W3C Working Draft, \url{https://www.w3.org/TR/charmod-norm/}
#'
#' \emph{Normalization} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/transforms/normalization/}
#' (technical details)
#'
#' \emph{Unicode Equivalence} -- Wikipedia,
#' \url{https://en.wikipedia.org/wiki/Unicode_equivalence}
#'
#' @examples
#' stri_trans_nfd('\u0105') # a with ogonek -> a, ogonek
#' stri_trans_nfkc('\ufdfa') # 1 codepoint -> 18 codepoints
#'
#' @export
#' @rdname stri_trans_nf
#' @family transform
stri_trans_nfc <- function(str)
{
.Call(C_stri_trans_nfc, str)
}
#' @rdname stri_trans_nf
#' @export
stri_trans_nfd <- function(str)
{
.Call(C_stri_trans_nfd, str)
}
#' @rdname stri_trans_nf
#' @export
stri_trans_nfkd <- function(str)
{
.Call(C_stri_trans_nfkd, str)
}
#' @rdname stri_trans_nf
#' @export
stri_trans_nfkc <- function(str)
{
.Call(C_stri_trans_nfkc, str)
}
#' @rdname stri_trans_nf
#' @export
stri_trans_nfkc_casefold <- function(str)
{
.Call(C_stri_trans_nfkc_casefold, str)
}
#' @rdname stri_trans_nf
#' @export
stri_trans_isnfc <- function(str)
{
.Call(C_stri_trans_isnfc, str)
}
#' @rdname stri_trans_nf
#' @export
stri_trans_isnfd <- function(str)
{
.Call(C_stri_trans_isnfd, str)
}
#' @rdname stri_trans_nf
#' @export
stri_trans_isnfkd <- function(str)
{
.Call(C_stri_trans_isnfkd, str)
}
#' @rdname stri_trans_nf
#' @export
stri_trans_isnfkc <- function(str)
{
.Call(C_stri_trans_isnfkc, str)
}
#' @rdname stri_trans_nf
#' @export
stri_trans_isnfkc_casefold <- function(str)
{
.Call(C_stri_trans_isnfkc_casefold, str)
}
stringi/R/search_count_4.R 0000644 0001762 0000144 00000013040 14750110641 015152 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Count the Number of Pattern Occurrences
#'
#' @description
#' These functions count the number of occurrences
#' of a pattern in a string.
#'
#' @details
#' Vectorized over \code{str} and \code{pattern} (with recycling
#' of the elements in the shorter vector if necessary). This allows to,
#' for instance, search for one pattern in each given string,
#' search for each pattern in one given string,
#' and search for the i-th pattern within the i-th string.
#'
#' If \code{pattern} is empty, then the result is \code{NA}
#' and a warning is generated.
#'
#' \code{stri_count} is a convenience function.
#' It calls either \code{stri_count_regex},
#' \code{stri_count_fixed}, \code{stri_count_coll},
#' or \code{stri_count_charclass}, depending on the argument used.
#'
#' @param str character vector; strings to search in
#' @param pattern,regex,fixed,coll,charclass character vector;
#' search patterns; for more details refer to \link{stringi-search}
#' @param opts_collator,opts_fixed,opts_regex a named list used to tune up
#' the search engine's settings; see
#' \code{\link{stri_opts_collator}}, \code{\link{stri_opts_fixed}},
#' and \code{\link{stri_opts_regex}}, respectively; \code{NULL}
#' for the defaults
#' @param ... supplementary arguments passed to the underlying functions,
#' including additional settings for \code{opts_collator}, \code{opts_regex},
#' \code{opts_fixed}, and so on
#'
#' @return All the functions return an integer vector.
#'
#' @examples
#' s <- 'Lorem ipsum dolor sit amet, consectetur adipisicing elit.'
#' stri_count(s, fixed='dolor')
#' stri_count(s, regex='\\p{L}+')
#'
#' stri_count_fixed(s, ' ')
#' stri_count_fixed(s, 'o')
#' stri_count_fixed(s, 'it')
#' stri_count_fixed(s, letters)
#' stri_count_fixed('babab', 'b')
#' stri_count_fixed(c('stringi', '123'), 'string')
#'
#' stri_count_charclass(c('stRRRingi', 'STrrrINGI', '123'),
#' c('\\p{Ll}', '\\p{Lu}', '\\p{Zs}'))
#' stri_count_charclass(' \t\n', '\\p{WHITE_SPACE}') # white space - binary property
#' stri_count_charclass(' \t\n', '\\p{Z}') # white-space - general category (note the difference)
#'
#' stri_count_regex(s, '(s|el)it')
#' stri_count_regex(s, 'i.i')
#' stri_count_regex(s, '.it')
#' stri_count_regex('bab baab baaab', c('b.*?b', 'b.b'))
#' stri_count_regex(c('stringi', '123'), '^(s|1)')
#'
#' @family search_count
#' @export
#' @rdname stri_count
stri_count <- function(str, ..., regex, fixed, coll, charclass)
{
providedarg <- c(
regex = !missing(regex),
fixed = !missing(fixed),
coll = !missing(coll),
charclass = !missing(charclass))
if (sum(providedarg) != 1)
stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`")
if (providedarg["regex"])
stri_count_regex(str, regex, ...)
else if (providedarg["fixed"])
stri_count_fixed(str, fixed, ...)
else if (providedarg["coll"])
stri_count_coll(str, coll, ...)
else if (providedarg["charclass"])
stri_count_charclass(str, charclass, ...)
}
#' @export
#' @rdname stri_count
stri_count_charclass <- function(str, pattern) {
.Call(C_stri_count_charclass, str, pattern)
}
#' @export
#' @rdname stri_count
stri_count_coll <- function(str, pattern, ..., opts_collator = NULL)
{
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_count_coll, str, pattern, opts_collator)
}
#' @export
#' @rdname stri_count
stri_count_fixed <- function(str, pattern, ..., opts_fixed = NULL)
{
if (!missing(...))
opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...)))
.Call(C_stri_count_fixed, str, pattern, opts_fixed)
}
#' @export
#' @rdname stri_count
stri_count_regex <- function(str, pattern, ..., opts_regex = NULL)
{
if (!missing(...))
opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...)))
.Call(C_stri_count_regex, str, pattern, opts_regex)
}
stringi/R/encoding_conversion.R 0000644 0001762 0000144 00000026255 14770472035 016333 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Convert Strings Between Given Encodings
#'
#' @description
#' These functions convert strings between encodings.
#' They aim to serve as a more portable and faster replacement
#' for \R's own \code{\link{iconv}}.
#'
#' @details
#' \code{stri_conv} is an alias for \code{stri_encode}.
#'
#' Refer to \code{\link{stri_enc_list}} for the list
#' of supported encodings and \link{stringi-encoding}
#' for a general discussion.
#'
#' If \code{from} is either missing, \code{''}, or \code{NULL},
#' and if \code{str} is a character vector
#' then the marked encodings are used
#' (see \code{\link{stri_enc_mark}}) -- in such a case \code{bytes}-declared
#' strings are disallowed.
#' Otherwise, i.e., if \code{str} is a \code{raw}-type vector
#' or a list of raw vectors,
#' we assume that the input encoding is the current default encoding
#' as given by \code{\link{stri_enc_get}}.
#'
#' However, if \code{from} is given explicitly,
#' the internal encoding declarations are always ignored.
#'
#' For \code{to_raw=FALSE}, the output
#' strings always have the encodings marked according to the target converter
#' used (as specified by \code{to}) and the current default Encoding
#' (\code{ASCII}, \code{latin1}, \code{UTF-8}, \code{native},
#' or \code{bytes} in all other cases).
#'
#'
#' Note that some issues might occur if \code{to} indicates, e.g.,
#' UTF-16 or UTF-32, as the output strings may have embedded NULs.
#' In such cases, please use \code{to_raw=TRUE} and consider
#' specifying a byte order marker (BOM) for portability reasons
#' (e.g., set \code{UTF-16} or \code{UTF-32} which automatically
#' adds the BOMs).
#'
#' Note that \code{stri_encode(as.raw(data), 'encodingname')}
#' is a clever substitute for \code{\link{rawToChar}}.
#'
#' In the current version of \pkg{stringi}, if an incorrect code point is found
#' on input, it is replaced with the default (for that target encoding)
#' 'missing/erroneous' character (with a warning), e.g.,
#' the SUBSTITUTE character (U+001A) or the REPLACEMENT one (U+FFFD).
#' Occurrences thereof can be located in the output string to diagnose
#' the problematic sequences, e.g., by calling:
#' \code{stri_locate_all_regex(converted_string, '[\\ufffd\\u001a]'}.
#'
#' Because of the way this function is currently implemented,
#' maximal size of a single string to be converted cannot exceed ~0.67 GB.
#'
#'
#' @param str a character vector, a raw vector, or
#' a list of \code{raw} vectors to be converted
#' @param from input encoding:
#' \code{NULL} or \code{''} for the default encoding
#' or internal encoding marks' usage (see Details);
#' otherwise, a single string with encoding name,
#' see \code{\link{stri_enc_list}}
#' @param to target encoding:
#' \code{NULL} or \code{''} for default encoding
#' (see \code{\link{stri_enc_get}}),
#' or a single string with encoding name
#' @param to_raw a single logical value; indicates whether a list of raw vectors
#' rather than a character vector should be returned
#'
#' @return If \code{to_raw} is \code{FALSE},
#' then a character vector with encoded strings (and appropriate
#' encoding marks) is returned.
#' Otherwise, a list of vectors of type raw is produced.
#'
#' @references
#' \emph{Conversion} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/conversion/}
#'
#' @family encoding_conversion
#' @rdname stri_encode
#' @export
stri_encode <- function(str, from = NULL, to = NULL, to_raw = FALSE) {
.Call(C_stri_encode, str, from, to, to_raw)
}
#' @rdname stri_encode
#' @export
stri_conv <- stri_encode
#' @title
#' Convert Strings To UTF-32
#'
#' @description
#' UTF-32 is a 32-bit encoding where each Unicode code point
#' corresponds to exactly one integer value.
#' This function converts a character vector to a list
#' of integer vectors so that, e.g.,
#' individual code points may be easily accessed, changed, etc.
#'
#' @details
#' See \code{\link{stri_enc_fromutf32}} for a dual operation.
#'
#' This function is roughly equivalent to a vectorized call
#' to \code{\link{utf8ToInt}(enc2utf8(str))}.
#' If you want a list of raw vectors on output,
#' use \code{\link{stri_encode}}.
#'
#' Unlike \code{utf8ToInt}, if ill-formed UTF-8 byte sequences are detected,
#' a corresponding element is set to NULL and a warning is generated.
#' To deal with such issues, use, e.g., \code{\link{stri_enc_toutf8}}.
#'
#' @param str a character vector (or an object coercible to)
#' to be converted
#' @return Returns a list of integer vectors.
#' Missing values are converted to \code{NULL}s.
#'
#' @family encoding_conversion
#' @export
stri_enc_toutf32 <- function(str)
{
.Call(C_stri_enc_toutf32, str)
}
#' @title
#' Convert From UTF-32
#'
#' @description
#' This function converts integer vectors,
#' representing sequences of UTF-32 code points, to UTF-8 strings.
#'
#' @details
#' UTF-32 is a 32-bit encoding where each Unicode code point
#' corresponds to exactly one integer value.
#'
#' This function is a vectorized version of
#' \code{\link{intToUtf8}}. As usual in \pkg{stringi},
#' it returns character strings in UTF-8.
#' See \code{\link{stri_enc_toutf32}} for a dual operation.
#'
#' If an ill-defined code point is given, a warning is generated
#' and the corresponding string is set to \code{NA}.
#' Note that \code{0}s are not allowed in \code{vec}, as they are used
#' internally to mark the end of a string (in the C API).
#'
#'
#' See also \code{\link{stri_encode}} for decoding arbitrary byte sequences
#' from any given encoding.
#'
#'
#' @param vec a list of integer vectors (or objects coercible to such vectors)
#' or \code{NULL}s. For convenience, a single integer vector can also
#' be given.
#' @return Returns a character vector (in UTF-8).
#' \code{NULL}s in the input list are converted to \code{NA_character_}.
#'
#' @family encoding_conversion
#' @export
stri_enc_fromutf32 <- function(vec)
{
.Call(C_stri_enc_fromutf32, vec)
}
#' @title
#' Convert Strings To UTF-8
#'
#' @description
#' Converts character strings with declared marked encodings
#' to UTF-8 strings.
#'
#' @details
#' If \code{is_unknown_8bit} is set to \code{FALSE} (the default),
#' then R encoding marks are used, see \code{\link{stri_enc_mark}}.
#' Bytes-marked strings will cause the function to fail.
#'
#' If a string is in UTF-8 and has a byte order mark (BOM),
#' then the BOM will be silently removed from the output string.
#'
#' If the default encoding is UTF-8, see \code{\link{stri_enc_get}},
#' then strings marked with \code{native} are -- for efficiency reasons --
#' returned as-is, i.e., with unchanged markings.
#' A similar behavior is observed when calling \code{\link{enc2utf8}}.
#'
#' For \code{is_unknown_8bit=TRUE}, if a string is declared to be neither
#' in ASCII nor in UTF-8, then all byte codes > 127 are replaced with
#' the Unicode REPLACEMENT CHARACTER (\\Ufffd).
#' Note that the REPLACEMENT CHARACTER may be interpreted as Unicode
#' missing value for single characters.
#' Here a \code{bytes}-marked string is assumed to use an 8-bit encoding
#' that extends the ASCII map.
#'
#' What is more, setting \code{validate} to \code{TRUE}
#' or \code{NA} in both cases validates the resulting UTF-8 byte stream.
#' If \code{validate=TRUE}, then
#' in case of any incorrect byte sequences, they will be
#' replaced with the REPLACEMENT CHARACTER.
#' This option may be used in a case
#' where you want to fix an invalid UTF-8 byte sequence.
#' For \code{NA}, a bogus string will be replaced with a missing value.
#'
#' @param str a character vector to be converted
#' @param is_unknown_8bit a single logical value, see Details
#' @param validate a single logical value (can be \code{NA}), see Details
#' @return Returns a character vector.
#'
#' @family encoding_conversion
#' @export
stri_enc_toutf8 <- function(str, is_unknown_8bit = FALSE, validate = FALSE)
{
.Call(C_stri_enc_toutf8, str, is_unknown_8bit, validate)
}
#' @title
#' Convert Strings To Native Encoding
#'
#' @description
#' Converts character strings with declared encodings
#' to the current native encoding.
#'
#' @details
#' This function just calls \code{\link{stri_encode}(str, NULL, NULL)}.
#' The current native encoding can be read with \code{\link{stri_enc_get}}.
#' Character strings declared to be in \code{bytes} encoding will fail here.
#'
#' Note that if working in a UTF-8 environment,
#' resulting strings will be marked with \code{UTF-8}
#' and not \code{native}, see \code{\link{stri_enc_mark}}.
#'
#' @param str a character vector to be converted
#' @return Returns a character vector.
#'
#' @family encoding_conversion
#' @export
stri_enc_tonative <- function(str)
{
stri_encode(str, NULL, NULL)
}
#' @title
#' Convert To ASCII
#'
#' @description
#' This function converts input strings to ASCII,
#' i.e., to character strings consisting of bytes not greater than 127.
#'
#' @details
#' All code points greater than 127 are replaced with the ASCII SUBSTITUTE
#' CHARACTER (0x1A).
#' \R encoding declarations are always used to determine
#' which encoding is assumed for each input, see \code{\link{stri_enc_mark}}.
#' If ill-formed byte sequences are found in UTF-8 byte
#' streams, a warning is generated.
#'
#' A \code{bytes}-marked string is assumed to be in an 8-bit encoding
#' extending the ASCII map (a common assumption in \R itself).
#'
#' Note that the SUBSTITUTE CHARACTER (\code{\\x1a == \\032}) may be interpreted
#' as the ASCII missing value for single characters.
#'
#' @param str a character vector to be converted
#' @return Returns a character vector.
#'
#' @family encoding_conversion
#' @export
stri_enc_toascii <- function(str)
{
.Call(C_stri_enc_toascii, str)
}
stringi/R/trans_casemap.R 0000644 0001762 0000144 00000012004 14750110641 015071 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Transform Strings with Case Mapping or Folding
#'
#' @description
#' These functions transform strings either to lower case,
#' UPPER CASE, or Title Case or perform case folding.
#'
#' @details
#' Vectorized over \code{str}.
#'
#' \pkg{ICU} implements full Unicode string case mappings. It is
#' worth noting that, generally, case mapping:
#' \itemize{
#' \item can change the number of code points and/or code units
#' of a string,
#' \item is language-sensitive (results may differ depending on the locale), and
#' \item is context-sensitive (a character in the input string may map
#' differently depending on surrounding characters).
#' }
#'
#' With \code{stri_trans_totitle}, if \code{word} \code{BreakIterator}
#' is used (the default), then the first letter of each word will be capitalized
#' and the rest will be transformed to lower case.
#' With the break iterator of type \code{sentence}, the first letter
#' of each sentence will be capitalized only.
#' Note that according the \pkg{ICU} User Guide,
#' the string \code{'one. two. three.'} consists of one sentence.
#'
#' Case folding, on the other hand, is locale-independent.
#' Its purpose is to make two pieces of text that differ only in case identical.
#' This may come in handy when comparing strings.
#'
#' For more general (but not locale dependent)
#' text transforms refer to \code{\link{stri_trans_general}}.
#'
#' @param str character vector
#' @param locale \code{NULL} or \code{''} for case mapping following
#' the conventions of the default locale, or a single string with
#' locale identifier, see \link{stringi-locale}.
#' @param opts_brkiter a named list with \pkg{ICU} BreakIterator's settings,
#' see \code{\link{stri_opts_brkiter}};
#' \code{NULL} for default break iterator, i.e., \code{word};
#' \code{stri_trans_totitle} only
#' @param ... additional settings for \code{opts_brkiter}
#'
#' @return
#' Each function returns a character vector.
#'
#' @references
#' \emph{Case Mappings} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/transforms/casemappings.html}
#'
#' @family locale_sensitive
#' @family transform
#' @export
#' @rdname stri_trans_casemap
#' @family text_boundaries
#'
#' @examples
#' stri_trans_toupper('\u00DF', 'de_DE') # small German Eszett / scharfes S
#' stri_cmp_eq(stri_trans_toupper('i', 'en_US'), stri_trans_toupper('i', 'tr_TR'))
#' stri_trans_toupper(c('abc', '123', '\u0105\u0104'))
#' stri_trans_tolower(c('AbC', '123', '\u0105\u0104'))
#' stri_trans_totitle(c('AbC', '123', '\u0105\u0104'))
#' stri_trans_casefold(c('AbC', '123', '\u0105\u0104'))
#' stri_trans_totitle('stringi is a FREE R pAcKaGe. WItH NO StrinGS attached.') # word boundary
#' stri_trans_totitle('stringi is a FREE R pAcKaGe. WItH NO StrinGS attached.', type='sentence')
stri_trans_tolower <- function(str, locale = NULL)
{
.Call(C_stri_trans_tolower, str, locale)
}
#' @export
#' @rdname stri_trans_casemap
stri_trans_toupper <- function(str, locale = NULL)
{
.Call(C_stri_trans_toupper, str, locale)
}
#' @export
#' @rdname stri_trans_casemap
stri_trans_casefold <- function(str)
{
.Call(C_stri_trans_casefold, str)
}
#' @export
#' @rdname stri_trans_casemap
stri_trans_totitle <- function(str, ..., opts_brkiter = NULL)
{
if (!missing(...))
opts_brkiter <- do.call(stri_opts_brkiter, as.list(c(opts_brkiter, ...)))
.Call(C_stri_trans_totitle, str, opts_brkiter)
}
stringi/R/trans_transliterate.R 0000644 0001762 0000144 00000013753 14750110641 016355 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' General Text Transforms, Including Transliteration
#'
#' @description
#' \pkg{ICU} General transforms provide different ways
#' for processing Unicode text. They are useful in handling a variety
#' of different tasks, including:
#' \itemize{
#' \item locale-independent upper case, lower case, title case,
#' full/halfwidth conversions,
#' \item normalization,
#' \item hex and character name conversions,
#' \item script to script conversion/transliteration.
#' }
#'
#'
#' @details
#' \pkg{ICU} Transforms were mainly designed to transliterate characters
#' from one script to another (for example, from Greek to Latin,
#' or Japanese Katakana to Latin).
#' However, these services are also capable of handling a much
#' broader range of tasks.
#' In particular, the Transforms include prebuilt transformations
#' for case conversions, for normalization conversions, for the removal
#' of given characters, and also for a variety of language and script
#' transliterations. Transforms can be chained together to perform
#' a series of operations and each step of the process can use a
#' UnicodeSet to restrict the characters that are affected.
#'
#' To get the list of available transforms,
#' call \code{\link{stri_trans_list}}.
#'
#' Note that transliterators are often combined in sequence
#' to achieve a desired transformation.
#' This is analogous to the composition of mathematical functions.
#' For example, given a script that converts lowercase ASCII characters
#' from Latin script to Katakana script, it is convenient to first
#' (1) separate input base characters and accents, and then (2)
#' convert uppercase to lowercase.
#' To achieve this, a compound transform can be specified as follows:
#' \code{NFKD; Lower; Latin-Katakana;} (with the default \code{rules=FALSE}).
#'
#' Custom rule-based transliteration is also supported, see the \pkg{ICU}
#' manual and below for some examples.
#'
#' Transliteration is not dependent on the current locale.
#'
#' @param str character vector
#' @param id a single string with transform identifier,
#' see \code{\link{stri_trans_list}}, or custom transliteration rules
#' @param rules if \code{TRUE}, treat \code{id} as a string with
#' semicolon-separated transliteration rules (see the \pkg{ICU} manual);
#' @param forward transliteration direction (\code{TRUE} for forward,
#' \code{FALSE} for reverse)
#'
#' @return
#' Returns a character vector.
#'
#' @examples
#' stri_trans_general('gro\u00df', 'latin-ascii')
#' stri_trans_general('stringi', 'latin-greek')
#' stri_trans_general('stringi', 'latin-cyrillic')
#' stri_trans_general('stringi', 'upper') # see stri_trans_toupper
#' stri_trans_general('\u0104', 'nfd; lower') # compound id; see stri_trans_nfd
#' stri_trans_general('Marek G\u0105golewski', 'pl-pl_FONIPA')
#' stri_trans_general('\u2620', 'any-name') # character name
#' stri_trans_general('\\N{latin small letter a}', 'name-any') # decode name
#' stri_trans_general('\u2620', 'hex/c') # to hex
#' stri_trans_general("\u201C\u2026\u201D \u0105\u015B\u0107\u017C",
#' "NFKD; NFC; [^\\p{L}] latin-ascii")
#'
#' x <- "\uC885\uB85C\uAD6C \uC0AC\uC9C1\uB3D9"
#' stringi::stri_trans_general(x, "Hangul-Latin")
#' # Deviate from the ICU rules of romanisation of Korean,
#' # see https://en.wikipedia.org/wiki/Romanization_of_Korean
#' id <- "
#' :: NFD;
#' \u11A8 > k;
#' \u11AE > t;
#' \u11B8 > p;
#' \u1105 > r;
#' :: Hangul-Latin;
#' "
#' stringi::stri_trans_general(x, id, rules=TRUE)
#'
#'
#' @references
#' \emph{General Transforms} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/transforms/general/}
#'
#' @family transform
#' @export
stri_trans_general <- function(str, id, rules=FALSE, forward=TRUE)
{
.Call(C_stri_trans_general, str, id, rules, forward)
}
#' @title
#' List Available Text Transforms and Transliterators
#'
#' @description
#' Returns a list of available text transform identifiers.
#' Each of them may be used in \code{\link{stri_trans_general}}
#' tasks.
#'
#' @return Returns a character vector.
#'
#' @references
#' \emph{General Transforms} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/transforms/general/}
#'
#' @examples
#' stri_trans_list()
#'
#' @family transform
#' @export
stri_trans_list <- function()
{
stri_sort(
.Call(C_stri_trans_list), locale="en_US", numeric=TRUE, strength=1
)
}
stringi/R/locale.R 0000644 0001762 0000144 00000014265 14750110641 013523 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Locales and \pkg{stringi}
#'
#' @description
#' In this section we explain how we specify locales in \pkg{stringi}.
#' Locale is a fundamental concept in \pkg{ICU}.
#' It identifies a specific user community, i.e., a group of users
#' who have similar culture and language expectations
#' for human-computer interaction.
#'
#'
#' @details
#' Because a locale is just an identifier of a region,
#' no validity check is performed when you specify a Locale.
#' \pkg{ICU} is implemented as a set of services.
#' If you want to verify whether particular resources are available
#' in the locale you asked for, you must query those resources.
#' Note: when you ask for a resource for a particular locale, you get back
#' the best available match, not necessarily precisely the one you requested.
#'
#' @section Locale Identifiers:
#'
#' \pkg{ICU} services are parametrized by locale,
#' to deliver culturally correct results.
#' Locales are identified by character strings
#' of the form \code{Language} code,
#' \code{Language_Country} code, or \code{Language_Country_Variant}
#' code, e.g., 'en_US'.
#'
#' The two-letter \code{Language} code uses the ISO-639-1 standard,
#' e.g., 'en' stands for English, 'pl' -- Polish, 'fr' -- French,
#' and 'de' for German.
#'
#' \code{Country} is a two-letter code following the ISO-3166 standard.
#' This is to reflect different language conventions within the same language,
#' for example in US-English ('en_US') and Australian-English ('en_AU').
#'
#' Differences may also appear in language conventions used within
#' the same country. For example, the Euro currency may be used in several European
#' countries while the individual country's currency is still in circulation.
#' In such a case, \pkg{ICU} \code{Variant} '_EURO' could be used for selecting
#' locales that support the Euro currency.
#'
#' The final (optional) element of a locale is a list of
#' keywords together with their values. Keywords must be unique.
#' Their order is not significant. Unknown keywords are ignored.
#' The handling of keywords depends on the specific services that
#' utilize them. Currently, the following keywords are recognized:
#' \code{calendar}, \code{collation}, \code{currency}, and \code{numbers},
#' e.g., \code{fr@@collation=phonebook;}\code{calendar=islamic-civil} is a valid
#' French locale specifier together with keyword arguments. For
#' more information, refer to the ICU user guide.
#'
#' For a list of locales that are recognized by \pkg{ICU},
#' call \code{\link{stri_locale_list}}.
#'
#' Note that in \pkg{stringi}, 'C' is a synonym of `en_US_POSIX`.
#'
#'
#' @section A Note on Default Locales:
#'
#' Each locale-sensitive function in \pkg{stringi}
#' selects the current default locale if an empty string or \code{NULL}
#' is provided as its \code{locale} argument. Default locales are available
#' to all the functions; initially, the system locale on that platform is used,
#' but it may be changed by calling \code{\link{stri_locale_set}}.
#'
#' Your program should avoid changing the default locale.
#' All locale-sensitive functions may request
#' any desired locale per-call (by specifying the \code{locale} argument),
#' i.e., without referencing to the default locale.
#' During many tests, however, we did not observe any improper
#' behavior of \pkg{stringi} while using a modified default locale.
#'
#'
#'
#'
#' @section Locale-Sensitive Functions in \pkg{stringi}:
#'
#' One of many examples of locale-dependent services is the Collator, which
#' performs a locale-aware string comparison. It is used for string comparing,
#' ordering, sorting, and searching. See \code{\link{stri_opts_collator}}
#' for the description on how to tune its settings, and its \code{locale}
#' argument in particular.
#'
#' When choosing a resource bundle that is not available in the explicitly
#' requested locale (but not when using the default locale)
#' nor in its more general variants (e.g., `es_ES` vs `es`),
#' a warning is emitted.
#'
#' Other locale-sensitive functions include, e.g.,
#' \code{\link{stri_trans_tolower}} (that does character case mapping).
#'
#' @references
#' \emph{Locale} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/locale/}
#'
#' \emph{ISO 639: Language Codes},
#' \url{https://www.iso.org/iso-639-language-codes.html}
#'
#' \emph{ISO 3166: Country Codes},
#' \url{https://www.iso.org/iso-3166-country-codes.html}
#'
#' @name about_locale
#' @rdname about_locale
#' @aliases about_locale locale stringi-locale
#' @family locale_management
#' @family locale_sensitive
#' @family stringi_general_topics
invisible(NULL)
stringi/R/compare.R 0000644 0001762 0000144 00000025367 14750110641 013717 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Compare Strings with or without Collation
#'
#' @description
#' These functions may be used to determine if two strings
#' are equal, canonically equivalent (this is performed in a much more clever
#' fashion than when testing for equality), or to check whether they are in
#' a specific lexicographic order.
#'
#'
#' @details
#' All the functions listed here are vectorized over \code{e1} and \code{e2}.
#'
#' \code{stri_cmp_eq} tests whether two corresponding strings
#' consist of exactly the same code points, while \code{stri_cmp_neq} allows
#' to check whether there is any difference between them. These are
#' locale-independent operations: for natural language processing,
#' where the notion of canonical equivalence is more valid, this might
#' not be exactly what you are looking for, see Examples.
#' Please note that \pkg{stringi} always silently removes UTF-8
#' BOMs from input strings, therefore, e.g., \code{stri_cmp_eq} does not take
#' BOMs into account while comparing strings.
#'
#' \code{stri_cmp_equiv} tests for canonical equivalence of two strings
#' and is locale-dependent. Additionally, the \pkg{ICU}'s Collator may be
#' tuned up so that, e.g., the comparison is case-insensitive.
#' To test whether two strings are not canonically equivalent,
#' call \code{stri_cmp_nequiv}.
#'
#' \code{stri_cmp_le} tests whether
#' the elements in the first vector are less than or equal to
#' the corresponding elements in the second vector,
#' \code{stri_cmp_ge} tests whether they are greater or equal,
#' \code{stri_cmp_lt} if less, and \code{stri_cmp_gt} if greater,
#' see also, e.g., \code{\link{\%s<\%}}.
#'
#' \code{stri_compare} is an alias to \code{stri_cmp}. They both
#' perform exactly the same locale-dependent operation.
#' Both functions provide a C library's \code{strcmp()} look-and-feel,
#' see Value for details.
#'
#'
#' For more information on \pkg{ICU}'s Collator and how to tune its settings
#' refer to \code{\link{stri_opts_collator}}.
#' Note that different locale settings may lead to different results
#' (see the examples below).
#'
#'
#' @param e1,e2 character vectors or objects coercible to character vectors
#' @param opts_collator a named list with \pkg{ICU} Collator's options,
#' see \code{\link{stri_opts_collator}}, \code{NULL}
#' for the default collation options.
#' @param ... additional settings for \code{opts_collator}
#'
#' @return The \code{stri_cmp} and \code{stri_compare} functions
#' return an integer vector representing the comparison results:
#' \code{-1} if \code{e1[...] < e2[...]},
#' \code{0} if they are canonically equivalent, and \code{1} if greater.
#'
#' All the other functions return a logical vector that indicates
#' whether a given relation holds between two corresponding elements
#' in \code{e1} and \code{e2}.
#'
#' @references
#' \emph{Collation} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/collation/}
#'
#' @examples
#' # in Polish, ch < h:
#' stri_cmp_lt('hladny', 'chladny', locale='pl_PL')
#'
#' # in Slovak, ch > h:
#' stri_cmp_lt('hladny', 'chladny', locale='sk_SK')
#'
#' # < or > (depends on locale):
#' stri_cmp('hladny', 'chladny')
#'
#' # ignore case differences:
#' stri_cmp_equiv('hladny', 'HLADNY', strength=2)
#'
#' # also ignore diacritical differences:
#' stri_cmp_equiv('hladn\u00FD', 'hladny', strength=1, locale='sk_SK')
#'
#' marios <- c('Mario', 'mario', 'M\\u00e1rio', 'm\\u00e1rio')
#' stri_cmp_equiv(marios, 'mario', case_level=TRUE, strength=2L)
#' stri_cmp_equiv(marios, 'mario', case_level=TRUE, strength=1L)
#' stri_cmp_equiv(marios, 'mario', strength=1L)
#' stri_cmp_equiv(marios, 'mario', strength=2L)
#'
#' # non-Unicode-normalized vs normalized string:
#' stri_cmp_equiv(stri_trans_nfkd('\u0105'), '\u105')
#'
#' # note the difference:
#' stri_cmp_eq(stri_trans_nfkd('\u0105'), '\u105')
#'
#' # ligatures:
#' stri_cmp_equiv('\ufb00', 'ff', strength=2)
#'
#' # phonebook collation
#' stri_cmp_equiv('G\u00e4rtner', 'Gaertner', locale='de_DE@@collation=phonebook', strength=1L)
#' stri_cmp_equiv('G\u00e4rtner', 'Gaertner', locale='de_DE', strength=1L)
#'
#' @family locale_sensitive
#' @export
#' @rdname stri_compare
stri_compare <- function(e1, e2, ..., opts_collator = NULL)
{
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_cmp, e1, e2, opts_collator)
}
#' @export
#' @rdname stri_compare
stri_cmp <- stri_compare
#' @export
#' @rdname stri_compare
stri_cmp_eq <- function(e1, e2)
{
.Call(C_stri_cmp_eq, e1, e2)
}
#' @export
#' @rdname stri_compare
stri_cmp_neq <- function(e1, e2)
{
.Call(C_stri_cmp_neq, e1, e2)
}
#' @export
#' @rdname stri_compare
stri_cmp_equiv <- function(e1, e2, ..., opts_collator = NULL)
{
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_cmp_equiv, e1, e2, opts_collator)
}
#' @export
#' @rdname stri_compare
stri_cmp_nequiv <- function(e1, e2, ..., opts_collator = NULL)
{
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_cmp_nequiv, e1, e2, opts_collator)
}
#' @export
#' @rdname stri_compare
stri_cmp_lt <- function(e1, e2, ..., opts_collator = NULL)
{
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_cmp_lt, e1, e2, opts_collator)
}
#' @export
#' @rdname stri_compare
stri_cmp_gt <- function(e1, e2, ..., opts_collator = NULL)
{
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_cmp_gt, e1, e2, opts_collator)
}
#' @export
#' @rdname stri_compare
stri_cmp_le <- function(e1, e2, ..., opts_collator = NULL)
{
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_cmp_le, e1, e2, opts_collator)
}
#' @export
#' @rdname stri_compare
stri_cmp_ge <- function(e1, e2, ..., opts_collator = NULL)
{
if (!missing(...))
opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...)))
.Call(C_stri_cmp_ge, e1, e2, opts_collator)
}
#' @title
#' Compare Strings with or without Collation
#'
#' @description
#' Relational operators for comparing corresponding strings in
#' two character vectors, with a typical R look-and-feel.
#'
#' @details
#' These functions call \code{\link{stri_cmp_le}} or its
#' friends, using the default collator options.
#' As a consequence, they are vectorized over \code{e1} and \code{e2}.
#'
#' \code{\%stri==\%} tests for canonical equivalence of strings
#' (see \code{\link{stri_cmp_equiv}}) and is a locale-dependent operation.
#'
#' \code{\%stri===\%} performs a locale-independent,
#' code point-based comparison.
#'
#'
#' @param e1,e2 character vectors or objects coercible to character vectors
#'
#' @return All the functions return a logical vector
#' indicating the result of a pairwise comparison.
#' As usual, the elements of shorter vectors are recycled if necessary.
#'
#'
#' @examples
#' 'a' %stri<% 'b'
#' c('a', 'b', 'c') %stri>=% 'b'
#'
#' @usage
#' e1 \%s<\% e2
#'
#' @family locale_sensitive
#' @rdname operator_compare
#' @aliases operator_compare oper_comparison oper_compare
#' @export
"%s<%" <- function(e1, e2)
{
stri_cmp_lt(e1, e2)
}
#' @usage
#' e1 \%s<=\% e2
#' @rdname operator_compare
#' @export
"%s<=%" <- function(e1, e2)
{
stri_cmp_le(e1, e2)
}
#' @usage
#' e1 \%s>\% e2
#' @rdname operator_compare
#' @export
"%s>%" <- function(e1, e2)
{
stri_cmp_gt(e1, e2)
}
#' @usage
#' e1 \%s>=\% e2
#' @rdname operator_compare
#' @export
"%s>=%" <- function(e1, e2)
{
stri_cmp_ge(e1, e2)
}
#' @usage
#' e1 \%s==\% e2
#' @rdname operator_compare
#' @export
"%s==%" <- function(e1, e2)
{
stri_cmp_equiv(e1, e2)
}
#' @usage
#' e1 \%s!=\% e2
#' @rdname operator_compare
#' @export
"%s!=%" <- function(e1, e2)
{
stri_cmp_nequiv(e1, e2)
}
#' @usage
#' e1 \%s===\% e2
#' @rdname operator_compare
#' @export
"%s===%" <- function(e1, e2)
{
stri_cmp_eq(e1, e2)
}
#' @usage
#' e1 \%s!==\% e2
#' @rdname operator_compare
#' @export
"%s!==%" <- function(e1, e2)
{
stri_cmp_neq(e1, e2)
}
#' @usage
#' e1 \%stri<\% e2
#' @rdname operator_compare
#' @export
"%stri<%" <- function(e1, e2)
{
stri_cmp_lt(e1, e2)
}
#' @usage
#' e1 \%stri<=\% e2
#' @rdname operator_compare
#' @export
"%stri<=%" <- function(e1, e2)
{
stri_cmp_le(e1, e2)
}
#' @usage
#' e1 \%stri>\% e2
#' @rdname operator_compare
#' @export
"%stri>%" <- function(e1, e2)
{
stri_cmp_gt(e1, e2)
}
#' @usage
#' e1 \%stri>=\% e2
#' @rdname operator_compare
#' @export
"%stri>=%" <- function(e1, e2)
{
stri_cmp_ge(e1, e2)
}
#' @usage
#' e1 \%stri==\% e2
#' @rdname operator_compare
#' @export
"%stri==%" <- function(e1, e2)
{
stri_cmp_equiv(e1, e2)
}
#' @usage
#' e1 \%stri!=\% e2
#' @rdname operator_compare
#' @export
"%stri!=%" <- function(e1, e2)
{
stri_cmp_nequiv(e1, e2)
}
#' @usage
#' e1 \%stri===\% e2
#' @rdname operator_compare
#' @export
"%stri===%" <- function(e1, e2)
{
stri_cmp_eq(e1, e2)
}
#' @usage
#' e1 \%stri!==\% e2
#' @rdname operator_compare
#' @export
"%stri!==%" <- function(e1, e2)
{
stri_cmp_neq(e1, e2)
}
stringi/R/stringi_package.R 0000644 0001762 0000144 00000016674 14770530442 015433 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the R package 'stringi'.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title Fast and Portable Character String Processing in R
#'
#' @description
#' \pkg{stringi} is THE R package for fast, correct, consistent,
#' and convenient string/text manipulation.
#' It gives predictable results on every platform, in each locale,
#' and under any native character encoding.
#'
#' \bold{Keywords}: R, text processing, character strings,
#' internationalization, localization, ICU, ICU4C, i18n, l10n, Unicode.
#'
#' \bold{Homepage}: \url{https://stringi.gagolewski.com/}
#'
#' \bold{License}: The BSD-3-clause license for the package code,
#' the ICU license for the accompanying ICU4C distribution,
#' and the UCD license for the Unicode Character Database.
#' See the COPYRIGHTS and LICENSE file for more details.
#'
#' @details
#' Manual pages on general topics:
#' \itemize{
#' \item \link{about_encoding} -- character encoding issues, including
#' information on encoding management in \pkg{stringi}, as well as
#' on encoding detection and conversion.
#'
#' \item \link{about_locale} -- locale issues, including locale
#' management and specification in \pkg{stringi}, and the list of
#' locale-sensitive operations. In particular, see
#' \code{\link{stri_opts_collator}} for a description of the string
#' collation algorithm, which is used for string comparing, ordering,
#' ranking, sorting, case-folding, and searching.
#'
#' \item \link{about_arguments} -- information on how \pkg{stringi}
#' handles the arguments passed to its function.
#' }
#'
#'
#' @section Facilities available:
#'
#' Refer to the following:
#' \itemize{
#' \item \link{about_search} for string searching facilities;
#' these include pattern searching, matching, string splitting, and so on.
#' The following independent search engines are provided:
#' \itemize{
#' \item \link{about_search_regex} -- with ICU (Java-like) regular expressions,
#' \item \link{about_search_fixed} -- fast, locale-independent, byte-wise pattern
#' matching,
#' \item \link{about_search_coll} -- locale-aware pattern matching
#' for natural language processing tasks,
#' \item \link{about_search_charclass} -- seeking elements of
#' particular character classes, like ``all whites-paces'' or ``all digits'',
#' \item \link{about_search_boundaries} -- text boundary analysis.
#' }
#'
#' \item \code{\link{stri_datetime_format}} for date/time formatting
#' and parsing. Also refer to the links therein for other date/time/time zone-
#' related operations.
#'
#' \item \code{\link{stri_stats_general}} and \code{\link{stri_stats_latex}}
#' for gathering some fancy statistics on a character vector's contents.
#'
#' \item \code{\link{stri_join}}, \code{\link{stri_dup}}, \code{\link{\%s+\%}},
#' and \code{\link{stri_flatten}} for concatenation-based operations.
#'
#' \item \code{\link{stri_sub}} for extracting and replacing substrings,
#' and \code{\link{stri_reverse}} for a joyful function
#' to reverse all code points in a string.
#'
#' \item \code{\link{stri_length}} (among others) for determining the number
#' of code points in a string. See also \code{\link{stri_count_boundaries}}
#' for counting the number of Unicode characters
#' and \code{\link{stri_width}} for approximating the width of a string.
#'
#' \item \code{\link{stri_trim}} (among others) for
#' trimming characters from the beginning or/and end of a string,
#' see also \link{about_search_charclass}, and \code{\link{stri_pad}}
#' for padding strings so that they are of the same width.
#' Additionally, \code{\link{stri_wrap}} wraps text into lines.
#'
#' \item \code{\link{stri_trans_tolower}} (among others) for case mapping,
#' i.e., conversion to lower, UPPER, or Title Case,
#' \code{\link{stri_trans_nfc}} (among others) for Unicode normalization,
#' \code{\link{stri_trans_char}} for translating individual code points,
#' and \code{\link{stri_trans_general}} for other universal
#' text transforms, including transliteration.
#'
#' \item \code{\link{stri_cmp}}, \code{\link{\%s<\%}}, \code{\link{stri_order}},
#' \code{\link{stri_sort}}, \code{\link{stri_rank}}, \code{\link{stri_unique}},
#' and \code{\link{stri_duplicated}} for collation-based,
#' locale-aware operations, see also \link{about_locale}.
#'
#' \item \code{\link{stri_split_lines}} (among others)
#' to split a string into text lines.
#'
#' \item \code{\link{stri_escape_unicode}} (among others) for escaping
#' some code points.
#'
#' \item \code{\link{stri_rand_strings}}, \code{\link{stri_rand_shuffle}},
#' and \code{\link{stri_rand_lipsum}} for generating (pseudo)random strings.
#'
#' \item \code{\link{stri_read_raw}},
#' \code{\link{stri_read_lines}}, and \code{\link{stri_write_lines}}
#' for reading and writing text files.
#' }
#'
#' Note that each man page provides many further links to other
#' interesting facilities and topics.
#'
#' @docType package
#' @author Marek Gagolewski,
#' with contributions from Bartek Tartanus and many others.
#' ICU4C was developed by IBM, Unicode, Inc., and others.
#'
#' @references
#' \emph{\pkg{stringi} Package Homepage},
#' \url{https://stringi.gagolewski.com/}
#'
#' Gagolewski M., \pkg{stringi}: Fast and portable character string
#' processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59,
#' \doi{10.18637/jss.v103.i02}
#'
#' \emph{ICU -- International Components for Unicode},
#' \url{https://icu.unicode.org/}
#'
#' \emph{ICU4C API Documentation},
#' \url{https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/}
#'
#' \emph{The Unicode Consortium},
#' \url{https://home.unicode.org/}
#'
#' \emph{UTF-8, A Transformation Format of ISO 10646} -- RFC 3629,
#' \url{https://www.rfc-editor.org/rfc/rfc3629}
#'
#' @family stringi_general_topics
#' @useDynLib stringi, .registration = TRUE
#' @importFrom tools md5sum
#' @importFrom utils packageVersion
#' @importFrom utils download.file
#' @importFrom utils unzip
#' @importFrom stats runif
#' @importFrom stats rnorm
"_PACKAGE"
stringi/R/trim.R 0000644 0001762 0000144 00000010574 14750110641 013236 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Trim Characters from the Left and/or Right Side of a String
#'
#' @description
#' These functions may be used, e.g., to remove unnecessary
#' white-spaces from strings. Trimming ends at the first or
#' starts at the last \code{pattern} match.
#'
#' @details
#' Vectorized over \code{str} and \code{pattern}.
#'
#' \code{stri_trim} is a convenience wrapper over \code{stri_trim_left}
#' and \code{stri_trim_right}.
#'
#' Contrary to many other string processing libraries,
#' our trimming functions are universal. The class of characters
#' to be retained or trimmed can be adjusted.
#'
#' For replacing pattern matches with
#' an arbitrary replacement string, see \code{\link{stri_replace}}.
#'
#' Trimming can also be used where you would normally rely on
#' regular expressions. For instance, you may get
#' \code{'23.5'} out of \code{'total of 23.5 bitcoins'}.
#'
#' For trimming white-spaces, please note the difference
#' between Unicode binary property `\code{\\p\{Wspace\}}` (more universal)
#' and general character category `\code{\\p\{Z\}}`,
#' see \link{stringi-search-charclass}.
#'
#' @param str a character vector of strings to be trimmed
#' @param pattern a single pattern, specifying the class of characters
#' (see \link{stringi-search-charclass}) to
#' to be preserved (if \code{negate} is \code{FALSE}; default)
#' or trimmed (otherwise)
#' @param side character [\code{stri_trim} only]; defaults to \code{'both'}
#' @param negate either \code{TRUE} or \code{FALSE}; see \code{pattern}
#'
#'
#' @return
#' All functions return a character vector.
#'
#'
#' @examples
#' stri_trim_left(' aaa')
#' stri_trim_right('r-project.org/', '\\P{P}')
#' stri_trim_both(' Total of 23.5 bitcoins. ', '\\p{N}')
#' stri_trim_both(' Total of 23.5 bitcoins. ', '\\P{N}', negate=TRUE)
#'
#' @aliases stri_trim
#' @family search_replace
#' @family search_charclass
#' @rdname stri_trim
#' @export
stri_trim_both <- function(str, pattern="\\P{Wspace}", negate=FALSE)
{
.Call(C_stri_trim_both, str, pattern, negate)
}
#' @rdname stri_trim
#' @export
stri_trim_left <- function(str, pattern="\\P{Wspace}", negate=FALSE)
{
.Call(C_stri_trim_left, str, pattern, negate)
}
#' @rdname stri_trim
#' @export
stri_trim_right <- function(str, pattern="\\P{Wspace}", negate=FALSE)
{
.Call(C_stri_trim_right, str, pattern, negate)
}
#' @rdname stri_trim
#' @export
stri_trim <- function(str, side=c("both", "left", "right"), pattern="\\P{Wspace}", negate=FALSE)
{
# `both` is default for compatibility with stringr
side <- match.arg(side) # this is slow
switch(side,
both=stri_trim_both(str, pattern, negate),
left=stri_trim_left(str, pattern, negate),
right=stri_trim_right(str, pattern, negate)
)
}
stringi/R/sprintf.R 0000644 0001762 0000144 00000022206 14750110641 013743 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Format Strings
#'
#' @description
#' \code{stri_sprintf} (synonym: \code{stri_string_format})
#' is a Unicode-aware replacement for and enhancement of
#' the built-in \code{\link[base]{sprintf}} function.
#' Moreover, \code{stri_printf} prints formatted strings.
#'
#' @details
#' Vectorized over \code{format} and all vectors passed via \code{...}.
#'
#' Unicode code points may have various widths when
#' printed on the console (compare \code{\link{stri_width}}).
#' These functions, by default (see the \code{use_length} argument), take this
#' into account.
#'
#' These functions are not locale sensitive. For instance, numbers are
#' always formatted in the "POSIX" style, e.g., \code{-123456.789}
#' (no thousands separator, dot as a fractional separator).
#' Such a feature might be added at a later date, though.
#'
#' All arguments passed via \code{...} are evaluated. If some of them
#' are unused, a warning is generated. Too few arguments result in an error.
#'
#' Note that \code{stri_printf} treats missing values in \code{...}
#' as \code{"NA"} strings by default.
#'
#' All format specifiers supported \code{\link[base]{sprintf}} are
#' also available here. For the formatting of integers and floating-point
#' values, currently the system \code{std::snprintf()} is called, but
#' this may change in the future. Format specifiers are normalized
#' and necessary sanity checks are performed.
#'
#' Supported conversion specifiers: \code{dioxX} (integers)
#' \code{feEgGaA} (floats) and \code{s} (character strings).
#' Supported flags: \code{-} (left-align),
#' \code{+} (force output sign or blank when \code{NaN} or \code{NA}; numeric only),
#' \code{} (output minus or space for a sign; numeric only)
#' \code{0} (pad with 0s; numeric only),
#' \code{#} (alternative output of some numerics).
#'
#'
#' @param format character vector of format strings
#' @param ... vectors (coercible to integer, real, or character)
#' @param na_string single string to represent missing values;
#' if \code{NA}, missing values in \code{...}
#' result in the corresponding outputs be missing too;
#' use \code{"NA"} for compatibility with base R
#' @param inf_string single string to represent the (unsigned) infinity (\code{NA} allowed)
#' @param nan_string single string to represent the not-a-number (\code{NA} allowed)
#' @param use_length single logical value; should the number of code
#' points be used when applying modifiers such as \code{\%20s}
#' instead of the total code point width?
#' @param file see \code{\link[base]{cat}}
#' @param sep see \code{\link[base]{cat}}
#' @param append see \code{\link[base]{cat}}
#'
#' @return
#' \code{stri_printf} is used for its side effect, which is printing
#' text on the standard output or other connection/file. Hence, it returns
#' \code{invisible(NULL)}.
#'
#' The other functions return a character vector.
#'
#'
#' @references
#' \code{printf} in \code{glibc},
#' \url{https://man.archlinux.org/man/printf.3}
#'
#' \code{printf} format strings -- Wikipedia,
#' \url{https://en.wikipedia.org/wiki/Printf_format_string}
#'
#' @examples
#' stri_printf("%4s=%.3f", c("e", "e\u00b2", "\u03c0", "\u03c0\u00b2"),
#' c(exp(1), exp(2), pi, pi^2))
#'
#' x <- c(
#' "xxabcd",
#' "xx\u0105\u0106\u0107\u0108",
#' stri_paste(
#' "\u200b\u200b\u200b\u200b",
#' "\U0001F3F4\U000E0067\U000E0062\U000E0073\U000E0063\U000E0074\U000E007F",
#' "abcd"
#' ))
#' stri_printf("[%10s]", x) # minimum width = 10
#' stri_printf("[%-10.3s]", x) # output of max width = 3, but pad to width of 10
#' stri_printf("[%10s]", x, use_length=TRUE) # minimum number of Unicode code points = 10
#'
#' # vectorization wrt all arguments:
#' p <- runif(10)
#' stri_sprintf(ifelse(p > 0.5, "P(Y=1)=%1$.2f", "P(Y=0)=%2$.2f"), p, 1-p)
#'
#' # using a "preformatted" logical vector:
#' x <- c(TRUE, FALSE, FALSE, NA, TRUE, FALSE)
#' stri_sprintf("%s) %s", letters[seq_along(x)], c("\u2718", "\u2713")[x+1])
#'
#' # custom NA/Inf/NaN strings:
#' stri_printf("%+10.3f", c(-Inf, -0, 0, Inf, NaN, NA_real_),
#' na_string="", nan_string="\U0001F4A9", inf_string="\u221E")
#'
#' stri_sprintf("UNIX time %1$f is %1$s.", Sys.time())
#'
#' # the following do not work in sprintf()
#' stri_sprintf("%1$#- *2$.*3$f", 1.23456, 10, 3) # two asterisks
#' stri_sprintf(c("%s", "%f"), pi) # re-coercion needed
#' stri_sprintf("%1$s is %1$f UNIX time.", Sys.time()) # re-coercion needed
#' stri_sprintf(c("%d", "%s"), factor(11:12)) # re-coercion needed
#' stri_sprintf(c("%s", "%d"), factor(11:12)) # re-coercion needed
#'
#' @rdname stri_sprintf
#' @family length
#' @export
stri_sprintf <- function(
format, ...,
na_string=NA_character_,
inf_string="Inf",
nan_string="NaN",
use_length=FALSE
) {
# force eval of ... here
.Call(C_stri_sprintf, format, list(...),
na_string, inf_string, nan_string, use_length)
}
#' @rdname stri_sprintf
#' @export
stri_string_format <- stri_sprintf
#' @rdname stri_sprintf
#' @export
stri_printf <- function(
format, ...,
file="",
sep="\n",
append=FALSE,
na_string="NA",
inf_string="Inf",
nan_string="NaN",
use_length=FALSE
) {
# force eval of ... here
str <- .Call(C_stri_sprintf, format, list(...),
na_string, inf_string, nan_string, use_length)
cat(str, file=file, sep=sep, append=append)
}
#' @title
#' C-Style Formatting with \code{\link{stri_sprintf}} as a Binary Operator
#'
#' @description
#' Provides access to \code{\link{stri_sprintf}} in form of a binary
#' operator in a way similar to Python's \code{\%} overloaded for strings.
#'
#' Missing values and empty vectors are propagated as usual.
#'
#' @details
#' Vectorized over \code{e1} and \code{e2}.
#'
#' \code{e1 \%s$\% atomic_vector} is equivalent to
#' \code{e1 \%s$\% list(atomic_vector)}.
#'
#'
#' @param e1 format strings, see \code{\link{stri_sprintf}} for syntax
#' @param e2 a list of atomic vectors to be passed to \code{\link{stri_sprintf}}
#' or a single atomic vector
#'
#' @return
#' Returns a character vector.
#'
#'
#' @examples
#' "value='%d'" %s$% 3
#' "value='%d'" %s$% 1:3
#' "%s='%d'" %s$% list("value", 3)
#' "%s='%d'" %s$% list("value", 1:3)
#' "%s='%d'" %s$% list(c("a", "b", "c"), 1)
#' "%s='%d'" %s$% list(c("a", "b", "c"), 1:3)
#'
#' x <- c("abcd", "\u00DF\u00B5\U0001F970", "abcdef")
#' cat("[%6s]" %s$% x, sep="\n") # width used, not the number of bytes
#'
#' @rdname operator_dollar
#' @aliases operator_dollar oper_dollar
#' @family length
#'
#' @usage
#' e1 \%s$\% e2
#'
#' @export
`%s$%` <- function(e1, e2)
{
if (!is.list(e2))
e2 <- list(e2)
na_string <- NA_character_
.Call(C_stri_sprintf, e1, e2, na_string, "Inf", "NaN", FALSE)
# old version: based on base::sprintf
# # this is stringi, assure UTF-8 output and proper NA handling!
# e1 <- stri_enc_toutf8(as.character(e1))
# if (length(e1) == 0) return(character(0))
#
# for (i in seq_along(e2)) {
# stopifnot(is.atomic(e2[[i]])) # factor is atomic
# if (length(e2[[i]]) == 0) return(character(0))
# if (is.character(e2[[i]]) || is.factor(e2[[i]])) {
# e2[[i]] <- stri_enc_toutf8(e2[[i]])
# }
# }
#
# ret <- stri_enc_toutf8(do.call(sprintf, as.list(c(list(e1), e2))))
# # for the time being, let stri_paste determine NAs
# # (it might be too greedy if there are unused strings)
# which_na <- do.call(stri_paste, e2)
# ret[is.na(which_na)] <- NA_character_
#
# ret[is.na(e1)] <- NA_character_
#
# ret
}
#' @usage
#' e1 \%stri$\% e2
#' @rdname operator_dollar
#' @export
`%stri$%` <- `%s$%`
stringi/R/escape.R 0000644 0001762 0000144 00000007313 14750110641 013520 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Escape Unicode Code Points
#'
#' @description
#' Generates an ASCII string where all non-printable characters
#' and non-ASCII characters are converted to escape sequences.
#'
#' @details
#'
#' For non-printable and certain special (well-known,
#' see also the R man page \link{Quotes})
#' ASCII characters, the following
#' (also recognized in R) convention is used.
#' We get \code{\\a}, \code{\\b}, \code{\\t}, \code{\\n}, \code{\\v},
#' \code{\\f}, \code{\\r}, \code{\"}, \code{\'}, \code{\\\\}
#' or either \code{\\uXXXX} (4 hex digits) or \code{\\UXXXXXXXX} (8 hex digits)
#' otherwise.
#'
#'
#' As usual in stringi, any input string is converted to Unicode
#' before executing the escape process.
#'
#'
#' @param str character vector
#'
#' @return
#' Returns a character vector.
#'
#' @examples
#' stri_escape_unicode('a\u0105!')
#'
#' @family escape
#' @export
stri_escape_unicode <- function(str)
{
.Call(C_stri_escape_unicode, str)
}
#' @title
#' Un-escape All Escape Sequences
#'
#' @description
#' Un-escapes all known escape sequences.
#'
#' @details
#' Uses \pkg{ICU}'s facilities to un-escape Unicode character sequences.
#'
#' The following escape sequences are recognized:
#' \code{\\a}, \code{\\b}, \code{\\t}, \code{\\n}, \code{\\v}, \code{\\?},
#' \code{\\e}, \code{\\f}, \code{\\r}, \code{\"}, \code{\'}, \code{\\\\},
#' \code{\\uXXXX} (4 hex digits),
#' \code{\\UXXXXXXXX} (8 hex digits),
#' \code{\\xXX} (1-2 hex digits),
#' \code{\\ooo} (1-3 octal digits),
#' \code{\\cX} (control-X; X is masked with 0x1F).
#' For \code{\\xXX} and \code{\\ooo}, beware of non-valid UTF-8 byte sequences.
#'
#' Note that some versions of R on Windows cannot handle
#' characters defined with \code{\\UXXXXXXXX}.
#'
#' @param str character vector
#'
#' @return
#' Returns a character vector.
#' If an escape sequence is ill-formed,
#' the result will be \code{NA} and a warning will be given.
#'
#' @examples
#' stri_unescape_unicode('a\\u0105!\\u0032\\n')
#'
#' @family escape
#' @export
stri_unescape_unicode <- function(str)
{
.Call(C_stri_unescape_unicode, str)
}
stringi/R/encoding.R 0000644 0001762 0000144 00000027760 14750110641 014056 0 ustar ligges users # kate: default-dictionary en_US
## This file is part of the 'stringi' package for R.
## Copyright (c) 2013-2025, Marek Gagolewski
## All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
##
## 1. Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright notice,
## this list of conditions and the following disclaimer in the documentation
## and/or other materials provided with the distribution.
##
## 3. Neither the name of the copyright holder nor the names of its
## contributors may be used to endorse or promote products derived from
## this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#' @title
#' Character Encodings and \pkg{stringi}
#'
#' @description
#' This manual page explains how \pkg{stringi} deals with character
#' strings in various encodings.
#'
#' In particular we should note that:
#' \itemize{
#' \item \R lets strings in ASCII, UTF-8, and your platform's
#' native encoding coexist. A character vector printed on the console
#' by calling \code{\link{print}} or \code{\link{cat}} is
#' silently re-encoded to the native encoding.
#' \item Functions in \pkg{stringi} process each string internally in
#' Unicode, the most universal character encoding ever.
#' Even if a string is given in the native encoding, i.e., your platform's
#' default one, it will be converted to Unicode (precisely: UTF-8 or UTF-16).
#' \item Most \pkg{stringi} functions always return UTF-8 encoded strings,
#' regardless of the input encoding. What is more, the functions have been
#' optimized for UTF-8/ASCII input (they have competitive, if not better
#' performance, especially when performing more complex operations like
#' string comparison, sorting, and even concatenation). Thus, it is
#' best to rely on cascading calls to \pkg{stringi} operations solely.
#' }
#'
#' @details
#' Quoting the ICU User Guide,
#' 'Hundreds of encodings have been developed over the years, each for small
#' groups of languages and for special purposes. As a result,
#' the interpretation of text, input, sorting, display, and storage
#' depends on the knowledge of all the different types of character sets
#' and their encodings. Programs have been written to handle either
#' one single encoding at a time and switch between them, or to convert
#' between external and internal encodings.'
#'
#' 'Unicode provides a single character set that covers the major
#' languages of the world, and a small number of machine-friendly encoding
#' forms and schemes to fit the needs of existing applications and protocols.
#' It is designed for best interoperability with both ASCII and ISO-8859-1
#' (the most widely used character sets) to make it easier for Unicode to be
#' used in almost all applications and protocols' (see the ICU User Guide).
#'
#' The Unicode Standard determines the way to map any possible character
#' to a numeric value -- a so-called code point. Such code points, however,
#' have to be stored somehow in computer's memory.
#' The Unicode Standard encodes characters in the range U+0000..U+10FFFF,
#' which amounts to a 21-bit code space. Depending on the encoding
#' form (UTF-8, UTF-16, or UTF-32), each character will
#' then be represented either as a sequence of one to four 8-bit bytes,
#' one or two 16-bit code units, or a single 32-bit integer
#' (compare the ICU FAQ).
#'
#' Unicode can be thought of as a superset of the spectrum of characters
#' supported by any given code page.
#'
#' @section UTF-8 and UTF-16:
#'
#' For portability reasons, the UTF-8 encoding is the most natural choice
#' for representing Unicode character strings in \R. UTF-8 has ASCII as its
#' subset (code points 1--127 represent the same characters in both of them).
#' Code points larger than 127 are represented by multi-byte sequences
#' (from 2 to 4 bytes: Please note that not all sequences of bytes
#' are valid UTF-8, compare \code{\link{stri_enc_isutf8}}).
#'
#' Most of the computations in \pkg{stringi} are performed internally
#' using either UTF-8 or UTF-16 encodings (this depends on type of service
#' you request: some \pkg{ICU} services are designed only to work with UTF-16).
#' Due to such a choice, with \pkg{stringi} you get the same result on
#' each platform, which is -- unfortunately -- not the case of base \R's
#' functions (for instance, it is known that performing a regular expression
#' search under Linux on some texts may give you a different result
#' to those obtained under Windows). We really had portability in our minds
#' while developing our package!
#'
#' We have observed that \R correctly handles UTF-8 strings regardless of your
#' platform's native encoding (see below). Therefore, we decided that most
#' functions in \pkg{stringi} will output its results in UTF-8
#' -- this speeds ups computations on cascading calls to our functions:
#' the strings does not have to be re-encoded each time.
#'
#' Note that some Unicode characters may have an ambiguous representation.
#' For example, ``a with ogonek'' (one character) and ``a''+``ogonek''
#' (two graphemes) are semantically the same. \pkg{stringi} provides functions
#' to normalize character sequences, see \code{\link{stri_trans_nfc}}
#' for discussion. However, it is observed that denormalized strings
#' do appear very rarely in typical string processing activities.
#'
#' Additionally, do note that \pkg{stringi} silently removes byte order marks
#' (BOMs - they may incidentally appear in a string read from a text file)
#' from UTF8-encoded strings, see \code{\link{stri_enc_toutf8}}.
#'
#'
#' @section Character Encodings in \R:
#'
#' Data in memory are just bytes (small integer
#' values) -- an en\emph{coding} is a way to represent characters with such
#' numbers, it is a semantic 'key' to understand a given byte sequence.
#' For example, in ISO-8859-2 (Central European), the value 177 represents
#' Polish ``a with ogonek'', and in ISO-8859-1 (Western European),
#' the same value denotes the ``plus-minus'' sign. Thus, a character encoding
#' is a translation scheme: we need to communicate with \R somehow,
#' relying on how it represents strings.
#'
#' Overall, \R has a very simple encoding marking mechanism,
#' see \code{\link{stri_enc_mark}}. There is an implicit assumption
#' that your platform's default (native) encoding always extends
#' ASCII -- \pkg{stringi} checks that whenever your native encoding
#' is being detected automatically on \pkg{ICU}'s initialization and each time
#' when you change it manually by calling \code{\link{stri_enc_set}}.
#'
#' Character strings in \R (internally) can be declared to be in:
#' \itemize{
#' \item \code{UTF-8};
#' \item \code{latin1}, i.e., either ISO-8859-1 (Western European on
#' Linux, OS X, and other Unixes) or WINDOWS-1252 (Windows);
#' \item \code{bytes} -- for strings that
#' should be manipulated as sequences of bytes.
#' }
#' Moreover, there are two other cases:
#' \itemize{
#' \item ASCII -- for strings consisting only of byte codes
#' not greater than 127;
#' \item \code{native} (a.k.a. \code{unknown} in \code{\link{Encoding}};
#' quite a misleading name: no explicit encoding mark) -- for
#' strings that are assumed to be in your platform's native (default) encoding.
#' This can represent UTF-8 if you are an OS X user,
#' or some 8-bit Windows code page, for example.
#' The native encoding used by \R may be determined by examining
#' the LC_CTYPE category, see \code{\link{Sys.getlocale}}.
#' }
#'
#' Intuitively, ``native'' strings result from reading
#' a string from stdin (e.g., keyboard input). This makes sense: your operating
#' system works in some encoding and provides \R with some data.
#'
#' Each time when a \pkg{stringi} function encounters a string declared
#' in native encoding, it assumes that the input data should be translated
#' from the default encoding, i.e., the one returned by \code{\link{stri_enc_get}}
#' (unless you know what you are doing, the default encoding should only be
#' changed if the automatic encoding detection process fails on \pkg{stringi}
#' load).
#'
#' Functions which allow \code{'bytes'} encoding markings are very rare in
#' \pkg{stringi}, and were carefully selected. These are:
#' \code{\link{stri_enc_toutf8}} (with argument \code{is_unknown_8bit=TRUE}),
#' \code{\link{stri_enc_toascii}}, and \code{\link{stri_encode}}.
#'
#' Finally, note that \R lets strings in ASCII, UTF-8, and your platform's
#' native encoding coexist. A character vector printed with
#' \code{\link{print}}, \code{\link{cat}}, etc., is silently re-encoded
#' so that it can be properly shown, e.g., on the console.
#'
#'
#' @section Encoding Conversion:
#'
#' Apart from automatic conversion from the native encoding,
#' you may re-encode a string manually, for example
#' when you read it from a file created on a different platform.
#' Call \code{\link{stri_enc_list}} for the list of
#' encodings supported by \pkg{ICU}.
#' Note that converter names are case-insensitive
#' and \pkg{ICU} tries to normalize the encoding specifiers.
#' Leading zeroes are ignored in sequences of digits (if further digits follow),
#' and all non-alphanumeric characters are ignored. Thus the strings
#' 'UTF-8', 'utf_8', 'u*Tf08' and 'Utf 8' are equivalent.
#'
#' The \code{\link{stri_encode}} function
#' allows you to convert between any given encodings
#' (in some cases you will obtain \code{bytes}-marked
#' strings, or even lists of raw vectors (i.e., for UTF-16).
#' There are also some useful more specialized functions,
#' like \code{\link{stri_enc_toutf32}} (converts a character vector to a list
#' of integers, where one code point is exactly one numeric value)
#' or \code{\link{stri_enc_toascii}} (substitutes all non-ASCII
#' bytes with the SUBSTITUTE CHARACTER,
#' which plays a similar role as \R's \code{NA} value).
#'
#' There are also some routines for automated encoding detection,
#' see, e.g., \code{\link{stri_enc_detect}}.
#'
#'
#' @section Encoding Detection:
#'
#' Given a text file, one has to know how to interpret (encode)
#' raw data in order to obtain meaningful information.
#'
#' Encoding detection is always an imprecise operation and
#' needs a considerable amount of data. However, in case of some
#' encodings (like UTF-8, ASCII, or UTF-32) a ``false positive'' byte
#' sequence is quite rare (statistically speaking).
#'
#' Check out \code{\link{stri_enc_detect}} (among others) for a useful
#' function in this category.
#'
#' @name about_encoding
#' @rdname about_encoding
#' @aliases about_encoding stringi-encoding encoding
#' @family stringi_general_topics
#' @family encoding_management
#' @family encoding_detection
#' @family encoding_conversion
#'
#' @references
#' \emph{Unicode Basics} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/icu/unicode.html}
#'
#' \emph{Conversion} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/conversion/}
#'
#' \emph{Converters} -- ICU User Guide,
#' \url{https://unicode-org.github.io/icu/userguide/conversion/converters.html}
#' (technical details)
#'
#' \emph{UTF-8, UTF-16, UTF-32 & BOM} -- ICU FAQ,
#' \url{https://www.unicode.org/faq/utf_bom.html}
invisible(NULL)
stringi/cleanup 0000755 0001762 0000144 00000000126 14771224007 013311 0 ustar ligges users #!/bin/sh
rm -f config.* src/Makevars src/*.o src/uconfig_local.h src/install.libs.R
stringi/NEWS 0000644 0001762 0000144 00000115007 14771217072 012444 0 ustar ligges users # Changelog
## 1.8.7 (2025-03-27)
* [BUGFIX] Fixed build warnings.
* [BUGFIX] #512: Fixed PROTECT stack imbalance in `stri_encode_from_marked`.
## 1.8.4 (2024-05-06)
* [BUILD TIME] [BUGFIX] #508: Fixed build errors on Windows
(thanks to @jeroen and @kalibera).
## 1.8.3 (2023-12-10)
* [BUILD TIME] [BUGFIX] Fixed the *format string is not a string literal
(potentially insecure)* warnings.
## 1.8.2 (2023-11-22)
* [BUILD TIME] [BUGFIX] #501: Fixed failing build on 32-bit Windows
(Windows API `ResolveLocaleName` function not available).
* [BUILD TIME] [BUGFIX] #502: `PKG_CPPFLAGS` are now considered
before other `CPPFLAGS` (the same with other flag types) in
the `configure` script to make it compatible with what happens in `Makevars`.
* [BUILD TIME] [BUGFIX] Support for ICU's `double` conversion on Loongarch
has been restored (see #463).
## 1.8.1 (2023-11-09)
* [GENERAL] ICU bundle updated to version 74.1 (Unicode 15.1, CLDR 44).
* [BACKWARD INCOMPATIBILITY] [BUILD TIME] Support for Solaris has now been
dropped. The package is no longer shipped with the very outdated ICU55 bundle.
A compiler supporting at least C++11 as well as ICU >= 61 are now required.
* [BACKWARD INCOMPATIBILITY] #469: Missing date-time fields in
`stri_datetime_parse` and `stri_datetime_create` now default to today's
midnight local time.
* [BACKWARD INCOMPATIBILITY] Removed the long-deprecated and defunct
`fallback_encoding` parameter of `stri_read_lines` and the ellipsis
parameter of `stri_opts_collator`, `stri_opts_regex`, `stri_opts_fixed`,
`stri_opts_brkiter`, and `stri_opts_regex`.
* [BUILD TIME] As per the suggestion of Prof. Brian Ripley, `icudt74l`
(ICU data - little endian) is now included in the source tarball (compressed
with xz to save space). This allows for building **`stringi`** on systems with
no internet access.
* [NEW FEATURE] #476: In break iterator-, date-time-, and collator-based
operations (e.g., `stri_sort`), a warning is emitted when the *root* ICU
resource bundle is returned when using an *explicitly* requested locale.
This might happen when we pass an 'unknown' `locale` argument to these
functions. Note that when relying on the default `locale=NULL` argument,
no warning is emitted. In such a case, checking
if the default locale as returned by `stri_enc_get` is amongst
those listed in `stri_enc_list` is recommended.
* [NEW FEATURE] The `C` locale identifier now resolves to `en_US_POSIX`.
* [BUGFIX] #469: `stri_datetime_parse` did not reset the `Calendar`
object when parsing multiple dates.
* [BUGFIX] #487: Some functions did not accept ASCII strings longer than
858993457 characters on input.
## 1.7.12 (2023-01-09)
* [BUGFIX] Fixed a few issues reported by `rchk`.
* [NOTE] [BACKWARD INCOMPATIBLE CHANGE IF ICU >= 72]
If building against ICU >= 72, note a backward incompatible change:
`@` is no longer considered a word break; for more details, see
.
## 1.7.8 (2022-07-11)
* [DOCUMENTATION] Paper on **`stringi`** has been published in
the *Journal of Statistical Software*;
see .
* [BUGFIX] #473, #397: Fixed buffer overflow in `stri_dup`; Also,
`stri_dup`, `stri_paste`, ... fail more graciously on attempts to
generate strings of length >= 2^31 each.
* [BUILD TIME] #480: Using `Rf_isNull` instead of `isNull`.
* [DOCUMENTATION] #462: That the `numeric=TRUE` collator
does not handle negative numbers correctly is now mentioned in the manual.
## 1.7.6 (2021-11-29)
* [BUILD TIME] #463: Added Loongarch support in ICU's double conversion
(@liuxiang88).
* [BUGFIX] #467: The UCRT build on Windows was not marking strings as `latin1`.
## 1.7.5 (2021-10-04)
* [DOCUMENTATION] Paper on **`stringi`** has been accepted for
publication in the *Journal of Statistical Software*,
see
for a draft version.
* [DOCUMENTATION] The **`stringi`** website at
now features a comprehensive tutorial based on the aforementioned paper.
* [DOCUMENTATION] The *ICU* Project site has been moved to
.
* [BUILD TIME] #457: The `autoconf` macros `AC_LANG_CPLUSPLUS`
and `AC_TRY_COMPILE` were obsolete.
* [BUGFIX] #458: Passing ALTREP objects no longer yields
'embeded nul in string' errors.
## 1.7.4 (2021-08-12)
* [BUGFIX] #449: Fixed segfaults generated by `stri_sprintf`.
* [BUILD TIME] No longer defining `USE_RINTERNALS` and `R_NO_REMAP`.
## 1.7.3 (2021-07-15)
* [BUGFIX] Fixed the previous patch of ICU55 causing a build failure on,
amongst others, CRAN's Solaris-based target.
## 1.7.2 (2021-07-14)
* [BUGFIX] Workaround for a bug in `tools::checkFF` failing
when `NA_character_` is passed to `.Call`.
## 1.7.1 (2021-07-14)
* [BACKWARD INCOMPATIBILITY] `%s$%` and `%stri$%` now use the new `stri_sprintf`
(see below) function instead of `base::sprintf`.
* [BACKWARD INCOMPATIBILITY, NEW FEATURE] In `stri_sub<-` and `stri_sub_all<-`,
providing a negative `length` from now on does not result in the corresponding
input string being altered.
* [BACKWARD INCOMPATIBILITY, NEW FEATURE] In `stri_sub` and `stri_sub_all`,
negative `length` results in the corresponding output being `NA`
or not extracted at all, depending on the setting of the new argument
`ignore_negative_length`.
* [BACKWARD INCOMPATIBILITY, BUGFIX, NEW FEATURE] In `stri_subset*`
and their replacement versions, `pattern` and `value` cannot be longer
than `str` (but now they are recycled if necessary).
* [BACKWARD INCOMPATIBILITY, NEW FEATURE] `stri_sub*` now accept the
`from` argument being a matrix like `cbind(from, length=length)`.
Unnamed columns or any other names are still interpreted as `cbind(from, to)`.
Also, the new argument `use_matrix` can be used to disable
the special treatment of such matrices.
* [DOCUMENTATION] It has been clarified that the syntax of `*_charclass`
(e.g., used in `stri_trim*`) differs slightly from regex character
classes.
* [NEW FEATURE] #420: `stri_sprintf` (alias: `stri_string_format`)
is a Unicode-aware replacement for and enhancement of the base `sprintf`:
it adds a customised handling of `NA`s (on demand), computing field size
based on code point width, outputting substrings of at most given width,
variable width and precision (both at the same time), etc. Moreover,
`stri_printf` can be used to display formatted strings conveniently.
* [NEW FEATURE] #153: `stri_match_*_regex` now extract capture group names.
* [NEW FEATURE] #25: `stri_locate_*_regex` now have a new argument,
`capture_groups`, which allows for extracting positions of matches
to parenthesised subexpressions.
* [NEW FEATURE] `stri_locate_*` now have a new argument, `get_length`,
whose setting may result in generating *from-length* matrices
(instead of *from-to* ones).
* [NEW FEATURE] #438: `stri_trans_general` now supports rule-based
as well as reverse-direction transliteration.
* [NEW FEATURE] #434: `stri_datetime_format` and `stri_datetime_parse`
are now vectorised also with respect to the `format` argument.
* [NEW FEATURE] `stri_datetime_fstr` has a new argument, `ignore_special`,
which defaults to `TRUE` for backward compatibility.
* [NEW FEATURE] `stri_datetime_format`, `stri_datetime_add`, and
`stri_datetime_fields` now call `as.POSIXct` more eagerly.
* [NEW FEATURE] `stri_trim*` now have a new argument, `negate`.
* [NEW FEATURE] `stri_replace_rstr` converts `gsub`-style replacement strings
to `stri_replace`-style.
* [INTERNAL] `stri_prepare_arg*` have been refactored, buffer overruns
in the exception handling subsystem are now avoided.
* [BUGFIX] Few functions (`stri_length`, `stri_enc_toutf32`, etc.)
did not throw an exception on an invalid UTF-8
byte sequence (and merely issued a warning instead).
* [BUGFIX] `stri_datetime_fstr` did not honour `NA_character_`
and did not parse format strings such as `"%Y%m%d"` correctly.
It has now been completely rewritten (in C).
* [BUGFIX] `stri_wrap` did not recognise the width of certain Unicode sequences
correctly.
## 1.6.2 (2021-05-14)
* [BACKWARD INCOMPATIBILITY] In `stri_enc_list()`,
`simplify` now defaults to `TRUE`.
* [NEW FEATURE] #425: The outputs of `stri_enc_list()`, `stri_locale_list()`,
`stri_timezone_list()`, and `stri_trans_list()` are now sorted.
* [NEW FEATURE] #428: In `stri_flatten`, `na_empty=NA` now omits missing values.
* [BUILD TIME] #431: Pre-4.9.0 GCC has `::max_align_t`,
but not `std::max_align_t`, added a (possible) workaround, see the `INSTALL`
file.
* [BUGFIX] #429: `stri_width()` misclassified the width of certain
code points (including grave accent, Eszett, etc.);
General category *Sk* (Symbol, modifier) is no longer of width 0,
`UCHAR_EAST_ASIAN_WIDTH` of `U_EA_AMBIGUOUS` is no longer of width 2.
* [BUGFIX] #354: `ALTREP` `CHARSXP`s were not copied, and thus could have been
garbage collected in the so-called meanwhile (with thanks to @jimhester).
## 1.6.1 (2021-05-05)
* [GENERAL] #401: stringi is now bundled with ICU4C 69.1 (upgraded from 61.1),
which is used on most Windows and OS X builds as well as on *nix systems
not equipped with system ICU. However, if the C++11 support is disabled,
stringi will be built against the battle-tested ICU4C 55.1.
The update to ICU brings Unicode 13.0 and CLDR 39 support.
* [DOCUMENTATION] A draft version of a paper on **`stringi`** is now available
at .
* [GENERAL] stringi now requires R >= 3.1 (`CXX_STD` of `CXX11` or `CXX1X`).
* [NEW FEATURE] #408: `stri_trans_casefold()` performs case folding;
this is different from case mapping, which is locale-dependent.
Folding makes two pieces of text that differ only in case identical.
This can come in handy when comparing strings.
* [NEW FEATURE] #421: `stri_rank()` ranks strings in a character vector
(e.g., for ordering data frames with regards to multiple criteria,
the ranks can be passed to `order()`, see #219).
* [NEW FEATURE] #266: `stri_width()` now supports emojis.
* [NEW FEATURE] `%s$%` and `%stri$%` are now vectorised with respect to
both arguments.
* [BUGFIX] `stri_sort_key()` now outputs `bytes`-encoded strings.
* [BUGFIX] #415: `locale=''` was not equivalent to `locale=NULL`
in `stri_opts_collator()`.
* [INTERNAL] #414: Use `LEVELS(x)` macro instead of accessing `(x)->sxpinfo.gp`
directly (@lukaszdaniel).
## 1.5.3 (2020-09-04)
* [DOCUMENTATION] stringi home page has moved to
and now includes a comprehensive reference
manual.
* [NEW FEATURE] #400: `%s$%` and `%stri$%` are now binary operators
that call base R's `sprintf()`.
* [NEW FEATURE] #399: The `%s*%` and `%stri*%` operators can be used
in addition to `stri_dup()`, for the very same purpose.
* [NEW FEATURE] #355: `stri_opts_regex()` now accepts the `time_limit` and
`stack_limit` options so as to prevent malformed or malicious regexes
from running for too long.
* [NEW FEATURE] #345: `stri_startswith()` and `stri_endswith()` are now equipped
with the `negate` parameter.
* [NEW FEATURE] #382: Incorrect regexes are now reported to ease debugging.
* [DEPRECATION WARNING] #347: Any unknown option passed to `stri_opts_fixed()`,
`stri_opts_regex()`, `stri_opts_coll()`, and `stri_opts_brkiter()` now
generates a warning. In the future, the `...` parameter will be removed,
so that will be an error.
* [DEPRECATION WARNING] `stri_duplicated()`'s `fromLast` argument
has been renamed `from_last`. `fromLast` is now its alias scheduled
for removal in a future version of the package.
* [DEPRECATION WARNING] `stri_enc_detect2()`
is scheduled for removal in a future version of the package.
Use `stri_enc_detect()` or the more targeted `stri_enc_isutf8()`,
`stri_enc_isascii()`, etc., instead.
* [DEPRECATION WARNING] `stri_read_lines()`, `stri_write_lines()`,
`stri_read_raw()`: use `con` argument instead of `fname` now.
The argument `fallback_encoding` is scheduled for removal and is no longer
used. `stri_read_lines()` does not support `encoding="auto"` anymore.
* [DEPRECATION WARNING] `nparagraphs` in `stri_rand_lipsum()` has been renamed
`n_paragraphs`.
* [NEW FEATURE] #398: Alternative, British spelling of function parameters
has been introduced, e.g., `stri_opts_coll()` now supports both
`normalization` and `normalisation`.
* [NEW FEATURE] #393: `stri_read_bin()`, `stri_read_lines()`, and
`stri_write_lines()` are no longer marked as draft API.
* [NEW FEATURE] #187: `stri_read_bin()`, `stri_read_lines()`, and
`stri_write_lines()` now support connection objects as well.
* [NEW FEATURE] #386: New function `stri_sort_key()` for generating
locale-dependent sort keys which can be ordered at the byte level and
return an equivalent ordering to the original string (@DavisVaughan).
* [BUGFIX] #138: `stri_encode()` and `stri_rand_strings()`
now can generate strings of much larger lengths.
* [BUGFIX] `stri_wrap()` did not honour `indent` correctly when
`use_width` was `TRUE`.
## 1.4.6 (2020-02-17)
* [BACKWARD INCOMPATIBILITY] #369: `stri_c()` now returns an empty string
when input is empty and `collapse` is set.
* [BUGFIX] #370: fixed an issue in `stri_prepare_arg_POSIXct()`
reported by rchk.
* [DOCUMENTATION] #372: documented arguments not in `\usage` in
documentation object `stri_datetime_format`: `...`
## 1.4.5 (2020-01-11)
* [BUGFIX] #366: fix for #363 required ICU >= 55 .
## 1.4.4 (2020-01-06)
* [BUGFIX] #348: Avoid copying 0 bytes to a nil-buffer in `stri_sub_all()`.
* [BUGFIX] #362: Removed `configure` variable `CXXCPP` as it is now deprecated.
* [BUGFIX] #318: PROTECTing objects from gcing as reported by `rchk`.
* [BUGFIX] #344, #364: Removed compiler warnings in icu61/common/cstring.h.
* [BUGFIX] #363: Status of `RegexMatcher` is now checked after its use.
## 1.4.3 (2019-03-12)
* [NEW FEATURE] #30: New function `stri_sub_all()` - a version of
`stri_sub()` accepting list `from`/`to`/`length` arguments for extracting
multiple substrings from each string in a character vector.
* [NEW FEATURE] #30: New function `stri_sub_all<-()` (and its `%<%`-friendly
version, `stri_sub_replace_all()`) - for replacing multiple substrings
with corresponding replacement strings.
* [NEW FEATURE] In `stri_sub_replace()`, `value` parameter
has a new alias, `replacement`.
* [NEW FEATURE] New convenience functions based on `stri_remove_empty()`:
`stri_omit_empty_na()`, `stri_remove_empty_na()`, `stri_omit_empty()`,
and also `stri_remove_na()`, `stri_omit_na()`.
* [BUGFIX] #343: `stri_trans_char()` did not yield correct results
for overlapping pattern and replacement strings.
* [WARNFIX] #205: `configure.ac` is now included in the source bundle.
## 1.3.1 (2019-02-10)
* [BACKWARD INCOMPATIBILITY] #335: A fix to #314 prevented (by design) the use
of the system ICU if the library had been compiled with `U_CHARSET_IS_UTF8=1`.
However, this is the default setting in `libicu`>=61. From now on, in such
cases the system ICU is used more eagerly, but `stri_enc_set()` issues
a warning stating that the default (UTF-8) encoding cannot be changed.
* [NEW FEATURE] #232: All `stri_detect_*` functions now have the `max_count`
argument that allows for, e.g., stopping at the first pattern occurrence.
* [NEW FEATURE] #338: `stri_sub_replace()` is now an alias for `stri_sub<-()`
which makes it much more easily pipable (@yutannihilation, @BastienFR).
* [NEW FEATURE] #334: Added missing `icudt61b.dat` to support big-endian
platforms (thanks to Dimitri John Ledkov @xnox).
* [BUGFIX] #296: Out-of-the box build used to fail on CentOS 6, upgraded
`configure` to `--disable-cxx11` more eagerly at an early stage.
* [BUGFIX] #341: Fixed possible buffer overflows when calling `strncpy()`
from within ICU 61.
* [BUGFIX] #325: Made `configure` more portable so that it works
under `/bin/dash` now.
* [BUGFIX] #319: Fixed overflow in `stri_rand_shuffle()`.
* [BUGFIX] #337: Empty search patterns in search functions (e.g.,
`stri_split_regex()` and `stri_count_fixed()`) used to raise
too many warnings on empty search patterns.
## 1.2.4 (2018-07-20)
* [BUGFIX] #314: Testing `U_CHARSET_IS_UTF8` in `configure` when
using `pkg-build`.
* [BUILD TIME] #317: Included `icudt61l.zip` in the source bundle to solve
the frequent `icudt download failed` error (also on CRAN's `windows-release`
and `windows-oldrel`). (reverted in version 1.3.1, the `winbuilder`
errors were caused by a build chain bug).
## 1.2.3 (2018-05-16)
* [BUGFIX] #296: Fixed the behaviour of the `configure` script on CentOS 6.
* [BUGFIX] Fixed broken Windows build by updating the `icudt` mirror list.
## 1.2.2 (2018-05-01)
* [GENERAL] #193: stringi is now bundled with ICU4C 61.1,
which is used on most Windows and OS X builds as well as on *nix systems
not equipped with ICU. However, if the C++11 support is disabled,
stringi will be built against ICU4C 55.1. The update to ICU brings
Unicode 10.0 support, including new emoji characters.
* [BUGFIX] #288: `stri_match()` did not return the correct number of columns
when input was empty.
* [NEW FEATURE] #188: `stri_enc_detect()` now returns a list of data frames.
* [NEW FEATURE] #289: `stri_flatten()` how has `na_empty` and `omit_empty`
arguments.
* [NEW FEATURE] New functions: `stri_remove_empty()`, `stri_na2empty()`.
* [NEW FEATURE] #285: Coercion from a non-trivial list (one that consists
of atomic vectors, each of length 1) to an atomic vector now issues a warning.
* [WARN] Removed `-Wparentheses` warnings in `icu55/common/cstring.h:38:63`
and `icu55/i18n/windtfmt.cpp` in the ICU4C 55.1 bundle.
## 1.1.7 (2018-03-06)
* [BUGFIX] Fixed ICU4C 55.1 generating some *significant warnings*
(`icu55/i18n/winnmfmt.cpp`) and *suppressing important diagnostics*
(`src/icu55/i18n/decNumber.c`).
## 1.1.6 (2017-11-10)
* [WINDOWS SPECIFIC] #270: Strings marked with `latin1` encoding
are now converted internally to UTF-8 using the WINDOWS-1252 codec.
This fixes problems with - among others - displaying the Euro sign.
* [NEW FEATURE] #263: Added support for custom rule-based break iteration,
see `?stri_opts_brkiter`.
* [NEW FEATURE] #267: `omit_na=TRUE` in `stri_sub<-()` now ignores missing
values in any of the arguments provided.
* [BUGFIX] Fixed unPROTECTed variable names and stack imbalances
as reported by `rchk`.
## 1.1.5 (2017-04-07)
* [GENERAL] stringi now requires ICU4C >= 52.
* [BUGFIX] Fixed errors pointed out by `clang-UBSAN` in `stri_brkiter.h`.
* [GENERAL] stringi now requires R >= 2.14.
* [BUILD TIME] #238, #220: Now trying *standard* ICU4C build flags if a call
to `pkg-config` fails.
* [BUILD TIME] #258: Use `CXX11` instead of `CXX1X` on R >= 3.4.
* [BUILD TIME, BUGFIX] #254: `dir.exists()` is R >= 3.2.
## 1.1.3 (2017-03-21)
* [REMOVE DEPRECATED] `stri_install_check()` and `stri_install_icudt()`
marked as deprecated in stringi 0.5-5 are no longer being exported.
* [BUGFIX] #227: Incorrect behaviour of `stri_sub()` and `stri_sub<-()`
if the empty string was the result.
* [BUILD TIME] #231: The `configure` (Linux/Unix only) script now reads the
following environment variables: `STRINGI_CFLAGS`, `STRINGI_CPPFLAGS`,
`STRINGI_CXXFLAGS`, `STRINGI_LDFLAGS`, `STRINGI_LIBS`,
`STRINGI_DISABLE_CXX11`, `STRINGI_DISABLE_ICU_BUNDLE`,
`STRINGI_DISABLE_PKG_CONFIG`, `PKG_CONFIG`,
see `INSTALL` for more information.
* [BUILD TIME] #253: Call to `R_useDynamicSymbols()` added.
* [BUILD TIME] #230: `icudt` is now being downloaded by
`configure` (*NIX only) *before* building.
* [BUILD TIME] #242: `_COUNT/_LIMIT` enum constants have been deprecated
as of ICU 58.2, stringi code has been upgraded accordingly.
## 1.1.2 (2016-09-30)
* [BUGFIX] `round()`, `snprintf()` is not C++98.
## 1.1.1 (2016-05-25)
* [BUGFIX] #214: Allow a regex pattern like `.*` to match an empty string.
* [BUGFIX] #210: `stri_replace_all_fixed(c("1", "NULL"), "NULL", NA)`
now results in `c("1", NA)`.
* [NEW FEATURE] #199: `stri_sub<-()` now allows for ignoring `NA` locations
(a new `omit_na` argument added).
* [NEW FEATURE] #207: `stri_sub<-()` now allows for substring insertions
(via `length=0`).
* [NEW FUNCTION] #124: `stri_subset<-()` functions added.
* [NEW FEATURE] #216: `stri_detect()`, `stri_subset()`, `stri_subset<-()`
now all have the `negate` argument.
* [NEW FUNCTION] #175: `stri_join_list()` concatenates all strings
in a list of character vectors. Useful in conjunction with, e.g.,
`stri_extract_all_regex()`, `stri_extract_all_words()`, etc.
## 1.0-1 (2015-10-22)
* [GENERAL] #88: C API is now available for use in, e.g., Rcpp packages, see
for an example.
* [BUGFIX] #183: Floating point exception raised in `stri_sub()` and
`stri_sub<-()` when `to` or `length` was a zero-length numeric vector.
* [BUGFIX] #180: `stri_c()` warned incorrectly (recycling rule) when using more
than two elements.
## 0.5-5 (2015-06-28)
* [BACKWARD INCOMPATIBILITY] `stri_install_check()` and `stri_install_icudt()`
are now deprecated. From now on they are supposed to be used only
by the stringi installer.
* [BUGFIX] #176: A patch for `sys/feature_tests.h` no longer included
(the original file was copyrighted by Sun Microsystems); fixed the *Compiler
or options invalid for pre-Unix 03 X/Open applications and pre-2001 POSIX
applications* error by forcing (conditionally) `_XPG6` conformance.
* [BUGFIX] #174: `stri_paste()` did not generate any warning when
the recycling rule is violated and `sep==""`.
* [BUGFIX] #170: `icu::setDataDirectory` is no longer called if our ICU
source bundle is not used (this used to cause build problems on openSUSE).
* [BUILD TIME] #169: `configure` now tries to switch to the *standard*
C++ compiler if a C++11 one is not configured correctly.
* [BUILD TIME] `configure.win` (`Biarch: TRUE`) now mimics `autoconf`'s
`AC_SUBST` and `AC_CONFIG_FILES` so that the build process is now
more similar across different platforms.
* [NEW FEATURE] `stri_info()` now also gives information about which version
of ICU4C is in use (system or bundle).
## 0.5-2 (2015-06-21)
* [BACKWARD INCOMPATIBILITY] The second argument to `stri_pad_*()` has
been renamed `width`.
* [GENERAL] #69: stringi is now bundled with ICU4C 55.1.
* [NEW FUNCTIONS] `stri_extract_*_boundaries()` extract text between text
boundaries.
* [NEW FUNCTION] #46: `stri_trans_char()` is a stringi-flavoured
`chartr()` equivalent.
* [NEW FUNCTION] #8: `stri_width()` approximates the *width* of a string
in a more Unicode-ish fashion than `nchar(..., "width")`
* [NEW FEATURE] #149: `stri_pad()` and `stri_wrap()` is now (by default)
based on code point widths instead of the number of code points.
Moreover, the default behaviour of `stri_wrap()` is now such that it
does not get rid of non-breaking, zero width, etc., spaces.
* [NEW FEATURE] #133: `stri_wrap()` silently allows for `width <= 0`
(for compatibility with `strwrap()`).
* [NEW FEATURE] #139: `stri_wrap()` gained a new argument: `whitespace_only`.
* [NEW FUNCTIONS] #137: Date-time formatting/parsing:
* `stri_timezone_list()` - lists all known time zone identifiers;
* `stri_timezone_set()`, `stri_timezone_get()` - manage the current
default time zone;
* `stri_timezone_info()` - basic information on a given time zone;
* `stri_datetime_symbols()` - gives localizable date-time formatting data;
* `stri_datetime_fstr()` - converts a `strptime`-like format string
to an ICU date/time format string;
* `stri_datetime_format()` - converts date/time to string;
* `stri_datetime_parse()` - converts string to date/time object;
* `stri_datetime_create()` - constructs date-time objects
from numeric representations;
* `stri_datetime_now()` - returns current date-time;
* `stri_datetime_fields()` - returns date-time fields' values;
* `stri_datetime_add()` - adds specific number of date-time units
to a date-time object.
* [GENERAL] #144: Performance improvements in handling ASCII strings
(these affect `stri_sub()`, `stri_locate()` and other string index-based
operations)
* [GENERAL] #143: Searching for short fixed patterns (`stri_*_fixed()`) now
relies on the current `libC`'s implementation of `strchr()` and `strstr()`.
This is very fast, e.g., on `glibc` using the `SSE2/3/4` instruction set.
* [BUILD TIME] #141: A local copy of `icudt*.zip` may be used on package
install; see the `INSTALL` file for more information.
* [BUILD TIME] #165: The `configure` option `--disable-icu-bundle`
forces the use of system ICU when building the package.
* [BUGFIX] Locale specifiers are now normalized in a more intelligent way:
e.g., `@calendar=gregorian` expands to `DEFAULT_LOCALE@calendar=gregorian`.
* [BUGFIX] #134: `stri_extract_all_words()` did not accept `simplify=NA`.
* [BUGFIX] #132: Incorrect behaviour in `stri_locate_regex()` for matches
of zero lengths.
* [BUGFIX] stringr/#73: `stri_wrap()` returned `CHARSXP` instead of `STRSXP`
on empty string input with `simplify=FALSE` argument.
* [BUGFIX] #164: Using `libicu-dev` failed on Ubuntu
(`LIBS` shall be passed after `LDFLAGS` and the list of `.o` files).
* [BUGFIX] #168: Build now fails if `icudt` is not available.
* [BUGFIX] #135: C++11 is now used by default (see the `INSTALL` file,
however) to build stringi from sources. This is because ICU4C uses the
`long long` type which is not part of the C++98 standard.
* [BUGFIX] #154: Dates and other objects with a custom class attribute
were not coerced to the character type correctly.
* [BUGFIX] Force ICU `u_init()` call on the stringi dynlib load.
* [BUGFIX] #157: Many overfull `hbox`es in the package PDF manual have been
corrected.
## 0.4-1 (2014-12-11)
* [IMPORTANT CHANGE] `n_max` argument in `stri_split_*()` has been renamed `n`.
* [IMPORTANT CHANGE] `simplify=FALSE` in `stri_extract_all_*()` and
`stri_split_*()` now calls `stri_list2matrix()` with `fill=""`.
`fill=NA_character_` may be obtained by using `simplify=NA`.
* [IMPORTANT CHANGE, NEW FUNCTIONS] #120: `stri_extract_words()` has been
renamed `stri_extract_all_words()` and `stri_locate_boundaries()` -
`stri_locate_all_boundaries()` as well as `stri_locate_words()` -
`stri_locate_all_words()`. New functions are now available:
`stri_locate_first_boundaries()`, `stri_locate_last_boundaries()`,
`stri_locate_first_words()`, `stri_locate_last_words()`,
`stri_extract_first_words()`, `stri_extract_last_words()`.
* [IMPORTANT CHANGE] #111: `opts_regex`, `opts_collator`, `opts_fixed`, and
`opts_brkiter` can now be supplied individually via `...`.
In other words, you may now simply call, e.g.,
`stri_detect_regex(str, pattern, case_insensitive=TRUE)` instead of
`stri_detect_regex(str, pattern,
opts_regex=stri_opts_regex(case_insensitive=TRUE))`.
* [NEW FEATURE] #110: Fixed pattern search engine's settings can
now be supplied via `opts_fixed` argument in `stri_*_fixed()`,
see `stri_opts_fixed()`. A simple (not suitable for natural language
processing) yet very fast `case_insensitive` pattern matching can be
performed now. `stri_extract_*_fixed()` is again available.
* [NEW FEATURE] #23: `stri_extract_all_fixed()`, `stri_count()`, and
`stri_locate_all_fixed()` may now also look for overlapping pattern
matches, see `?stri_opts_fixed`.
* [NEW FEATURE] #129: `stri_match_*_regex()` gained a `cg_missing` argument.
* [NEW FEATURE] #117: `stri_extract_all_*()`, `stri_locate_all_*()`,
`stri_match_all_*()` gained a new argument: `omit_no_match`.
Setting it to `TRUE` makes these functions compatible with their
**`stringr`** equivalents.
* [NEW FEATURE] #118: `stri_wrap()` gained `indent`, `exdent`, `initial`,
and `prefix` arguments. Moreover, Knuth's dynamic word wrapping algorithm
now assumes that the cost of printing the last line is zero, see #128.
* [NEW FEATURE] #122: `stri_subset()` gained an `omit_na` argument.
* [NEW FEATURE] `stri_list2matrix()` gained an `n_min` argument.
* [NEW FEATURE] #126: `stri_split()` is now also able to act
just like `stringr::str_split_fixed()`.
* [NEW FEATURE] #119: `stri_split_boundaries()` now has
`n`, `tokens_only`, and `simplify` arguments. Additionally,
`stri_extract_all_words()` is now equipped with `simplify` arg.
* [NEW FEATURE] #116: `stri_paste()` gained a new argument:
`ignore_null`. Setting it to `TRUE` makes this function more compatible
with `paste()`.
* [OTHER] #123: `useDynLib` is used to speed up symbol look-up in
the compiled dynamic library.
* [BUGFIX] #114: `stri_paste()`: could return result in an incorrect order.
* [BUGFIX] #94: Run-time errors on Solaris caused by setting
`-DU_DISABLE_RENAMING=1` - memory allocation errors in, among others,
the ICU `UnicodeString`. This setting also caused some `ASAN` sanity check
failures within ICU code.
## 0.3-1 (2014-11-06)
* [IMPORTANT CHANGE] #87: `%>%` overlapped with the pipe operator from
the `magrittr` package; now each operator like `%>%` has been renamed `%s>%`.
* [IMPORTANT CHANGE] #108: Now the `BreakIterator` (for text boundary analysis)
may be more easily controlled via `stri_opts_brkiter()` (see options `type`
and `locale` which aim to replace now-removed `boundary` and `locale`
parameters to `stri_locate_boundaries()`, `stri_split_boundaries()`,
`stri_trans_totitle()`, `stri_extract_words()`, and `stri_locate_words()`).
* [NEW FUNCTIONS] #109: `stri_count_boundaries()` and `stri_count_words()`
count the number of text boundaries in a string.
* [NEW FUNCTIONS] #41: `stri_startswith_*()` and `stri_endswith_*()`
determine whether a string starts or ends with a given pattern.
* [NEW FEATURE] #102: `stri_replace_all_*()` now all have the `vectorize_all`
parameter, which defaults to `TRUE` for backward compatibility.
* [NEW FUNCTION] #91: Added `stri_subset_*()` - a convenient and more efficient
substitute for `str[stri_detect_*(str, ...)]`.
* [NEW FEATURE] #100: `stri_split_fixed()`, `stri_split_charclass()`,
`stri_split_regex()`, `stri_split_coll()` gained a `tokens_only` parameter,
which defaults to `FALSE` for backward compatibility.
* [NEW FUNCTION] #105: `stri_list2matrix()` converts lists of atomic vectors
to character matrices, useful in conjunction with `stri_split()`
and `stri_extract()`.
* [NEW FEATURE] #107: `stri_split_*()` now allow
setting an `omit_empty=NA` argument.
* [NEW FEATURE] #106: `stri_split()` and `stri_extract_all()`
gained a `simplify` argument
(if `TRUE`, then `stri_list2matrix(..., byrow=TRUE)`
is called on the resulting list).
* [NEW FUNCTION] #77: `stri_rand_lipsum()` generates
a (pseudo)random dummy *lorem ipsum* text.
* [NEW FEATURE] #98: `stri_trans_totitle()` gained a `opts_brkiter`
parameter; it indicates which ICU `BreakIterator` should be used when
case mapping.
* [NEW FEATURE] `stri_wrap()` gained a new parameter: `normalize`.
* [BUGFIX] #86: `stri_*_fixed()`, `stri_*_coll()`, and `stri_*_regex()` could
give incorrect results if one of search strings were of length 0.
* [BUGFIX] #99: `stri_replace_all()` did not use the `replacement` arg.
* [BUGFIX] #112: Some of the objects were not PROTECTed from
garbage collection - this could have led to spontaneous SEGFAULTS.
* [BUGFIX] Some collator's options were not passed correctly to ICU services.
* [BUGFIX] Memory leaks as detected by
`valgrind --tool=memcheck --leak-check=full` have been removed.
* [DOCUMENTATION] Significant extensions/clean ups in the stringi manual.
## 0.2-5 (2014-05-16)
* Some examples are no longer run if `icudt` is not available
(this was reverted in a future version though).
## 0.2-4 (2014-05-15)
* [BUGFIX] Fixed issues with loading of misaligned addresses
in `stri_*_fixed()`.
## 0.2-3 (2014-05-14)
* [IMPORTANT CHANGE] `stri_cmp*()` now do not allow for passing
`opts_collator=NA`. From now on, `stri_cmp_eq()`, `stri_cmp_neq()`,
and the new operators `%===%`, `%!==%`, `%stri===%`, and `%stri!==%`
are locale-independent operations, which base on code point comparisons.
New functions `stri_cmp_equiv()` and `stri_cmp_nequiv()`
(and from now on also `%==%`, `%!=%`, `%stri==%`, and `%stri!=%`)
test for canonical equivalence.
* [IMPORTANT CHANGE] `stri_*_fixed()` search functions now perform
a locale-independent exact (byte-wise, of course after conversion to UTF-8)
pattern search. All the `Collator`-based, locale-dependent search routines
are now available via `stri_*_coll()`. The reason behind this is that
ICU's `USearch` has currently very poor performance. What is more,
in many search tasks exact pattern matching is sufficient anyway.
* [GENERAL] `stri_*_fixed` now use a tweaked Knuth-Morris-Pratt search
algorithm which improves the search performance drastically.
* [IMPORTANT CHANGE] `stri_enc_nf*()` and `stri_enc_isnf*()` function families
have been renamed `stri_trans_nf*()` and `stri_trans_isnf*()`,
respectively -- they deal with text transforming,
and not with character encoding. Note that all of these may
be performed by ICU's `Transliterator` too (see below).
* [NEW FUNCTION] `stri_trans_general()` and `stri_trans_list()` give access
to ICU's `Transliterator`: they may be used to perform some generic
text transforms, like Unicode normalisation, case folding, etc.
* [NEW FUNCTION `stri_split_boundaries()` uses ICU's `BreakIterator`
to split strings at specific text boundaries. Moreover,
`stri_locate_boundaries()` indicates positions of these boundaries.
* [NEW FUNCTION] `stri_extract_words()` uses ICU's `BreakIterator` to
extract all words from a text. Additionally, `stri_locate_words()`
locates start and end positions of words in a text.
* [NEW FUNCTION] `stri_pad()`, `stri_pad_left()`, `stri_pad_right()`,
and `stri_pad_both()` pad a string with a specific code point.
* [NEW FUNCTION] `stri_wrap()` breaks paragraphs of text into lines.
Two algorithms (greedy and minimal raggedness) are available.
* [IMPORTANT CHANGE] `stri_*_charclass()` search functions now
rely solely on ICU's `UnicodeSet` patterns. All the previously accepted
charclass identifiers became invalid. However, new patterns
should now be more familiar to the users (they are regex-like).
Moreover, we observe a very nice performance gain.
* [IMPORTANT CHANGE] `stri_sort()` now does not include `NA`s
in output vectors by default, for compatibility with `sort()`.
Moreover, currently none of the input vector's attributes are preserved.
* [NEW FUNCTION] `stri_unique()` extracts unique elements from
a character vector.
* [NEW FUNCTIONS] `stri_duplicated()` and `stri_duplicated_any()`
determine duplicate elements in a character vector.
* [NEW FUNCTION] `stri_replace_na()` replaces `NA`s in a character vector
with a given string, useful for emulating, e.g., R's `paste()` behaviour.
* [NEW FUNCTION] `stri_rand_shuffle()` generates a random permutation
of code points in a string.
* [NEW FUNCTION] `stri_rand_strings()` generates random strings.
* [NEW FUNCTIONS] New functions and binary operators for string comparison:
`stri_cmp_eq()`, `stri_cmp_neq()`, `stri_cmp_lt()`, `stri_cmp_le()`,
`stri_cmp_gt()`, `stri_cmp_ge()`, `%==%`, `%!=%`, `%<%`, `%<=%`,
`%>%`, `%>=%`.
* [NEW FUNCTION] `stri_enc_mark()` reads declared encodings of character
strings as seen by stringi.
* [NEW FUNCTION] `stri_enc_tonative(str)` is an alias to
`stri_encode(str, NULL, NULL)`.
* [NEW FEATURE] `stri_order()` and `stri_sort()` now have an additional
argument `na_last` (defaults to `TRUE` and `NA`, respectively).
* [NEW FEATURE] `stri_replace_all_charclass()`, `stri_extract_all_charclass()`,
and `stri_locate_all_charclass()` now have a new argument, `merge`
(defaults to `FALSE` for backward-compatibility). It may be used
to, e.g., replace sequences of white spaces with a single space.
* [NEW FEATURE] `stri_enc_toutf8()` now has a new `validate` argument
(which defaults to `FALSE` for backward-compatibility). It may be used
in a (rare) case where a user wants to fix an invalid UTF-8 byte sequence.
`stri_length()` (among others) now detects invalid UTF-8 byte sequences.
* [NEW FEATURE] All binary operators `%???%` now also have aliases `%stri???%`.
* [GENERAL] Performance improvements in `StriContainerUTF8`
and `StriContainerUTF16` (they affect most other functions).
* [GENERAL] Significant performance improvements in `stri_join()`,
`stri_flatten()`, `stri_cmp()`, `stri_trans_to*()`, and others.
* [GENERAL] Added 3rd mirror site for our `icudt` binary distribution.
* `U_MISSING_RESOURCE_ERROR` message in `StriException` now suggests
calling `stri_install_check()`.
* [BUGFIX] UTF-8 BOMs are now silently removed from input strings.
* [BUGFIX] No more attempts to re-encode UTF-8 encoded strings
if native encoding is UTF-8 in `StriContainerUTF8`.
* [BUGFIX] Possible memory leaks when throwing errors via `Rf_error()`.
* [BUGFIX] `stri_order()` and `stri_cmp()` could return incorrect results
for `opts_collator=NA`.
* [BUGFIX] `stri_sort()` did not guarantee to return strings in UTF-8.
## 0.1-25 (2014-03-12)
* LICENSE tweaks.
* First CRAN release.
## 0.1-24 (2014-03-11)
* Fixed bugs detected with `ASAN` and `UBSAN`,
e.g., fixed `CharClass::gcmask` type (`enum` -> `uint32_t`)
(reported by `UBSAN`).
* Fixed array over-runs detected with `valgrind` in `string8.h`.
* Fixed uninitialised class fields in `StriContainerUTF8`
(reported by `valgrind`).
## 0.1-23 (2014-03-11)
* License changed to BSD-3-clause, COPYRIGHTS updated.
* `icudt` is not shipped with stringi anymore;
it is now downloaded in `install.libs.R` from one of our servers.
* New functions: `stri_install_check()`, `stri_install_icudt()`.
## 0.1-22 (2014-02-20)
* System ICU is used on systems which do have one (version >= 50 needed).
ICU is auto-detected with `pkg-config` in `configure`.
Pass `'--disable-pkg-config'` to `configure` to force building
ICU from sources.
* `icudt52b` (custom subset) is now shipped with stringi
(for big-endian, ASCII systems).
## 0.1-21 (2014-02-19)
* Fixed some issues on Solaris while preparing stringi
for CRAN submission.
## 0.1-20 (2014-02-17)
* ICU4C 52.1 sources included (common, i18n, stubdata + `icu52dt.dat`
loaded dynamically). Compilation via Makevars.
* stringi does not depend on any external libraries anymore.
## 0.1-11 (2013-11-16)
* ICU4C is now statically linked on Windows.
* First OS X binary build.
* The package is being intensively tested by our students at Warsaw
University of Technology.
## 0.1-10 (2013-11-13)
* Using `pkg-config` via `configure` to look for ICU4C libs.
## 0.1-6 (2013-07-05)
* First Windows binary build.
* Compilation passed on Oracle Sun Studio compiler collection.
* By now we have implemented most of the functionality
scheduled for milestone 0.1.
## 0.1-1 (2013-01-05)
* The stringi project has been started.
stringi/src/ 0000755 0001762 0000144 00000000000 14771224007 012524 5 ustar ligges users stringi/src/stri_callables.cpp 0000644 0001762 0000144 00000003727 14750143456 016231 0 ustar ligges users /* This file is part of the 'stringi' project.
* Copyright (c) 2013-2025, Marek Gagolewski
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "stri_stringi.h"
#include "stri_callables.h"
const extern R_CallMethodDef stri_callables[] =
{
{"stric_u_hasBinaryProperty", (DL_FUNC)(void (*) (void))(&stric_u_hasBinaryProperty), 0/*unused*/},
{NULL, NULL, 0}
};
int stric_u_hasBinaryProperty(int c, int which)
{
return (int)u_hasBinaryProperty((UChar32)c, (UProperty)which);
}
stringi/src/stri_ICU_settings.cpp 0000644 0001762 0000144 00000007355 14770540074 016647 0 ustar ligges users /* This file is part of the 'stringi' project.
* Copyright (c) 2013-2025, Marek Gagolewski
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "stri_stringi.h"
#ifndef STRI_ICU_FOUND
#include "uconfig_local.h"
#endif
/** Get current-default ICU locale and charset information
*
* @return an R named list with 7 components:
* \code{Unicode.version} == ICU Unicode version,
* \code{ICU.version} == U_ICU_VERSION
* \code{Locale} == \code{stri_locale_info()},
* \code{Charset.internal} == \code{"UTF-8", "UTF-16"},
* \code{Charset.native} == \code{stri_enc_info()})
* \code{ICU.system} == is system ICU used?
* \code{ICU.UTF8} == is U_CHARSET_IS_UTF8 set?
*
* @version 0.1-?? (Marek Gagolewski)
*
* @version 0.1-?? (Marek Gagolewski, 2013-06-16)
* make StriException friendly
*
* @version 0.1-?? (Marek Gagolewski, 2013-11-17)
* added U_ICU_VERSION
*
* @version 0.3-1 (Marek Gagolewski, 2014-11-04)
* Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
*
* @version 0.5-3 (Marek Gagolewski, 2015-06-24)
* new retval field: ICU.system
*
* @version 1.3.1 (Marek Gagolewski, 2019-02-06)
* new retval field: ICU.UTF8
*
*/
SEXP stri_info()
{
STRI__ERROR_HANDLER_BEGIN(0)
const R_len_t infosize = 7;
SEXP vals;
STRI__PROTECT(vals = Rf_allocVector(VECSXP, infosize));
SET_VECTOR_ELT(vals, 0, Rf_mkString(U_UNICODE_VERSION));
SET_VECTOR_ELT(vals, 1, Rf_mkString(U_ICU_VERSION));
SET_VECTOR_ELT(vals, 2, stri_locale_info(R_NilValue)); // may call Rf_error
SET_VECTOR_ELT(vals, 3,
stri__make_character_vector_char_ptr(2, "UTF-8", "UTF-16")); // fixed strings
SET_VECTOR_ELT(vals, 4, stri_enc_info(R_NilValue)); // may call Rf_error
SET_VECTOR_ELT(vals, 5, Rf_ScalarLogical(STRI_ICU_FOUND));
SET_VECTOR_ELT(vals, 6, Rf_ScalarLogical(0));
#ifdef U_CHARSET_IS_UTF8
#if U_CHARSET_IS_UTF8
SET_VECTOR_ELT(vals, 6, Rf_ScalarLogical(1));
#endif
#endif
stri__set_names(vals, infosize,
"Unicode.version", "ICU.version", "Locale",
"Charset.internal", "Charset.native", "ICU.system", "ICU.UTF8");
STRI__UNPROTECT_ALL
return vals;
STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
stringi/src/stri_brkiter.h 0000644 0001762 0000144 00000022256 14770541312 015407 0 ustar ligges users /* This file is part of the 'stringi' project.
* Copyright (c) 2013-2025, Marek Gagolewski
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __stri_brkiter_h
#define __stri_brkiter_h
#include "stri_stringi.h"
#include
#include
#include
#include
#include
#include
/**
* A class to manage a break iterator's options
*
* @version 0.4-1 (Marek Gagolewski, 2014-12-02)
*
* @version 1.1.3 (Marek Gagolewski, 2017-01-07) UBRK_COUNT deprecated
*
* @version 1.1.6 (Marek Gagolewski, 2017-04-22) Add support for RBBI
*/
class StriBrkIterOptions {
protected:
const char* locale; // R_alloc'd
UnicodeString rules;
UBreakIteratorType type;
int32_t* skip_rules; // R_alloc'd
R_len_t skip_size; // number of elements in skip_rules
private:
void setEmptyOpts()
{
locale = NULL;
type = UBRK_CHARACTER;
skip_rules = NULL;
skip_size = 0;
}
void setType(SEXP opts_brkiter, const char* default_type);
void setLocale(SEXP opts_brkiter);
void setSkipRuleStatus(SEXP opts_brkiter);
public:
StriBrkIterOptions() {
setEmptyOpts();
}
StriBrkIterOptions(SEXP opts_brkiter, const char* default_type) {
setEmptyOpts();
setLocale(opts_brkiter);
setSkipRuleStatus(opts_brkiter);
setType(opts_brkiter, default_type);
}
};
/**
* A class to manage a break iterator
*
* @version 0.3-1 (Marek Gagolewski, 2014-10-30)
*
* @version 0.4-1 (Marek Gagolewski, 2014-12-02) separate class
*
* @version 1.1.3 (Marek Gagolewski, 2017-01-07) UBRK_COUNT deprecated
*
* @version 1.1.6 (Marek Gagolewski, 2017-04-22) Add support for RBBI
*
* @version 1.8.1 (Marek Gagolewski, 2023-11-09)
* warn if resource bundle for an explicitly set locale is unavailable
*/
class StriUBreakIterator : public StriBrkIterOptions {
private:
UBreakIterator* uiterator;
void open() {
#ifndef NDEBUG
if (uiterator) throw StriException("!NDEBUG: StriUBreakIterator::open()");
#endif
UErrorCode status = U_ZERO_ERROR;
if (!rules.isEmpty()) {
UParseError parseErr;
uiterator = ubrk_openRules(rules.getTerminatedBuffer(),
-1/*null-terminated*/, NULL, 0,
&parseErr, &status);
}
else {
switch (type) {
case UBRK_CHARACTER: // character
uiterator = ubrk_open(UBRK_CHARACTER, locale, NULL, 0, &status);
break;
case UBRK_LINE: // line_break
uiterator = ubrk_open(UBRK_LINE, locale, NULL, 0, &status);
break;
case UBRK_SENTENCE: // sentence
uiterator = ubrk_open(UBRK_SENTENCE, locale, NULL, 0, &status);
break;
case UBRK_WORD: // word
uiterator = ubrk_open(UBRK_WORD, locale, NULL, 0, &status);
break;
default:
throw StriException(MSG__INTERNAL_ERROR);
}
}
STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
if (status == U_USING_DEFAULT_WARNING && uiterator && locale) {
UErrorCode status2 = U_ZERO_ERROR;
const char* valid_locale = ubrk_getLocaleByType(uiterator, ULOC_VALID_LOCALE, &status2);
if (valid_locale && !strcmp(valid_locale, "root"))
Rf_warning("%s", ICUError::getICUerrorName(status));
}
}
public:
StriUBreakIterator()
: StriBrkIterOptions() {
uiterator = NULL;
}
StriUBreakIterator(const StriBrkIterOptions& bropt)
: StriBrkIterOptions(bropt) {
uiterator = NULL;
}
StriUBreakIterator& operator=(const StriBrkIterOptions& bropt) {
this->~StriUBreakIterator();
(StriBrkIterOptions&) (*this) = (StriBrkIterOptions&)bropt;
uiterator = NULL;
return *this;
}
~StriUBreakIterator() {
if (uiterator) {
ubrk_close(uiterator);
uiterator = NULL;
}
}
void free(bool dealloc=true) {
if (uiterator && dealloc) {
ubrk_close(uiterator);
}
uiterator = NULL;
}
UBreakIterator* getIterator() {
if (!uiterator) open();
return uiterator;
}
const char* getLocale() {
return locale;
}
};
/**
* A class to manage a break iterator
*
* @version 0.3-1 (Marek Gagolewski, 2014-10-30)
*
* @version 0.4-1 (Marek Gagolewski, 2014-12-02)
* separate class
*
* @version 1.1.6 (Marek Gagolewski, 2017-04-22) Add support for RBBI
*
* @version 1.8.1 (Marek Gagolewski, 2023-11-09)
* warn if resource bundle for an explicitly set locale is unavailable
*/
class StriRuleBasedBreakIterator : public StriBrkIterOptions {
private:
BreakIterator* rbiterator;
UText* searchText;
R_len_t searchPos; // may be BreakIterator::DONE
const char* searchStr; // owned by caller
R_len_t searchLen; // in bytes
void setEmptyOpts() {
rbiterator = NULL;
searchText = NULL;
searchPos = BreakIterator::DONE;
searchStr = NULL;
searchLen = 0;
}
void open() {
UErrorCode status = U_ZERO_ERROR;
Locale loc = Locale::createFromName(locale);
if (!rules.isEmpty()) {
UParseError parseErr;
rbiterator = (BreakIterator*) new RuleBasedBreakIterator(
UnicodeString(rules), parseErr, status
);
}
else {
switch (type) {
case UBRK_CHARACTER: // character
rbiterator = (BreakIterator*)BreakIterator::createCharacterInstance(loc, status);
break;
case UBRK_LINE: // line_break
rbiterator = (BreakIterator*)BreakIterator::createLineInstance(loc, status);
break;
case UBRK_SENTENCE: // sentence
rbiterator = (BreakIterator*)BreakIterator::createSentenceInstance(loc, status);
break;
case UBRK_WORD: // word
rbiterator = (BreakIterator*)BreakIterator::createWordInstance(loc, status);
break;
default:
throw StriException(MSG__INTERNAL_ERROR);
}
}
STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
if (status == U_USING_DEFAULT_WARNING && rbiterator && locale) {
UErrorCode status2 = U_ZERO_ERROR;
const char* valid_locale = rbiterator->getLocaleID(ULOC_VALID_LOCALE, status2);
if (valid_locale && !strcmp(valid_locale, "root"))
Rf_warning("%s", ICUError::getICUerrorName(status));
}
}
bool ignoreBoundary();
public:
StriRuleBasedBreakIterator()
: StriBrkIterOptions() {
setEmptyOpts();
}
StriRuleBasedBreakIterator(const StriBrkIterOptions& bropt)
: StriBrkIterOptions(bropt) {
setEmptyOpts();
}
StriRuleBasedBreakIterator& operator=(const StriBrkIterOptions& bropt) {
this->~StriRuleBasedBreakIterator();
(StriBrkIterOptions&) (*this) = (StriBrkIterOptions&)bropt;
setEmptyOpts();
return *this;
}
~StriRuleBasedBreakIterator() {
if (rbiterator) {
delete rbiterator;
rbiterator = NULL;
}
if (searchText) {
utext_close(searchText);
searchText = NULL;
}
}
void setupMatcher(const char* searchStr, R_len_t searchLen);
void first();
bool next();
bool next(std::pair& bdr);
void last();
bool previous(std::pair& bdr);
};
#endif
stringi/src/stri_search_regex_split.cpp 0000644 0001762 0000144 00000022225 14770541312 020146 0 ustar ligges users /* This file is part of the 'stringi' project.
* Copyright (c) 2013-2025, Marek Gagolewski
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "stri_stringi.h"
#include "stri_container_utf8.h"
#include "stri_container_integer.h"
#include "stri_container_logical.h"
#include "stri_container_regex.h"
#include
#include
using namespace std;
/**
* Split a string into parts.
*
* The pattern matches identify delimiters that separate the input into fields.
* The input data between the matches becomes the fields themselves.
*
* @param str character vector
* @param pattern character vector
* @param n integer vector
* @param opts_regex
* @param tokens_only single logical value
* @param simplify single logical value
*
* @return list of character vectors or character matrix
*
* @version 0.1-?? (Marek Gagolewski, 2013-06-21)
*
* @version 0.1-?? (Marek Gagolewski, 2013-07-10)
* BUGFIX: wrong behavior on empty str
*
* @version 0.1-24 (Marek Gagolewski, 2014-03-11)
* Added missing utext_close call to avoid memleaks
*
* @version 0.3-1 (Marek Gagolewski, 2014-10-19)
* added tokens_only param
*
* @version 0.3-1 (Marek Gagolewski, 2014-10-23)
* added split param
*
* @version 0.3-1 (Marek Gagolewski, 2014-10-24)
* allow omit_empty=NA
*
* @version 0.3-1 (Marek Gagolewski, 2014-11-05)
* Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
*
* @version 0.4-1 (Marek Gagolewski, 2014-12-04)
* allow `simplify=NA`; FR #126: pass n to stri_list2matrix
*
* @version 1.4.7 (Marek Gagolewski, 2020-08-24)
* Use StriContainerRegexPattern::getRegexOptions
*/
SEXP stri_split_regex(SEXP str, SEXP pattern, SEXP n, SEXP omit_empty,
SEXP tokens_only, SEXP simplify, SEXP opts_regex)
{
bool tokens_only1 = stri__prepare_arg_logical_1_notNA(tokens_only, "tokens_only");
PROTECT(str = stri__prepare_arg_string(str, "str"));
PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern"));
PROTECT(n = stri__prepare_arg_integer(n, "n"));
PROTECT(omit_empty = stri__prepare_arg_logical(omit_empty, "omit_empty"));
PROTECT(simplify = stri__prepare_arg_logical_1(simplify, "simplify"));
R_len_t vectorize_length = stri__recycling_rule(true, 4,
LENGTH(str), LENGTH(pattern), LENGTH(n), LENGTH(omit_empty));
StriRegexMatcherOptions pattern_opts =
StriContainerRegexPattern::getRegexOptions(opts_regex);
UText* str_text = NULL; // may potentially be slower, but definitely is more convenient!
STRI__ERROR_HANDLER_BEGIN(5)
StriContainerUTF8 str_cont(str, vectorize_length);
StriContainerInteger n_cont(n, vectorize_length);
StriContainerLogical omit_empty_cont(omit_empty, vectorize_length);
StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_opts);
SEXP ret;
STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length));
for (R_len_t i = pattern_cont.vectorize_init();
i != pattern_cont.vectorize_end();
i = pattern_cont.vectorize_next(i))
{
if (n_cont.isNA(i)) {
SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));
continue;
}
int n_cur = n_cont.get(i);
int omit_empty_cur = !omit_empty_cont.isNA(i) && omit_empty_cont.get(i);
STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont,
SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));,
SET_VECTOR_ELT(ret, i,
(omit_empty_cont.isNA(i))?stri__vector_NA_strings(1):
stri__vector_empty_strings((omit_empty_cur || n_cur == 0)?0:1));)
R_len_t str_cur_n = str_cont.get(i).length();
const char* str_cur_s = str_cont.get(i).c_str();
if (n_cur >= INT_MAX-1)
throw StriException(MSG__INCORRECT_NAMED_ARG "; " MSG__EXPECTED_SMALLER, "n");
else if (n_cur < 0)
n_cur = INT_MAX;
else if (n_cur == 0) {
SET_VECTOR_ELT(ret, i, Rf_allocVector(STRSXP, 0));
continue;
}
else if (tokens_only1)
n_cur++; // we need to do one split ahead here
UErrorCode status = U_ZERO_ERROR;
RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically
str_text = utext_openUTF8(str_text, str_cont.get(i).c_str(), str_cont.get(i).length(), &status);
STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
matcher->reset(str_text);
R_len_t k;
deque< pair > fields; // byte based-indices
fields.push_back(pair(0,0));
for (k=1; k < n_cur; ) {
int m_res = (int)matcher->find(status);
STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
if (!m_res) break;
R_len_t s1 = (R_len_t)matcher->start(status);
STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
R_len_t s2 = (R_len_t)matcher->end(status);
STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
if (omit_empty_cur && fields.back().first == s1)
fields.back().first = s2; // don't start any new field
else {
fields.back().second = s1;
fields.push_back(pair(s2, s2)); // start a new field here
++k; // another field
}
}
fields.back().second = str_cur_n;
if (omit_empty_cur && fields.back().first == fields.back().second)
fields.pop_back();
if (tokens_only1 && n_cur < INT_MAX) {
n_cur--; // one split ahead could have been made, see above
while (fields.size() > (size_t)n_cur)
fields.pop_back(); // get rid of the remainder
}
SEXP ans;
STRI__PROTECT(ans = Rf_allocVector(STRSXP, fields.size()));
deque< pair >::iterator iter = fields.begin();
for (k = 0; iter != fields.end(); ++iter, ++k) {
pair curoccur = *iter;
if (curoccur.second == curoccur.first && omit_empty_cont.isNA(i))
SET_STRING_ELT(ans, k, NA_STRING);
else
SET_STRING_ELT(ans, k,
Rf_mkCharLenCE(str_cur_s+curoccur.first, curoccur.second-curoccur.first, CE_UTF8));
}
SET_VECTOR_ELT(ret, i, ans);
STRI__UNPROTECT(1);
}
if (str_text) {
utext_close(str_text);
str_text = NULL;
}
if (LOGICAL(simplify)[0] == NA_LOGICAL || LOGICAL(simplify)[0]) {
R_len_t n_min = 0;
R_len_t n_length = LENGTH(n);
int* n_tab = INTEGER(n);
for (R_len_t i=0; i