stringi/0000755000176200001440000000000014771247052011742 5ustar liggesusersstringi/MD50000644000176200001440000022702614771247052012263 0ustar liggesusersc11040ccc16441809adabbcef6a0e1d8 *DESCRIPTION 5b9bded89d382625aa5194610ef5dc0a *INSTALL 0b274d86f0944f0964b5cbe89f3e1529 *LICENSE fbccc8447bec59bda28786a6a59b3442 *NAMESPACE a4bf945a499f378a0a469204291c105b *NEWS 15bf7790e61900b5d9e6b920a05a19ab *R/ICU_settings.R d97ec069ec10c409e150012b944b3b1b *R/compare.R 938d4c3b5ba033957d059090acf1201e *R/encoding.R adbd123a02c59f02eeee9eadb13296e1 *R/encoding_conversion.R 02dc3549a8a67a19f2d37bbd79c88a9a *R/encoding_detection.R 4e61d55d8d20c9f997850b3b86637be6 *R/encoding_management.R 1fba31d0a3fde174f089bca14deb1580 *R/escape.R d88252597a52b12d42d64df95ef1554e *R/files.R b7361e0556de786d0f9e5fed7ad4dbe8 *R/install.R 66f83400fc39b73e9c5a17c89dac7502 *R/internal_prepare_arg.R 1e2508ee764b3d12a68ea300a03b348f *R/internal_test.R ddf7920a17df98717a5a8e73c45e228c *R/join.R 9e5a917d9ebfd2c85bc84f18286f306d *R/length.R 3f8b72ec3a6a7a943a0e9dd4d74e4aa1 *R/locale.R 96de209032f3e86d7eef0753ddd19afc *R/locale_management.R 77cd1d9086073a133118c9d610093f17 *R/opts.R a3dae305f6a8da323a6a8721a61993c1 *R/pad.R 8a5afc91d4ed94569224cc453fedbca9 *R/random.R 41fd0b9bdcb6fd8e2a430a06bc43d47a *R/reverse.R 654242c9eb4adac6376c7dc30318e832 *R/search.R 9cf8f04e46b0747298d2b4fce0066ad8 *R/search_count_4.R a1a7b9194656386c161e6153d99af703 *R/search_count_bound.R 75bca32414c213f7cf393bd71b8aff04 *R/search_detect_4.R 87294d6151a91130e7219afc551ca7a3 *R/search_extract_4.R 81940054ff415c8a7977f893c65deda3 *R/search_extract_bound.R 909363b0676ab92e279b9c6ef41d51fc *R/search_locate_4.R 81d2c846df4d8c4ab7cb81a12e677d96 *R/search_locate_bound.R ae12bed93e94cb2d516659d6a298229c *R/search_match_4.R 36c3ec95d3c55f325c71e6aa4666cebf *R/search_replace_4.R 4db6b32065ca48cbc1c7e44898ce0d15 *R/search_split_4.R 84874e582b4bfc60bdf959c280e26702 *R/search_split_bound.R 6c4d8f07cd11a2a882cfc601ea09e86b *R/search_startsendswith_4.R 770a5ea50a71446f86986c609d8aae0e *R/search_subset_4.R 3f5a99a1033d30881ab2ccc9a72a458b *R/sort.R 6bff0145897f7f39aad25b807a860223 *R/sprintf.R 11992cbf4f72d914075190bdf855191b *R/stats.R 600b4ff16b35ec57f7214acbebda28f3 *R/stringi_package.R cc7b399f1a5d9963e1d8f00d263e3b94 *R/sub.R 1496986a08a1b940534919d0e046ea5b *R/time_calendar.R 115d819cda273048a7785a2eab46464a *R/time_format.R 2aef5f86d5b388da2fff4b16db19daa8 *R/time_symbols.R 4b95cd483be9fc564a057f2b577c1891 *R/time_zone.R f854804f31d27b1a8f4eaacab241da36 *R/trans_casemap.R 2ffdd4e62fbb0f5f287473cb61783431 *R/trans_normalization.R 4cdcf592ea1a768f84fa8d427ce690dd *R/trans_other.R a6f889b5a831281623a8f983aabde151 *R/trans_transliterate.R 0d802c7e9daa6c729065d57d962b0128 *R/trim.R cb18ecf9188d501a56fe13e04013fc5e *R/utils.R bfe8512f21f7dcddf02b8dd277ffa46f *R/wrap.R 4454e632feb75ce4f508807d50b3b12f *build/partial.rdb e786a8e3195a353a5aa9105b1b18fb29 *cleanup 994bf59ab98cbce51323d0179a396e8e *configure 1f85dbfcffef6fa6a62a05e3ff85458c *configure.ac 5164e451ddb1b73c1052e9aae743767d *configure.win 0a51dcf4e38b057ee084a85644c6e3a7 *inst/AUTHORS 11940a3690625fea0670cbee73ede58a *inst/CITATION f281a9a4d5d2a3827941b01abff8dcd5 *man/about_arguments.Rd facc1aea6b469d0fb66839e6e47dc608 *man/about_encoding.Rd aef6a1a3225f905c91775bfd62f7bd38 *man/about_locale.Rd 66b0a115b3f4385ab2b20c73e2df001e *man/about_search.Rd 27e0cc3f55c11e8751f316db9524731c *man/about_search_boundaries.Rd 2a41f016ce9daf72523fd18b6f4bd507 *man/about_search_charclass.Rd bda5beb1839985bba3813a66de887344 *man/about_search_coll.Rd 1bdcbbd57a04b019e98d6917dee1997c *man/about_search_fixed.Rd eace196448388b85525817253daaeab3 *man/about_search_regex.Rd 681f2c39e73f953d03338d95303267f8 *man/operator_add.Rd 95bc60262e823fbe7eb72e321eab3806 *man/operator_compare.Rd 44b518b7ca674e27b558ed30e09bca50 *man/operator_dollar.Rd fe12fe54f17be287d1592f5990969a6b *man/stri_compare.Rd 9bc0b4a887d6fd4f8b568a2c21718900 *man/stri_count.Rd 4eb69343856500b71ee3e4136e48c265 *man/stri_count_boundaries.Rd 61d4e9ee5515276335670f859df59f69 *man/stri_datetime_add.Rd 012a6b0ea16680030908916840be3ef4 *man/stri_datetime_create.Rd 63229754aac10e796e6c2042587f969f *man/stri_datetime_fields.Rd 845705290346bb17740337ce2ab2ba57 *man/stri_datetime_format.Rd 94b1dceafe07d86c1a07ccbc23f6c473 *man/stri_datetime_fstr.Rd c6db33c91376938541c19f2f5c2415c5 *man/stri_datetime_now.Rd 9dcfb5c4422b2b4428ac83c0a647bb87 *man/stri_datetime_symbols.Rd 8fb721a5777d8f81ae42a0a859e56730 *man/stri_detect.Rd ee907e61319b13c0eb30ed608efc5e64 *man/stri_dup.Rd 0ee4d43c8c0023ef5f3c6b4a6a9d9f7e *man/stri_duplicated.Rd 8139a923d436200a04c19054290be31f *man/stri_enc_detect.Rd e597449d6687a03932c13071b8a47816 *man/stri_enc_detect2.Rd dbf2f340a3bac54920f7fa486de4890f *man/stri_enc_fromutf32.Rd ca659ca78d219cac17d4605d3935d6f3 *man/stri_enc_info.Rd 582d9c1729decaa4c63f8cafbb367d8a *man/stri_enc_isascii.Rd 96ea8addd014e76bd261b55d94a31149 *man/stri_enc_isutf16.Rd d8f4dbbd72de4e1684d79c5863aa6c43 *man/stri_enc_isutf8.Rd fd211c6321254aaec9e050cfc8e3fdf7 *man/stri_enc_list.Rd 196d03d91964c93da50c61b785c4a07f *man/stri_enc_mark.Rd 3d76974bfedd87525b3690e4f77eee58 *man/stri_enc_set.Rd 5e2ee6edbaa990278f09bdecc785ea5a *man/stri_enc_toascii.Rd 35126c3527369f3c485ac742c9a38b16 *man/stri_enc_tonative.Rd 6a26e279c6bd7b40038367933ebb933f *man/stri_enc_toutf32.Rd ee14797f45b703936e5cdac325465e68 *man/stri_enc_toutf8.Rd f2f5fe4b1742c649de2134dac54b8c81 *man/stri_encode.Rd 09e1574a4a464aa9dc871fc319101590 *man/stri_escape_unicode.Rd ec85f99ac41fb93b15c98fbb81791306 *man/stri_extract.Rd 86d4f8c26a36e9e62437c3ebc4226d77 *man/stri_extract_boundaries.Rd a42cb9f6b9e733ff77a30d5f80fab5ea *man/stri_flatten.Rd 8b8c786f646623ce465c3f68a16a543a *man/stri_info.Rd bd47458781c7368b236c63551277291f *man/stri_isempty.Rd 36c34f25eba4fc6e36bca3519d72010f *man/stri_join.Rd 6879e164dbace74170a0eca74b7ef495 *man/stri_join_list.Rd 0c11e6c3d21c154c076b01a42fd2ffb9 *man/stri_length.Rd 1fc9ab81a1adc8b0a5513625f4765900 *man/stri_list2matrix.Rd 3ce00aa1511217857a1272822916aad7 *man/stri_locale_info.Rd 9554cf49bd74615b59d06dae8f0a4bd4 *man/stri_locale_list.Rd 66d17af3574b16281f3b0e6539c5f8cf *man/stri_locale_set.Rd d2e839e43f0b35542c27f094ca0a565d *man/stri_locate.Rd 0a1911c2411b0df9f802cc3ee337c14f *man/stri_locate_boundaries.Rd 89e230567b24a55c42a4b5614913d257 *man/stri_match.Rd 4d41393c9df2f66ebe8dfdc8dbb7cd86 *man/stri_na2empty.Rd 38d4763eaccee4cca6b016d8f42f3a21 *man/stri_numbytes.Rd bbc11a11ab42c552aa2021cee7eb9ec9 *man/stri_opts_brkiter.Rd fca73c9f559a4ed799dc04f59095eb51 *man/stri_opts_collator.Rd e34a2aeb898b34837ea72f5415c04338 *man/stri_opts_fixed.Rd 1d61606edd40e0e9a315b7c88a39f691 *man/stri_opts_regex.Rd f775ccf0ee34e0040447a35e506fcd37 *man/stri_order.Rd 7081ba7863589767e062d5b198df2a19 *man/stri_pad.Rd 98874345b46429d6e7d7c7851183ffb4 *man/stri_rand_lipsum.Rd 86a5426e03c808b1f38275505251370c *man/stri_rand_shuffle.Rd 3ef378b335ff09d5d3c54f8785d16464 *man/stri_rand_strings.Rd 5b0939177b1a8fe25bf72c3eff54afa5 *man/stri_rank.Rd 2355b24ebb992222f5dd5639941575c0 *man/stri_read_lines.Rd 6cf8d9fbf46427d4190ef1e8926db066 *man/stri_read_raw.Rd 739abfe8d7a3e8fa59a661649dd1de57 *man/stri_remove_empty.Rd 8505905d5d0f6fd705211c90a5ed1bc3 *man/stri_replace.Rd 6fe4bf6442b3117a9828a6536989add2 *man/stri_replace_na.Rd acbd49ed0f7c14780001f2deb734b7f5 *man/stri_replace_rstr.Rd 2779cd55fbd28cdf7a1057eb5e166176 *man/stri_reverse.Rd 4f5b80e5ed84c616229451ffb7c0c981 *man/stri_sort.Rd 31f7dcf193bf440a6597daced93f555d *man/stri_sort_key.Rd 3481f15d7fdd574c80892c0e7896a6b9 *man/stri_split.Rd ce8aa31b3021a3adf8a8e74d8ffad042 *man/stri_split_boundaries.Rd 0ef215f89103cadb224636f972862695 *man/stri_split_lines.Rd b519bc5d01579fef7396ca746ae7cd5e *man/stri_sprintf.Rd 4d2b7488aa8f69a3b97fbb520017a60a *man/stri_startsendswith.Rd ff4e32a409b836191c65cfb83cbdedf8 *man/stri_stats_general.Rd f43f3590a07eba0c4457551e8a38ecbf *man/stri_stats_latex.Rd e29c03d4df8979754f7e4a12458fea57 *man/stri_sub.Rd 18957f87ce34a7ed312fac3cdb6c497c *man/stri_sub_all.Rd 4347c5de9c076eb8289ff6da1a366a66 *man/stri_subset.Rd 93276498808607079fa908268fedffc4 *man/stri_timezone_info.Rd b2b6541f69bcc7813fa24f4dbe5d757a *man/stri_timezone_list.Rd 4b821559050165e62aec074304231247 *man/stri_timezone_set.Rd abfe9b51d269dcfd3030f4731513c0b1 *man/stri_trans_casemap.Rd ae2e432a4593322fc458e90000cf6290 *man/stri_trans_char.Rd d6f92f2a49e46d2f9be7a25450de21cd *man/stri_trans_general.Rd 1580f614404500b90320f4091214d9c0 *man/stri_trans_list.Rd b6bc64dab3bdaf4c09cdcd79accf05cd *man/stri_trans_nf.Rd 45d5b3bfdc138615c0169d902b2263c4 *man/stri_trim.Rd 0d8f35b569539fcc95bd12bfef50b643 *man/stri_unescape_unicode.Rd 1b175ce46b12eb8655e23bae91899cc0 *man/stri_unique.Rd 21d0d8d4a922e82733b599963e939e05 *man/stri_width.Rd 71ed88e3597f646c586ebf2e033f4003 *man/stri_wrap.Rd 49ebee3c32f7fccc0983a04a6980206a *man/stri_write_lines.Rd e766e5a948beba086527537587e85058 *man/stringi-package.Rd 027586a16c6ea87f2fb09ecbaee5d132 *src/Makevars.in 285cf51dcc3b31e1866742ff8f552ac8 *src/Makevars.win a47d0b87576c49d306558f276511d5d6 *src/icu74/LICENSE 5f7b2dd77c1660634264fc3210664da9 *src/icu74/common/appendable.cpp cee4b1e5cd196870283f2d9dd856a3d7 *src/icu74/common/bmpset.cpp e1bd25b7cb9b8bd18323925af7be0b29 *src/icu74/common/bmpset.h 07e65db7a78f0c126244a1f1596f1519 *src/icu74/common/brkeng.cpp e4482ff43625cee25d1709462b6bb066 *src/icu74/common/brkeng.h aa7938e9a0e00426fdaffc50a317384c *src/icu74/common/brkiter.cpp 7f218959d375deb386b8487723eb4ca2 *src/icu74/common/bytesinkutil.cpp 63ef8f0eb4ee82dc37286d6f5bec5510 *src/icu74/common/bytesinkutil.h 1ffa481b943f6d2d16df024b15321dd2 *src/icu74/common/bytestream.cpp 4fec3a2364e293bfed68d2059e96fa60 *src/icu74/common/bytestrie.cpp 9d34bb7529e5e85db3846149f72e74fb *src/icu74/common/bytestriebuilder.cpp eec840cb56dd0175ebe0bad6e6624cf2 *src/icu74/common/bytestrieiterator.cpp 5b4f1ae8af9658c3cff0fb411a0bbe73 *src/icu74/common/caniter.cpp 26ce62cdc123a6aa954d0825b4656fcf *src/icu74/common/capi_helper.h 0491d663d61b316c636c20eb14efb339 *src/icu74/common/characterproperties.cpp 7a06610ee4e501d7a6a43a3e392f2eb9 *src/icu74/common/chariter.cpp 5958b5a0b86c9bb79097afbbdc07d658 *src/icu74/common/charstr.cpp 1c5820d511ec39d2ffa24f558754aa21 *src/icu74/common/charstr.h 60539b2a360dd616050daabf0d86fa8f *src/icu74/common/charstrmap.h 56ffd680c0ff86d15828075edf24ff07 *src/icu74/common/cmemory.cpp abb67941c632bced36e6a4ab4e51ff8d *src/icu74/common/cmemory.h f20cfd96a1071bb0d9e405cbb8a9e573 *src/icu74/common/cpputils.h 36a23acf28a001f6d3cbad39f38840a7 *src/icu74/common/cstr.cpp 1734c10cd6f567a344a713fd13b6acb3 *src/icu74/common/cstr.h 607e9b62ea855cf96e148756cae39220 *src/icu74/common/cstring.cpp 2f0d93ae0ec0c9ae5b59fe7d28eee3d9 *src/icu74/common/cstring.h 6796dd6d8e6e96d1a7b151b6b095847e *src/icu74/common/cwchar.cpp d24c0320118915cc2594222412902ce4 *src/icu74/common/cwchar.h c289eb6c66006c1b1d5832ff8ade8587 *src/icu74/common/dictbe.cpp 297753dacbf0974f3ee4b5fda7a03a9a *src/icu74/common/dictbe.h e27712eb92c95bcfa9a77ee703fabe8f *src/icu74/common/dictionarydata.cpp ca105277d120deccaada6f6a5cf17906 *src/icu74/common/dictionarydata.h 4ec2b76aa91e236134e14754aec023fb *src/icu74/common/dtintrv.cpp c4ec0d188412acad9d36846e8b37be29 *src/icu74/common/edits.cpp 25cd354b078228b5f726d2b12058f4d2 *src/icu74/common/emojiprops.cpp 524ea116206b67d476d8f8f66f993a44 *src/icu74/common/emojiprops.h d0599f623f055b96f0fc492bb12df8ec *src/icu74/common/errorcode.cpp d1400cf563a683370d7a4030eac16f9c *src/icu74/common/filteredbrk.cpp b159de30753bfee01022329494b9b3e0 *src/icu74/common/filterednormalizer2.cpp 3a0890ef8285209b449cd874b0537bfd *src/icu74/common/hash.h 6a57bccc095f3b343e71a3aff9e4ce57 *src/icu74/common/icudataver.cpp ff31826df391950cc39bd268e86c78d4 *src/icu74/common/icuplug.cpp 916cbee9d0df4a2cd1ee5a4cbc55e61b *src/icu74/common/icuplugimp.h 3eca5df0d53dbfd755596932ea6e1cb0 *src/icu74/common/loadednormalizer2impl.cpp 68bd9585b80223c17b781415e82e51de *src/icu74/common/localebuilder.cpp 16d3ed5b2a8543cdbd5a8bc83ff64df0 *src/icu74/common/localefallback_data.h 91994495dd194e5842228248019bfc33 *src/icu74/common/localematcher.cpp 5b490499d7f5df20a9044282146ab305 *src/icu74/common/localeprioritylist.cpp 37d91ee57184d74b4b9aeae1599433f2 *src/icu74/common/localeprioritylist.h 881f1bc0f31c5b02318b4689b0cd4f61 *src/icu74/common/localsvc.h 693e21fec8d49e356e82f1eb998dbd4e *src/icu74/common/locavailable.cpp 238322c53dc81946c5aaced25b28bcf9 *src/icu74/common/locbased.cpp 255709ed4f38a17483c41f4cbddc07df *src/icu74/common/locbased.h 0c7bb71bcab5a65705d1ed27b78ade0c *src/icu74/common/locdispnames.cpp 69ce0b7b169fc846334d531e226989a3 *src/icu74/common/locdistance.cpp 137447c33703b378fa13a55ae8fbc74d *src/icu74/common/locdistance.h 4c5577d47fa0441556e8e2176e082e49 *src/icu74/common/locdspnm.cpp 5d8cf9683bf024ccbfe0debb65141b2c *src/icu74/common/locid.cpp ee64134183f676c92e1642def0561096 *src/icu74/common/loclikely.cpp 4b65b120350b7c2e4ec5c32544ffd75c *src/icu74/common/loclikelysubtags.cpp 1401c5171c58d2477bcee374768d52e2 *src/icu74/common/loclikelysubtags.h 45801605a76cb154d247e483d2d63c65 *src/icu74/common/locmap.cpp b68d350d2e61faa97a8e36e273bf923e *src/icu74/common/locmap.h eac5ed59ffbae769864f17f49ac4e154 *src/icu74/common/locresdata.cpp df168377e8e97e760afed0a8d35d655f *src/icu74/common/locutil.cpp c8de4950b1a1c5f40bb6dea944257653 *src/icu74/common/locutil.h 005c1a13ca32bd917ca0a86bae070f21 *src/icu74/common/lsr.cpp b354c92d6c6e67232cf4bfee6068c501 *src/icu74/common/lsr.h d52cc6823bbeb3a4e87481c68deaec9e *src/icu74/common/lstmbe.cpp 2dc55c269a222366fc75bf8bb8d0d225 *src/icu74/common/lstmbe.h 0c91992ff7075338ce8c7d92f3fd4f52 *src/icu74/common/messageimpl.h 70100f80f605af1bfa322ff674b7a337 *src/icu74/common/messagepattern.cpp 62081a8f902ec09940b6f9ad39c61814 *src/icu74/common/mlbe.cpp f831547533e98363bd2f7089fa3a4cf6 *src/icu74/common/mlbe.h 2ef217357b14d55d974e0e2e8a58356c *src/icu74/common/msvcres.h 7486043406d4e31cbdfa125880e49138 *src/icu74/common/mutex.h 554a68d0c0e05ab883b6d53fce816afe *src/icu74/common/norm2_nfc_data.h a073cc1261ddf38d44cb44ba75ef6687 *src/icu74/common/norm2allmodes.h a068c7eda830d2110268b0424dee1b69 *src/icu74/common/normalizer2.cpp 51666d9357a06976271a6eff329662c0 *src/icu74/common/normalizer2impl.cpp 97623fe2c8f24cdf5f088460dee4b788 *src/icu74/common/normalizer2impl.h 4e5eeb5dd1e090d79c0e953b9efddcf1 *src/icu74/common/normlzr.cpp 1fa7ed3bf4f2f971e7a8276d486c7685 *src/icu74/common/parsepos.cpp c66c8695e7403a37152270a8af566cdb *src/icu74/common/patternprops.cpp ff6cfc56af708075307c192c001f29da *src/icu74/common/patternprops.h 429c1d63ed0b24581e6c475a8e37f97e *src/icu74/common/pluralmap.cpp 88cb63317c695dc7e8dbacbc449a4d24 *src/icu74/common/pluralmap.h 995698d1a497a8f06163a4a1185492cd *src/icu74/common/propname.cpp a21161247e5be8b68452a4be78d640c5 *src/icu74/common/propname.h 610551e9e8a70ea372b23559115924a9 *src/icu74/common/propname_data.h 8a8ac4a7ad03bef0489d311dcee53499 *src/icu74/common/propsvec.cpp e0145d2f8b4915e53973f706ab11405a *src/icu74/common/propsvec.h e9c3746a1f8b1ccdc45ed6d4309bb899 *src/icu74/common/punycode.cpp ee3c921b794f955e1a685167dc1836b2 *src/icu74/common/punycode.h 474c4d78e8dc9a8be2d25d791606a11b *src/icu74/common/putil.cpp 6c7a1b6fac1d79bc0d6e703b27abb64c *src/icu74/common/putilimp.h 559cecd2a98a2e81212c90fa1764b68d *src/icu74/common/rbbi.cpp 9370cfebe1239b56188afa60a7c4b317 *src/icu74/common/rbbi_cache.cpp 18c56992a825baa680cb115b400afe7d *src/icu74/common/rbbi_cache.h 136c5ea57bc9d7e6de71cf2c23e4f368 *src/icu74/common/rbbicst.pl b7c64f3e0edb7f87fb8084385e7b4605 *src/icu74/common/rbbidata.cpp 331f278428e1f8f20fcf7db93455316b *src/icu74/common/rbbidata.h 20697208ff695cdf3f41aa659cd3816a *src/icu74/common/rbbinode.cpp 2c410fe710ab670b07fc52b9100e9962 *src/icu74/common/rbbinode.h 3d51591297788b33e7a9533ca76c7210 *src/icu74/common/rbbirb.cpp ed463a8c4ae7258a4f29ac3324179228 *src/icu74/common/rbbirb.h 142f77394905d409050aa1478735eedc *src/icu74/common/rbbirpt.h 82112a367a22ebe95522954cfdcbbd81 *src/icu74/common/rbbirpt.txt cb7f93de24bb83e5665bb0fa3bde694e *src/icu74/common/rbbiscan.cpp b3d7a14766881f1d391b2b06939ec41d *src/icu74/common/rbbiscan.h 44098e0c661ac118bcfe706cb49ac070 *src/icu74/common/rbbisetb.cpp 808a7c843f02ff3fa60da37e38dff8e9 *src/icu74/common/rbbisetb.h 6f0b0bba0a4dcfc42e87a35bab8efeab *src/icu74/common/rbbistbl.cpp cb6d87d1ebb8e673ba2abe57b4c981fe *src/icu74/common/rbbitblb.cpp 902c7df825c8c4a6605896ab2e95a5d6 *src/icu74/common/rbbitblb.h 0f812b7faa4664c917377d575b7731e1 *src/icu74/common/resbund.cpp d97aaf4cb79da04595321bf98df17d2a *src/icu74/common/resbund_cnv.cpp f30a51b3fc83fa6cdc96f719d0a0063c *src/icu74/common/resource.cpp 9126edf455c55252d0e485523050b9b0 *src/icu74/common/resource.h 80b36a32f7813f145b2bfa38c596243d *src/icu74/common/restrace.cpp 0435c3e8d88f3db1494e923631efc090 *src/icu74/common/restrace.h 901180f194d5b750d0f50c584fccaaba *src/icu74/common/ruleiter.cpp 61b5f69b262730fd0624701400528a49 *src/icu74/common/ruleiter.h 82d7d609b1b1d1b6e4924e0361d91fe4 *src/icu74/common/schriter.cpp 8bffbea5c51a8612ef9eb2ce68e9c1ed *src/icu74/common/serv.cpp 4e8b05d89e357e4e2b97a5417b02c0e6 *src/icu74/common/serv.h 7450c98221dd7d4a7b8e0bc18dad50ef *src/icu74/common/servlk.cpp 13295e5a0082aa3bc89dd161d9469888 *src/icu74/common/servlkf.cpp ccfa74aaa75e0147f9f40c581af2e363 *src/icu74/common/servloc.h e8df4698c390c6d2b3cae3d0afe10298 *src/icu74/common/servls.cpp 91d4e00af31b1cab71ef0922bc5a3289 *src/icu74/common/servnotf.cpp 89202af45b72be975f4116ec007d48b6 *src/icu74/common/servnotf.h 871c3011fe2308ae9ebd9cc07cf46588 *src/icu74/common/servrbf.cpp 0b77601f454ca348191969d9981d3778 *src/icu74/common/servslkf.cpp 541134d3e932630c85e9a63397cfbc59 *src/icu74/common/sharedobject.cpp a439482b2eedd9c133a31f47ba459ec1 *src/icu74/common/sharedobject.h 0cde3c11d51b3f4450488f0523f93a89 *src/icu74/common/simpleformatter.cpp 1c888a5db9bd5bcfd222ed9cc95068bb *src/icu74/common/sprpimpl.h 3719470a87c813e264a588708d94733e *src/icu74/common/static_unicode_sets.cpp 3bd4fcbbe261de5ca4477b845baedc5e *src/icu74/common/static_unicode_sets.h cad826cf0f88dab6580ba7af11dfff4f *src/icu74/common/stringpiece.cpp 3dd287b973225884eb60f94e4c63ef97 *src/icu74/common/stringtriebuilder.cpp fcf861c4c72243de74d43b32627af0e3 *src/icu74/common/uarrsort.cpp 99e8141442c17ed5835e3c67139ecfa5 *src/icu74/common/uarrsort.h ff4398d3757794bfb89b58260f73bf95 *src/icu74/common/uassert.h 56208a271cce741f2767b93340b79a75 *src/icu74/common/ubidi.cpp fe5f27573e38963032654253ff568955 *src/icu74/common/ubidi_props.cpp 006bb181ca0c004ed2285054938fdd3f *src/icu74/common/ubidi_props.h c60d0881d1e36ca6ba06716990440972 *src/icu74/common/ubidi_props_data.h 87ddd2f659109bf35f4e59dff039119b *src/icu74/common/ubidiimp.h d89d8b3468d2f7482d0963fd0718604c *src/icu74/common/ubidiln.cpp 1fcd7f39b31f837af9ee734a71d4139d *src/icu74/common/ubiditransform.cpp 350e8dccda3db31ad4e26cb3b0f1dfd3 *src/icu74/common/ubidiwrt.cpp d493356e30c2852ab898096290909c1e *src/icu74/common/ubrk.cpp ac84d19db39d5cba7e99e302dfd15ce0 *src/icu74/common/ubrkimpl.h 52270f84c6d49df8865894204688812a *src/icu74/common/ucase.cpp 5be4dbce2e0be55fcf2ce20468ff22d5 *src/icu74/common/ucase.h 62b2d6e6795298c6877d31ba2b9bdb92 *src/icu74/common/ucase_props_data.h 298fad2cef464d72ddbb603a53ca9624 *src/icu74/common/ucasemap.cpp decf402e008f58401753fb992fc27b19 *src/icu74/common/ucasemap_imp.h 2ff8413a6860ba42d83d860a45460797 *src/icu74/common/ucasemap_titlecase_brkiter.cpp 12274b70dddfd552794d3abc19de4fda *src/icu74/common/ucat.cpp e96cb0eb83377cb095c6c0e4f40fff46 *src/icu74/common/uchar.cpp 123a117756cd94fcd34b02f6ec550bbb *src/icu74/common/uchar_props_data.h 8bc853685c9bba2299bed2f8d52f1c7e *src/icu74/common/ucharstrie.cpp e0db9125b7cb2142da73a2cd8da8c268 *src/icu74/common/ucharstriebuilder.cpp 64ad5b834cfc8d85bc804cadab2fc814 *src/icu74/common/ucharstrieiterator.cpp 3f9063c504d404cbc342b62b5293e94c *src/icu74/common/uchriter.cpp 0124c53dbcd37f4d6c803d3fafd675c9 *src/icu74/common/ucln.h b372e93b9c9b1e408e9196dda1283da2 *src/icu74/common/ucln_cmn.cpp d2571e9749dedffc01e77749c09f054a *src/icu74/common/ucln_cmn.h 66e78aa2e4c799a2f161c316cb3cd6ad *src/icu74/common/ucln_imp.h 60b5018b77cf8b59c7c2b6fa4648a2f8 *src/icu74/common/ucmndata.cpp a0dbc662c402fa1c298087debf7b7991 *src/icu74/common/ucmndata.h b4d24b159f8f580116117deee6a95c4c *src/icu74/common/ucnv.cpp f2904e74ea2c17541817db6489160a40 *src/icu74/common/ucnv2022.cpp 6114879505c54e096348aa7b6c423ef1 *src/icu74/common/ucnv_bld.cpp b816c3ac1b08085d6b364ebb3ed5b401 *src/icu74/common/ucnv_bld.h f2b205e7d0fb6baaaff2895fda3db13c *src/icu74/common/ucnv_cb.cpp 23d5509e268ff0a5a0ff882379a95e7e *src/icu74/common/ucnv_cnv.cpp 1432a6fb6bc3d24aa3e772bc591f7e5d *src/icu74/common/ucnv_cnv.h 3e3cdadde05fb6d47af9bf9e329a148a *src/icu74/common/ucnv_ct.cpp 13786e2af332d44bd8ddce63dd89fa35 *src/icu74/common/ucnv_err.cpp 9ef771a5b60cdce37f7f266ff3645d6f *src/icu74/common/ucnv_ext.cpp db686c2369b7d80e71ed7eb3c19db497 *src/icu74/common/ucnv_ext.h 1752788dd8c7cfe061c392dc9b954082 *src/icu74/common/ucnv_imp.h 98e41e8c09680f3f03f3d835273c6852 *src/icu74/common/ucnv_io.cpp 74f38751ccd295e8c3dedd97e40581d5 *src/icu74/common/ucnv_io.h f537aba247d9dbd5d9ac9a1c71f7b390 *src/icu74/common/ucnv_lmb.cpp 289fd90c341ff15ba04a4727ea0aa4c9 *src/icu74/common/ucnv_set.cpp b3f9864491df17d013605950e721aba8 *src/icu74/common/ucnv_u16.cpp 0b3cbf386e2509129edb1457e13a5e29 *src/icu74/common/ucnv_u32.cpp 86f8d4b8a3a8aba1173b24e6bce190de *src/icu74/common/ucnv_u7.cpp 0e2f2891f99c435600546ccf62675f75 *src/icu74/common/ucnv_u8.cpp 7e0f2e0074c2fee739eeb3e671d68690 *src/icu74/common/ucnvbocu.cpp 29a32eef3fa52485a783a0e3f2da2046 *src/icu74/common/ucnvdisp.cpp 5d4f87eca974ab59e8e03d7ad73cbd0d *src/icu74/common/ucnvhz.cpp 1433efa24ba41931829b2b4e38436155 *src/icu74/common/ucnvisci.cpp 989ed98c16f43c9ee1cc48f0937deacb *src/icu74/common/ucnvlat1.cpp e475913e7df724be25e100db5ebc5327 *src/icu74/common/ucnvmbcs.cpp 18542ba892d1625c6bbb65d6ddb050de *src/icu74/common/ucnvmbcs.h a9bcb8ef179f5beb5e6695cbd457f720 *src/icu74/common/ucnvscsu.cpp 7ba8dba81144e8b66f3bba06f4b1b568 *src/icu74/common/ucnvsel.cpp a97aee35a90370bf8f1bd5965a65f0d3 *src/icu74/common/ucol_data.h 569dafd910cec82341fb40285ef8d2cd *src/icu74/common/ucol_swp.cpp 9459520e8741fac35b9b8b9e93f903c1 *src/icu74/common/ucol_swp.h d4607ab10e38123c75fd3b38997bfcdf *src/icu74/common/ucptrie.cpp a0a26718e2387cc430eb3f3d9eecd761 *src/icu74/common/ucptrie_impl.h 29062aba5e1ab128b99c26f9e3851492 *src/icu74/common/ucurr.cpp 5ac72fa08b0586c3b1c3c056263f82be *src/icu74/common/ucurrimp.h e95ebeb1b58ca8b12afb9023b6c9ba51 *src/icu74/common/udata.cpp 0355eedf4c6ee082025c36aeef852567 *src/icu74/common/udatamem.cpp 0be15feab9f692d0fec7c6ec7b964c55 *src/icu74/common/udatamem.h 4e7db48495d2dc60291b709ce777bd29 *src/icu74/common/udataswp.cpp 2f05e3cafc57e99c4322c875c201292e *src/icu74/common/udataswp.h 714675c46b3b69d780368d0a9db8b223 *src/icu74/common/uelement.h 61b9cb3bbc082c7778fc0a93311fb0dc *src/icu74/common/uenum.cpp 74ace804f67cb80c3f039ac8cd0f3047 *src/icu74/common/uenumimp.h 06a652e6fb32c8ce6233a072ab4c93ff *src/icu74/common/uhash.cpp 931a33a137f97b9c36a153c3a37f98fd *src/icu74/common/uhash.h 077b3409c8f4ab87b8cf00d92e583b0d *src/icu74/common/uhash_us.cpp 279993de4c5ac5b9d5af75fc48933951 *src/icu74/common/uidna.cpp c5d287e1eb3f2887f566352bd78c91e4 *src/icu74/common/uinit.cpp 324c8cc797a4c179e52f21c3a0f4124d *src/icu74/common/uinvchar.cpp a1a13f5b5931115ebc6724fbee4a7e30 *src/icu74/common/uinvchar.h a81273f16325087549e8ab384fade5f4 *src/icu74/common/uiter.cpp 0f3bddb3773a839160f2c9fa93392ccc *src/icu74/common/ulayout_props.h 3b3b384819214ea5682221067afc0ae0 *src/icu74/common/ulist.cpp cea9a4d84c3e28d7fa01b1278a02c43c *src/icu74/common/ulist.h 0b9b4f66e78e18733f1d8afe41e11eed *src/icu74/common/uloc.cpp 5c2050809eb2409c849a477b2ecc6476 *src/icu74/common/uloc_keytype.cpp d7349e6b49f93b6135272e45b552236f *src/icu74/common/uloc_tag.cpp fb7d2a57b86ff84cbe02c0cab6b8cfed *src/icu74/common/ulocale.cpp 0ea6e309a46f132d1d51eadaca1c80f3 *src/icu74/common/ulocbuilder.cpp 71a3208f9320cbf0f7a470e5c4f7419b *src/icu74/common/ulocimp.h adb9d86dfd35396f80f0ca1377eb94f6 *src/icu74/common/umapfile.cpp 9f8498d136635f8d3856d29c5ce20dac *src/icu74/common/umapfile.h eef946c7c54e8f07f5550d4ffe985d29 *src/icu74/common/umath.cpp 6199ba0af7567d62f30ddc83e1baf1ef *src/icu74/common/umutablecptrie.cpp 93124eeb19731ca786f45e6fb81d4d42 *src/icu74/common/umutex.cpp 65a7d296eb467b3bafbf5854a2889180 *src/icu74/common/umutex.h 50cf5c7ab2234b2b5f207116c73e165c *src/icu74/common/unames.cpp 2c738fd96b4efc238076a6801af4252f *src/icu74/common/unifiedcache.cpp 2fd54fd002a9d78ad391539263219e52 *src/icu74/common/unifiedcache.h 16b15efb1af1fa9a625cfc6a1b82b6d8 *src/icu74/common/unifilt.cpp ea6b8680c985829dfe4fbe9a46c8ca76 *src/icu74/common/unifunct.cpp d7aa4a918a9cfc6c79c312c7e29b3af4 *src/icu74/common/uniquecharstr.h 0bccbe7880448075e03394c286ab7d61 *src/icu74/common/uniset.cpp 867bd0b3fef0b5af2f82ac4cdf34bc2e *src/icu74/common/uniset_closure.cpp 555a10ca866f036084a58596a07a9e6b *src/icu74/common/uniset_props.cpp aa75d79e23b9af6a75f8ca18332ca9a0 *src/icu74/common/unisetspan.cpp ea561c789d88fe9a7a230cf27b4935e3 *src/icu74/common/unisetspan.h ed434247049f25143e2061b81beb4b22 *src/icu74/common/unistr.cpp b534afa81a8fa7ec54fd662c77b8f9c9 *src/icu74/common/unistr_case.cpp ea66a38226c38b16d1fc865fb7808808 *src/icu74/common/unistr_case_locale.cpp 6804634537f1349d5ddc36c5d0949509 *src/icu74/common/unistr_cnv.cpp 177cba781160690d63baeb60a8c9d15c *src/icu74/common/unistr_props.cpp a1ed95f6af8bd03124dea20ef8a0c821 *src/icu74/common/unistr_titlecase_brkiter.cpp 81845d9908e5228dc19ff5b53df25312 *src/icu74/common/unistrappender.h 863a3a81ddedcbdf621be35654b01d01 *src/icu74/common/unorm.cpp e94285566619c1869c64c09562eb10c4 *src/icu74/common/unormcmp.cpp fcc9fb21f7f7e4a86a5c4db6659d07d6 *src/icu74/common/unormimp.h 8c3f26303a50cfa26d38d16592daacb5 *src/icu74/common/uobject.cpp 98a842dc115c9550da1aa39bd18a401c *src/icu74/common/uposixdefs.h 15b7c3c62e8a269905ea22d0cdabf5b5 *src/icu74/common/uprops.cpp 62aa03d20a4677bb4d36cf0fc940e59d *src/icu74/common/uprops.h 649daca03a703e878bae9e14feff7ccd *src/icu74/common/ures_cnv.cpp c5a1cca06ddb8c74ffda8a06e0bffa5e *src/icu74/common/uresbund.cpp 7d4612dbf7194cb28a64c77b6ec34bdc *src/icu74/common/uresdata.cpp 7a8e310955d46553a7c5bba8c75b51e6 *src/icu74/common/uresdata.h 68328ca14ae0da00afac8171550f97ae *src/icu74/common/uresimp.h 9d1c3398397fcc0c380b549493ac5de8 *src/icu74/common/ureslocs.h d4721f0e0c6854642777ec23d821c47c *src/icu74/common/usc_impl.cpp d68c64e47b1338f2ea06ab0d8f3ac9fc *src/icu74/common/usc_impl.h bfa918431674b85a0308359c7dda5d1d *src/icu74/common/uscript.cpp dcd63bee366aec3f0b00d40c9d24919a *src/icu74/common/uscript_props.cpp 115ff79c65cf4120b20981d3d1bf5d64 *src/icu74/common/uset.cpp bf39d3dc51e26aedf66102a9da9e70f3 *src/icu74/common/uset_imp.h 4515d7583242d973f0202f23d72c6c28 *src/icu74/common/uset_props.cpp c0ed5e972b9cf6998accb9e3ccbdf1e9 *src/icu74/common/usetiter.cpp 7899152e7011492967c14bf7be442ca4 *src/icu74/common/ushape.cpp 5de9494db31ca5476b57bf6c96d10eff *src/icu74/common/usprep.cpp 3f88d88d8345363312d3bb04e9c2e1ba *src/icu74/common/ustack.cpp 0703aabdb61bb50553961d8dbb6c2513 *src/icu74/common/ustr_cnv.cpp 3ec96414f9013aa39acdc09425abb09a *src/icu74/common/ustr_cnv.h aa91bdcb60b1419b3cbeb9a1faf17d6b *src/icu74/common/ustr_imp.h 6df49081281891caa5364bde0efbb30a *src/icu74/common/ustr_titlecase_brkiter.cpp accff6321a811a61ec07bf93c5978d9d *src/icu74/common/ustr_wcs.cpp aecdb259a931f7e3aac3722cd9bd0989 *src/icu74/common/ustrcase.cpp ebe2ae679ee8ec80a45ed3e4e09d790d *src/icu74/common/ustrcase_locale.cpp 091136e3a83e682d93f4ab6b29d9d01c *src/icu74/common/ustrenum.cpp 6ac428072da3aa7501778af75d112540 *src/icu74/common/ustrenum.h ec1be0200d6d6e618217694b884fc066 *src/icu74/common/ustrfmt.cpp 8c58fa99afa9a9886e857c080d244142 *src/icu74/common/ustrfmt.h 6341507b1c09d281d8152899b4f5139a *src/icu74/common/ustring.cpp 363dca1fe175c19c7c2d2433c6afa085 *src/icu74/common/ustrtrns.cpp ad7529b9295fd2991fce3b9fa6804648 *src/icu74/common/utext.cpp 41311318d1bae1c5b005d8603694477d *src/icu74/common/utf_impl.cpp b0e269c6fe574e497a1547cc09edb199 *src/icu74/common/util.cpp 883d51baa498b48b2b0ec563b117b6ee *src/icu74/common/util.h 8f1393cb34ecbaf48f3f450f61b457a2 *src/icu74/common/util_props.cpp 93334e52979972e6d51941ef1bc3354a *src/icu74/common/utrace.cpp 4749c91038039c73de769530b20325d7 *src/icu74/common/utracimp.h 41b6d141af0cd6d05a29b7e622e36439 *src/icu74/common/utrie.cpp 1f88ef78e5a55b0e29e4aae66acb99da *src/icu74/common/utrie.h d262b4ce4de01dbec87513adfca7b7dd *src/icu74/common/utrie2.cpp f1f2817dfd7da87969a091fc8f97360d *src/icu74/common/utrie2.h fa83f7e77aaea7b8da9aa8af8233d0cc *src/icu74/common/utrie2_builder.cpp 96d79794bd37e8a23ab9103b3b55c43a *src/icu74/common/utrie2_impl.h da43b5ddee2c00c774aa83fee9583aa9 *src/icu74/common/utrie_swap.cpp 266c6d2f85a12face00e97d2302f3c36 *src/icu74/common/uts46.cpp e7124ec489a49cffc44f1b3ce723148a *src/icu74/common/utypeinfo.h dbdda139438671d513593aff4a4bcf58 *src/icu74/common/utypes.cpp 48e7bed955f2c38c97d8b9d84a376afa *src/icu74/common/uvector.cpp 726d69a75555480c8b85cc0669089575 *src/icu74/common/uvector.h ef978bdffd5bc3bd21b74d9dd5240e58 *src/icu74/common/uvectr32.cpp 7edd1f2067a208d02bca1428282bd69f *src/icu74/common/uvectr32.h a285a33bdbad4322752f02d05fc6d6b5 *src/icu74/common/uvectr64.cpp 7f960fc52f03556f403798561fa82110 *src/icu74/common/uvectr64.h 99d996d42e83e09075c00302cb8f1a97 *src/icu74/common/wintz.cpp cc3e99d547dc4b10f1cc752f8ad7ba5d *src/icu74/common/wintz.h 08dc3852df8fffa807301902ad899ff8 *src/icu74/data/LICENSE 20b12254aa9f02c707612fb62e4a43d2 *src/icu74/data/SOURCE 23c26661d62277a88e8ba9d66a88beff *src/icu74/data/icudt74b.dat.md5sum f7f3e7988145676c45c51bf56e63aea9 *src/icu74/data/icudt74b.dat.sha256sum d7d69305b5d66fcc60d41e886c480155 *src/icu74/data/icudt74l.dat.md5sum 1541e1efb55443bcd38b349cd88be438 *src/icu74/data/icudt74l.dat.sha256sum 026bc4732611c718d3c08c4e9f5f0656 *src/icu74/data/icudt74l.dat.xz 90e21194f73907fe9dd68ca8491cbea6 *src/icu74/i18n/alphaindex.cpp e32f21a804407f225e48457b627aa5bd *src/icu74/i18n/anytrans.cpp d3b7c90f6774e62c7df50954adcc02ee *src/icu74/i18n/anytrans.h 5bc87c0e78d2bb05ee43a12266826700 *src/icu74/i18n/astro.cpp 0f4003d987030dee7d1ab87512943062 *src/icu74/i18n/astro.h 5824ff6d5523118052916abada83fd39 *src/icu74/i18n/basictz.cpp 806d4c338daeff64951e4cb8845fdae8 *src/icu74/i18n/bocsu.cpp 591813bad634eef918147e3b364739a8 *src/icu74/i18n/bocsu.h e94c43140a7b9a8e9f168212083522e0 *src/icu74/i18n/brktrans.cpp b23a22e8160fb79f482ae9a29acb8291 *src/icu74/i18n/brktrans.h 6f14e333e4bc190853340a29f4ca6fc1 *src/icu74/i18n/buddhcal.cpp 85a99c65e95ef8d14f2e38529b6c9c3c *src/icu74/i18n/buddhcal.h 12c00296cbb2af3f057ae260717759ec *src/icu74/i18n/calendar.cpp 1cc4e3449a658f09ed5e92316ba7dd74 *src/icu74/i18n/casetrn.cpp e5ef110faa03e2e92d7ee630739e1bf2 *src/icu74/i18n/casetrn.h 36242f4845f0befec3703d5412b33045 *src/icu74/i18n/cecal.cpp 8229d6a554ed480f8efb4a650ff13b8d *src/icu74/i18n/cecal.h 103e1c08ff6966618a4a29896850bbdb *src/icu74/i18n/chnsecal.cpp bc0c02f9a43d76cd3587d06169271257 *src/icu74/i18n/chnsecal.h 63f89834ed34b350e4dac944f8a68ba2 *src/icu74/i18n/choicfmt.cpp 28e5c20e350c5f34960a3a445d93f9eb *src/icu74/i18n/coleitr.cpp ae9988a8fce998084438be92dc6c2796 *src/icu74/i18n/coll.cpp 1bb75bd8d7cd47cea136cef61248af34 *src/icu74/i18n/collation.cpp 4025d092de9e3ba6674535c3b3c7831f *src/icu74/i18n/collation.h d1ea4a5b241439ce59d033fb943fbd51 *src/icu74/i18n/collationbuilder.cpp 6d7277e221708a5df3842499bf0914b6 *src/icu74/i18n/collationbuilder.h bad57ed50e7cd389b76ee86944cab0c8 *src/icu74/i18n/collationcompare.cpp a43cbeda92cf72ce2a8ab9172210bbb2 *src/icu74/i18n/collationcompare.h 16d29d356c9510f81f3e1aa58982303c *src/icu74/i18n/collationdata.cpp e7157732039ee90675f7f3b1cd34b982 *src/icu74/i18n/collationdata.h de8c1f2be3cdedd2fc02137a7137cc32 *src/icu74/i18n/collationdatabuilder.cpp 65399521eea51d7fbb9f39be3493aa05 *src/icu74/i18n/collationdatabuilder.h 01c145cdd697165def20af8dfdda508d *src/icu74/i18n/collationdatareader.cpp 0a59dd4d3cb862b2be6646a5092798de *src/icu74/i18n/collationdatareader.h ef410495845581e2e667da1c90ab188c *src/icu74/i18n/collationdatawriter.cpp 13077c8cee15408ed56f058f68c2b8cf *src/icu74/i18n/collationdatawriter.h fdb098bc522e1e14dd35c10c054fb0e4 *src/icu74/i18n/collationfastlatin.cpp 78504885e2a9ab29722975ad680f02f6 *src/icu74/i18n/collationfastlatin.h df88b6af447db5370287178a53556d16 *src/icu74/i18n/collationfastlatinbuilder.cpp 22c30cf23eaa3a3a73473858271d10b7 *src/icu74/i18n/collationfastlatinbuilder.h 85c13c084ebe24944141c0231715dd1b *src/icu74/i18n/collationfcd.cpp d16a4007fab369418b61224c30c54c78 *src/icu74/i18n/collationfcd.h 1afeb7d364a996e78fba7aa53011666f *src/icu74/i18n/collationiterator.cpp 8518d377a990da3d895eeb727ca70025 *src/icu74/i18n/collationiterator.h 74659b67f2a7ab819b3b6a976c4175ec *src/icu74/i18n/collationkeys.cpp f45d7a0f68cfdd4a39cda25458399d1d *src/icu74/i18n/collationkeys.h 63559edac2d7833448b7c2b953613429 *src/icu74/i18n/collationroot.cpp 9c74abc9d4b1920f071b47b5c22d2fc5 *src/icu74/i18n/collationroot.h 51a4ccbdb111177a507a64503218bfaa *src/icu74/i18n/collationrootelements.cpp f29d66fb901acc36ce4c14d4e1fb4e90 *src/icu74/i18n/collationrootelements.h d0b1b8b51fa09d8653ceb56bebb21826 *src/icu74/i18n/collationruleparser.cpp 5283e6ac91c37ad653b05f3abe29fafc *src/icu74/i18n/collationruleparser.h c4144f8ef123be110daec6094cd077e7 *src/icu74/i18n/collationsets.cpp 759356c7452d1f5853a156530e39fc4a *src/icu74/i18n/collationsets.h 530a877d6138845f8f8639b2a4101c9e *src/icu74/i18n/collationsettings.cpp 21308670179dfe7ab328ff7b43351ff2 *src/icu74/i18n/collationsettings.h 79b9f7940d77c685b7c938115c9bd4c5 *src/icu74/i18n/collationtailoring.cpp 33c82d5d597c689869ee4e0aef410b80 *src/icu74/i18n/collationtailoring.h 76b8b3b582558850172cd5f9d4fdc401 *src/icu74/i18n/collationweights.cpp 5071775b20b1fc4ade749e30d0481def *src/icu74/i18n/collationweights.h 129fa549890d6dc37c6a6b74faa3e410 *src/icu74/i18n/collunsafe.h 5582e860be31f68194b09671fb2da7cb *src/icu74/i18n/compactdecimalformat.cpp 3afaa912779586044b420924b4a26073 *src/icu74/i18n/coptccal.cpp 384d95da4e51d0193f14741fbab4924c *src/icu74/i18n/coptccal.h 83e08f371f33985f15a539fb50401f33 *src/icu74/i18n/cpdtrans.cpp e3b3df8c57ffc76602d35b0cc67460fd *src/icu74/i18n/cpdtrans.h bb8b75b75fc8d7172acefcf0a9c63258 *src/icu74/i18n/csdetect.cpp 7b144abcac10c60e48c297021c6c7e75 *src/icu74/i18n/csdetect.h 0864138c06bc800222587aa582b30589 *src/icu74/i18n/csmatch.cpp be862da6c6977982550a1853bf0f9c60 *src/icu74/i18n/csmatch.h 80cbffce65c364b000a10f32b866b35f *src/icu74/i18n/csr2022.cpp b0fac98602d38925ba2c0ac6df7f3258 *src/icu74/i18n/csr2022.h 9053d1c4d19917e6e0d4c90cf01bbedf *src/icu74/i18n/csrecog.cpp e17d8be1fa152b8526289e636968af82 *src/icu74/i18n/csrecog.h 2855995fabbd4f71478191ca8dedf367 *src/icu74/i18n/csrmbcs.cpp 3cb8aa0d3e5cc644a7b809de76072a74 *src/icu74/i18n/csrmbcs.h b5137c6b3ea2363a182d4c1551dc4002 *src/icu74/i18n/csrsbcs.cpp ebf0a695b0a563d3c0e5f69b8df212b0 *src/icu74/i18n/csrsbcs.h 3616d37a7ae3e3d1417647e4930476c4 *src/icu74/i18n/csrucode.cpp 8c41bd77f0c3996717b82b3d0ec5b99a *src/icu74/i18n/csrucode.h 2c9021ddba7947dc66433bcf6ec0dc84 *src/icu74/i18n/csrutf8.cpp b73f9d41c490a8fe7e056d7026ac89a2 *src/icu74/i18n/csrutf8.h 84ff569a38fe50aba9cf07b129092613 *src/icu74/i18n/curramt.cpp 7478df25773814b0a56abcea635e4891 *src/icu74/i18n/currfmt.cpp 577345a3357e4a78d40bc40361d09a9a *src/icu74/i18n/currfmt.h f967aaa849b74ef5986b0bb767558d40 *src/icu74/i18n/currpinf.cpp 31e95f882c4450eb396f12924933c0a8 *src/icu74/i18n/currunit.cpp 368d251a8fbf575cdb9ca822da10de30 *src/icu74/i18n/dangical.cpp 41e1c1dbd43193759530c2593900d2da *src/icu74/i18n/dangical.h a9fc4cce77c1960652cf66b07804bdb6 *src/icu74/i18n/datefmt.cpp 1c70836aa19cd6135642471ab190fae1 *src/icu74/i18n/dayperiodrules.cpp bc27ce5314b1c5bf9730af55f64c395e *src/icu74/i18n/dayperiodrules.h bacf1dd3ab7aaff7ed9c3963d0b3f406 *src/icu74/i18n/dcfmtsym.cpp 4119973ef7649a988b9d4c03252b1f5f *src/icu74/i18n/decContext.cpp 596e01fa2e3b5462886bd072a74f27a0 *src/icu74/i18n/decContext.h 70e4b74d496407bd84348c2819813245 *src/icu74/i18n/decNumber.cpp d444cbd31e4b2d88b8b3caa0d95a2bfd *src/icu74/i18n/decNumber.h 9fc72b97fdca6d16bdeb656738b95eb8 *src/icu74/i18n/decNumberLocal.h 870838f64ca29dfa9a9f198ae0ac4756 *src/icu74/i18n/decimfmt.cpp 035b2353b007a601a03e009b57b19eb5 *src/icu74/i18n/displayoptions.cpp 24b821191d3487139be1daec067c544b *src/icu74/i18n/double-conversion-bignum-dtoa.cpp 6c8773691e230a3aecb41af702c14d7a *src/icu74/i18n/double-conversion-bignum-dtoa.h 52720e041e32513b513e7068bebff605 *src/icu74/i18n/double-conversion-bignum.cpp 73fdb4513c2020fac7d68f80806edb71 *src/icu74/i18n/double-conversion-bignum.h e0db70de8e5825d89297704b4cf4341a *src/icu74/i18n/double-conversion-cached-powers.cpp f72851aea7f52fe37f0a12dd8ea671b1 *src/icu74/i18n/double-conversion-cached-powers.h 99b43602fa8d8eca475f790b82f499c7 *src/icu74/i18n/double-conversion-diy-fp.h a6634bbec4d84f8fb6ceebf8aa533ba4 *src/icu74/i18n/double-conversion-double-to-string.cpp 8889eef01703ceff78d54e99d474e461 *src/icu74/i18n/double-conversion-double-to-string.h 288de63c07216687aadcd7251fc5f60e *src/icu74/i18n/double-conversion-fast-dtoa.cpp 6783de252f77cb093ee152e7cd37b3c8 *src/icu74/i18n/double-conversion-fast-dtoa.h 608e4389548310ddbb97c9f3f18fe47a *src/icu74/i18n/double-conversion-ieee.h c944fbe5f906c1627eb05764fbc620cd *src/icu74/i18n/double-conversion-string-to-double.cpp 6d6af209112a216d58e477515a2815df *src/icu74/i18n/double-conversion-string-to-double.h f265a11a189ab3b5ef0c36121384de02 *src/icu74/i18n/double-conversion-strtod.cpp 3e521e3256ff699c3f6bbc15c3fa226e *src/icu74/i18n/double-conversion-strtod.h 88f1d85eff6833f156fd39aac882b09c *src/icu74/i18n/double-conversion-utils.h b734b48b27dc655feaff3862676843fe *src/icu74/i18n/double-conversion.h ba5fd1742374a110c6d5bf01f62fd8e5 *src/icu74/i18n/dt_impl.h 0469131ca7becae0ed846fbc95f21f2d *src/icu74/i18n/dtfmtsym.cpp 1e1e17ba31c452f1bb213f00f7674eca *src/icu74/i18n/dtitv_impl.h 3cd89465a31023ffef348ded0518ca14 *src/icu74/i18n/dtitvfmt.cpp 19e4bd6ee24d5ffa8075bfeca2422558 *src/icu74/i18n/dtitvinf.cpp 3e500f44c3f46a146e6293fa0cbd5c72 *src/icu74/i18n/dtptngen.cpp 9185152f4a9321acd6a27c11a67ca08e *src/icu74/i18n/dtptngen_impl.h a0cd1aaa68703cc9c6be6d04f11a8787 *src/icu74/i18n/dtrule.cpp 4397256b1c81850f77d852b3be861a3c *src/icu74/i18n/erarules.cpp 639bcd84862af3eb8ade10444ac55faf *src/icu74/i18n/erarules.h d0c415c91a2f75176572fcfbb1745b28 *src/icu74/i18n/esctrn.cpp 4ba8a25d0724205e700e5d23307004de *src/icu74/i18n/esctrn.h be72dcf9e5d643a21ad9ee46e57c2622 *src/icu74/i18n/ethpccal.cpp 97bda4317c41e08f97d9ccd25eacd628 *src/icu74/i18n/ethpccal.h aa4219597ac27b44a8aa855053a1d7fa *src/icu74/i18n/fmtable.cpp 731733bdc429695c6d64ef168502496b *src/icu74/i18n/fmtable_cnv.cpp 0f9715c961b9532d2a830fd8efa4c661 *src/icu74/i18n/fmtableimp.h 563a66209b0e298fb807df167ea84c5f *src/icu74/i18n/format.cpp 9f7aff74c34e8ec613bdfbce9289fb5f *src/icu74/i18n/formatted_string_builder.cpp 77362fb1f440a64ff77339a989265bdb *src/icu74/i18n/formatted_string_builder.h 03530a1d2c08eb0154390911a0b5e7af *src/icu74/i18n/formattedval_impl.h 320789a691f33068b87385eadab91c02 *src/icu74/i18n/formattedval_iterimpl.cpp 2011e25d52bfc7378eda9f6fa679a203 *src/icu74/i18n/formattedval_sbimpl.cpp e5cb3b64477e2b477ac6aff647b44322 *src/icu74/i18n/formattedvalue.cpp dce4d156e18f26b7867820374d15d23a *src/icu74/i18n/fphdlimp.cpp 1ab3f36c3f6ebc94a3652d7b653bc76c *src/icu74/i18n/fphdlimp.h 24e898a2ff01b9a3974e90fe98adfcbc *src/icu74/i18n/fpositer.cpp 4d65fae82030a1425723a3ec60805a4e *src/icu74/i18n/funcrepl.cpp 0992870e3ce1f279db21fb87c7d7676a *src/icu74/i18n/funcrepl.h d66a6bacf6786c638cc63bf83789b4d5 *src/icu74/i18n/gender.cpp de4e55c1c050dac1407cae5522262704 *src/icu74/i18n/gregocal.cpp 592d42fab74507725b80494395d51b30 *src/icu74/i18n/gregoimp.cpp 7616059f7a46284ca0858cb898f4d8fc *src/icu74/i18n/gregoimp.h 540f860f01f830720ebe10b7c0c05518 *src/icu74/i18n/hebrwcal.cpp d0028ad1c46412b265283992cea8ab15 *src/icu74/i18n/hebrwcal.h 792b7e22539bc2eda65f60af719efab1 *src/icu74/i18n/indiancal.cpp e738cddfe152d1d4024c941c6686fa5a *src/icu74/i18n/indiancal.h 9ac8ac553a4d79388f31897e51cec72b *src/icu74/i18n/inputext.cpp ff19ec95809d5b706125a14f83677962 *src/icu74/i18n/inputext.h d12b0c16c20e7359047c556c94dffc8f *src/icu74/i18n/islamcal.cpp 680950aaa4922b07c3a474b22e623fb7 *src/icu74/i18n/islamcal.h fbc22c4d41bd3347c666e7a617504eb5 *src/icu74/i18n/iso8601cal.cpp 54800ee228eff4412792a6460a3c1cb5 *src/icu74/i18n/iso8601cal.h 1c224bf8333020e6df833fa584f87371 *src/icu74/i18n/japancal.cpp 8bf5ffa3c3f1f8fdc4ce75cb9a91e6d4 *src/icu74/i18n/japancal.h ee4f7e72aca491beb219c52020e52eb5 *src/icu74/i18n/listformatter.cpp eb36140ed7e3ee7f3e3060e59cb19a59 *src/icu74/i18n/measfmt.cpp cd04a3c355d755733cfb35b4b63959cb *src/icu74/i18n/measunit.cpp f6c4f6e6a5650b90701e38d45ba26dd9 *src/icu74/i18n/measunit_extra.cpp ae1f98552badffdcf00e5a0080c943f1 *src/icu74/i18n/measunit_impl.h 143b532eebddfa40470faadeb84aae42 *src/icu74/i18n/measure.cpp 7feb155c754826ce654f9617a663dd21 *src/icu74/i18n/msgfmt.cpp 5e91e4b015acd59fc9d8ae0dc0f4ae99 *src/icu74/i18n/msgfmt_impl.h 7bd6f0cba5fc3716ae03d5bd08705a04 *src/icu74/i18n/name2uni.cpp 40e275d9ea0696bdcf90fcba11343226 *src/icu74/i18n/name2uni.h 8408d8e9bf58b80a0339ec3d2f658250 *src/icu74/i18n/nfrlist.h 17b50dd408e7c6cf4d3dc60bd76c56f8 *src/icu74/i18n/nfrs.cpp 5de3444820766dad63c75387b7115ee6 *src/icu74/i18n/nfrs.h 652e60eed1f5536eb1997ad59218ab02 *src/icu74/i18n/nfrule.cpp 93a6a23ffcebec12ee7af561f0ef7a4f *src/icu74/i18n/nfrule.h 452e9a845585a7553188c4239f55948e *src/icu74/i18n/nfsubs.cpp cfd3232f443cd2bb45bf5862c5447deb *src/icu74/i18n/nfsubs.h ce2cb295abbc04551063bbd66a357841 *src/icu74/i18n/nortrans.cpp 384e35eb53a26b00349d5910ed03454e *src/icu74/i18n/nortrans.h 72f0244cd2441bfc684c23af7aff6ba6 *src/icu74/i18n/nultrans.cpp 899bf063c84788aba1c33c33b2bb4cd4 *src/icu74/i18n/nultrans.h b3792be6321f001a842ccdb9ae71b74a *src/icu74/i18n/number_affixutils.cpp c72d465ef54714d514e3dc0237865eb8 *src/icu74/i18n/number_affixutils.h 952330f617e93f3c3d6d0e17472dff50 *src/icu74/i18n/number_asformat.cpp 6dc3e7134e5099b53fd10c00c29caaac *src/icu74/i18n/number_asformat.h 00af1a56a211b1b6de01ec7d3e6396b1 *src/icu74/i18n/number_capi.cpp 8b217d085335a612c7a2b86bbeeb9aa1 *src/icu74/i18n/number_compact.cpp 0e626374775553966f159142933c6670 *src/icu74/i18n/number_compact.h 429843cb4961b856ff11ae517389785b *src/icu74/i18n/number_currencysymbols.cpp 7c95d2d371f585387fff30ee9344b5c0 *src/icu74/i18n/number_currencysymbols.h e9bafbc925414e87a1b86a6e3acc4bee *src/icu74/i18n/number_decimalquantity.cpp 368dd2ae1ba6b306afb493241996c600 *src/icu74/i18n/number_decimalquantity.h b70dc329d710f421a89b90001b1e4ee5 *src/icu74/i18n/number_decimfmtprops.cpp 634ac5437e87a7bb1046b4f2109bcffa *src/icu74/i18n/number_decimfmtprops.h a7db7c30b63db78cd19e9eeb65762346 *src/icu74/i18n/number_decnum.h 18d192d5509cd0a6e285f3cd4dec5ea2 *src/icu74/i18n/number_fluent.cpp e5bfca31ee5fd6bfb14e89db76129300 *src/icu74/i18n/number_formatimpl.cpp 593f69ecc6537aedeb0286ac709a29e1 *src/icu74/i18n/number_formatimpl.h 9555662382ffaa51329c3acdc679d684 *src/icu74/i18n/number_grouping.cpp 39fa6d9de0ac4256d51c316ddd15b172 *src/icu74/i18n/number_integerwidth.cpp 5fbc81adcd146b9a3ff26cd7ce204a6e *src/icu74/i18n/number_longnames.cpp baa6561cf90e522ff39bd35850b77352 *src/icu74/i18n/number_longnames.h 3c4089b3fa0386ab68a84e718631d964 *src/icu74/i18n/number_mapper.cpp d133751e0412ebcf38138d0850658d46 *src/icu74/i18n/number_mapper.h 67f4ebfc2e5a4f1d5d904b7c5955e8de *src/icu74/i18n/number_microprops.h 2d7c786159711d7f0d8896e4c919c7e5 *src/icu74/i18n/number_modifiers.cpp bca3359ee4e1b93037f41352a6cc5678 *src/icu74/i18n/number_modifiers.h aef5955b95d5bc6be0dc568f391950b4 *src/icu74/i18n/number_multiplier.cpp faaeeb99e3859c2ce6aded5c8dc52cba *src/icu74/i18n/number_multiplier.h c87b8fe96ede6d9df126f8b9a0c84c02 *src/icu74/i18n/number_notation.cpp 48e8107253d2ecd75d55b6e0ed744e7b *src/icu74/i18n/number_output.cpp c93aedc4b3e9304f3deff852c1865327 *src/icu74/i18n/number_padding.cpp 956893cd53d5b3300132d25221bddf49 *src/icu74/i18n/number_patternmodifier.cpp 9de54ab073b3b39e2f45ccea96613532 *src/icu74/i18n/number_patternmodifier.h eb0c493a9032e9d29a4b640723a141ee *src/icu74/i18n/number_patternstring.cpp e13a3520702fe4e33f4d2a65cb27e39b *src/icu74/i18n/number_patternstring.h 4ad6568b450149b25f14e682327dc83c *src/icu74/i18n/number_rounding.cpp 4e306a4d94822465bb2fc1491a57cb60 *src/icu74/i18n/number_roundingutils.h d7f72c4c2298cd0586d2889aa6f00dd6 *src/icu74/i18n/number_scientific.cpp 9d3e6107887c03532b52a801aa8ae8d8 *src/icu74/i18n/number_scientific.h d453d5b4e031c722a97e4240905145a2 *src/icu74/i18n/number_simple.cpp 0d052d703551f57ef380ae6b98d6d13c *src/icu74/i18n/number_skeletons.cpp 19a995acf95dd14ac94183f136feeba8 *src/icu74/i18n/number_skeletons.h 78aeff8c530099e09caacea791d3d850 *src/icu74/i18n/number_symbolswrapper.cpp 00425da912d4276d8ea39b42fc600a22 *src/icu74/i18n/number_types.h a0a5aa94ec050a1754bc2cf763bba8ce *src/icu74/i18n/number_usageprefs.cpp 12096dbf814e8456ab5169e4b915d58d *src/icu74/i18n/number_usageprefs.h 8181f5b8a6d5919d664a1de345b85854 *src/icu74/i18n/number_utils.cpp 3e6bcb33fa3f0490fd6437ab340a5b7c *src/icu74/i18n/number_utils.h 13bdac003484ba5361cf0e8b5720c6b3 *src/icu74/i18n/number_utypes.h fd4c6bf742a3109f3a075a04693b7289 *src/icu74/i18n/numfmt.cpp 8d596b10370e03e93a8c784337c3892c *src/icu74/i18n/numparse_affixes.cpp 15dc0ed3e759d48c99a8403290b146f4 *src/icu74/i18n/numparse_affixes.h f760db91281f37ffe7373bfeb210a8cb *src/icu74/i18n/numparse_compositions.cpp 0dd20d09fb18f9e7aaa9f825208986b2 *src/icu74/i18n/numparse_compositions.h 867d8cd0f5705a77d96bd98cb68ab5b9 *src/icu74/i18n/numparse_currency.cpp 1a2c07a632970bdf70e3b8e6fee8fc28 *src/icu74/i18n/numparse_currency.h 0bbe0dde27d1675ffe2e204ca34ddfca *src/icu74/i18n/numparse_decimal.cpp 99bf8f6ac32cdabaddc339f97694b97f *src/icu74/i18n/numparse_decimal.h f99035e8cde5bde79c087ae4b5f165fa *src/icu74/i18n/numparse_impl.cpp f586b5fb7d1e6be8c17b61d7cd9c8f47 *src/icu74/i18n/numparse_impl.h 5aafcc925efa75f442dcacd4d14a623a *src/icu74/i18n/numparse_parsednumber.cpp 2cd10065dca57d2e2fd34c9d9e167983 *src/icu74/i18n/numparse_scientific.cpp c4ab41144b0600f20da5288f432c8651 *src/icu74/i18n/numparse_scientific.h baba2688a4a979e4117b39739b555e01 *src/icu74/i18n/numparse_symbols.cpp 71da71119a0535ac557a65d3e005ccc3 *src/icu74/i18n/numparse_symbols.h 0f6d7be6bc673e6707111ad769f2f520 *src/icu74/i18n/numparse_types.h 1959a7476ef38ca098d6abb6619bd28e *src/icu74/i18n/numparse_utils.h cec4cdfdfa1cce101919b0748bbc8ca9 *src/icu74/i18n/numparse_validators.cpp a40c780a7a2649762d6827c13569a911 *src/icu74/i18n/numparse_validators.h e72ac58b890120161a9a344a8d0c5386 *src/icu74/i18n/numrange_capi.cpp 86f0d63beb6b0f1d8ecb13342a6613f4 *src/icu74/i18n/numrange_fluent.cpp 3b9a7239de549bba1fe72e4904de9539 *src/icu74/i18n/numrange_impl.cpp 92b57cb82ced744e6b00679061c22fc5 *src/icu74/i18n/numrange_impl.h 27bcd721675482d079b948179052970a *src/icu74/i18n/numsys.cpp a172ec5c557763cd6b092757b2fd462c *src/icu74/i18n/numsys_impl.h f899886cbc5f633d5f9d46a89d8ff392 *src/icu74/i18n/olsontz.cpp 1218fa0474595656be22063a09e921e3 *src/icu74/i18n/olsontz.h 87100031fd028353243c80ce87d37245 *src/icu74/i18n/persncal.cpp 9b6378f841df0b123aa19f2f0869caab *src/icu74/i18n/persncal.h ff5c9c9b2a106c9047768dab57e5e6e3 *src/icu74/i18n/pluralranges.cpp 78e4071886f83d932cc819712e3db3b1 *src/icu74/i18n/pluralranges.h c739d60a64b7dad5c6de92dd27481db4 *src/icu74/i18n/plurfmt.cpp c843e54a288d1c4dff1be10ff934bdaa *src/icu74/i18n/plurrule.cpp ef5e4fb83df35b874541c10b87ac2113 *src/icu74/i18n/plurrule_impl.h 738ef02f6e802aa3b7dd486705340c6b *src/icu74/i18n/quant.cpp 0f0d56b6013caf50b06f450635d587ea *src/icu74/i18n/quant.h 0fb69e3829d2628b0ee89fa4d3b9ba5f *src/icu74/i18n/quantityformatter.cpp 0133ad8ffe5881513b82e3d37fb4f85c *src/icu74/i18n/quantityformatter.h b60ac8bcb4890667f387243003bf255b *src/icu74/i18n/rbnf.cpp 25fdb60dab05421910412e6f4d76c9d5 *src/icu74/i18n/rbt.cpp 79d1ba8d0a1b16d2b74e77dbaa887748 *src/icu74/i18n/rbt.h 8821032e8f87e056c5855948d61addc9 *src/icu74/i18n/rbt_data.cpp 4e2c2c9c674687ae71a1a52f7177de09 *src/icu74/i18n/rbt_data.h 23e3d68c4acf8bee2bf14c9a5d09dfb2 *src/icu74/i18n/rbt_pars.cpp a041b95f9d99f7bbeca78b5f2c501b5d *src/icu74/i18n/rbt_pars.h 5748e9f794912ccc8db57b1840a99838 *src/icu74/i18n/rbt_rule.cpp 5dfdeb8adce79f2e9b695eb441f6dbe5 *src/icu74/i18n/rbt_rule.h 184376dc3546aa5cf1dd3d7da88a0645 *src/icu74/i18n/rbt_set.cpp 138974c3a39ef8c5fb84d850a5538ff1 *src/icu74/i18n/rbt_set.h 5812e13a3ac120a915c81ac879e570dc *src/icu74/i18n/rbtz.cpp 5d77a27e8db07dd64de7de828bfbb111 *src/icu74/i18n/regexcmp.cpp cf8010708b2762e77c020a1044a0e114 *src/icu74/i18n/regexcmp.h 4caf2d01afdc843ae32f91dc3c1a8775 *src/icu74/i18n/regexcst.h 3ca092e7ce9e7bd741ab4eb28cfb0695 *src/icu74/i18n/regexcst.pl 7990cbeadf4e0e067b6e1d98895d82d3 *src/icu74/i18n/regexcst.txt 22b65d548d0a21baa1c60bb5de0a73bd *src/icu74/i18n/regeximp.cpp 3f66fa1168b97d7d731faf95733fe68d *src/icu74/i18n/regeximp.h c25a700ff0102e131078aa983d2418aa *src/icu74/i18n/regexst.cpp ec2216308bc15bbc9c69ed3836e5c159 *src/icu74/i18n/regexst.h 54adba24f34306a6d9ba04e2c58b3055 *src/icu74/i18n/regextxt.cpp 02e21be1f3c784a681d278201240bc8b *src/icu74/i18n/regextxt.h df4d106bb8587f902cdb2c284c8bb2c5 *src/icu74/i18n/region.cpp 3e8ecef525426509e2bca8c308d5ccb6 *src/icu74/i18n/region_impl.h 0476d52799f3f5aadebecdd5f8a10218 *src/icu74/i18n/reldatefmt.cpp 82542d6feae419efa6deb05c1d910e90 *src/icu74/i18n/reldtfmt.cpp e25d17a38cac736c3ae73fae8a81e2cf *src/icu74/i18n/reldtfmt.h b57d2602241987451644b075a6dd9ed2 *src/icu74/i18n/rematch.cpp a50149472f47b3dfc9838b4116a270d3 *src/icu74/i18n/remtrans.cpp 68d9094b4a6d70b697f92463fb69c185 *src/icu74/i18n/remtrans.h a4bcbfcf7ddd7d5c4d5f58a91702fe70 *src/icu74/i18n/repattrn.cpp 002a51bafee19447c957163ffd6c70a8 *src/icu74/i18n/rulebasedcollator.cpp 55b504e415a93a43e9bc29a699f070d6 *src/icu74/i18n/scientificnumberformatter.cpp 7c98b4241a01c1083c78ec99c2cdc7eb *src/icu74/i18n/scriptset.cpp 70163664c605350da257f89405924e63 *src/icu74/i18n/scriptset.h ec9510445f36dcaa0c9d8c2aea2b2f66 *src/icu74/i18n/search.cpp 7ac6cfe9d5f2d91956d254916d7a04c3 *src/icu74/i18n/selfmt.cpp ae27723b93891a39cb8894dbad74e243 *src/icu74/i18n/selfmtimpl.h 2565550c239f65d9619ed8a4c668e9e3 *src/icu74/i18n/sharedbreakiterator.cpp aa975748e44a0c65efc8449f0dfeb976 *src/icu74/i18n/sharedbreakiterator.h 17b1c0c7792a42f5f6249d9775cec741 *src/icu74/i18n/sharedcalendar.h 1386d2906094bd9356ccbd72761429b6 *src/icu74/i18n/shareddateformatsymbols.h 5d1e6b8d6644a32ba9e473e76ffeca99 *src/icu74/i18n/sharednumberformat.h 8f32ec904ffc0ea9f857e2e4b402ec4f *src/icu74/i18n/sharedpluralrules.h ac06c7c8f0a70bdf3f5fdb52e85357b2 *src/icu74/i18n/simpletz.cpp 5f000a5daa17cbb5fc5f5f5fbaf9f043 *src/icu74/i18n/smpdtfmt.cpp abf9bfd6cec588b807ff2ef07d715141 *src/icu74/i18n/smpdtfst.cpp 803bce2d5c6ae1cd75295f9f6ab3fef7 *src/icu74/i18n/smpdtfst.h 4a2f0d7a67460c4d63417f1ef2299f35 *src/icu74/i18n/sortkey.cpp d380ff41e65c118e423c03d7dc3b0262 *src/icu74/i18n/standardplural.cpp cfa0d8c0219d9b40b23504b04be2a884 *src/icu74/i18n/standardplural.h 3a74ea64b4f5ff0a65bb34cd6ce04dfc *src/icu74/i18n/string_segment.cpp afbc178c35fe6f1adccb38c03e9a1631 *src/icu74/i18n/string_segment.h 6c464ab583d2cb0c83c4db2a5f73e41d *src/icu74/i18n/strmatch.cpp 2765301e7fbe35a5e1994e9f86c9b1ec *src/icu74/i18n/strmatch.h 3cd0108b1a743d7fc9018487b0528b3d *src/icu74/i18n/strrepl.cpp cd8c27f855bd94517c0abbfacc299f50 *src/icu74/i18n/strrepl.h c3164f4b079f5ce10ee4c25e9b16dd99 *src/icu74/i18n/stsearch.cpp 6eb3c3c25dc9a16540f26ede38883c66 *src/icu74/i18n/taiwncal.cpp ec8c0b7e5b5a2d7a0a4aca486ca3cd8f *src/icu74/i18n/taiwncal.h dd6395e8cd46205bfa9b7066c2ee8d1b *src/icu74/i18n/timezone.cpp 0a5dc36b7c17d3333b7d895d89c8299f *src/icu74/i18n/titletrn.cpp a29f132f8b7a8aec85830de60108774c *src/icu74/i18n/titletrn.h ef2daee21bbf59ad08c30614b90e52cf *src/icu74/i18n/tmunit.cpp 07d8cdfbc2d03813f7c627f8da1655bb *src/icu74/i18n/tmutamt.cpp 0c681c5582802401834b9da9b36ab450 *src/icu74/i18n/tmutfmt.cpp ac2bbf452dd3783dbac98a5bfea3217a *src/icu74/i18n/tolowtrn.cpp 4d2f04d6c5b9f00a253da9e2df714e56 *src/icu74/i18n/tolowtrn.h 6449b2a314524ec4c0c288d46082ae33 *src/icu74/i18n/toupptrn.cpp c70ab3b7dc12638c2b27b4204e80f7e2 *src/icu74/i18n/toupptrn.h 370842bfd73fbb33c20647fe3da8d5fc *src/icu74/i18n/translit.cpp e0e47086f995eea6bb9459f59df55347 *src/icu74/i18n/transreg.cpp ebca78ee6fb0857067b085818b440316 *src/icu74/i18n/transreg.h dc7962d26f45145c33c12d59ede9a35d *src/icu74/i18n/tridpars.cpp 9a53b3381bdc7d6cbdc0384e055450c3 *src/icu74/i18n/tridpars.h 73c11b9ea03b9f22f408978591a03bf3 *src/icu74/i18n/tzfmt.cpp ced689b025fca4dd563d88282fd45c2c *src/icu74/i18n/tzgnames.cpp 08e1df95a27f65421f23df09c8fb0267 *src/icu74/i18n/tzgnames.h 9c063c1a3b0e9dc53a6e2966ab885b84 *src/icu74/i18n/tznames.cpp 6d0e225e55ced3ba250632d91096d5cd *src/icu74/i18n/tznames_impl.cpp b1fbca10ca71bada19ccdc23a5753681 *src/icu74/i18n/tznames_impl.h 96e4265046612a9f9e45f0e0603143c7 *src/icu74/i18n/tzrule.cpp 8caa658b0790bd2f20d6c045b52dd544 *src/icu74/i18n/tztrans.cpp 7929d2444609f93f42535d8bd4eb7b81 *src/icu74/i18n/ucal.cpp cf2df4a981de69f553c29760586fd2e8 *src/icu74/i18n/ucln_in.cpp 05eaff445e0ae3e8f3df4a8e2a3b0104 *src/icu74/i18n/ucln_in.h b75385c005c9405876bd0f39901f2669 *src/icu74/i18n/ucol.cpp 74e2da658fd620e01604a54b13e6a903 *src/icu74/i18n/ucol_imp.h d416be4f64bba5cfc2c80ade61e1599c *src/icu74/i18n/ucol_res.cpp a8edef6a8d1a7877b930cfe06bc7fcf8 *src/icu74/i18n/ucol_sit.cpp a396e5ad795c4fd29a1fd711942165ba *src/icu74/i18n/ucoleitr.cpp 3a1f864549e42c94ae4492b7a95b173e *src/icu74/i18n/ucsdet.cpp 1ec0ea491c3f4e92c49c91d71d33d088 *src/icu74/i18n/udat.cpp 93f90b129ea6ce3c6d0b6e5178168124 *src/icu74/i18n/udateintervalformat.cpp fcb41c8023b4abb881c515eb3f42b3e1 *src/icu74/i18n/udatpg.cpp 81ca04f0525e9a90631cbd4cc7bb8075 *src/icu74/i18n/ufieldpositer.cpp 8d71927df5786143bc0a67f054e68109 *src/icu74/i18n/uitercollationiterator.cpp f5dd577dcd177f3f50a76a2355b2d296 *src/icu74/i18n/uitercollationiterator.h 0896ff1f34edc00e33b588db0f73ba76 *src/icu74/i18n/ulistformatter.cpp 8b34721af92e3c1f3b3ab17a57d88e3a *src/icu74/i18n/ulocdata.cpp 6eefc224fd1cdfd1ae446b2f63972487 *src/icu74/i18n/umsg.cpp 014d78caa836affaa85f2e252e39c1fb *src/icu74/i18n/umsg_imp.h bc1802df6f8b549d67dfc070df3ffd68 *src/icu74/i18n/unesctrn.cpp 853a7e4af8dafb3ff1e9601fbf258daf *src/icu74/i18n/unesctrn.h f7693b8e7e9e384ca4cee3209c1d9b47 *src/icu74/i18n/uni2name.cpp f7bfbddb7b0266971da4e66ff2c71955 *src/icu74/i18n/uni2name.h 696752594bb68c28f171fab3674e9820 *src/icu74/i18n/units_complexconverter.cpp 7af7164d8ca6ef0f6a61f32ed02341bb *src/icu74/i18n/units_complexconverter.h 93a52cb93abfd8da0ebbf71e0cc23c8d *src/icu74/i18n/units_converter.cpp 6b77759d83b2ce3307751b72482d01eb *src/icu74/i18n/units_converter.h 05068ed9104ab4edd83ea023dfee6384 *src/icu74/i18n/units_data.cpp 2a9e516c4f92e6194da35044978bfda8 *src/icu74/i18n/units_data.h bec40009b2a6297feafa7062d2fa2808 *src/icu74/i18n/units_router.cpp d647205f11d816053621395bf3e56e6b *src/icu74/i18n/units_router.h cb9c8b40d2d9b74f13281e62cb3d732c *src/icu74/i18n/unum.cpp e49c6c5347293466fc4ba8f61eb6feca *src/icu74/i18n/unumsys.cpp b413fb527b94a47d84275231881e864c *src/icu74/i18n/upluralrules.cpp 68931d131908924f671ab5ac202fdba4 *src/icu74/i18n/uregex.cpp 3bdd3210fc12e5e41eab5e0a4c72111f *src/icu74/i18n/uregexc.cpp 4359f9e01178046ea837a4f5c48a3d5c *src/icu74/i18n/uregion.cpp a59fd255bee8ea6c46a5c4a8e38602ce *src/icu74/i18n/usearch.cpp 6f992808f8a6203496f97d15c6700477 *src/icu74/i18n/uspoof.cpp 1f929ca604e5338ab792d94f7db872cb *src/icu74/i18n/uspoof_build.cpp f7b4b054fa14efd75a70c215594f2a69 *src/icu74/i18n/uspoof_conf.cpp 84503dab427da29acf1800c7c0eaeb93 *src/icu74/i18n/uspoof_conf.h 076961df215ae0178f75fb464cdff33c *src/icu74/i18n/uspoof_impl.cpp e3af965ef8796c4587f5342d4b8d563a *src/icu74/i18n/uspoof_impl.h cb6366ce3299ac312b56102e502f575d *src/icu74/i18n/usrchimp.h 3fa260bb5e98e1454212f6627e64b747 *src/icu74/i18n/utf16collationiterator.cpp c3f2f7ec37bdb63962e15c0a315e33f1 *src/icu74/i18n/utf16collationiterator.h dce7ae5ad8236badad99beb6b7d463bb *src/icu74/i18n/utf8collationiterator.cpp 67b7373706e12c98fdabf488a7e3d632 *src/icu74/i18n/utf8collationiterator.h f4656b9e13a254cadd1315384d60517a *src/icu74/i18n/utmscale.cpp a8036c12a54f9fe58e3257bead6232a5 *src/icu74/i18n/utrans.cpp d035f7e39015ccdf8ac0f4199a2aacd6 *src/icu74/i18n/vtzone.cpp 6d05ea39cd1174cdf6eed94050321b0d *src/icu74/i18n/vzone.cpp 252d2a07067bedb3ffe026d5bde9a090 *src/icu74/i18n/vzone.h 0bcc62711eb3576d2ef4c9e915e09047 *src/icu74/i18n/windtfmt.cpp 5318d59e0ac4c0b58655b28cd242dac5 *src/icu74/i18n/windtfmt.h f1e0b3f778dea398d9edf30b85b248d9 *src/icu74/i18n/winnmfmt.cpp d5ee2e585d1f0bd75ff717b7dc920ded *src/icu74/i18n/winnmfmt.h 021778c6c0d161f16992a9e1d7d275bc *src/icu74/i18n/wintzimpl.cpp efa44197993d058a7f1fe304529b0e53 *src/icu74/i18n/wintzimpl.h b474d1e82f9d838149d766c14ee12687 *src/icu74/i18n/zonemeta.cpp f1d835a6ef797c289b1c0ab0d167296e *src/icu74/i18n/zonemeta.h 9c1ba436d76330416d8622088bf7cb0e *src/icu74/i18n/zrule.cpp 3499e2f82c86b54ae1df560c68771138 *src/icu74/i18n/zrule.h a4d52e6c132b080c8a88c2146d9f9b80 *src/icu74/i18n/ztrans.cpp 70d85481c6c331c63c5e97b888e0d2d2 *src/icu74/i18n/ztrans.h 2dbee2e923f1fcc207a4900324ab9ff4 *src/icu74/stubdata/stubdata.cpp a0a21beebfcdbf8f697c808f18d63360 *src/icu74/stubdata/stubdata.h ec79ecd11d550d28219edc761ca194c6 *src/icu74/unicode/alphaindex.h b534307b4927912b07a057ba814a5aff *src/icu74/unicode/appendable.h b6ba00878a2f1d089810b84e215334f8 *src/icu74/unicode/basictz.h 45759d53352c84e6cc50b27525432137 *src/icu74/unicode/brkiter.h aa057828d4ac2c5b0391a7ab9fd43099 *src/icu74/unicode/bytestream.h 4150910418a57f6a48d724526b21fedf *src/icu74/unicode/bytestrie.h 3e1a00abe95f389e97aa9ef701b338e5 *src/icu74/unicode/bytestriebuilder.h 08be0538368e51090e6ff1d23dfec6f7 *src/icu74/unicode/calendar.h 641e741b2c01155b2d4fbd9d2713a614 *src/icu74/unicode/caniter.h e3425f7c74b34cf79026cd19a031f7bb *src/icu74/unicode/casemap.h 6cb8a2e1958b7b7a55726cce5b68a803 *src/icu74/unicode/char16ptr.h 05c18c0f6891a66d16ffc6cb06d4efdd *src/icu74/unicode/chariter.h c066a51f7d50be3565be48eb25e70441 *src/icu74/unicode/choicfmt.h db313832874af0e89b4b12b8fd28d0f8 *src/icu74/unicode/coleitr.h 6d0fdc0fa650dda69b0719bb708aa6d5 *src/icu74/unicode/coll.h 510d30046480b513145d1694c7f22293 *src/icu74/unicode/compactdecimalformat.h 33e59e40478f72b464102e8bc8c3c786 *src/icu74/unicode/curramt.h 9593ad2b47287896517deebbf5167390 *src/icu74/unicode/currpinf.h 4bc2e93eee8b039d863aae7deb16c49e *src/icu74/unicode/currunit.h 17a8772d691939702e42bf97973c3454 *src/icu74/unicode/datefmt.h 3d4186ec9549d6693d136257f610ec06 *src/icu74/unicode/dbbi.h 85cb07a5b874370e37d8987e2e9e1fe3 *src/icu74/unicode/dcfmtsym.h 9561cbaedec8cf25002de5554dcd00e8 *src/icu74/unicode/decimfmt.h e9fc936da341e6bcef7f60b6ddaabcac *src/icu74/unicode/displayoptions.h 9ecff9520f7c9c103fcf2d98071f2820 *src/icu74/unicode/docmain.h a9a39cdc2820819eca29a17b691a961b *src/icu74/unicode/dtfmtsym.h 354bfd450b52bb73685e60f99df17fe3 *src/icu74/unicode/dtintrv.h 4135d8969428fd463ad91e98e828429b *src/icu74/unicode/dtitvfmt.h d685ce1d63aa8690a55e0e35fd6144e8 *src/icu74/unicode/dtitvinf.h c0d29f49d2d4d48c9ec46b217df5ef88 *src/icu74/unicode/dtptngen.h 6a139c62bc78fb7ff84726ec1df8f002 *src/icu74/unicode/dtrule.h f72eba6fc28806010b354b7367082e3e *src/icu74/unicode/edits.h 3bb75fc43c88682895ae98c02e69810d *src/icu74/unicode/enumset.h debeda4467a19aaf964fafc73629c8ae *src/icu74/unicode/errorcode.h 321862339b410fb1482231175b6321da *src/icu74/unicode/fieldpos.h 4f05b806ecf8bf8797d7c6ede0851fdf *src/icu74/unicode/filteredbrk.h 45d5addd3877b01d1fb5cec1d2516b58 *src/icu74/unicode/fmtable.h b00fe8417659b75d47dce7dd3dc731c5 *src/icu74/unicode/format.h 32550933d3feee0fd4e6435174e413c4 *src/icu74/unicode/formattednumber.h c6f38411864c36f83ed75de4100a8ba8 *src/icu74/unicode/formattedvalue.h 0eb92a7c2025b739075f4891a344a710 *src/icu74/unicode/fpositer.h d7e955b4dc08fc7dd19a4f699daf1ebf *src/icu74/unicode/gender.h 0c5818fc6f6aca256e4c44eb835c7de7 *src/icu74/unicode/gregocal.h 7f302721a25b90f8db3844963a6db0ed *src/icu74/unicode/icudataver.h 1956da7c9086dcefccc89abf23fc1bc9 *src/icu74/unicode/icuplug.h 80f4736dc94e89d7e0204cd4653c5b04 *src/icu74/unicode/idna.h f0b534815224ac8fcdb37ebdc85fe36f *src/icu74/unicode/listformatter.h ab1b8f1cb43634452dad8f3f7df9826f *src/icu74/unicode/localebuilder.h 2e414786f4f07e7690787b46c8054453 *src/icu74/unicode/localematcher.h c91c88bfc334c7d499f5a41e7af0369d *src/icu74/unicode/localpointer.h aa260a00e31e970757e3b3cbe00fff3d *src/icu74/unicode/locdspnm.h e5e8a1f66e28a293c82a9619ed54483e *src/icu74/unicode/locid.h 60f32da233fd51948748d354bad7f8e4 *src/icu74/unicode/measfmt.h 08d500980300a85d60a9978214834f1d *src/icu74/unicode/measunit.h 80785d8036f254a86d9baac0c1af2ed4 *src/icu74/unicode/measure.h 1f576c7e53600371bb451b741f3c9a9d *src/icu74/unicode/messagepattern.h 5e0c21fab16bd123d4ec85e06be97cad *src/icu74/unicode/msgfmt.h 101ff621627f86c93aa2ad3b7325ad11 *src/icu74/unicode/normalizer2.h c3cbf740fd160ebc483f63ebc98a737a *src/icu74/unicode/normlzr.h 1869765b22f9a6ebab507347f5404cd1 *src/icu74/unicode/nounit.h 80f907ca399d415b1f02eb0c2f74aa01 *src/icu74/unicode/numberformatter.h 68cfbb32bc32dd4309ab9d211570d974 *src/icu74/unicode/numberrangeformatter.h bde1976339fcd7dafb55ea40b3e18e9d *src/icu74/unicode/numfmt.h 3b99ae8bdb3d640cd23994924ba30cc3 *src/icu74/unicode/numsys.h ce38831411af01eeaf0bbbb6e1cb0153 *src/icu74/unicode/parseerr.h cc5ccd6c547dd9091c7f49c751ecb991 *src/icu74/unicode/parsepos.h 3dcb1366225146b55947cd1a0bb7caf9 *src/icu74/unicode/platform.h 483f1161d5dcdd8dabba2ab1c0676472 *src/icu74/unicode/plurfmt.h d58e612ffa69af52219e7cd9c63f81e4 *src/icu74/unicode/plurrule.h d74097874c82f77331d8e06a5c59d37f *src/icu74/unicode/ptypes.h 749d40b1814eebb28751c765149de620 *src/icu74/unicode/putil.h e62b3ce376b276233ca6d5c42b269cf4 *src/icu74/unicode/rbbi.h c72642843afb410e8608a1854d1ec5e6 *src/icu74/unicode/rbnf.h f398e5f778c63899a5fb0491a7865e0a *src/icu74/unicode/rbtz.h 792e6d02296f6e2e7aa11c2738ed61ce *src/icu74/unicode/regex.h 4bfb46a34299058158a17293463257ce *src/icu74/unicode/region.h 71f70261239632a1774c662dde7fb15f *src/icu74/unicode/reldatefmt.h 6c6cf6b8bac96615e3e5badfd8e61abf *src/icu74/unicode/rep.h 8916c1460ab58af3dd262b4a4078b689 *src/icu74/unicode/resbund.h fdffae48b9f5bd15c850ce0adf9b72d0 *src/icu74/unicode/schriter.h b3f324f5b5909bfdbb44ad017eebfc2a *src/icu74/unicode/scientificnumberformatter.h 0d1d877947274d7d93a978136cd5956b *src/icu74/unicode/search.h a8673ab1e695133af0e7ba1bfdc65b0d *src/icu74/unicode/selfmt.h f10649909c461296486a9c0a7dd5857d *src/icu74/unicode/simpleformatter.h 7e988e590297e335b18f3f147a8aaae3 *src/icu74/unicode/simplenumberformatter.h 1f5fe314833142f0a225893d793f77fa *src/icu74/unicode/simpletz.h 51aa79b85d606c79c81900c5eca8ad5a *src/icu74/unicode/smpdtfmt.h 741b50d64482931844d297e126636fa1 *src/icu74/unicode/sortkey.h bb3b2e28a2ae9b66a60b7d90f8c07512 *src/icu74/unicode/std_string.h 785cf3e673212965b782c4413e6b8222 *src/icu74/unicode/strenum.h d4929224e8d9df2f272f0305780cbacb *src/icu74/unicode/stringoptions.h 8e783cf9e6e4bf924925ab8a231b4643 *src/icu74/unicode/stringpiece.h 278a1b70f109127ce330b6ce78b10759 *src/icu74/unicode/stringtriebuilder.h 050caaae33d81da3f92391beda9abe6e *src/icu74/unicode/stsearch.h 0a29f6657cdf2f1fb51f571f6ed91188 *src/icu74/unicode/symtable.h 580312a384c454b22fa78efd57d68dc8 *src/icu74/unicode/tblcoll.h 9c085cb49cdb637e1c6ed9135a067a85 *src/icu74/unicode/timezone.h d4c113ee349bf153446afe6368656eec *src/icu74/unicode/tmunit.h 9b2b3e2a43eaa5de71cb8f48ce70f1cf *src/icu74/unicode/tmutamt.h e278520847e0ebb7b58047ec4b1938e5 *src/icu74/unicode/tmutfmt.h 1add1a6880730bfb536af6047374930b *src/icu74/unicode/translit.h 09771861d074000b6ddb91670ac0e808 *src/icu74/unicode/tzfmt.h b999d3f10ebb00b154a9c2cae60dd386 *src/icu74/unicode/tznames.h 3b2958cf9b88d5a66f949d331a87707a *src/icu74/unicode/tzrule.h 7708a0f07a173cfbce2671b54009c9f2 *src/icu74/unicode/tztrans.h 3ab58e9b0b0ab26f8f15b0538ee467c1 *src/icu74/unicode/ubidi.h 06f757bac430c0f6ce0d2f44d090eab8 *src/icu74/unicode/ubiditransform.h e1cea4823bccb6ca313538a45c75fe40 *src/icu74/unicode/ubrk.h dc4ad784285da5b246f48f49563af5bf *src/icu74/unicode/ucal.h 86ef4f5a8f0d8708669491ed9467ee1c *src/icu74/unicode/ucasemap.h b536ccb7b79d1fee71b5b53e7d370f8f *src/icu74/unicode/ucat.h f2e3aa1c6fea75517962d6d8596aaa94 *src/icu74/unicode/uchar.h 6e75f25a3a7d95f85a104f4800f00e45 *src/icu74/unicode/ucharstrie.h f45c632aeab5184ef4af16857a096168 *src/icu74/unicode/ucharstriebuilder.h 99952c7f7481d1d6849ca745d7ada609 *src/icu74/unicode/uchriter.h 08c4cdf97d737fb8bed066c4f1411f15 *src/icu74/unicode/uclean.h 7a6379ffa7fb17129c387d51f6a2ed76 *src/icu74/unicode/ucnv.h b5307a489929d900d1505e3b17546544 *src/icu74/unicode/ucnv_cb.h a507bbb125e54d860485ec054a4a55ed *src/icu74/unicode/ucnv_err.h 3e19ed11aa05d92bed1e152d75808fbc *src/icu74/unicode/ucnvsel.h db8f0d1efdcd516cfc72b8f07df02b57 *src/icu74/unicode/ucol.h 82784539ceee31f9117c6a377542ac72 *src/icu74/unicode/ucoleitr.h eb78be846cee6d512bf6d74c076cea7f *src/icu74/unicode/uconfig.h fab7a1cb4679b3799a86e331a73e5b6b *src/icu74/unicode/ucpmap.h b52b06ac993637f99a3c0795019bc616 *src/icu74/unicode/ucptrie.h 115314d31a5b3a0a67a860843bdc5a07 *src/icu74/unicode/ucsdet.h 93df41f8eed0aac934559ba79ec3a4b8 *src/icu74/unicode/ucurr.h a7a4edb0616190ce137195e18bad95f9 *src/icu74/unicode/udat.h 138bdc88a47b2447778af7b9e7a81405 *src/icu74/unicode/udata.h b3993a86728da0269bebb9afe03bf40f *src/icu74/unicode/udateintervalformat.h d3bb32cb07c55a3fced272d0bb320191 *src/icu74/unicode/udatpg.h 2b320b1e6042efb12c0cee35f5c19d7d *src/icu74/unicode/udisplaycontext.h a3a178d7830dcc01908671bb50f4db31 *src/icu74/unicode/udisplayoptions.h 1c977846ed8fb1ed9a22ef5505b5f74a *src/icu74/unicode/uenum.h 691ad0b0e8795dd2c653a3eba713d8f8 *src/icu74/unicode/ufieldpositer.h 8d5c4936b633b2d697a895b0cba8adfe *src/icu74/unicode/uformattable.h 32927ad9df741fc4f2f30e140e682e56 *src/icu74/unicode/uformattednumber.h c8cbe1952703b8efa1981fa54ee2449b *src/icu74/unicode/uformattedvalue.h 164eaedd0ecaa7bbe0b4afe99664e4e2 *src/icu74/unicode/ugender.h 1dc6db480aa52a700e6ec4e876783d52 *src/icu74/unicode/uidna.h 43e0c805cd086df8b62aa4110a3a055a *src/icu74/unicode/uiter.h a70ca9d644ec5a75dcd214cb2972775a *src/icu74/unicode/uldnames.h a64f637b452dde79f1569d6dd33116e9 *src/icu74/unicode/ulistformatter.h 10e85e2e67a966d8e36ff49d48dae183 *src/icu74/unicode/uloc.h a4e1ebf51b78f3927af50d001307df7e *src/icu74/unicode/ulocale.h 8a201c407f9d58061ac7f4a54c9a7c9f *src/icu74/unicode/ulocbuilder.h 802fd8f24ae13177edb94e8045293a67 *src/icu74/unicode/ulocdata.h 2d902ff00eac765b305cb113b74d34df *src/icu74/unicode/umachine.h f01bf0e2ecdad7168cae02051a68c6e0 *src/icu74/unicode/umisc.h 37ee4bc144f6245875d6c16051fd7f79 *src/icu74/unicode/umsg.h 6f10d7a98ed5bb1fa6123d12483a5e9c *src/icu74/unicode/umutablecptrie.h 3c83bb08bd0cdd214f8fb3244a674d79 *src/icu74/unicode/unifilt.h 821f46371d07630960cb18f261e98455 *src/icu74/unicode/unifunct.h 7cc35c453c14958148e3beff36faa28f *src/icu74/unicode/unimatch.h c551301d7294b366098e863e349a7b40 *src/icu74/unicode/unirepl.h 5cef1e374e075a8e6f4a1a0a0994811f *src/icu74/unicode/uniset.h 462e72372ff841496db258eb69846a7f *src/icu74/unicode/unistr.h 57454a0c2671043683a531c02c3b7784 *src/icu74/unicode/unorm.h 9bf09ab215d2625fc068f9ac4c3ba49b *src/icu74/unicode/unorm2.h 5890f50a4aa9e36955c17fab4bc1ec71 *src/icu74/unicode/unum.h 315dd3c68fa2d700410f8a47bac4098d *src/icu74/unicode/unumberformatter.h dc5ee8336efcb731c70bc67ef104e69e *src/icu74/unicode/unumberoptions.h 617d50483cf3e0973fca717816e1f8b5 *src/icu74/unicode/unumberrangeformatter.h 18bf54617ad67e194a83ce8de7fec7a8 *src/icu74/unicode/unumsys.h 93cd3606c473f206d9012a1d759aff0f *src/icu74/unicode/uobject.h b9d4a099b0076de1fe7b5238f6506dbb *src/icu74/unicode/upluralrules.h f2b961c77896af68f040fee75b96ab1d *src/icu74/unicode/uregex.h a136613a3d5416b0cb8a38b9a982dcd1 *src/icu74/unicode/uregion.h 2d37d0294471dbfb4303efdcbd817e4e *src/icu74/unicode/ureldatefmt.h cdfff39775f39a3eddf0df76c90729b9 *src/icu74/unicode/urename.h 6ada98ac1aa68081f10d17b1abadef6e *src/icu74/unicode/urep.h 1c6c3117cde06106cf8d200c0fc4438d *src/icu74/unicode/ures.h b73724ad86fb127bb7c3ec4ff6075922 *src/icu74/unicode/uscript.h 77ea7c165a8175aeeff168574b81b295 *src/icu74/unicode/usearch.h 05ca5673f7d5b8819c6b8068da5a0fe8 *src/icu74/unicode/uset.h 4b80450fa2e136e639bf86e436cd29ef *src/icu74/unicode/usetiter.h a30f44d71635b8afa174e2ce1e1188c1 *src/icu74/unicode/ushape.h 13d82c9a8e888e4fd1c1c8a919d52c11 *src/icu74/unicode/usimplenumberformatter.h 8555b71a17076ec05955b8fb3b438834 *src/icu74/unicode/uspoof.h eabdbc2e85d927e9132a487cceae4d19 *src/icu74/unicode/usprep.h 52148861630982d150b1b08117967604 *src/icu74/unicode/ustring.h 54a28aa9c4e117cd3ebfd0a670324f6c *src/icu74/unicode/ustringtrie.h eef1213d7071e18534af555eaa6e99a8 *src/icu74/unicode/utext.h f9990715c6252d369294e99a81b192ab *src/icu74/unicode/utf.h 7ba8c4dd33f399a67a9b2ddd8de5e35d *src/icu74/unicode/utf16.h a1fdf74f4bd8808f9fb17df371f886d9 *src/icu74/unicode/utf32.h 4cc30aa971e4b8eb7c22c3d0554126ec *src/icu74/unicode/utf8.h 71329d19bae2193dae4d270898718e6f *src/icu74/unicode/utf_old.h 7452fa04c594359438a3240535209eaa *src/icu74/unicode/utmscale.h 1a10170c6f8fefa7f29f28164022fc6a *src/icu74/unicode/utrace.h 48f145f00b2ffaa332a63184a5282981 *src/icu74/unicode/utrans.h 5a4cc77d4aded3026b10df71bfb99dae *src/icu74/unicode/utypes.h 779cdf7824a7dbcb471aa4301ca8c425 *src/icu74/unicode/uvernum.h 2da9700caecc79f6640b9c92c1bc56cb *src/icu74/unicode/uversion.h 0406bfde77988075c026776072385914 *src/icu74/unicode/vtzone.h a4070f14b37d8f1acd211650f2ded226 *src/icu74_common_cpp.txt cd0d99d38a7d4ac6f0a4bddd70688de4 *src/icu74_i18n_cpp.txt d9648c8f0c3712b1f96f6ad9c7d3e683 *src/icu74_stubdata_cpp.txt 6150c6361d82f4c9026aab6b6a87a85f *src/install.libs.R.in fb5f86e0873efd7e7c549c13699fbb61 *src/stri_ICU_settings.cpp 0ff20e51dff85659d7cd69b56b298ff8 *src/stri_brkiter.cpp 29309d746858edba632a6ac13841ca01 *src/stri_brkiter.h 65c59843fa6ac72a2e8dd0979639462b *src/stri_bytesearch_matcher.h 4536f48e72dcc8f2fd072484c1afc380 *src/stri_callables.cpp 6d7577f17868bbf007b3ab898156adb2 *src/stri_callables.h 999b74134eb9940700f76479b316fbbe *src/stri_collator.cpp de7839d95c156cb9c0d371ad71909804 *src/stri_common.cpp a5a327fce8f64462c5641fc28f79ba04 *src/stri_compare.cpp f4f90e887cee7681ec60c61b18b69cad *src/stri_container_base.cpp 782e20deb4f3275feb4e8f26168640ac *src/stri_container_base.h ac5ab1c263eaa4d1a022d469af812040 *src/stri_container_bytesearch.cpp 75fd333fc8240055a37e3d071a2b5e9e *src/stri_container_bytesearch.h edb3cf965229a035067483457c9696e3 *src/stri_container_charclass.h c14b6e86b694301cf193d323d7b1f075 *src/stri_container_double.h e34704dc4ed05be96acd19a8ac0ae8e6 *src/stri_container_integer.h 2e5cf0580ed0efda929b9fe8ff5e26ac *src/stri_container_listint.cpp 810a33d2ad7b21d781d8205c9208f596 *src/stri_container_listint.h 76da91deea79a83d9a99d28a35cf94a4 *src/stri_container_listraw.cpp 12001c2617b74ca38cdc83b86829f4b2 *src/stri_container_listraw.h 266a12b65704bb121af10c69a6abfc62 *src/stri_container_listutf8.cpp 1435c851af3349e0aabba8b948161e09 *src/stri_container_listutf8.h 744c0de65919d3a38e55f15edbb235b7 *src/stri_container_logical.h 8131ebae84748a16a124a477be3efcd2 *src/stri_container_regex.cpp d827d5a7633dd33db76991c6152ce7fb *src/stri_container_regex.h 7f5c9cc952082c5e783d279b50a023e7 *src/stri_container_usearch.cpp aff54201e0bc936849b79d237e54d93e *src/stri_container_usearch.h 240e0e4d9a445e434edfba60897dc2f3 *src/stri_container_utf16.cpp 5a429ebeeb7648ec50ad732cacd73964 *src/stri_container_utf16.h 253d3abedb7b8809adbb2a68b643ee45 *src/stri_container_utf8.cpp 31dfda69c9dee74a6392cb10a08010fe *src/stri_container_utf8.h 2ad8726ff33daf537a77699017f51998 *src/stri_container_utf8_indexable.cpp 2cb6d113326f3222c745b7f127d9c790 *src/stri_container_utf8_indexable.h f14bb0884dbf9ffc4982a8c2958db613 *src/stri_cpp.txt 67b4cbe9ecb1f096de83d3b5d513fa96 *src/stri_encoding_conversion.cpp 37c0e82c56933142e81641c96e584ffc *src/stri_encoding_detection.cpp 6fd2ba70aa6412a6e43e84d6779fa51d *src/stri_encoding_management.cpp 7563bcd22b34f1e9ae327805c5f45a45 *src/stri_escape.cpp 40c0157929ab0be4cdc431b274c81f2e *src/stri_exception.cpp 976d5e2faab934bf7c4b9132d0dcb084 *src/stri_exception.h 2fa6be7c409dcae8774302560b258239 *src/stri_exports.h fcbd9989f52a638e5e48c292c28bb119 *src/stri_external.h 041502f3bd4953d92ace1bd2aa101239 *src/stri_interval.h 10dbc313734a5bb26ab3bfcff8debc57 *src/stri_intvec.h 7d6a57e2fa6abf2615a1cac9ab365be5 *src/stri_join.cpp ffa3db866c25b86e50b7061df4faff12 *src/stri_length.cpp 46a8b7117cba2e48c12eee0929aa3980 *src/stri_macros.h bd743a87be2256eeec123d73445dc6ee *src/stri_messages.h ea2ad4d5742fafa38c206cc4b941949c *src/stri_pad.cpp 152db0a1f2c4caa32fcf5314abcdcda8 *src/stri_prepare_arg.cpp 2adc53a4628f3040aff80ee02f98d41c *src/stri_random.cpp 1cb4ec7d0b6bc0d6adf244e785679826 *src/stri_reverse.cpp bfba95388e4f32795ba6844ff528277b *src/stri_search_boundaries_count.cpp 18791d35d171aeda92d161063732f734 *src/stri_search_boundaries_extract.cpp 350f53d64ad199495ec530e0cff93d66 *src/stri_search_boundaries_locate.cpp d48fbabffcea8deccf18d0677bb6b944 *src/stri_search_boundaries_split.cpp 94df35b3edf13dbf9ff4db1d5679b308 *src/stri_search_class_count.cpp a27d5f514ab03f883d3e6ff4333fc29d *src/stri_search_class_detect.cpp 1bcee41c9ccfab4f8a1dc08f8b8238f8 *src/stri_search_class_extract.cpp 9f05dc7db3735c45079b0451e9862a9d *src/stri_search_class_locate.cpp bcd00c874fddcf3d11ef0f5214ef9949 *src/stri_search_class_replace.cpp 2301343cc6d94c17a2a437cbdda36db2 *src/stri_search_class_split.cpp a9f9a80ca84aaf7fc5800db889998fb9 *src/stri_search_class_startsendswith.cpp dc1e20c339eecdbf64b66edcb30814c6 *src/stri_search_class_subset.cpp 9ad5ca966c1343f6c9a1ec54f2805e09 *src/stri_search_class_trim.cpp 400a34946a62a6665dba6133613aa427 *src/stri_search_coll_count.cpp 68428d5a38f50cb68317c0aa2321f2bc *src/stri_search_coll_detect.cpp ab8d89132b513b55da86315d5ce321a1 *src/stri_search_coll_extract.cpp 2a64b8ce7c6483ec0e6752ac9b46c4d5 *src/stri_search_coll_locate.cpp 9f5c17d4be8e8b272a40ab22e8167c21 *src/stri_search_coll_replace.cpp bb8f994bc643fff66afa1ac0bd754226 *src/stri_search_coll_split.cpp 8f9d57bf1e05251a794b3118aa603a7d *src/stri_search_coll_startsendswith.cpp b279600820fe650272bb17adeb8be3b4 *src/stri_search_coll_subset.cpp ac697151b8087b75ee766226bf406efc *src/stri_search_common.cpp 5a3cb5f5107f6d5ea557eab429d515cf *src/stri_search_fixed_count.cpp 16d121db52a7f6adcc0f07d6d8f88632 *src/stri_search_fixed_detect.cpp ade62a6f1dee537a373816d625b05241 *src/stri_search_fixed_extract.cpp 8c82763d282a04b411875cbcf37e21ca *src/stri_search_fixed_locate.cpp 4c467f11a8762e96d72be58e3110dcb4 *src/stri_search_fixed_replace.cpp 0d4e558341ba7d95e531aeb05398f220 *src/stri_search_fixed_split.cpp 39572950e4c8ca0ce5f37791c508dbd7 *src/stri_search_fixed_startsendswith.cpp f7d990d20ad700912ebd5fdb85b9cf85 *src/stri_search_fixed_subset.cpp 797523b241c0a526e5d0cdf00ac0a036 *src/stri_search_in.cpp 16ec5e9660e1a1ff631d4b387a3242d2 *src/stri_search_other_split.cpp 5801b7e6ff5dca59feee5b8d7be9d59f *src/stri_search_regex_count.cpp d772307081f0bea44ba25ef90f505ab4 *src/stri_search_regex_detect.cpp aa632649d54c16dff6caf386d50dfc25 *src/stri_search_regex_extract.cpp 28e44f4781b3f88a3f36830f2aae0e70 *src/stri_search_regex_locate.cpp 98e5b7c40210647bd1c43925f2afa84a *src/stri_search_regex_match.cpp e33b296680044b6e3204249fdb477d67 *src/stri_search_regex_replace.cpp 50dadf6eaaa7a8759856a824ece614e2 *src/stri_search_regex_split.cpp 33a6024188555ada4d6d7bbafad284e6 *src/stri_search_regex_subset.cpp c107f666262ea6e772403cee8093d339 *src/stri_sort.cpp a376d482fdaabe2f0fab36d79aad672c *src/stri_sprintf.cpp bd4783e910100d5d22e404ec8e019b3f *src/stri_stats.cpp fd32f7c9b4dda6e83be100fbacd7bf2a *src/stri_string8.cpp 55f899d3255f92a4875e0532be42ec4e *src/stri_string8.h 05a15f74a18447d688094e9b484d4512 *src/stri_string8buf.h 57b37b079e73d9a5ee57c47fc6767c45 *src/stri_stringi.cpp b31d40f98a7051e0b2fb0843fe95c032 *src/stri_stringi.h 6535842e678905e27cb8a9e0bf6648b4 *src/stri_sub.cpp c33bce1462f1a48f4dd2ebaab9c1852b *src/stri_test.cpp 188ef6ea0fc5e80bcd23eaf5c1135e14 *src/stri_time_calendar.cpp e3426a190e0b751ca561080c383ec9a3 *src/stri_time_format.cpp e47421dc81ae335f8dfc65f94ecb85f6 *src/stri_time_symbols.cpp dd25cde7f0118f7aeecbe6f7cedde614 *src/stri_time_zone.cpp ed8b336d61c3eca1be978371efb8f681 *src/stri_trans_casemap.cpp 8c83c91976b44ebcef290131e61c1686 *src/stri_trans_normalization.cpp f16759c75e1682efd17a50d04e948c66 *src/stri_trans_other.cpp d296e57c3bba2d0c9ec2fb951e96c13d *src/stri_trans_transliterate.cpp 9764a87dca4729845b5dbf5c7e925d86 *src/stri_ucnv.cpp 0ff791f6303d23e4ac36857c90c588a8 *src/stri_ucnv.h e77227c5e7acd921504fb129769ce259 *src/stri_uloc.cpp 500df823cc0795f16d15bc411eecfdac *src/stri_utils.cpp 9e121071c528d969cefdc5b02efd5d1d *src/stri_wrap.cpp bb1dcb7fe7fd5ca35f7ec3626ea59e2f *src/uconfig_local.h.in 7c699ef342589de58701f093695d4ab4 *tools/AC_CXX_HAVE_STL.m4 b58a26e64432c535ed081494fc39783e *tools/AC_CXX_NAMESPACES.m4 stringi/configure.win0000644000176200001440000000350114750110641014427 0ustar liggesusers# Copyright (c) 2013-2025, Marek Gagolewski # This is an architecture-independent configure.win file ICU_FOUND=0 # use our ICU bundle ICUDT_DIR="icu74/data" ICU_BUNDLE_VERSION=74 # some systems do not have ResolveLocaleName - this applies to mingw # on 32-bit windows shipped with older Rtools (R < 4.2) DISABLE_RESOLVE_LOCALE_NAME=`"${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e ' cat(as.integer(getRversion() < "4.2")) '` echo "ICU_FOUND=${ICU_FOUND}" echo "ICU_BUNDLE_VERSION=${ICU_BUNDLE_VERSION}" echo "ICUDT_DIR=${ICUDT_DIR}" echo "DISABLE_RESOLVE_LOCALE_NAME=${DISABLE_RESOLVE_LOCALE_NAME}" "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e ' fin <- "src/uconfig_local.h.in"; fout <- "src/uconfig_local.h"; f <- readLines(fin); f <- gsub("@ICU_FOUND@", '"${ICU_FOUND}"', f, fixed = TRUE); f <- gsub("@DISABLE_RESOLVE_LOCALE_NAME@", '"${DISABLE_RESOLVE_LOCALE_NAME}"', f, fixed = TRUE); f <- gsub("@ICUDT_DIR@", "'"${ICUDT_DIR}"'", f, fixed = TRUE); f <- gsub("@ICU_BUNDLE_VERSION@", "'"${ICU_BUNDLE_VERSION}"'", f, fixed = TRUE); f <- gsub("@ICUDT_ENDIANNESS@", .Platform$endian, f, fixed = TRUE); con <- file(fout, "wb") # LF line ending writeLines(f, con); close(con) ' "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e ' fin <- "src/install.libs.R.in"; fout <- "src/install.libs.R"; f <- readLines(fin); f <- gsub("@ICU_FOUND@", '"${ICU_FOUND}"', f, fixed = TRUE); f <- gsub("@DISABLE_RESOLVE_LOCALE_NAME@", '"${DISABLE_RESOLVE_LOCALE_NAME}"', f, fixed = TRUE); f <- gsub("@ICUDT_DIR@", "'"${ICUDT_DIR}"'", f, fixed = TRUE); f <- gsub("@ICU_BUNDLE_VERSION@", "'"${ICU_BUNDLE_VERSION}"'", f, fixed = TRUE); f <- gsub("@ICUDT_ENDIANNESS@", .Platform$endian, f, fixed = TRUE); con <- file(fout, "wb") # LF line ending writeLines(f, con); close(con) ' stringi/R/0000755000176200001440000000000014771224007012136 5ustar liggesusersstringi/R/files.R0000644000176200001440000001407714750110641013367 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Read Text File as Raw #' #' @description #' Reads a text file as-is, with no conversion or text line splitting. #' #' @details #' Once a text file is read into memory, #' encoding detection (see \code{\link{stri_enc_detect}}), #' conversion (see \code{\link{stri_encode}}), and/or #' splitting of text into lines (see \code{\link{stri_split_lines1}}) #' can be performed. #' #' @param con name of the output file or a connection object #' (opened in the binary mode) #' @param fname [DEPRECATED] alias of \code{con} #' #' @return #' Returns a vector of type \code{raw}. #' #' @family files #' @export stri_read_raw <- function(con, fname = con) { if (!missing(fname) && missing(con)) { # DEPRECATED warning("The 'fname' argument in stri_read_raw is a deprecated alias of 'con' and will be removed in a future release of 'stringi'.") con <- fname } if (is.character(con)) { con <- file(con, "rb") on.exit(close(con)) } bufsize <- 4194304L data <- list() n <- 1L repeat { buf <- readBin(con, what = "raw", size = 1L, n = bufsize) data[[n]] <- buf n <- n + 1L if (length(buf) < bufsize) break } do.call(c, data) } #' @title #' Read Text Lines from a Text File #' #' @description #' Reads a text file in ins entirety, re-encodes it, and splits it into text lines. #' #' @details #' This aims to be a substitute for the \code{\link{readLines}} function, #' with the ability to re-encode the input file in a much more robust way, #' and split the text into lines with \code{\link{stri_split_lines1}} #' (which conforms with the Unicode guidelines for newline markers). #' #' The function calls \code{\link{stri_read_raw}}, #' \code{\link{stri_encode}}, and \code{\link{stri_split_lines1}}, #' in this order. #' #' Because of the way this function is currently implemented, #' maximal file size cannot exceed ~0.67 GB. #' #' @param con name of the output file or a connection object #' (opened in the binary mode) #' @param encoding single string; input encoding; #' \code{NULL} or \code{''} for the current default encoding. #' @param fname [DEPRECATED] alias of \code{con} #' #' @return #' Returns a character vector, each text line is a separate string. #' The output is always marked as UTF-8. #' #' @family files #' @export stri_read_lines <- function(con, encoding = NULL, fname = con) { if (!missing(fname) && missing(con)) { # DEPRECATED warning("The 'fname' argument in stri_read_lines is a deprecated alias of 'con' and will be removed in a future release of 'stringi'.") con <- fname } stopifnot(is.null(encoding) || is.character(encoding)) if (is.null(encoding) || encoding == "") encoding <- stri_enc_get() # this need to be done manually, see ?stri_encode if (encoding == "auto") stop("encoding `auto` is no longer supported") # TODO: remove in the future txt <- stri_read_raw(con) txt <- stri_encode(txt, encoding, "UTF-8") stri_split_lines1(txt) } #' @title #' Write Text Lines to a Text File #' #' @description #' Writes a text file is such a way that each element of a given #' character vector becomes a separate text line. #' #' #' @details #' It is a substitute for the \R \code{\link{writeLines}} function, #' with the ability to easily re-encode the output. #' #' We suggest using the UTF-8 encoding for all text files: #' thus, it is the default one for the output. #' #' @param str character vector with data to write #' @param con name of the output file or a connection object #' (opened in the binary mode) #' @param encoding output encoding, \code{NULL} or \code{''} for #' the current default one #' @param sep newline separator #' @param fname [DEPRECATED] alias of \code{con} #' #' @return #' This function returns nothing noteworthy. #' #' @family files #' @export stri_write_lines <- function(str, con, encoding = "UTF-8", sep = ifelse(.Platform$OS.type == "windows", "\r\n", "\n"), fname = con) { if (!missing(fname) && missing(con)) { # DEPRECATED warning("The 'fname' argument in stri_write_lines is a deprecated alias of 'con' and will be removed in a future release of 'stringi'.") con <- fname } stopifnot(is.character(sep), length(sep) == 1) str <- stri_join(str, sep, collapse = "") str <- stri_encode(str, "", encoding, to_raw = TRUE)[[1]] writeBin(str, con, useBytes = TRUE) invisible(NULL) } stringi/R/encoding_management.R0000644000176200001440000002154614750110641016246 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' List Known Character Encodings #' #' @description #' Gives the list of encodings that are supported by \pkg{ICU}. #' #' @details #' Apart from given encoding identifiers and their aliases, #' some other specifiers might additionally be available. #' This is due to the fact that \pkg{ICU} tries to normalize #' converter names. For instance, \code{'UTF8'} is also valid, #' see \link{stringi-encoding} for more information. #' #' @param simplify single logical value; return a character vector or a #' list of character vectors? #' #' @return If \code{simplify} is \code{FALSE}, a list of #' character vectors is returned. Each list element represents a unique #' character encoding. The \code{name} attribute gives the \pkg{ICU} Canonical #' Name of an encoding family. The elements (character vectors) are #' its aliases. #' #' If \code{simplify} is \code{TRUE} (the default), then the resulting list #' is coerced to a character vector and sorted, and returned with #' removed duplicated entries. #' #' @examples #' stri_enc_list() #' stri_enc_list(FALSE) #' #' @family encoding_management #' @export stri_enc_list <- function(simplify=TRUE) { simplify <- (is.logical(simplify) && length(simplify) == 1L && !is.na(simplify) && simplify) # isTRUE(simplify) ret <- .Call(C_stri_enc_list) if (simplify) { stri_sort( unique(unlist(ret)), locale="en_US", numeric=TRUE, strength=1 ) } else { lapply( ret[ stri_order( names(ret), locale="en_US", numeric=TRUE, strength=1 ) ], stri_sort, locale="en_US", numeric=TRUE, strength=1 ) } } #' @title #' Query a Character Encoding #' #' @description #' Gets basic information on a character encoding. #' #' @details #' An error is raised if the provided encoding is unknown to \pkg{ICU} #' (see \code{\link{stri_enc_list}} for more details). #' #' #' @param enc \code{NULL} or \code{''} for the default encoding, #' or a single string with encoding name #' #' @return #' Returns a list with the following components: #' \itemize{ #' \item \code{Name.friendly} -- friendly encoding name: #' MIME Name or JAVA Name or \pkg{ICU} Canonical Name #' (the first of provided ones is selected, see below); #' \item \code{Name.ICU} -- encoding name as identified by \pkg{ICU}; #' \item \code{Name.*} -- other standardized encoding names, #' e.g., \code{Name.UTR22}, \code{Name.IBM}, \code{Name.WINDOWS}, #' \code{Name.JAVA}, \code{Name.IANA}, \code{Name.MIME} (some of them #' may be unavailable for all the encodings); #' \item \code{ASCII.subset} -- is ASCII a subset of the given encoding?; #' \item \code{Unicode.1to1} -- for 8-bit encodings only: are all characters #' translated to exactly one Unicode code point and is the translation #' scheme reversible?; #' \item \code{CharSize.8bit} -- is this an 8-bit encoding, i.e., do we have #' \code{CharSize.min == CharSize.max} and \code{CharSize.min == 1}?; #' \item \code{CharSize.min} -- minimal number of bytes used #' to represent a UChar (in UTF-16, this is not the same as UChar32) #' \item \code{CharSize.max} -- maximal number of bytes used #' to represent a UChar (in UTF-16, this is not the same as UChar32, #' i.e., does not reflect the maximal code point representation size) #' } #' #' @family encoding_management #' @export stri_enc_info <- function(enc = NULL) { .Call(C_stri_enc_info, enc) } #' @title #' Set or Get Default Character Encoding in \pkg{stringi} #' #' @description #' \code{stri_enc_set} sets the encoding used to re-encode strings #' internally (i.e., by \R) declared to be in native encoding, #' see \link{stringi-encoding} and \code{\link{stri_enc_mark}}. #' \code{stri_enc_get} returns the currently used default encoding. #' #' @details #' \code{stri_enc_get} is the same as #' \code{\link{stri_enc_info}(NULL)$Name.friendly}. #' #' Note that changing the default encoding may have undesired consequences. #' Unless you are an expert user and you know what you are doing, #' \code{stri_enc_set} should only be used if \pkg{ICU} fails to detect #' your system's encoding correctly (while testing \pkg{stringi} #' we only encountered such a situation on a very old Solaris machine). #' Note that \pkg{ICU} tries to match the encoding part of the \code{LC_CTYPE} #' category as given by \code{\link{Sys.getlocale}}. #' #' If you set a default encoding that is neither a superset of ASCII, #' nor an 8-bit encoding, a warning will be generated, #' see \link{stringi-encoding} for discussion. #' #' \code{stri_enc_set} has no effect if the system ICU assumes that #' the default charset is always UTF-8 (i.e., where the internal #' \code{U_CHARSET_IS_UTF8} is defined and set to 1), see #' \code{\link{stri_info}}. #' #' @param enc single string; character encoding name, #' see \code{\link{stri_enc_list}} for the list of supported encodings. #' #' @return #' \code{stri_enc_set} returns a string with #' previously used character encoding, invisibly. #' #' \code{stri_enc_get} returns a string with current default character #' encoding. #' #' @family encoding_management #' @rdname stri_enc_set #' @export stri_enc_set <- function(enc) { previous <- stri_enc_get() # We call stri_info, because it generates some warnings, # in case any problems are found: .Call(C_stri_enc_set, enc) message(stri_paste("New settings: ", stri_info(short = TRUE))) invisible(previous) } #' @rdname stri_enc_set #' @export stri_enc_get <- function() { stri_enc_info(NULL)$Name.friendly } #' @title #' Get Declared Encodings of Each String #' #' @description #' Reads declared encodings for each string in a character vector #' as seen by \pkg{stringi}. #' #' @details #' According to \code{\link{Encoding}}, #' \R has a simple encoding marking mechanism: #' strings can be declared to be in \code{latin1}, #' \code{UTF-8} or \code{bytes}. #' #' Moreover, we may check (via the R/C API) whether #' a string is in ASCII (\R assumes that this holds if and only if #' all bytes in a string are not greater than 127, #' so there is an implicit assumption that your platform uses #' an encoding that extends ASCII) #' or in the system's default (a.k.a. \code{unknown} in \code{\link{Encoding}}) #' encoding. #' #' Intuitively, the default encoding should be equivalent to #' the one you use on \code{stdin} (e.g., your 'keyboard'). #' In \pkg{stringi} we assume that such an encoding #' is equivalent to the one returned by \code{\link{stri_enc_get}}. #' It is automatically detected by \pkg{ICU} #' to match -- by default -- the encoding part of the \code{LC_CTYPE} category #' as given by \code{\link{Sys.getlocale}}. #' #' #' #' @param str character vector #' or an object coercible to a character vector #' #' @return Returns a character vector of the same length as \code{str}. #' Unlike in the \code{\link{Encoding}} function, here the possible encodings are: #' \code{ASCII}, \code{latin1}, \code{bytes}, \code{native}, #' and \code{UTF-8}. Additionally, missing values are handled properly. #' #' This gives exactly the same data that is used by #' all the functions in \pkg{stringi} to re-encode their inputs. #' #' @family encoding_management #' @export stri_enc_mark <- function(str) { .Call(C_stri_enc_mark, str) } stringi/R/search_replace_4.R0000644000176200001440000003363014750110641015444 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Replace Pattern Occurrences #' #' @description #' These functions replace, with the given replacement string, every/first/last #' substring of the input that matches the specified \code{pattern}. #' #' @details #' By default, all the functions are vectorized over #' \code{str}, \code{pattern}, \code{replacement} (with recycling #' of the elements in the shorter vector if necessary). #' Input that is not part of any match is left unchanged; #' each match is replaced in the result by the replacement string. #' #' However, for \code{stri_replace_all*}, if \code{vectorize_all} is \code{FALSE}, #' then each substring matching any of the supplied \code{pattern}s #' is replaced by a corresponding \code{replacement} string. #' In such a case, the vectorization is over \code{str}, #' and - independently - over \code{pattern} and \code{replacement}. #' In other words, this is equivalent to something like #' \code{for (i in 1:npatterns) str <- stri_replace_all(str, pattern[i], replacement[i]}. #' Note that you must set \code{length(pattern) >= length(replacement)}. #' #' In case of \code{stri_replace_*_regex}, #' the replacement string may contain references to capture groups #' (in round parentheses). #' References are of the form \code{$n}, where \code{n} is the number #' of the capture group (\code{$1} denotes the first group). #' For the literal \code{$}, #' escape it with a backslash. #' Moreover, \code{${name}} are used for named capture groups. #' #' Note that \code{stri_replace_last_regex} searches from start to end, #' but skips overlapping matches, see the example below. #' #' \code{stri_replace}, \code{stri_replace_all}, \code{stri_replace_first}, #' and \code{stri_replace_last} are convenience functions; they just call #' \code{stri_replace_*_*} variants, depending on the arguments used. #' #' If you wish to remove white-spaces from the start or end #' of a string, see \code{\link{stri_trim}}. #' #' @param str character vector; strings to search in #' @param pattern,regex,fixed,coll,charclass character vector; #' search patterns; for more details refer to \link{stringi-search} #' @param replacement character vector with replacements for matched patterns #' @param opts_collator,opts_fixed,opts_regex a named list used to tune up #' the search engine's settings; see #' \code{\link{stri_opts_collator}}, \code{\link{stri_opts_fixed}}, #' and \code{\link{stri_opts_regex}}, respectively; \code{NULL} #' for the defaults #' @param merge single logical value; #' should consecutive matches be merged into one string; #' \code{stri_replace_all_charclass} only #' @param vectorize_all single logical value; #' should each occurrence of a pattern in every string #' be replaced by a corresponding replacement string?; #' \code{stri_replace_all_*} only #' @param vectorise_all alias of \code{vectorize_all} #' @param mode single string; #' one of: \code{'first'} (the default), \code{'all'}, \code{'last'} #' @param ... supplementary arguments passed to the underlying functions, #' including additional settings for \code{opts_collator}, \code{opts_regex}, #' \code{opts_fixed}, and so on #' #' @return All the functions return a character vector. #' #' @examples #' stri_replace_all_charclass('aaaa', '[a]', 'b', merge=c(TRUE, FALSE)) #' #' stri_replace_all_charclass('a\nb\tc d', '\\p{WHITE_SPACE}', ' ') #' stri_replace_all_charclass('a\nb\tc d', '\\p{WHITE_SPACE}', ' ', merge=TRUE) #' #' s <- 'Lorem ipsum dolor sit amet, consectetur adipisicing elit.' #' stri_replace_all_fixed(s, ' ', '#') #' stri_replace_all_fixed(s, 'o', '0') #' #' stri_replace_all_fixed(c('1', 'NULL', '3'), 'NULL', NA) #' #' stri_replace_all_regex(s, ' .*? ', '#') #' stri_replace_all_regex(s, '(el|s)it', '1234') #' stri_replace_all_regex('abaca', 'a', c('!', '*')) #' stri_replace_all_regex('123|456|789', '(\\p{N}).(\\p{N})', '$2-$1') #' stri_replace_all_regex(c('stringi R', 'REXAMINE', '123'), '( R|R.)', ' r ') #' #' # named capture groups are available since ICU 55 #' \dontrun{ #' stri_replace_all_regex('words 123 and numbers 456', #' '(?[0-9]+)', '!${numbers}!') #' } #' #' # Compare the results: #' stri_replace_all_fixed('The quick brown fox jumped over the lazy dog.', #' c('quick', 'brown', 'fox'), c('slow', 'black', 'bear'), vectorize_all=TRUE) #' stri_replace_all_fixed('The quick brown fox jumped over the lazy dog.', #' c('quick', 'brown', 'fox'), c('slow', 'black', 'bear'), vectorize_all=FALSE) #' #' # Compare the results: #' stri_replace_all_fixed('The quicker brown fox jumped over the lazy dog.', #' c('quick', 'brown', 'fox'), c('slow', 'black', 'bear'), vectorize_all=FALSE) #' stri_replace_all_regex('The quicker brown fox jumped over the lazy dog.', #' '\\b'%s+%c('quick', 'brown', 'fox')%s+%'\\b', c('slow', 'black', 'bear'), vectorize_all=FALSE) #' #' # Searching for the last occurrence: #' # Note the difference - regex searches left to right, with no overlaps. #' stri_replace_last_fixed("agAGA", "aga", "*", case_insensitive=TRUE) #' stri_replace_last_regex("agAGA", "aga", "*", case_insensitive=TRUE) #' #' @family search_replace #' @export #' @rdname stri_replace stri_replace_all <- function(str, replacement, ..., regex, fixed, coll, charclass) { providedarg <- c( regex = !missing(regex), fixed = !missing(fixed), coll = !missing(coll), charclass = !missing(charclass)) if (sum(providedarg) != 1) stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`") if (providedarg["regex"]) stri_replace_all_regex(str, regex, replacement, ...) else if (providedarg["fixed"]) stri_replace_all_fixed(str, fixed, replacement, ...) else if (providedarg["coll"]) stri_replace_all_coll(str, coll, replacement, ...) else if (providedarg["charclass"]) stri_replace_all_charclass(str, charclass, replacement, ...) } #' @export #' @rdname stri_replace stri_replace_first <- function(str, replacement, ..., regex, fixed, coll, charclass) { providedarg <- c( regex = !missing(regex), fixed = !missing(fixed), coll = !missing(coll), charclass = !missing(charclass)) if (sum(providedarg) != 1) stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`") if (providedarg["regex"]) stri_replace_first_regex(str, regex, replacement, ...) else if (providedarg["fixed"]) stri_replace_first_fixed(str, fixed, replacement, ...) else if (providedarg["coll"]) stri_replace_first_coll(str, coll, replacement, ...) else if (providedarg["charclass"]) stri_replace_first_charclass(str, charclass, replacement, ...) } #' @export #' @rdname stri_replace stri_replace_last <- function(str, replacement, ..., regex, fixed, coll, charclass) { providedarg <- c( regex = !missing(regex), fixed = !missing(fixed), coll = !missing(coll), charclass = !missing(charclass)) if (sum(providedarg) != 1) stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`") if (providedarg["regex"]) stri_replace_last_regex(str, regex, replacement, ...) else if (providedarg["fixed"]) stri_replace_last_fixed(str, fixed, replacement, ...) else if (providedarg["coll"]) stri_replace_last_coll(str, coll, replacement, ...) else if (providedarg["charclass"]) stri_replace_last_charclass(str, charclass, replacement, ...) } #' @export #' @rdname stri_replace stri_replace <- function(str, replacement, ..., regex, fixed, coll, charclass, mode = c("first", "all", "last")) { # `first` is default for compatibility with stringr mode <- match.arg(mode) # this is slow switch(mode, first = stri_replace_first(str, replacement, ..., regex = regex, fixed = fixed, coll = coll, charclass = charclass), last = stri_replace_last(str, replacement, ..., regex = regex, fixed = fixed, coll = coll, charclass = charclass), all = stri_replace_all(str, replacement, ..., regex = regex, fixed = fixed, coll = coll, charclass = charclass)) } #' @export #' @rdname stri_replace stri_replace_all_charclass <- function(str, pattern, replacement, merge = FALSE, vectorize_all = TRUE, vectorise_all = vectorize_all) { if (!missing(vectorise_all)) vectorize_all <- vectorise_all .Call(C_stri_replace_all_charclass, str, pattern, replacement, merge, vectorize_all) } #' @export #' @rdname stri_replace stri_replace_first_charclass <- function(str, pattern, replacement) { .Call(C_stri_replace_first_charclass, str, pattern, replacement) } #' @export #' @rdname stri_replace stri_replace_last_charclass <- function(str, pattern, replacement) { .Call(C_stri_replace_last_charclass, str, pattern, replacement) } #' @export #' @rdname stri_replace stri_replace_all_coll <- function(str, pattern, replacement, vectorize_all = TRUE, vectorise_all = vectorize_all, ..., opts_collator = NULL) { if (!missing(vectorise_all)) vectorize_all <- vectorise_all if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_replace_all_coll, str, pattern, replacement, vectorize_all, opts_collator) } #' @export #' @rdname stri_replace stri_replace_first_coll <- function(str, pattern, replacement, ..., opts_collator = NULL) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_replace_first_coll, str, pattern, replacement, opts_collator) } #' @export #' @rdname stri_replace stri_replace_last_coll <- function(str, pattern, replacement, ..., opts_collator = NULL) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_replace_last_coll, str, pattern, replacement, opts_collator) } #' @export #' @rdname stri_replace stri_replace_all_fixed <- function(str, pattern, replacement, vectorize_all = TRUE, vectorise_all = vectorize_all, ..., opts_fixed = NULL) { if (!missing(vectorise_all)) vectorize_all <- vectorise_all if (!missing(...)) opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...))) .Call(C_stri_replace_all_fixed, str, pattern, replacement, vectorize_all, opts_fixed) } #' @export #' @rdname stri_replace stri_replace_first_fixed <- function(str, pattern, replacement, ..., opts_fixed = NULL) { if (!missing(...)) opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...))) .Call(C_stri_replace_first_fixed, str, pattern, replacement, opts_fixed) } #' @export #' @rdname stri_replace stri_replace_last_fixed <- function(str, pattern, replacement, ..., opts_fixed = NULL) { if (!missing(...)) opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...))) .Call(C_stri_replace_last_fixed, str, pattern, replacement, opts_fixed) } #' @export #' @rdname stri_replace stri_replace_all_regex <- function(str, pattern, replacement, vectorize_all = TRUE, vectorise_all = vectorize_all, ..., opts_regex = NULL) { if (!missing(vectorise_all)) vectorize_all <- vectorise_all if (!missing(...)) opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...))) .Call(C_stri_replace_all_regex, str, pattern, replacement, vectorize_all, opts_regex) } #' @export #' @rdname stri_replace stri_replace_first_regex <- function(str, pattern, replacement, ..., opts_regex = NULL) { if (!missing(...)) opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...))) .Call(C_stri_replace_first_regex, str, pattern, replacement, opts_regex) } #' @export #' @rdname stri_replace stri_replace_last_regex <- function(str, pattern, replacement, ..., opts_regex = NULL) { if (!missing(...)) opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...))) .Call(C_stri_replace_last_regex, str, pattern, replacement, opts_regex) } #' Convert gsub-Style Replacement Strings #' #' @description #' Converts a \code{\link[base]{gsub}}-style replacement strings #' to those which can be used in \code{\link{stri_replace}}. #' In particular, \code{$} becomes \code{\\$} and \code{\\1} becomes \code{$1}. #' #' @param x character vector #' #' @return Returns a character vector. #' #' @family search_replace #' @export stri_replace_rstr <- function(x) { .Call(C_stri_replace_rstr, x) } stringi/R/search_extract_4.R0000644000176200001440000003133514750110641015503 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Extract Pattern Occurrences #' #' @description #' These functions extract all substrings matching a given pattern. #' #' \code{stri_extract_all_*} extracts all the matches. #' \code{stri_extract_first_*} and \code{stri_extract_last_*} #' yield the first or the last matches, respectively. #' #' @details #' Vectorized over \code{str} and \code{pattern} (with recycling #' of the elements in the shorter vector if necessary). This allows to, #' for instance, search for one pattern in each given string, #' search for each pattern in one given string, #' and search for the i-th pattern within the i-th string. #' #' Check out \code{\link{stri_match}} for the extraction of matches #' to individual regex capture groups. #' #' \code{stri_extract}, \code{stri_extract_all}, \code{stri_extract_first}, #' and \code{stri_extract_last} are convenience functions. #' They merely call \code{stri_extract_*_*}, depending on the arguments used. #' #' @param str character vector; strings to search in #' @param pattern,regex,fixed,coll,charclass character vector; #' search patterns; for more details refer to \link{stringi-search} #' @param opts_collator,opts_fixed,opts_regex a named list to tune up #' the search engine's settings; see \code{\link{stri_opts_collator}}, #' \code{\link{stri_opts_fixed}}, and \code{\link{stri_opts_regex}}, #' respectively; \code{NULL} for the defaults #' @param merge single logical value; indicates whether consecutive pattern #' matches will be merged into one string; #' \code{stri_extract_all_charclass} only #' @param simplify single logical value; #' if \code{TRUE} or \code{NA}, then a character matrix is returned; #' otherwise (the default), a list of character vectors is given, see Value; #' \code{stri_extract_all_*} only #' @param omit_no_match single logical value; if \code{FALSE}, #' then a missing value will indicate that there was no match; #' \code{stri_extract_all_*} only #' @param mode single string; #' one of: \code{'first'} (the default), \code{'all'}, \code{'last'} #' @param ... supplementary arguments passed to the underlying functions, #' including additional settings for \code{opts_collator}, \code{opts_regex}, #' and so on #' #' @return #' For \code{stri_extract_all*}, if \code{simplify=FALSE} (the default), then #' a list of character vectors is returned. Each list element #' represents the results of a different search scenario. #' If a pattern is not found and \code{omit_no_match=FALSE}, #' then a character vector of length 1 #' with single \code{NA} value will be generated. #' #' Otherwise, i.e., if \code{simplify} is not \code{FALSE}, #' then \code{\link{stri_list2matrix}} with \code{byrow=TRUE} argument #' is called on the resulting object. #' In such a case, the function yields a character matrix with an appropriate #' number of rows (according to the length of \code{str}, \code{pattern}, etc.). #' Note that \code{\link{stri_list2matrix}}'s \code{fill} argument is set #' either to an empty string or \code{NA}, depending on #' whether \code{simplify} is \code{TRUE} or \code{NA}, respectively. #' #' \code{stri_extract_first*} and \code{stri_extract_last*} #' return a character vector. A \code{NA} element indicates a no-match. #' #' Note that \code{stri_extract_last_regex} searches from start to end, #' but skips overlapping matches, see the example below. #' #' @examples #' stri_extract_all('XaaaaX', regex=c('\\p{Ll}', '\\p{Ll}+', '\\p{Ll}{2,3}', '\\p{Ll}{2,3}?')) #' stri_extract_all('Bartolini', coll='i') #' stri_extract_all('stringi is so good!', charclass='\\p{Zs}') # all white-spaces #' #' stri_extract_all_charclass(c('AbcdeFgHijK', 'abc', 'ABC'), '\\p{Ll}') #' stri_extract_all_charclass(c('AbcdeFgHijK', 'abc', 'ABC'), '\\p{Ll}', merge=FALSE) #' stri_extract_first_charclass('AaBbCc', '\\p{Ll}') #' stri_extract_last_charclass('AaBbCc', '\\p{Ll}') #' #' \dontrun{ #' # emoji support available since ICU 57 #' stri_extract_all_charclass(stri_enc_fromutf32(32:55200), '\\p{EMOJI}') #' } #' #' stri_extract_all_coll(c('AaaaaaaA', 'AAAA'), 'a') #' stri_extract_first_coll(c('Yy\u00FD', 'AAA'), 'y', strength=2, locale='sk_SK') #' stri_extract_last_coll(c('Yy\u00FD', 'AAA'), 'y', strength=1, locale='sk_SK') #' #' stri_extract_all_regex('XaaaaX', c('\\p{Ll}', '\\p{Ll}+', '\\p{Ll}{2,3}', '\\p{Ll}{2,3}?')) #' stri_extract_first_regex('XaaaaX', c('\\p{Ll}', '\\p{Ll}+', '\\p{Ll}{2,3}', '\\p{Ll}{2,3}?')) #' stri_extract_last_regex('XaaaaX', c('\\p{Ll}', '\\p{Ll}+', '\\p{Ll}{2,3}', '\\p{Ll}{2,3}?')) #' #' stri_list2matrix(stri_extract_all_regex('XaaaaX', c('\\p{Ll}', '\\p{Ll}+'))) #' stri_extract_all_regex('XaaaaX', c('\\p{Ll}', '\\p{Ll}+'), simplify=TRUE) #' stri_extract_all_regex('XaaaaX', c('\\p{Ll}', '\\p{Ll}+'), simplify=NA) #' #' stri_extract_all_fixed('abaBAba', 'Aba', case_insensitive=TRUE) #' stri_extract_all_fixed('abaBAba', 'Aba', case_insensitive=TRUE, overlap=TRUE) #' #' # Searching for the last occurrence: #' # Note the difference - regex searches left to right, with no overlaps. #' stri_extract_last_fixed("agAGA", "aga", case_insensitive=TRUE) #' stri_extract_last_regex("agAGA", "aga", case_insensitive=TRUE) #' #' @family search_extract #' #' @export #' @rdname stri_extract stri_extract_all <- function(str, ..., regex, fixed, coll, charclass) { providedarg <- c( regex = !missing(regex), fixed = !missing(fixed), coll = !missing(coll), charclass = !missing(charclass)) if (sum(providedarg) != 1) stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`") if (providedarg["regex"]) stri_extract_all_regex(str, regex, ...) else if (providedarg["fixed"]) stri_extract_all_fixed(str, fixed, ...) else if (providedarg["coll"]) stri_extract_all_coll(str, coll, ...) else if (providedarg["charclass"]) stri_extract_all_charclass(str, charclass, ...) } #' @export #' @rdname stri_extract stri_extract_first <- function(str, ..., regex, fixed, coll, charclass) { providedarg <- c( regex = !missing(regex), fixed = !missing(fixed), coll = !missing(coll), charclass = !missing(charclass)) if (sum(providedarg) != 1) stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`") if (providedarg["regex"]) stri_extract_first_regex(str, regex, ...) else if (providedarg["fixed"]) stri_extract_first_fixed(str, fixed, ...) else if (providedarg["coll"]) stri_extract_first_coll(str, coll, ...) else if (providedarg["charclass"]) stri_extract_first_charclass(str, charclass, ...) } #' @export #' @rdname stri_extract stri_extract_last <- function(str, ..., regex, fixed, coll, charclass) { providedarg <- c( regex = !missing(regex), fixed = !missing(fixed), coll = !missing(coll), charclass = !missing(charclass)) if (sum(providedarg) != 1) stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`") if (providedarg["regex"]) stri_extract_last_regex(str, regex, ...) else if (providedarg["fixed"]) stri_extract_last_fixed(str, fixed, ...) else if (providedarg["coll"]) stri_extract_last_coll(str, coll, ...) else if (providedarg["charclass"]) stri_extract_last_charclass(str, charclass, ...) } #' @export #' @rdname stri_extract stri_extract <- function(str, ..., regex, fixed, coll, charclass, mode = c("first", "all", "last")) { # `first` is default for compatibility with stringr mode <- match.arg(mode) # this is slow switch(mode, first = stri_extract_first(str, ..., regex = regex, fixed = fixed, coll = coll, charclass = charclass), last = stri_extract_last(str, ..., regex = regex, fixed = fixed, coll = coll, charclass = charclass), all = stri_extract_all(str, ..., regex = regex, fixed = fixed, coll = coll, charclass = charclass)) } #' @export #' @rdname stri_extract stri_extract_all_charclass <- function(str, pattern, merge = TRUE, simplify = FALSE, omit_no_match = FALSE) { .Call(C_stri_extract_all_charclass, str, pattern, merge, simplify, omit_no_match) } #' @export #' @rdname stri_extract stri_extract_first_charclass <- function(str, pattern) { .Call(C_stri_extract_first_charclass, str, pattern) } #' @export #' @rdname stri_extract stri_extract_last_charclass <- function(str, pattern) { .Call(C_stri_extract_last_charclass, str, pattern) } #' @export #' @rdname stri_extract stri_extract_all_coll <- function(str, pattern, simplify = FALSE, omit_no_match = FALSE, ..., opts_collator = NULL) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_extract_all_coll, str, pattern, simplify, omit_no_match, opts_collator) } #' @export #' @rdname stri_extract stri_extract_first_coll <- function(str, pattern, ..., opts_collator = NULL) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_extract_first_coll, str, pattern, opts_collator) } #' @export #' @rdname stri_extract stri_extract_last_coll <- function(str, pattern, ..., opts_collator = NULL) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_extract_last_coll, str, pattern, opts_collator) } #' @export #' @rdname stri_extract stri_extract_all_regex <- function(str, pattern, simplify = FALSE, omit_no_match = FALSE, ..., opts_regex = NULL) { if (!missing(...)) opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...))) .Call(C_stri_extract_all_regex, str, pattern, simplify, omit_no_match, opts_regex) } #' @export #' @rdname stri_extract stri_extract_first_regex <- function(str, pattern, ..., opts_regex = NULL) { if (!missing(...)) opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...))) .Call(C_stri_extract_first_regex, str, pattern, opts_regex) } #' @export #' @rdname stri_extract stri_extract_last_regex <- function(str, pattern, ..., opts_regex = NULL) { if (!missing(...)) opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...))) .Call(C_stri_extract_last_regex, str, pattern, opts_regex) } #' @export #' @rdname stri_extract stri_extract_all_fixed <- function(str, pattern, simplify = FALSE, omit_no_match = FALSE, ..., opts_fixed = NULL) { if (!missing(...)) opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...))) .Call(C_stri_extract_all_fixed, str, pattern, simplify, omit_no_match, opts_fixed) } #' @export #' @rdname stri_extract stri_extract_first_fixed <- function(str, pattern, ..., opts_fixed = NULL) { if (!missing(...)) opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...))) .Call(C_stri_extract_first_fixed, str, pattern, opts_fixed) } #' @export #' @rdname stri_extract stri_extract_last_fixed <- function(str, pattern, ..., opts_fixed = NULL) { if (!missing(...)) opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...))) .Call(C_stri_extract_last_fixed, str, pattern, opts_fixed) } stringi/R/locale_management.R0000644000176200001440000001250114750110641015706 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' List Available Locales #' #' @description #' Creates a character vector with all available locale identifies. #' #' @details #' Note that some of the services may be unavailable in some locales. #' Querying for locale-specific services is always performed #' during the resource request. #' #' See \link{stringi-locale} for more information. #' #' @return #' Returns a character vector with locale identifiers #' that are known to \pkg{ICU}. #' #' @examples #' stri_locale_list() #' #' @family locale_management #' @export stri_locale_list <- function() { stri_sort( .Call(C_stri_locale_list), locale="en_US", numeric=TRUE, strength=1 ) } #' @title #' Set or Get Default Locale in \pkg{stringi} #' #' @description #' \code{stri_locale_set} changes the default locale for all the functions #' in the \pkg{stringi} package, #' i.e., establishes the meaning of the ``\code{NULL} locale'' argument #' of locale-sensitive functions. #' \code{stri_locale_get} #' gives the current default locale. #' #' @details #' See \link{stringi-locale} for more information on the effect of #' changing the default locale. #' #' \code{stri_locale_get} is the same as \code{\link{stri_locale_info}(NULL)$Name}. #' #' @param locale single string of the form \code{Language}, #' \code{Language_Country}, or \code{Language_Country_Variant}, e.g., #' \code{'en_US'}, see \code{\link{stri_locale_list}}. #' #' @return #' \code{stri_locale_set} returns a string with #' previously used locale, invisibly. #' #' \code{stri_locale_get} returns a string of the form \code{Language}, #' \code{Language_Country}, or \code{Language_Country_Variant}, #' e.g., \code{'en_US'}. #' #' @family locale_management #' @rdname stri_locale_set #' @examples #' \dontrun{ #' oldloc <- stri_locale_set('pt_BR') #' # ... some locale-dependent operations #' # ... note that you may always modify a locale per-call #' # ... changing the default locale is convenient if you perform #' # ... many operations #' stri_locale_set(oldloc) # restore the previous default locale #' } #' @export stri_locale_set <- function(locale) { previous <- stri_locale_get() .Call(C_stri_locale_set, locale) # We call stri_info, because it generates some warnings, # in case any problems are found: message(stri_paste("You are now working with ", stri_info(short = TRUE))) invisible(previous) } #' @rdname stri_locale_set #' @export stri_locale_get <- function() { stri_locale_info(NULL)$Name } #' @title #' Query Given Locale #' #' @description #' Provides some basic information on a given locale identifier. #' #' @details #' With this function you may obtain some basic information #' on any provided locale identifier, #' even if it is unsupported by \pkg{ICU} or if you pass a malformed locale #' identifier (the one that is not, e.g., of the form Language_Country). #' See \link{stringi-locale} for discussion. #' #' This function does not do anything really complicated. In many #' cases it is similar to a call to #' \code{\link{as.list}(\link{stri_split_fixed}(locale, '_', 3L)[[1]])}, #' with \code{locale} case mapped. #' It may be used, however, to get insight on how ICU understands a given #' locale identifier. #' #' @param locale \code{NULL} or \code{''} for default locale, #' or a single string with locale identifier. #' #' @return #' Returns a list with the following named character strings: #' \code{Language}, \code{Country}, \code{Variant}, and #' \code{Name}, being their underscore separated combination. #' #' @examples #' stri_locale_info('pl_PL') #' stri_locale_info('Pl_pL') # the same result #' #' @family locale_management #' @export stri_locale_info <- function(locale = NULL) { .Call(C_stri_locale_info, locale) } stringi/R/search_split_4.R0000644000176200001440000002241514750110641015163 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Split a String By Pattern Matches #' #' @description #' These functions split each element in \code{str} into substrings. #' \code{pattern} defines the delimiters that separate the inputs into tokens. #' The input data between the matches become the fields themselves. #' #' @details #' Vectorized over \code{str}, \code{pattern}, \code{n}, and \code{omit_empty} #' (with recycling of the elements in the shorter vector if necessary). #' #' If \code{n} is negative, then all pieces are extracted. #' Otherwise, if \code{tokens_only} is \code{FALSE} (which is the default), #' then \code{n-1} tokens are extracted (if possible) and the \code{n}-th string #' gives the remainder (see Examples). #' On the other hand, if \code{tokens_only} is \code{TRUE}, #' then only full tokens (up to \code{n} pieces) are extracted. #' #' \code{omit_empty} is applied during the split process: if it is set to #' \code{TRUE}, then tokens of zero length are ignored. Thus, empty strings #' will never appear in the resulting vector. On the other hand, if #' \code{omit_empty} is \code{NA}, then empty tokens are substituted with #' missing strings. #' #' Empty search patterns are not supported. If you wish to split a #' string into individual characters, use, e.g., #' \code{\link{stri_split_boundaries}(str, type='character')} for THE Unicode way. #' #' \code{stri_split} is a convenience function. It calls either #' \code{stri_split_regex}, \code{stri_split_fixed}, \code{stri_split_coll}, #' or \code{stri_split_charclass}, depending on the argument used. #' #' @param str character vector; strings to search in #' @param pattern,regex,fixed,coll,charclass character vector; #' search patterns; for more details refer to \link{stringi-search} #' @param n integer vector, maximal number of strings to return, #' and, at the same time, maximal number of text boundaries to look for #' @param omit_empty logical vector; determines whether empty #' tokens should be removed from the result (\code{TRUE} or \code{FALSE}) #' or replaced with \code{NA}s (\code{NA}) #' @param tokens_only single logical value; #' may affect the result if \code{n} is positive, see Details #' @param simplify single logical value; #' if \code{TRUE} or \code{NA}, then a character matrix is returned; #' otherwise (the default), a list of character vectors is given, see Value #' @param opts_collator,opts_fixed,opts_regex a named list used to tune up #' the search engine's settings; see #' \code{\link{stri_opts_collator}}, \code{\link{stri_opts_fixed}}, #' and \code{\link{stri_opts_regex}}, respectively; \code{NULL} #' for the defaults #' @param ... supplementary arguments passed to the underlying functions, #' including additional settings for \code{opts_collator}, \code{opts_regex}, #' \code{opts_fixed}, and so on #' #' @return If \code{simplify=FALSE} (the default), #' then the functions return a list of character vectors. #' #' Otherwise, \code{\link{stri_list2matrix}} with \code{byrow=TRUE} #' and \code{n_min=n} arguments is called on the resulting object. #' In such a case, a character matrix with an appropriate number of rows #' (according to the length of \code{str}, \code{pattern}, etc.) #' is returned. Note that \code{\link{stri_list2matrix}}'s \code{fill} argument #' is set to an empty string and \code{NA}, for \code{simplify} equal to #' \code{TRUE} and \code{NA}, respectively. #' #' @examples #' stri_split_fixed('a_b_c_d', '_') #' stri_split_fixed('a_b_c__d', '_') #' stri_split_fixed('a_b_c__d', '_', omit_empty=TRUE) #' stri_split_fixed('a_b_c__d', '_', n=2, tokens_only=FALSE) # 'a' & remainder #' stri_split_fixed('a_b_c__d', '_', n=2, tokens_only=TRUE) # 'a' & 'b' only #' stri_split_fixed('a_b_c__d', '_', n=4, omit_empty=TRUE, tokens_only=TRUE) #' stri_split_fixed('a_b_c__d', '_', n=4, omit_empty=FALSE, tokens_only=TRUE) #' stri_split_fixed('a_b_c__d', '_', omit_empty=NA) #' stri_split_fixed(c('ab_c', 'd_ef_g', 'h', ''), '_', n=1, tokens_only=TRUE, omit_empty=TRUE) #' stri_split_fixed(c('ab_c', 'd_ef_g', 'h', ''), '_', n=2, tokens_only=TRUE, omit_empty=TRUE) #' stri_split_fixed(c('ab_c', 'd_ef_g', 'h', ''), '_', n=3, tokens_only=TRUE, omit_empty=TRUE) #' #' stri_list2matrix(stri_split_fixed(c('ab,c', 'd,ef,g', ',h', ''), ',', omit_empty=TRUE)) #' stri_split_fixed(c('ab,c', 'd,ef,g', ',h', ''), ',', omit_empty=FALSE, simplify=TRUE) #' stri_split_fixed(c('ab,c', 'd,ef,g', ',h', ''), ',', omit_empty=NA, simplify=TRUE) #' stri_split_fixed(c('ab,c', 'd,ef,g', ',h', ''), ',', omit_empty=TRUE, simplify=TRUE) #' stri_split_fixed(c('ab,c', 'd,ef,g', ',h', ''), ',', omit_empty=NA, simplify=NA) #' #' stri_split_regex(c('ab,c', 'd,ef , g', ', h', ''), #' '\\p{WHITE_SPACE}*,\\p{WHITE_SPACE}*', omit_empty=NA, simplify=TRUE) #' #' stri_split_charclass('Lorem ipsum dolor sit amet', '\\p{WHITE_SPACE}') #' stri_split_charclass(' Lorem ipsum dolor', '\\p{WHITE_SPACE}', n=3, #' omit_empty=c(FALSE, TRUE)) #' #' stri_split_regex('Lorem ipsum dolor sit amet', #' '\\p{Z}+') # see also stri_split_charclass #' #' @export #' @rdname stri_split #' @family search_split #' @export stri_split <- function(str, ..., regex, fixed, coll, charclass) { providedarg <- c( regex = !missing(regex), fixed = !missing(fixed), coll = !missing(coll), charclass = !missing(charclass)) if (sum(providedarg) != 1) stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`") if (providedarg["regex"]) stri_split_regex(str, regex, ...) else if (providedarg["fixed"]) stri_split_fixed(str, fixed, ...) else if (providedarg["coll"]) stri_split_coll(str, coll, ...) else if (providedarg["charclass"]) stri_split_charclass(str, charclass, ...) } #' @export #' @rdname stri_split stri_split_fixed <- function(str, pattern, n = -1L, omit_empty = FALSE, tokens_only = FALSE, simplify = FALSE, ..., opts_fixed = NULL) { # omit_empty defaults to FALSE for compatibility with the stringr package # tokens_only defaults to FALSE for compatibility with the stringr package if (!missing(...)) opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...))) .Call(C_stri_split_fixed, str, pattern, n, omit_empty, tokens_only, simplify, opts_fixed) } #' @export #' @rdname stri_split stri_split_regex <- function(str, pattern, n = -1L, omit_empty = FALSE, tokens_only = FALSE, simplify = FALSE, ..., opts_regex = NULL) { # omit_empty defaults to FALSE for compatibility with the stringr package # tokens_only defaults to FALSE for compatibility with the stringr package if (!missing(...)) opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...))) .Call(C_stri_split_regex, str, pattern, n, omit_empty, tokens_only, simplify, opts_regex) } #' @export #' @rdname stri_split stri_split_coll <- function(str, pattern, n = -1L, omit_empty = FALSE, tokens_only = FALSE, simplify = FALSE, ..., opts_collator = NULL) { # omit_empty defaults to FALSE for compatibility with the stringr package # tokens_only defaults to FALSE for compatibility with the stringr package if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_split_coll, str, pattern, n, omit_empty, tokens_only, simplify, opts_collator) } #' @export #' @rdname stri_split stri_split_charclass <- function(str, pattern, n = -1L, omit_empty = FALSE, tokens_only = FALSE, simplify = FALSE) { # omit_empty defaults to FALSE for compatibility with the stringr package # tokens_only defaults to FALSE for compatibility with the stringr package .Call(C_stri_split_charclass, str, pattern, n, omit_empty, tokens_only, simplify) } stringi/R/install.R0000644000176200001440000001154014750110641013723 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # internal functions used whilst installing stringi icudt_fname <- c( little74 = "icudt74l.dat", big74 = "icudt74b.dat" ) # This function is not exported; it is called by install.libs.r(.in) stri_install_icudt <- function(outpath, inpath, icu_bundle_version) { # remember about importFrom tools md5sum -> stringi-package.R xzpath <- stri_download_icudt(inpath, icu_bundle_version) if (identical(xzpath, FALSE) || !file.exists(xzpath)) { return(invisible(FALSE)) } basepath <- substr(xzpath, 1, nchar(xzpath)-3) # ~~".xz"~~ message("decompressing ", xzpath, " to: ", outpath) fin <- xzfile(xzpath, "rb") fout <- file(basepath, "wb") repeat { chunk <- readBin(fin, raw(), 8192L) if (length(chunk) <= 0) break writeBin(chunk, fout) } close(fout) close(fin) md5ex <- scan(sprintf("%s.md5sum", basepath), what=character(), n=1, quiet=TRUE) md5ob <- tools::md5sum(basepath) if (is.na(md5ob) || md5ob != md5ex) { message(sprintf("md5sum mismatch for %s (%s vs %s)", basepath, as.character(md5ob), as.character(md5ex) )) file.remove(basepath) return(invisible(FALSE)) } file.copy(basepath, file.path(outpath, basename(basepath)), overwrite=TRUE) file.remove(basepath) message(sprintf("%s installed successfully", basepath)) invisible(TRUE) } # This function is not exported; # it is called by configure(.ac) and stri_install_icudt above stri_download_icudt <- function(inpath, icu_bundle_version) { fname <- icudt_fname[paste0(.Platform$endian, icu_bundle_version)] path <- file.path(inpath, fname) commit_id <- "bbe75eca8f9ef4dc72dc5c6e36c8f8306a324b7e" mirrors <- sprintf( "%s://raw.githubusercontent.com/gagolews/stringi/%s/src/icu%d/data/", c("https", "http"), commit_id, icu_bundle_version ) xzpath <- sprintf("%s.xz", path) if (file.exists(xzpath)) { message(sprintf("%s exists", xzpath)) return(xzpath) } download_from_mirror <- function(href, fname, xzpath) { tryCatch({ suppressWarnings(file.remove(xzpath)) # download icudt if ( download.file( paste(href, fname, sep = ""), xzpath, mode = "wb" ) != 0 ) { return("download error") } if (!file.exists(xzpath)) { return("download error") } TRUE }, error = function(e) as.character(e)) } message(sprintf("downloading the ICU data library (%s)...", xzpath)) if (!dir.exists(inpath)) suppressWarnings(dir.create(inpath)) allok <- FALSE for (m in mirrors) { status <- download_from_mirror(m, sprintf("%s.xz", fname), xzpath) if (identical(status, TRUE)) { allok <- TRUE break } else message(status) } if (!allok || !file.exists(xzpath)) { suppressWarnings(file.remove(xzpath)) message(sprintf("Error: %s could not be downloaded", xzpath)) return(invisible(FALSE)) } message(sprintf("%s downloaded successfully", xzpath)) return(xzpath) } stringi/R/search_match_4.R0000644000176200001440000001624714750110641015132 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Extract Regex Pattern Matches, Together with Capture Groups #' #' @description #' These functions extract substrings in \code{str} that #' match a given regex \code{pattern}. Additionally, they extract matches #' to every \emph{capture group}, i.e., to all the sub-patterns given #' in round parentheses. #' #' @details #' Vectorized over \code{str} and \code{pattern} (with recycling #' of the elements in the shorter vector if necessary). This allows to, #' for instance, search for one pattern in each given string, #' search for each pattern in one given string, #' and search for the i-th pattern within the i-th string. #' #' If no pattern match is detected and \code{omit_no_match=FALSE}, #' then \code{NA}s are included in the resulting matrix (matrices), see Examples. #' #' \code{stri_match}, \code{stri_match_all}, \code{stri_match_first}, #' and \code{stri_match_last} are convenience functions. #' They merely call \code{stri_match_*_regex} and are #' provided for consistency with other string searching functions' wrappers, #' see, among others, \code{\link{stri_extract}}. #' #' @param str character vector; strings to search in #' @param pattern,regex character vector; #' search patterns; for more details refer to \link{stringi-search} #' @param opts_regex a named list with \pkg{ICU} Regex settings, #' see \code{\link{stri_opts_regex}}; \code{NULL} #' for default settings #' @param omit_no_match single logical value; if \code{FALSE}, #' then a row with missing values will indicate that there was no match; #' \code{stri_match_all_*} only #' @param cg_missing single string to be used if a capture group match #' is unavailable #' @param mode single string; #' one of: \code{'first'} (the default), \code{'all'}, \code{'last'} #' @param ... supplementary arguments passed to the underlying functions, #' including additional settings for \code{opts_regex} #' #' @return #' For \code{stri_match_all*}, #' a list of character matrices is returned. Each list element #' represents the results of a different search scenario. #' #' For \code{stri_match_first*} and \code{stri_match_last*} #' a character matrix is returned. #' Each row corresponds to a different search result. #' #' The first matrix column gives the whole match. The second one corresponds to #' the first capture group, the third -- the second capture group, and so on. #' #' If regular expressions feature a named capture group, #' the matrix columns will be named accordingly. #' However, for \code{stri_match_first*} and \code{stri_match_last*} #' this will only be the case if there is a single pattern. #' #' #' @examples #' stri_match_all_regex('breakfast=eggs, lunch=pizza, dessert=icecream', #' '(\\w+)=(\\w+)') #' stri_match_all_regex(c('breakfast=eggs', 'lunch=pizza', 'no food here'), #' '(\\w+)=(\\w+)') #' stri_match_all_regex(c('breakfast=eggs;lunch=pizza', #' 'breakfast=bacon;lunch=spaghetti', 'no food here'), #' '(\\w+)=(\\w+)') #' stri_match_all_regex(c('breakfast=eggs;lunch=pizza', #' 'breakfast=bacon;lunch=spaghetti', 'no food here'), #' '(?\\w+)=(?\\w+)') # named capture groups #' stri_match_first_regex(c('breakfast=eggs;lunch=pizza', #' 'breakfast=bacon;lunch=spaghetti', 'no food here'), #' '(\\w+)=(\\w+)') #' stri_match_last_regex(c('breakfast=eggs;lunch=pizza', #' 'breakfast=bacon;lunch=spaghetti', 'no food here'), #' '(\\w+)=(\\w+)') #' #' stri_match_first_regex(c('abcd', ':abcd', ':abcd:'), '^(:)?([^:]*)(:)?$') #' stri_match_first_regex(c('abcd', ':abcd', ':abcd:'), '^(:)?([^:]*)(:)?$', cg_missing='') #' #' # Match all the pattern of the form XYX, including overlapping matches: #' stri_match_all_regex('ACAGAGACTTTAGATAGAGAAGA', '(?=(([ACGT])[ACGT]\\2))')[[1]][,2] #' # Compare the above to: #' stri_extract_all_regex('ACAGAGACTTTAGATAGAGAAGA', '([ACGT])[ACGT]\\1') #' #' @family search_extract #' @export #' @rdname stri_match stri_match_all <- function(str, ..., regex) { stri_match_all_regex(str, regex, ...) } #' @export #' @rdname stri_match stri_match_first <- function(str, ..., regex) { stri_match_first_regex(str, regex, ...) } #' @export #' @rdname stri_match stri_match_last <- function(str, ..., regex) { stri_match_last_regex(str, regex, ...) } #' @export #' @rdname stri_match stri_match <- function(str, ..., regex, mode = c("first", "all", "last")) { # `first` is default for compatibility with stringr mode <- match.arg(mode) # this is slow switch(mode, first = stri_match_first_regex(str, regex, ...), last = stri_match_last_regex(str, regex, ...), all = stri_match_all_regex(str, regex, ...)) } #' @export #' @rdname stri_match stri_match_all_regex <- function(str, pattern, omit_no_match = FALSE, cg_missing = NA_character_, ..., opts_regex = NULL) { if (!missing(...)) opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...))) .Call(C_stri_match_all_regex, str, pattern, omit_no_match, cg_missing, opts_regex) } #' @export #' @rdname stri_match stri_match_first_regex <- function(str, pattern, cg_missing = NA_character_, ..., opts_regex = NULL) { if (!missing(...)) opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...))) .Call(C_stri_match_first_regex, str, pattern, cg_missing, opts_regex) } #' @export #' @rdname stri_match stri_match_last_regex <- function(str, pattern, cg_missing = NA_character_, ..., opts_regex = NULL) { if (!missing(...)) opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...))) .Call(C_stri_match_last_regex, str, pattern, cg_missing, opts_regex) } stringi/R/time_format.R0000644000176200001440000003347714750110641014600 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Date and Time Formatting and Parsing #' #' @description #' These functions convert a given date/time object #' to a character vector, or vice versa. #' #' @details #' Vectorized over \code{format} and \code{time} or \code{str}. #' #' When parsing strings, unspecified date-time fields #' (e.g., seconds where only hours and minutes are given) #' are based on today's midnight in the local time zone #' (for compatibility with \code{\link[base]{strptime}}). #' #' By default, \code{stri_datetime_format} (for compatibility #' with the \code{\link[base]{strftime}} function) #' formats a date/time object using the current default time zone. #' #' \code{format} may be one of \code{DT_STYLE} or \code{DT_relative_STYLE}, #' where \code{DT} is equal to \code{date}, \code{time}, or \code{datetime}, #' and \code{STYLE} is equal to \code{full}, \code{long}, \code{medium}, #' or \code{short}. This gives a locale-dependent date and/or time format. #' Note that currently \pkg{ICU} does not support \code{relative} #' \code{time} formats, thus this flag is currently ignored in such a context. #' #' Otherwise, \code{format} is a pattern: #' a string where specific sequences of characters are replaced #' with date/time data from a calendar when formatting or used #' to generate data for a calendar when parsing. #' For example, \code{y} stands for 'year'. Characters #' may be used multiple times: #' \code{yy} might produce \code{99}, whereas \code{yyyy} yields \code{1999}. #' For most numerical fields, the number of characters specifies #' the field width. For example, if \code{h} is the hour, \code{h} might #' produce \code{5}, but \code{hh} yields \code{05}. #' For some characters, the count specifies whether an abbreviated #' or full form should be used. #' #' Two single quotes represent a literal single quote, either #' inside or outside single quotes. Text within single quotes #' is not interpreted in any way (except for two adjacent single quotes). #' Otherwise, all ASCII letters from \code{a} to \code{z} and #' \code{A} to \code{Z} are reserved as syntax characters, and require quoting #' if they are to represent literal characters. In addition, certain #' ASCII punctuation characters may become available in the future #' (e.g., \code{:} being interpreted as the time separator and \code{/} #' as a date separator, and replaced by respective #' locale-sensitive characters in display). #' #' \tabular{llll}{ #' \bold{Symbol} \tab \bold{Meaning} \tab \bold{Example(s)} \tab \bold{Output} \cr #' G \tab era designator \tab G, GG, or GGG \tab AD \cr #' \tab \tab GGGG \tab Anno Domini \cr #' \tab \tab GGGGG \tab A \cr #' y \tab year \tab yy \tab 96 \cr #' \tab \tab y or yyyy \tab 1996 \cr # Y \tab year of 'Week of Year' \tab Y \tab 1997 \cr #' u \tab extended year \tab u \tab 4601 \cr #' U \tab cyclic year name, as in Chinese lunar calendar \tab U \tab \cr #' r \tab related Gregorian year \tab r \tab 1996 \cr #' Q \tab quarter \tab Q or QQ \tab 02 \cr #' \tab \tab QQQ \tab Q2 \cr #' \tab \tab QQQQ \tab 2nd quarter \cr #' \tab \tab QQQQQ \tab 2 \cr #' q \tab Stand Alone quarter \tab q or qq \tab 02 \cr #' \tab \tab qqq \tab Q2 \cr #' \tab \tab qqqq \tab 2nd quarter \cr #' \tab \tab qqqqq \tab 2 \cr #' M \tab month in year \tab M or MM \tab 09 \cr #' \tab \tab MMM \tab Sep \cr #' \tab \tab MMMM \tab September \cr #' \tab \tab MMMMM \tab S \cr #' L \tab Stand Alone month in year \tab L or LL \tab 09 \cr #' \tab \tab LLL \tab Sep \cr #' \tab \tab LLLL \tab September \cr #' \tab \tab LLLLL \tab S \cr #' w \tab week of year \tab w or ww \tab 27 \cr #' W \tab week of month \tab W \tab 2 \cr #' d \tab day in month \tab d \tab 2 \cr #' \tab \tab dd \tab 02 \cr #' D \tab day of year \tab D \tab 189 \cr #' F \tab day of week in month \tab F \tab 2 (2nd Wed in July) \cr #' g \tab modified Julian day \tab g \tab 2451334 \cr #' E \tab day of week \tab E, EE, or EEE \tab Tue \cr #' \tab \tab EEEE \tab Tuesday \cr #' \tab \tab EEEEE \tab T \cr #' \tab \tab EEEEEE \tab Tu \cr #' e \tab local day of week \tab e or ee \tab 2 \cr #' \tab example: if Monday is 1st day, Tuesday is 2nd ) \tab eee \tab Tue \cr #' \tab \tab eeee \tab Tuesday \cr #' \tab \tab eeeee \tab T \cr #' \tab \tab eeeeee \tab Tu \cr #' c \tab Stand Alone local day of week \tab c or cc \tab 2 \cr #' \tab \tab ccc \tab Tue \cr #' \tab \tab cccc \tab Tuesday \cr #' \tab \tab ccccc \tab T \cr #' \tab \tab cccccc \tab Tu \cr #' a \tab am/pm marker \tab a \tab pm \cr #' h \tab hour in am/pm (1~12) \tab h \tab 7 \cr #' \tab \tab hh \tab 07 \cr #' H \tab hour in day (0~23) \tab H \tab 0 \cr #' \tab \tab HH \tab 00 \cr #' k \tab hour in day (1~24) \tab k \tab 24 \cr #' \tab \tab kk \tab 24 \cr #' K \tab hour in am/pm (0~11) \tab K \tab 0 \cr #' \tab \tab KK \tab 00 \cr #' m \tab minute in hour \tab m \tab 4 \cr #' \tab \tab mm \tab 04 \cr #' s \tab second in minute \tab s \tab 5 \cr #' \tab \tab ss \tab 05 \cr #' S \tab fractional second - truncates (like other time fields) \tab S \tab 2 \cr #' \tab to the count of letters when formatting. Appends \tab SS \tab 23 \cr #' \tab zeros if more than 3 letters specified. Truncates at \tab SSS \tab 235 \cr #' \tab three significant digits when parsing. \tab SSSS \tab 2350 \cr #' A \tab milliseconds in day \tab A \tab 61201235 \cr #' z \tab Time Zone: specific non-location \tab z, zz, or zzz \tab PDT \cr #' \tab \tab zzzz \tab Pacific Daylight Time \cr #' Z \tab Time Zone: ISO8601 basic hms? / RFC 822 \tab Z, ZZ, or ZZZ \tab -0800 \cr #' \tab Time Zone: long localized GMT (=OOOO) \tab ZZZZ \tab GMT-08:00 \cr #' \tab Time Zone: ISO8601 extended hms? (=XXXXX) \tab ZZZZZ \tab -08:00, -07:52:58, Z \cr #' O \tab Time Zone: short localized GMT \tab O \tab GMT-8 \cr #' \tab Time Zone: long localized GMT (=ZZZZ) \tab OOOO \tab GMT-08:00 \cr #' v \tab Time Zone: generic non-location \tab v \tab PT \cr #' \tab (falls back first to VVVV) \tab vvvv \tab Pacific Time or Los Angeles Time \cr #' V \tab Time Zone: short time zone ID \tab V \tab uslax \cr #' \tab Time Zone: long time zone ID \tab VV \tab America/Los_Angeles \cr #' \tab Time Zone: time zone exemplar city \tab VVV \tab Los Angeles \cr #' \tab Time Zone: generic location (falls back to OOOO) \tab VVVV \tab Los Angeles Time \cr #' X \tab Time Zone: ISO8601 basic hm?, with Z for 0 \tab X \tab -08, +0530, Z \cr #' \tab Time Zone: ISO8601 basic hm, with Z \tab XX \tab -0800, Z \cr #' \tab Time Zone: ISO8601 extended hm, with Z \tab XXX \tab -08:00, Z \cr #' \tab Time Zone: ISO8601 basic hms?, with Z \tab XXXX \tab -0800, -075258, Z \cr #' \tab Time Zone: ISO8601 extended hms?, with Z \tab XXXXX \tab -08:00, -07:52:58, Z \cr #' x \tab Time Zone: ISO8601 basic hm?, without Z for 0 \tab x \tab -08, +0530 \cr #' \tab Time Zone: ISO8601 basic hm, without Z \tab xx \tab -0800 \cr #' \tab Time Zone: ISO8601 extended hm, without Z \tab xxx \tab -08:00 \cr #' \tab Time Zone: ISO8601 basic hms?, without Z \tab xxxx \tab -0800, -075258 \cr #' \tab Time Zone: ISO8601 extended hms?, without Z \tab xxxxx \tab -08:00, -07:52:58 \cr #' ' \tab escape for text \tab ' \tab (nothing) \cr #' ' ' \tab two single quotes produce one \tab ' ' \tab ' #' } #' #' Note that any characters in the pattern that are not in the ranges #' of \code{[a-z]} and \code{[A-Z]} will be treated as quoted text. #' For instance, characters like \code{:}, \code{.}, \code{ } (a space), #' \code{#} and \code{@@} will appear in the resulting time text #' even if they are not enclosed within single quotes. The single quote is used #' to ``escape'' the letters. Two single quotes in a row, #' inside or outside a quoted sequence, represent a ``real'' single quote. #' #' #' A few examples: #' #' \tabular{ll}{ #' \bold{Example Pattern} \tab \bold{Result} \cr #' yyyy.MM.dd 'at' HH:mm:ss zzz \tab 2015.12.31 at 23:59:59 GMT+1 \cr #' EEE, MMM d, ''yy \tab czw., gru 31, '15 \cr #' h:mm a \tab 11:59 PM \cr #' hh 'o''clock' a, zzzz \tab 11 o'clock PM, GMT+01:00 \cr #' K:mm a, z \tab 11:59 PM, GMT+1 \cr #' yyyyy.MMMM.dd GGG hh:mm aaa \tab 2015.grudnia.31 n.e. 11:59 PM \cr #' uuuu-MM-dd'T'HH:mm:ssZ \tab 2015-12-31T23:59:59+0100 (the ISO 8601 guideline) \cr #' } #' #' @param time an object of class \code{\link{POSIXct}} with date-time data #' to be formatted #' (\code{as.POSIXct} will be called on character vectors #' and objects of class \code{POSIXlt}, \code{Date}, and \code{factor}) #' @param str character vector with strings to be parsed #' @param format character vector, see Details; see also \code{\link{stri_datetime_fstr}} #' @param tz \code{NULL} or \code{''} for the default time zone #' or a single string with a timezone identifier, #' see \code{\link{stri_timezone_get}} and \code{\link{stri_timezone_list}} #' @param lenient single logical value; should date/time parsing be lenient? #' @param locale \code{NULL} or \code{''} for the default locale, #' or a single string with locale identifier; a non-Gregorian calendar #' may be specified by setting the \code{@@calendar=name} keyword #' #' @return #' \code{stri_datetime_format} returns a character vector. #' #' \code{stri_datetime_parse} returns an object of class \code{\link{POSIXct}}. #' #' @references #' \emph{Formatting Dates and Times} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/format_parse/datetime/} #' #' #' @examples #' x <- c('2015-02-28', '2015-02-29') #' stri_datetime_parse(x, 'yyyy-MM-dd') #' stri_datetime_parse(x, 'yyyy-MM-dd', lenient=TRUE) #' stri_datetime_parse(x %s+% " 17:13", "yyyy-MM-dd HH:mm") #' stri_datetime_parse('19 lipca 2015', 'date_long', locale='pl_PL') #' stri_datetime_format(stri_datetime_now(), 'datetime_relative_medium') #' #' @rdname stri_datetime_format #' @family datetime #' @export stri_datetime_format <- function( time, format = "uuuu-MM-dd HH:mm:ss", tz = NULL, locale = NULL ) { .Call(C_stri_datetime_format, time, format, tz, locale) } #' @export #' @rdname stri_datetime_format #' @aliases stri_datetime_format stri_datetime_parse <- function( str, format = "uuuu-MM-dd HH:mm:ss", lenient = FALSE, tz = NULL, locale = NULL ) { .Call(C_stri_datetime_parse, str, format, lenient, tz, locale) } #' @title #' Convert \code{strptime}-Style Format Strings #' #' @description #' This function converts \code{\link[base]{strptime}} or #' \code{\link[base]{strftime}}-style #' format strings to \pkg{ICU} format strings that may be used #' in \code{\link{stri_datetime_parse}} and \code{\link{stri_datetime_format}} #' functions. #' #' @details #' For more details on conversion specifiers please refer to #' the manual page of \code{\link[base]{strptime}}. Most of the formatters #' of the form \code{\%x}, where \code{x} is a letter, are supported. #' Moreover, each \code{\%\%} is replaced with \code{\%}. #' #' Warnings are given in the case of \code{\%x}, \code{\%X}, \code{\%u}, #' \code{\%w}, \code{\%g}, \code{\%G}, \code{\%c}, \code{\%U}, and \code{\%W} #' as in such circumstances either \pkg{ICU} does not #' support the functionality requested using the string format API #' or there are some inconsistencies between base R and \pkg{ICU}. #' #' @param x character vector of date/time format strings #' #' @param ignore_special if \code{FALSE}, special identifiers like #' \code{"datetime_full"} or \code{date_relative_short} #' (see \code{\link{stri_datetime_format}}) are left as-is #' #' @return Returns a character vector. #' #' @examples #' stri_datetime_fstr('%Y-%m-%d %H:%M:%S') #' #' @family datetime #' @export stri_datetime_fstr <- function(x, ignore_special=TRUE) { x <- .Call(C_stri_datetime_fstr, x) ignore_special <- (is.logical(ignore_special) && length(ignore_special) == 1L && !is.na(ignore_special) && ignore_special) # isTRUE(ignore_special) if (length(x) > 0 && !ignore_special) { formats <- outer( c("date", "time", "datetime", "date_relative", "datetime_relative"), c("full", "long", "medium", "short"), stri_paste, sep="_" ) which_p <- match(x, stringi::stri_sprintf("'%s'", formats)) # works for NAs and no items from the above list too x[which(!is.na(which_p))] <- formats[which_p[!is.na(which_p)]] } x } # ?DateTimeClasses # cut # round # trunc # time + z # z + time # time - z # time1 lop time2 stringi/R/search_startsendswith_4.R0000644000176200001440000001666114750110641017124 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Determine if the Start or End of a String Matches a Pattern #' #' @description #' These functions check if a string starts or ends with a match #' to a given pattern. Also, it is possible to check if there is a match #' at a specific position. #' #' @details #' Vectorized over \code{str}, \code{pattern}, #' and \code{from} or \code{to} (with recycling #' of the elements in the shorter vector if necessary). #' #' If \code{pattern} is empty, then the result is \code{NA} #' and a warning is generated. #' #' Argument \code{start} controls the start position in \code{str} #' where there is a match to a \code{pattern}. #' \code{to} gives the end position. #' #' Indexes given by \code{from} or \code{to} are of course 1-based, #' i.e., an index 1 denotes the first character #' in a string. This gives a typical R look-and-feel. #' #' For negative indexes in \code{from} or \code{to}, counting starts #' at the end of the string. For instance, index -1 denotes the last code point #' in the string. #' #' If you wish to test for a pattern match at an arbitrary #' position in \code{str}, use \code{\link{stri_detect}}. #' #' \code{stri_startswith} and \code{stri_endswith} are convenience functions. #' They call either \code{stri_*_fixed}, \code{stri_*_coll}, #' or \code{stri_*_charclass}, depending on the argument used. #' Relying on these underlying functions directly will make your code run #' slightly faster. #' #' Note that testing for a pattern match at the start or end of a string #' has not been implemented separately for regex patterns. #' For that you may use the '\code{^}' and '\code{$}' meta-characters, #' see \link{stringi-search-regex}. #' #' @param str character vector #' @param pattern,fixed,coll,charclass character vector defining search patterns; #' for more details refer to \link{stringi-search} #' @param from integer vector #' @param to integer vector #' @param negate single logical value; whether a no-match to a pattern #' is rather of interest #' @param opts_collator,opts_fixed a named list used to tune up #' the search engine's settings; see \code{\link{stri_opts_collator}} #' and \code{\link{stri_opts_fixed}}, respectively; \code{NULL} #' for the defaults #' @param ... supplementary arguments passed to the underlying functions, #' including additional settings for \code{opts_collator}, \code{opts_fixed}, #' and so on. #' #' @return Each function returns a logical vector. #' #' #' @examples #' stri_startswith_charclass(' trim me! ', '\\p{WSpace}') #' stri_startswith_fixed(c('a1', 'a2', 'b3', 'a4', 'c5'), 'a') #' stri_detect_regex(c('a1', 'a2', 'b3', 'a4', 'c5'), '^a') #' stri_startswith_fixed('ababa', 'ba') #' stri_startswith_fixed('ababa', 'ba', from=2) #' stri_startswith_coll(c('a1', 'A2', 'b3', 'A4', 'C5'), 'a', strength=1) #' pat <- stri_paste('\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 ', #' '\u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645XYZ') #' stri_endswith_coll('\ufdfa\ufdfa\ufdfaXYZ', pat, strength=1) #' #' @family search_detect #' @export #' @rdname stri_startsendswith stri_startswith <- function(str, ..., fixed, coll, charclass) { providedarg <- c( fixed = !missing(fixed), coll = !missing(coll), charclass = !missing(charclass)) if (sum(providedarg) != 1) stop("you have to specify either `fixed`, `coll`, or `charclass`") if (providedarg["fixed"]) stri_startswith_fixed(str, fixed, ...) else if (providedarg["coll"]) stri_startswith_coll(str, coll, ...) else if (providedarg["charclass"]) stri_startswith_charclass(str, charclass, ...) } #' @export #' @rdname stri_startsendswith stri_endswith <- function(str, ..., fixed, coll, charclass) { providedarg <- c( fixed = !missing(fixed), coll = !missing(coll), charclass = !missing(charclass)) if (sum(providedarg) != 1) stop("you have to specify either `fixed`, `coll`, or `charclass`") if (providedarg["fixed"]) stri_endswith_fixed(str, fixed, ...) else if (providedarg["coll"]) stri_endswith_coll(str, coll, ...) else if (providedarg["charclass"]) stri_endswith_charclass(str, charclass, ...) } #' @export #' @rdname stri_startsendswith stri_startswith_fixed <- function(str, pattern, from = 1L, negate = FALSE, ..., opts_fixed = NULL) { if (!missing(...)) opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...))) .Call(C_stri_startswith_fixed, str, pattern, from, negate, opts_fixed) } #' @export #' @rdname stri_startsendswith stri_endswith_fixed <- function(str, pattern, to = -1L, negate = FALSE, ..., opts_fixed = NULL) { if (!missing(...)) opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...))) .Call(C_stri_endswith_fixed, str, pattern, to, negate, opts_fixed) } #' @export #' @rdname stri_startsendswith stri_startswith_charclass <- function(str, pattern, from = 1L, negate = FALSE) { .Call(C_stri_startswith_charclass, str, pattern, from, negate) } #' @export #' @rdname stri_startsendswith stri_endswith_charclass <- function(str, pattern, to = -1L, negate = FALSE) { .Call(C_stri_endswith_charclass, str, pattern, to, negate) } #' @export #' @rdname stri_startsendswith stri_startswith_coll <- function(str, pattern, from = 1L, negate = FALSE, ..., opts_collator = NULL) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_startswith_coll, str, pattern, from, negate, opts_collator) } #' @export #' @rdname stri_startsendswith stri_endswith_coll <- function(str, pattern, to = -1L, negate = FALSE, ..., opts_collator = NULL) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_endswith_coll, str, pattern, to, negate, opts_collator) } stringi/R/internal_test.R0000644000176200001440000000516214750110641015133 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # Check R encoding marking [internal, DEBUG only] # # This is an internal function (no-export & no-manual) - test how R marks # ASCII/LATIN1/UTF8/BYTES encodings (see also \code{?Encoding}). # # Results are printed on STDERR # # @param str character vector # @return who cares .stri_test_Rmark <- function(str) { invisible(.Call(C_stri_test_Rmark, str)) } # For testing StriContainerUTF16's performance [internal, DEBUG only] # # @param str character vector # @return who cares .stri_test_StriContainerUTF16 <- function(str) { .Call(C_stri_test_UnicodeContainer16, str) } # For testing StriContainerUTF8's performance [internal, DEBUG only] # # @param str character vector # @return who cares .stri_test_StriContainerUTF8 <- function(str) { .Call(C_stri_test_UnicodeContainer8, str) } # For testing .Call performance [internal, DEBUG only] # # @param str some object # @return \code{str} .stri_test_returnasis <- function(x) { .Call(C_stri_test_returnasis, x) } stringi/R/pad.R0000644000176200001440000001134014750110641013017 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Pad (Center/Left/Right Align) a String #' #' @description #' Add multiple \code{pad} characters at the given \code{side}(s) of each string #' so that each output string is of total width of at least \code{width}. #' These functions may be used to center or left/right-align each string. #' #' @details #' Vectorized over \code{str}, \code{width}, and \code{pad}. #' Each string in \code{pad} should consist of a code points of total width #' equal to 1 or, if \code{use_length} is \code{TRUE}, exactly one code point. #' #' \code{stri_pad} is a convenience function, which dispatches #' to \code{stri_pad_*}. #' #' Note that Unicode code points may have various widths when #' printed on the console and that, by default, the function takes that #' into account. By changing the state of the \code{use_length} #' argument, this function starts acting like each code point #' was of width 1. This feature should rather be used with #' text in Latin script. #' #' See \code{\link{stri_trim_left}} (among others) for reverse operation. #' Also check out \code{\link{stri_wrap}} for line wrapping. #' #' @param str character vector #' @param width integer vector giving minimal output string lengths #' @param side [\code{stri_pad} only] single character string; #' sides on which padding character is added #' (\code{left} (default), \code{right}, or \code{both}) #' @param pad character vector giving padding code points #' @param use_length single logical value; should the number of code #' points be used instead of the total code point width #' (see \code{\link{stri_width}})? #' #' @return These functions return a character vector. #' #' @examples #' stri_pad_left('stringi', 10, pad='#') #' stri_pad_both('stringi', 8:12, pad='*') #' # center on screen: #' cat(stri_pad_both(c('the', 'string', 'processing', 'package'), #' getOption('width')*0.9), sep='\n') #' cat(stri_pad_both(c('\ud6c8\ubbfc\uc815\uc74c', # takes width into account #' stri_trans_nfkd('\ud6c8\ubbfc\uc815\uc74c'), 'abcd'), #' width=10), sep='\n') #' #' @family length #' @rdname stri_pad #' @export stri_pad_both <- function(str, width = floor(0.9 * getOption("width")), pad = " ", use_length = FALSE) { .Call(C_stri_pad, str, width, 2L, pad, use_length) } #' @rdname stri_pad #' @export stri_pad_left <- function(str, width = floor(0.9 * getOption("width")), pad = " ", use_length = FALSE) { .Call(C_stri_pad, str, width, 0L, pad, use_length) } #' @rdname stri_pad #' @export stri_pad_right <- function(str, width = floor(0.9 * getOption("width")), pad = " ", use_length = FALSE) { .Call(C_stri_pad, str, width, 1L, pad, use_length) } #' @rdname stri_pad #' @export stri_pad <- function(str, width = floor(0.9 * getOption("width")), side = c("left", "right", "both"), pad = " ", use_length = FALSE) { # `left` is the default for compatibility with stringr side <- match.arg(side) # this is slow switch(side, both = stri_pad_both(str, width, pad, use_length), left = stri_pad_left(str, width, pad, use_length), right = stri_pad_right(str, width, pad, use_length)) } stringi/R/opts.R0000644000176200001440000003731114750110641013246 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Generate a List with Collator Settings #' #' @description #' A convenience function to tune the \pkg{ICU} Collator's behavior, #' e.g., in \code{\link{stri_compare}}, \code{\link{stri_order}}, #' \code{\link{stri_unique}}, \code{\link{stri_duplicated}}, #' as well as \code{\link{stri_detect_coll}} #' and other \link{stringi-search-coll} functions. #' #' #' @details #' \pkg{ICU}'s \emph{collator} performs a locale-aware, natural-language #' alike string comparison. #' This is a more reliable way of establishing relationships between #' strings than the one provided by base \R, and definitely #' one that is more complex and appropriate than ordinary bytewise #' comparison. #' #' #' @param locale single string, \code{NULL} or #' \code{''} for default locale #' @param strength single integer in \{1,2,3,4\}, which defines collation strength; #' \code{1} for the most permissive collation rules, \code{4} for the strictest #' ones #' @param alternate_shifted single logical value; \code{FALSE} #' treats all the code points with non-ignorable primary weights in the same way, #' \code{TRUE} causes code points with primary weights that are equal or below #' the variable top value to be ignored on primary level and moved to the quaternary level #' @param french single logical value; used in Canadian French; #' \code{TRUE} results in secondary weights being considered backwards #' @param uppercase_first single logical value; \code{NA} #' orders upper and lower case letters in accordance to their tertiary weights, #' \code{TRUE} forces upper case letters to sort before lower case letters, #' \code{FALSE} does the opposite #' @param case_level single logical value; #' controls whether an extra case level (positioned before the third level) is generated or not #' @param normalization #' single logical value; if \code{TRUE}, #' then incremental check is performed to see whether the input data is in #' the FCD form. If the data is not in the FCD form, incremental NFD #' normalization is performed #' @param normalisation alias of \code{normalization} #' @param numeric single logical value; #' when turned on, this attribute generates a collation key for #' the numeric value of substrings of digits; #' this is a way to get '100' to sort AFTER '2'; #' note that negative or non-integer numbers will not be ordered properly #' #' @return #' Returns a named list object; missing settings are left with default values. #' #' @export #' @family locale_sensitive #' @family search_coll #' #' #' @rdname stri_opts_collator #' #' @references #' \emph{Collation} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/collation/} #' #' \emph{ICU Collation Service Architecture} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/collation/architecture.html} #' #' \emph{\code{icu::Collator} Class Reference} -- ICU4C API Documentation, #' \url{https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1Collator.html} #' #' @examples #' stri_cmp('number100', 'number2') #' stri_cmp('number100', 'number2', opts_collator=stri_opts_collator(numeric=TRUE)) #' stri_cmp('number100', 'number2', numeric=TRUE) # equivalent #' stri_cmp('above mentioned', 'above-mentioned') #' stri_cmp('above mentioned', 'above-mentioned', alternate_shifted=TRUE) stri_opts_collator <- function( locale = NULL, strength = 3L, alternate_shifted = FALSE, french = FALSE, uppercase_first = NA, case_level = FALSE, normalization = FALSE, normalisation = normalization, numeric = FALSE ) { opts <- list() if (!missing(locale)) opts["locale"] <- locale if (!missing(strength)) opts["strength"] <- strength if (!missing(alternate_shifted)) opts["alternate_shifted"] <- alternate_shifted if (!missing(french)) opts["french"] <- french if (!missing(uppercase_first)) opts["uppercase_first"] <- uppercase_first if (!missing(case_level)) opts["case_level"] <- case_level if (!missing(numeric)) opts["numeric"] <- numeric if (!missing(normalization)) opts["normalization"] <- normalization else if (!missing(normalisation)) opts["normalization"] <- normalisation opts } #' @rdname stri_opts_collator #' @export stri_coll <- stri_opts_collator #' @title #' Generate a List with Regex Matcher Settings #' #' @description #' A convenience function to tune the \pkg{ICU} regular expressions #' matcher's behavior, e.g., in \code{\link{stri_count_regex}} #' and other \link{stringi-search-regex} functions. #' #' @details #' Note that some regex settings may be changed using ICU regex flags #' inside regexes. For example, \code{'(?i)pattern'} performs #' a case-insensitive match of a given pattern, #' see the \pkg{ICU} User Guide entry on Regular Expressions #' in the References section or \link{stringi-search-regex}. #' #' @param case_insensitive logical; enables case insensitive matching [regex flag \code{(?i)}] #' @param comments logical; allows white space and comments within patterns [regex flag \code{(?x)}] #' @param dotall logical; if set, `\code{.}` matches line terminators, #' otherwise matching of `\code{.}` stops at a line end [regex flag \code{(?s)}] #' @param dot_all alias of \code{dotall} #' @param literal logical; if set, treat the entire pattern as a literal string: #' metacharacters or escape sequences in the input sequence will be given no special meaning; #' note that in most cases you would rather use the \link{stringi-search-fixed} #' facilities in this case #' @param multiline logical; controls the behavior of `\code{$}` and `\code{^}`. #' If set, recognize line terminators within a string, otherwise, #' match only at start and end of input string [regex flag \code{(?m)}] #' @param multi_line alias of \code{multiline} #' @param unix_lines logical; Unix-only line endings; #' when enabled, only \code{U+000a} is recognized as a #' line ending by `\code{.}`, `\code{$}`, and `\code{^}`. #' @param uword logical; Unicode word boundaries; #' if set, uses the Unicode TR 29 definition of word boundaries; #' warning: Unicode word boundaries are quite different from traditional #' regex word boundaries. [regex flag \code{(?w)}] #' See \url{https://unicode.org/reports/tr29/#Word_Boundaries} #' @param error_on_unknown_escapes logical; #' whether to generate an error on unrecognized backslash escapes; #' if set, fail with an error on patterns that contain backslash-escaped ASCII #' letters without a known special meaning; #' otherwise, these escaped letters represent themselves #' @param time_limit integer; processing time limit, in ~milliseconds (but not precisely so, #' depends on the CPU speed), for match operations; #' setting a limit is desirable if poorly written regexes are expected on input; #' 0 for no limit #' @param stack_limit integer; maximal size, in bytes, of the heap storage available #' for the match backtracking stack; setting a limit is desirable if poorly #' written regexes are expected on input; 0 for no limit #' #' @return #' Returns a named list object; missing settings are left with default values. #' #' @export #' @family search_regex #' #' @references #' \emph{\code{enum URegexpFlag}: Constants for Regular Expression Match Modes} #' -- ICU4C API Documentation, #' \url{https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/uregex_8h.html} #' #' \emph{Regular Expressions} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/strings/regexp.html} #' #' @examples #' stri_detect_regex('ala', 'ALA') # case-sensitive by default #' stri_detect_regex('ala', 'ALA', opts_regex=stri_opts_regex(case_insensitive=TRUE)) #' stri_detect_regex('ala', 'ALA', case_insensitive=TRUE) # equivalent #' stri_detect_regex('ala', '(?i)ALA') # equivalent stri_opts_regex <- function( case_insensitive, comments, dotall, dot_all = dotall, literal, multiline, multi_line = multiline, unix_lines, uword, error_on_unknown_escapes, time_limit = 0L, stack_limit = 0L ) { opts <- list() if (!missing(case_insensitive)) opts["case_insensitive"] <- case_insensitive if (!missing(comments)) opts["comments"] <- comments if (!missing(literal)) opts["literal"] <- literal if (!missing(unix_lines)) opts["unix_lines"] <- unix_lines if (!missing(uword)) opts["uword"] <- uword if (!missing(error_on_unknown_escapes)) opts["error_on_unknown_escapes"] <- error_on_unknown_escapes if (!missing(stack_limit)) opts["stack_limit"] <- stack_limit if (!missing(time_limit)) opts["time_limit"] <- time_limit if (!missing(dotall)) opts["dotall"] <- dotall else if (!missing(dot_all)) opts["dotall"] <- dot_all if (!missing(multiline)) opts["multiline"] <- multiline else if (!missing(multi_line)) opts["multiline"] <- multi_line opts } #' @title #' Generate a List with BreakIterator Settings #' #' @description #' A convenience function to tune the \pkg{ICU} \code{BreakIterator}'s behavior #' in some text boundary analysis functions, see #' \link{stringi-search-boundaries}. #' #' @details #' The \code{skip_*} family of settings may be used to prevent performing #' any special actions on particular types of text boundaries, e.g., #' in case of the \code{\link{stri_locate_all_boundaries}} and #' \code{\link{stri_split_boundaries}} functions. #' #' Note that custom break iterator rules (advanced users only) #' should be specified as a single string. #' For a detailed description of the syntax of RBBI rules, please refer #' to the ICU User Guide on Boundary Analysis. #' #' @param type single string; either the break iterator type, one of \code{character}, #' \code{line_break}, \code{sentence}, \code{word}, #' or a custom set of ICU break iteration rules; #' see \link{stringi-search-boundaries} #' @param locale single string, \code{NULL} or \code{''} for default locale #' @param skip_word_none logical; perform no action for 'words' that #' do not fit into any other categories #' @param skip_word_number logical; perform no action for words that #' appear to be numbers #' @param skip_word_letter logical; perform no action for words that #' contain letters, excluding hiragana, katakana, or ideographic characters #' @param skip_word_kana logical; perform no action for words #' containing kana characters #' @param skip_word_ideo logical; perform no action for words #' containing ideographic characters #' @param skip_line_soft logical; perform no action for soft line breaks, #' i.e., positions where a line break is acceptable but not required #' @param skip_line_hard logical; perform no action for hard, #' or mandatory line breaks #' @param skip_sentence_term logical; perform no action for sentences #' ending with a sentence terminator ('\code{.}', '\code{,}', '\code{?}', #' '\code{!}'), possibly followed by a hard separator #' (\code{CR}, \code{LF}, \code{PS}, etc.) #' @param skip_sentence_sep logical; perform no action for sentences #' that do not contain an ending sentence terminator, but are ended #' by a hard separator or end of input #' #' @return #' Returns a named list object. #' Omitted \code{skip_*} values act as they have been set to \code{FALSE}. #' #' @export #' @family text_boundaries #' #' @references #' \emph{\code{ubrk.h} File Reference} -- ICU4C API Documentation, #' \url{https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/ubrk_8h.html} #' #' \emph{Boundary Analysis} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/boundaryanalysis/} stri_opts_brkiter <- function( type, locale, skip_word_none, skip_word_number, skip_word_letter, skip_word_kana, skip_word_ideo, skip_line_soft, skip_line_hard, skip_sentence_term, skip_sentence_sep ) { opts <- list() if (!missing(type)) opts["type"] <- type if (!missing(locale)) opts["locale"] <- locale if (!missing(skip_word_none)) opts["skip_word_none"] <- skip_word_none if (!missing(skip_word_number)) opts["skip_word_number"] <- skip_word_number if (!missing(skip_word_letter)) opts["skip_word_letter"] <- skip_word_letter if (!missing(skip_word_kana)) opts["skip_word_kana"] <- skip_word_kana if (!missing(skip_word_ideo)) opts["skip_word_ideo"] <- skip_word_ideo if (!missing(skip_line_soft)) opts["skip_line_soft"] <- skip_line_soft if (!missing(skip_line_hard)) opts["skip_line_hard"] <- skip_line_hard if (!missing(skip_sentence_term)) opts["skip_sentence_term"] <- skip_sentence_term if (!missing(skip_sentence_sep)) opts["skip_sentence_sep"] <- skip_sentence_sep opts } #' @title #' Generate a List with Fixed Pattern Search Engine's Settings #' #' @description #' A convenience function used to tune up the behavior of \code{stri_*_fixed} #' functions, see \link{stringi-search-fixed}. #' #' @details #' Case-insensitive matching uses a simple, single-code point case mapping #' (via ICU's \code{u_toupper()} function). #' Full case mappings should be used whenever possible because they produce #' better results by working on whole strings. They also take into account #' the string context and the language, see \link{stringi-search-coll}. #' #' Searching for overlapping pattern matches is available in #' \code{\link{stri_extract_all_fixed}}, \code{\link{stri_locate_all_fixed}}, #' and \code{\link{stri_count_fixed}} functions. #' #' @param case_insensitive logical; enable simple case insensitive matching #' @param overlap logical; enable overlapping matches' detection #' #' @return #' Returns a named list object. #' #' @export #' @family search_fixed #' #' @references #' \emph{C/POSIX Migration} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/icu/posix.html} #' #' @examples #' stri_detect_fixed('ala', 'ALA') # case-sensitive by default #' stri_detect_fixed('ala', 'ALA', opts_fixed=stri_opts_fixed(case_insensitive=TRUE)) #' stri_detect_fixed('ala', 'ALA', case_insensitive=TRUE) # equivalent stri_opts_fixed <- function(case_insensitive = FALSE, overlap = FALSE) { opts <- list() if (!missing(case_insensitive)) opts["case_insensitive"] <- case_insensitive if (!missing(overlap)) opts["overlap"] <- overlap opts } stringi/R/time_zone.R0000644000176200001440000002005714750110641014251 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' List Available Time Zone Identifiers #' #' @description #' Returns a list of available time zone identifiers. #' #' @details #' If \code{offset} and \code{region} are \code{NA} (the default), then #' all time zones are returned. Otherwise, #' only time zone identifiers with a given raw offset from GMT #' and/or time zones corresponding to a given region are provided. #' Note that the effect of daylight savings time is ignored. #' #' A time zone represents an offset applied to the Greenwich Mean Time (GMT) #' to obtain local time (Universal Coordinated Time, or UTC, is similar, #' but not precisely identical, to GMT; in \pkg{ICU} the two terms #' are used interchangeably since \pkg{ICU} does not concern itself with #' either leap seconds or historical behavior). #' The offset might vary throughout the year, if daylight savings time (DST) #' is used, or might be the same all year long. #' Typically, regions closer to the equator do not use DST. #' If DST is in use, then specific rules define the point where #' the offset changes and the amount by which it changes. #' #' If DST is observed, then three additional bits of information are needed: #' \enumerate{ #' \item The precise date and time during the year when DST begins. #' In the first half of the year it is in the northern hemisphere, #' and in the second half of the year it is in the southern hemisphere. #' \item The precise date and time during the year when DST ends. #' In the first half of the year it is in the southern hemisphere, #' and in the second half of the year it is in the northern hemisphere. #' \item The amount by which the GMT offset changes when DST is in effect. #' This is almost always one hour. #' } #' #' #' @param offset single numeric value; #' a given raw offset from GMT, in hours; #' \code{NA} for all offsets #' @param region single string; #' a ISO 3166 two-letter country code or UN M.49 three-digit area code; #' \code{NA} for all regions #' #' @return Returns a character vector. #' #' @references #' \emph{TimeZone} class -- ICU API Documentation, #' \url{https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1TimeZone.html} #' #' \emph{ICU TimeZone classes} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/datetime/timezone/} #' #' \emph{Date/Time Services} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/datetime/} #' #' @examples #' stri_timezone_list() #' stri_timezone_list(offset=1) #' stri_timezone_list(offset=5.5) #' stri_timezone_list(offset=5.75) #' stri_timezone_list(region='PL') #' stri_timezone_list(region='US', offset=-10) #' #' # Fetch information on all time zones #' do.call(rbind.data.frame, #' lapply(stri_timezone_list(), function(tz) stri_timezone_info(tz))) #' #' @family datetime #' @family timezone #' @export stri_timezone_list <- function(region=NA_character_, offset=NA_integer_) { stri_sort( .Call(C_stri_timezone_list, region, offset), locale="en_US", numeric=TRUE, strength=1 ) } #' @title #' Set or Get Default Time Zone in \pkg{stringi} #' #' @description #' \code{stri_timezone_set} changes the current default time zone for all functions #' in the \pkg{stringi} package, i.e., establishes the meaning of the #' ``\code{NULL} time zone'' argument to date/time processing functions. #' #' \code{stri_timezone_get} gets the current default time zone. #' #' For more information on time zone representation in \pkg{ICU} #' and \pkg{stringi}, refer to \code{\link{stri_timezone_list}}. #' #' @details #' Unless the default time zone has already been set using #' \code{stri_timezone_set}, the default time zone is determined #' by querying the OS with methods in \pkg{ICU}'s internal platform utilities. #' #' @param tz single string; time zone identifier #' #' @return #' \code{stri_timezone_set} returns a string with #' previously used timezone, invisibly. #' #' \code{stri_timezone_get} returns a single string #' with the current default time zone. #' #' @references #' \emph{TimeZone} class -- ICU API Documentation, #' \url{https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1TimeZone.html} #' #' @examples #' \dontrun{ #' oldtz <- stri_timezone_set('Europe/Warsaw') #' # ... many time zone-dependent operations #' stri_timezone_set(oldtz) # restore previous default time zone #' } #' #' @export #' @family datetime #' @family timezone #' @rdname stri_timezone_set #' @export stri_timezone_get <- function() { stri_timezone_info()$ID } #' @rdname stri_timezone_set #' @export stri_timezone_set <- function(tz) { previous <- stri_timezone_get() .Call(C_stri_timezone_set, tz) invisible(previous) } #' @title #' Query a Given Time Zone #' #' @description #' Provides some basic information on a given time zone identifier. #' #' @details #' Used to fetch basic information #' on any supported time zone. #' #' For more information on time zone representation in \pkg{ICU}, #' see \code{\link{stri_timezone_list}}. #' #' @param tz \code{NULL} or \code{''} for default time zone, #' or a single string with time zone ID otherwise #' @param display_type single string; #' one of \code{'short'}, \code{'long'}, \code{'generic_short'}, #' \code{'generic_long'}, \code{'gmt_short'}, \code{'gmt_long'}, #' \code{'common'}, \code{'generic_location'} #' @param locale \code{NULL} or \code{''} for default locale, #' or a single string with locale identifier #' #' @return #' Returns a list with the following named components: #' \enumerate{ #' \item \code{ID} (time zone identifier), #' \item \code{Name} (localized human-readable time zone name), #' \item \code{Name.Daylight} (localized human-readable time zone #' name when DST is used, if available), #' \item \code{Name.Windows} (Windows time zone ID, if available), #' \item \code{RawOffset} (raw GMT offset, in hours, before taking #' daylight savings into account), and #' \item \code{UsesDaylightTime} (states whether a time zone uses #' daylight savings time in the current Gregorian calendar year). #' } #' #' @examples #' stri_timezone_info() #' stri_timezone_info(locale='sk_SK') #' sapply(c('short', 'long', 'generic_short', 'generic_long', #' 'gmt_short', 'gmt_long', 'common', 'generic_location'), #' function(e) stri_timezone_info('Europe/London', display_type=e)) #' #' @family datetime #' @family timezone #' @export stri_timezone_info <- function(tz=NULL, locale=NULL, display_type="long") { # TODO: when does DST start??? .Call(C_stri_timezone_info, tz, locale, display_type) } stringi/R/internal_prepare_arg.R0000644000176200001440000002723114750110641016444 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Passing Arguments to Functions in \pkg{stringi} #' #' @description #' Below we explain how \pkg{stringi} deals with its functions' arguments. #' #' If some function violates one of the following rules #' (for a very important reason), #' this is clearly indicated in its documentation (with discussion). #' #' @section Coercion of Arguments: #' #' When a character vector argument is expected, factors and other vectors #' coercible to characters vectors are silently converted with #' \code{\link{as.character}}, otherwise an error is generated. #' Coercion from a list which does not consist of length-1 atomic vectors #' issues a warning. #' #' When a logical, numeric, or integer vector argument is expected, #' factors are converted with \code{as.*(\link{as.character}(...))}, #' and other coercible vectors are converted with \code{as.*}, #' otherwise an error is generated. #' #' #' @section Vectorization: #' #' Almost all functions are vectorized with respect to all their arguments #' and the recycling rule is applied whenever necessary. #' Due to this property you may, #' for instance, search for one pattern in each given string, #' search for each pattern in one given string, #' and search for the i-th pattern within the i-th string. #' #' We of course took great care of performance issues: #' e.g., in regular expression searching, regex matchers are reused #' from iteration to iteration, as long as it is possible. #' #' Functions with some non-vectorized arguments are rare: #' e.g., regular expression matcher's settings are established #' once per each call. #' #' Some functions #' assume that a vector with one element is given #' as an argument (like \code{collapse} in \code{\link{stri_join}}). #' In such cases, if an empty vector is given you will get an error #' and for vectors with more than 1 elements - a warning will be #' generated (only the first element will be used). #' #' You may find details on vectorization behavior in the man pages #' on each particular function of your interest. #' #' @section Handling Missing Values (\code{NA}s): #' #' \pkg{stringi} handles missing values consistently. #' For any vectorized operation, if at least one vector element is missing, #' then the corresponding resulting value is also set to \code{NA}. #' #' #' @section Preserving Object Attributes: #' #' Generally, all our functions drop input objects' attributes #' (e.g., \code{\link{names}}, \code{\link{dim}}, etc.). #' This is due to deep vectorization as well as for efficiency reasons. #' If the preservation of attributes is needed, #' important attributes can be manually copied. Alternatively, the notation #' \code{x[] <- stri_...(x, ...)} can sometimes be used too. #' #' @rdname about_arguments #' @name about_arguments #' @aliases arguments stringi-arguments stringi-arguments #' @family stringi_general_topics #' @family prepare_arg invisible(NULL) # @title # Prepare a String Vector Argument [internal] # # @description # This is an internal function. However, the interested user may play with it # in order to get more insight on how \pkg{stringi} deals with its # functions' arguments. See `Value' section for details. # # @param x argument to be checked # # @return # If \code{x} is a factor or an object equipped with a \code{class} # attribute or a list, then \code{\link{as.character}} is called. # If \code{x} is a string, it is returned with no change. # If an atomic vector or a matrix is given, it is coerced to a character vector. # If it is a \code{name} object, a character vector of length 1 is generated. # Otherwise the function throws an error. # # @family prepare_arg stri_prepare_arg_string <- function(x) { .Call(C_stri_prepare_arg_string, x, deparse(substitute(x))) } # @title # Prepare a Numeric Vector Argument [internal] # # @description # This is an internal function. However, the interested user may play with it # in order to get more insight on how \pkg{stringi} deals with its # functions' arguments. See `Value' section for details. # TODO: factors_as_strings # # @param x argument to be checked # # @return # If \code{x} is a factor, \code{\link{as.character}} is called, and the # resulting character vector is coerced to numeric. # If it is an object equipped with a \code{class} attribute or a list, # \code{as.double} is called. # If it is a numeric vector, then it is returned with no change. # If atomic vector or a matrix is given, it is coerced to a numeric vector. # Otherwise the function throws an error. # # @family prepare_arg stri_prepare_arg_double <- function(x) { .Call(C_stri_prepare_arg_double, x, deparse(substitute(x))) } # @title # Prepare an Integer Vector Argument [internal] # # @description # This is an internal function. However, the interested user may play with it # in order to get more insight on how \pkg{stringi} deals with its # functions' arguments. See `Value' section for details. # TODO: factors_as_strings # # @param x argument to be checked # # @return # If \code{x} is a factor, \code{\link{as.character}} is called, and the # resulting character vector is coerced to integer. # If it is an object equipped with a \code{class} attribute or a list, # \code{as.integer} is called. # If it is an integer vector, then it is returned with no change. # If an atomic vector or a matrix is given, it is coerced to an integer vector. # Otherwise the function throws an error. # # @family prepare_arg stri_prepare_arg_integer <- function(x) { .Call(C_stri_prepare_arg_integer, x, deparse(substitute(x))) } # @title # Prepare a Logical Vector Argument [internal] # # @description # This is an internal function. However, the interested user may play with it # in order to get more insight on how \pkg{stringi} deals with its # functions' arguments. See `Value' section for details. # TODO: factors_as_strings # # @param x argument to be checked # # @return # If \code{x} is a logical vector, it is returned with no change. # If it is an object equipped with a \code{class} attribute or a list, # \code{as.logical} is called. # If \code{x} is a factor, \code{\link{as.character}} is called, and the # resulting character vector is coerced to logical. # If atomic vector or a matrix is given, it is coerced to a logical vector. # Otherwise the function throws an error. # # @family prepare_arg stri_prepare_arg_logical <- function(x) { .Call(C_stri_prepare_arg_logical, x, deparse(substitute(x))) } # @title # Prepare a Raw Vector Argument [internal] # # @description # This is an internal function. However, the interested user may play with it # in order to get more insight on how \pkg{stringi} deals with its # functions' arguments. See `Value' section for details. # TODO: factors_as_strings # # @param x argument to be checked # # @return # If \code{x} is a factor, \code{\link{as.character}} is called, and the # resulting character vector is coerced to raw. # If it is an object equipped with a \code{class} attribute or a list, # \code{as.raw} is called. # If \code{x} is a raw vector, it is returned with no change. # If atomic vector or a matrix is given, it is coerced to a raw vector. # Otherwise the function throws an error. # # @family prepare_arg stri_prepare_arg_raw <- function(x) { .Call(C_stri_prepare_arg_raw, x, deparse(substitute(x))) } # @title # Prepare a String Vector Argument [Single Value] [internal] # # @description # This is an internal function. However, the interested user may play with it # in order to get more insight on how \pkg{stringi} deals with its # functions' arguments. See `Value' section for details. # # @param x argument to be checked # @return # In the first place, \code{\link{stri_prepare_arg_string}} is called. # On an empty vector, an error is generated. # If there are more than 1 elements, a warning is generated. # A vector with one element (the first in \code{x}) is returned. # # @family prepare_arg stri_prepare_arg_string_1 <- function(x) { .Call(C_stri_prepare_arg_string_1, x, deparse(substitute(x))) } # @title # Prepare a Numeric Vector Argument [Single Value] [internal] # # @description # This is an internal function. However, the interested user may play with it # in order to get more insight on how \pkg{stringi} deals with its # functions' arguments. See `Value' section for details. # TODO: factors_as_strings # # @param x argument to be checked # @return # In the first place, \code{\link{stri_prepare_arg_double}} is called. # On an empty vector, an error is generated. # If there are more than 1 elements, a warning is generated. # A vector with one element (the first in \code{x}) is returned. # # @family prepare_arg stri_prepare_arg_double_1 <- function(x) { .Call(C_stri_prepare_arg_double_1, x, deparse(substitute(x))) } # @title # Prepare an Integer Vector Argument [Single Value] [internal] # # @description # This is an internal function. However, the interested user may play with it # in order to get more insight on how \pkg{stringi} deals with its # functions' arguments. See `Value' section for details. # # TODO: factors_as_strings # # @param x argument to be checked # @return # In the first place, \code{\link{stri_prepare_arg_integer}} is called. # On an empty vector, an error is generated. # If there are more than 1 elements, a warning is generated. # A vector with one element (the first in \code{x}) is returned. # # @family prepare_arg stri_prepare_arg_integer_1 <- function(x) { .Call(C_stri_prepare_arg_integer_1, x, deparse(substitute(x))) } # @title # Prepare a Logical Vector Argument [Single Value] [internal] # # @description # This is an internal function. However, the interested user may play with it # in order to get more insight on how \pkg{stringi} deals with its # functions' arguments. See `Value' section for details. # # @param x argument to be checked # @return # In the first place, \code{\link{stri_prepare_arg_logical}} is called. # On an empty vector, an error is generated. # If there are more than 1 elements, a warning is generated. # A vector with one element (the first in \code{x}) is returned. # # @family prepare_arg stri_prepare_arg_logical_1 <- function(x) { .Call(C_stri_prepare_arg_logical_1, x, deparse(substitute(x))) } stringi/R/wrap.R0000644000176200001440000001626514750110641013237 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Word Wrap Text to Format Paragraphs #' #' @description #' This function breaks text paragraphs into lines, #' of total width (if it is possible) at most given \code{width}. #' #' @details #' Vectorized over \code{str}. #' #' If \code{whitespace_only} is \code{FALSE}, #' then \pkg{ICU}'s line-\code{BreakIterator} is used to determine #' text boundaries where a line break is possible. #' This is a locale-dependent operation. #' Otherwise, the breaks are only at white-spaces. #' #' Note that Unicode code points may have various widths when #' printed on the console and that this function, by default, takes that #' into account. By changing the state of the \code{use_length} #' argument, this function starts to act as if each code point #' was of width 1. #' #' If \code{normalize} is \code{FALSE}, #' then multiple white spaces between the word boundaries are #' preserved within each wrapped line. #' In such a case, none of the strings can contain \code{\\r}, \code{\\n}, #' or other new line characters, otherwise you will get an error. #' You should split the input text into lines #' or, for example, substitute line breaks with spaces #' before applying this function. #' #' If \code{normalize} is \code{TRUE}, then #' all consecutive white space (ASCII space, horizontal TAB, CR, LF) #' sequences are replaced with single ASCII spaces #' before actual string wrapping. Moreover, \code{\link{stri_split_lines}} #' and \code{\link{stri_trans_nfc}} is called on the input character vector. #' This is for compatibility with \code{\link{strwrap}}. #' #' The greedy algorithm (for \code{cost_exponent} being non-positive) #' provides a very simple way for word wrapping. #' It always puts as many words in each line as possible. #' This method -- contrary to the dynamic algorithm -- does not minimize #' the number of space left at the end of every line. #' The dynamic algorithm (a.k.a. Knuth's word wrapping algorithm) #' is more complex, but it returns text wrapped #' in a more aesthetic way. This method minimizes the squared #' (by default, see \code{cost_exponent}) number of spaces (raggedness) #' at the end of each line, so the text is mode arranged evenly. #' Note that the cost of printing the last line is always zero. #' #' @param str character vector of strings to reformat #' @param width single integer giving the suggested #' maximal total width/number of code points per line #' @param cost_exponent single numeric value, values not greater than zero #' will select a greedy word-wrapping algorithm; otherwise #' this value denotes the exponent in the cost function #' of a (more aesthetic) dynamic programming-based algorithm #' (values in [2, 3] are recommended) #' @param simplify single logical value, see Value #' @param normalize single logical value, see Details #' @param normalise alias of \code{normalize} #' @param indent single non-negative integer; gives the indentation of the #' first line in each paragraph #' @param exdent single non-negative integer; specifies the indentation #' of subsequent lines in paragraphs #' @param prefix,initial single strings; \code{prefix} is used as prefix for each #' line except the first, for which \code{initial} is utilized #' @param whitespace_only single logical value; allow breaks only at white-spaces? #' if \code{FALSE}, \pkg{ICU}'s line break iterator is used to split text #' into words, which is suitable for natural language processing #' @param locale \code{NULL} or \code{''} for text boundary analysis following #' the conventions of the default locale, or a single string with #' locale identifier, see \link{stringi-locale} #' @param use_length single logical value; should the number of code #' points be used instead of the total code point width (see \code{\link{stri_width}})? #' #' @return #' If \code{simplify} is \code{TRUE}, then a character vector is returned. #' Otherwise, you will get a list of \code{length(str)} character vectors. #' #' @rdname stri_wrap #' @family locale_sensitive #' @family text_boundaries #' @examples #' s <- stri_paste( #' 'Lorem ipsum dolor sit amet, consectetur adipisicing elit. Proin ', #' 'nibh augue, suscipit a, scelerisque sed, lacinia in, mi. Cras vel ', #' 'lorem. Etiam pellentesque aliquet tellus.') #' cat(stri_wrap(s, 20, 0.0), sep='\n') # greedy #' cat(stri_wrap(s, 20, 2.0), sep='\n') # dynamic #' cat(stri_pad(stri_wrap(s), side='both'), sep='\n') #' #' @references #' D.E. Knuth, M.F. Plass, #' Breaking paragraphs into lines, \emph{Software: Practice and Experience} 11(11), #' 1981, pp. 1119--1184. #' #' @export stri_wrap <- function(str, width = floor(0.9 * getOption("width")), cost_exponent = 2, simplify = TRUE, normalize = TRUE, normalise = normalize, indent = 0, exdent = 0, prefix = "", initial = prefix, whitespace_only = FALSE, use_length = FALSE, locale = NULL) { simplify <- as.logical(simplify) if (!missing(normalise)) normalize <- normalise normalize <- as.logical(normalize) if (normalize) { # this will give an informative warning or error if sth is wrong str <- sapply(stri_split_lines(str), function(s) stri_flatten(s, collapse = " ")) str <- stri_trim(stri_replace_all_charclass(str, "[\\u0020\\r\\n\\t]", " ", merge = TRUE)) str <- stri_trans_nfc(str) } ret <- .Call(C_stri_wrap, str, width, cost_exponent, indent, exdent, prefix, initial, whitespace_only, use_length, locale) if (simplify) { # this will give an informative warning or error if sth is wrong as.character(unlist(ret)) } else ret } stringi/R/reverse.R0000644000176200001440000000452414750110641013734 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Reverse Each String #' #' @description #' Reverses the order of the code points in every string. #' #' @details #' Note that this operation may result in non-Unicode-normalized #' strings and may give peculiar outputs for bidirectional strings. #' #' See also \code{\link{stri_rand_shuffle}} for a random permutation #' of code points. #' #' @param str character vector #' #' @return Returns a character vector. #' #' @examples #' stri_reverse(c('123', 'abc d e f')) #' stri_reverse('ZXY (\u0105\u0104123$^).') #' stri_reverse(stri_trans_nfd('\u0105')) == stri_trans_nfd('\u0105') # A, ogonek -> agonek, A #' #' @export stri_reverse <- function(str) { .Call(C_stri_reverse, str) } stringi/R/search_extract_bound.R0000644000176200001440000001341314750110641016444 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Extract Data Between Text Boundaries #' #' @description #' These functions extract data between text boundaries. #' #' @details #' Vectorized over \code{str}. #' #' For more information on text boundary analysis #' performed by \pkg{ICU}'s \code{BreakIterator}, see #' \link{stringi-search-boundaries}. #' #' In case of \code{stri_extract_*_words}, #' just like in \code{\link{stri_count_words}}, #' \pkg{ICU}'s word \code{BreakIterator} iterator is used #' to locate the word boundaries, and all non-word characters #' (\code{UBRK_WORD_NONE} rule status) are ignored. #' #' #' @param str character vector or an object coercible to #' @param omit_no_match single logical value; if \code{FALSE}, #' then a missing value will indicate that there are no words #' @param simplify single logical value; #' if \code{TRUE} or \code{NA}, then a character matrix is returned; #' otherwise (the default), a list of character vectors is given, see Value #' @param opts_brkiter a named list with \pkg{ICU} BreakIterator's settings, #' see \code{\link{stri_opts_brkiter}}; #' \code{NULL} for the default break iterator, i.e., \code{line_break} #' @param ... additional settings for \code{opts_brkiter} #' @param locale \code{NULL} or \code{''} for text boundary analysis following #' the conventions of the default locale, or a single string with #' locale identifier, see \link{stringi-locale} #' #' @return #' For \code{stri_extract_all_*}, #' if \code{simplify=FALSE} (the default), then a #' list of character vectors is returned. Each string consists of #' a separate word. In case of \code{omit_no_match=FALSE} and #' if there are no words or if a string is missing, #' a single \code{NA} is provided on output. #' #' Otherwise, \code{\link{stri_list2matrix}} with \code{byrow=TRUE} argument #' is called on the resulting object. #' In such a case, a character matrix with \code{length(str)} rows #' is returned. Note that \code{\link{stri_list2matrix}}'s \code{fill} argument #' is set to an empty string and \code{NA}, #' for \code{simplify} \code{TRUE} and \code{NA}, respectively. #' #' For \code{stri_extract_first_*} and \code{stri_extract_last_*}, #' a character vector is returned. #' A \code{NA} element indicates a no-match. #' #' @examples #' stri_extract_all_words('stringi: THE string processing package 123.48...') #' #' @export #' @family search_extract #' @family locale_sensitive #' @family text_boundaries #' @rdname stri_extract_boundaries stri_extract_all_boundaries <- function(str, simplify = FALSE, omit_no_match = FALSE, ..., opts_brkiter = NULL) { if (!missing(...)) opts_brkiter <- do.call(stri_opts_brkiter, as.list(c(opts_brkiter, ...))) .Call(C_stri_extract_all_boundaries, str, simplify, omit_no_match, opts_brkiter) } #' @export #' @rdname stri_extract_boundaries stri_extract_last_boundaries <- function(str, ..., opts_brkiter = NULL) { if (!missing(...)) opts_brkiter <- do.call(stri_opts_brkiter, as.list(c(opts_brkiter, ...))) .Call(C_stri_extract_last_boundaries, str, opts_brkiter) } #' @export #' @rdname stri_extract_boundaries stri_extract_first_boundaries <- function(str, ..., opts_brkiter = NULL) { if (!missing(...)) opts_brkiter <- do.call(stri_opts_brkiter, as.list(c(opts_brkiter, ...))) .Call(C_stri_extract_first_boundaries, str, opts_brkiter) } #' @export #' @rdname stri_extract_boundaries stri_extract_all_words <- function(str, simplify = FALSE, omit_no_match = FALSE, locale = NULL) { stri_extract_all_boundaries( str, simplify, omit_no_match, opts_brkiter = stri_opts_brkiter( type = "word", skip_word_none = TRUE, locale = locale ) ) } #' @export #' @rdname stri_extract_boundaries stri_extract_first_words <- function(str, locale = NULL) { stri_extract_first_boundaries(str, opts_brkiter = stri_opts_brkiter(type = "word", skip_word_none = TRUE, locale = locale)) } #' @export #' @rdname stri_extract_boundaries stri_extract_last_words <- function(str, locale = NULL) { stri_extract_last_boundaries(str, opts_brkiter = stri_opts_brkiter(type = "word", skip_word_none = TRUE, locale = locale)) } stringi/R/trans_other.R0000644000176200001440000000547114750110641014613 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Translate Characters #' #' @description #' Translates Unicode code points in each input string. #' #' @details #' Vectorized over \code{str} and with respect to each code point #' in \code{pattern} and \code{replacement}. #' #' If \code{pattern} and \code{replacement} consist of a different number #' of code points, then the extra code points in the longer of the two #' are ignored, with a warning. #' #' If code points in a given \code{pattern} are not unique, the #' last corresponding replacement code point is used. #' #' Time complexity for each string in \code{str} is #' O(\code{stri_length(str)*stri_length(pattern)}). #' #' @param str character vector #' @param pattern a single character string providing code points to be translated #' @param replacement a single character string giving translated code points #' #' @return Returns a character vector. #' #' @export #' @family transform #' @examples #' stri_trans_char('id.123', '.', '_') #' stri_trans_char('babaab', 'ab', '01') #' stri_trans_char('GCUACGGAGCUUCGGAGCUAG', 'ACGT', 'TGCA') stri_trans_char <- function(str, pattern, replacement) { .Call(C_stri_trans_char, str, pattern, replacement) } stringi/R/search_locate_bound.R0000644000176200001440000001403414750110641016241 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title Locate Text Boundaries #' #' @description #' These functions locate text boundaries #' (like character, word, line, or sentence boundaries). #' Use \code{stri_locate_all_*} to locate all the matches. #' \code{stri_locate_first_*} and \code{stri_locate_last_*} #' give the first or the last matches, respectively. #' #' #' @details #' Vectorized over \code{str}. #' #' For more information on text boundary analysis #' performed by \pkg{ICU}'s \code{BreakIterator}, see #' \link{stringi-search-boundaries}. #' #' For \code{stri_locate_*_words}, #' just like in \code{\link{stri_extract_all_words}} and \code{\link{stri_count_words}}, #' \pkg{ICU}'s word \code{BreakIterator} iterator is used #' to locate the word boundaries, and all non-word characters #' (\code{UBRK_WORD_NONE} rule status) are ignored. #' This function is equivalent to a call to #' \code{stri_locate_*_boundaries(str, type='word', skip_word_none=TRUE, locale=locale)} #' #' #' #' @param str character vector or an object coercible to #' #' @param omit_no_match single logical value; if \code{TRUE}, #' a no-match will be indicated by a matrix with 0 rows #' \code{stri_locate_all_*} only #' #' @param opts_brkiter named list with \pkg{ICU} BreakIterator's settings, #' see \code{\link{stri_opts_brkiter}}; #' \code{NULL} for default break iterator, i.e., \code{line_break} #' #' @param ... additional settings for \code{opts_brkiter} #' #' @param locale \code{NULL} or \code{''} for text boundary analysis following #' the conventions of the default locale, or a single string with #' locale identifier, see \link{stringi-locale} #' #' @param get_length single logical value; if \code{FALSE} (default), #' generate \emph{from-to} matrices; otherwise, output #' \emph{from-length} ones #' #' #' @return #' \code{stri_locate_all_*} yields a list of \code{length(str)} #' integer matrices. #' \code{stri_locate_first_*} and \code{stri_locate_last_*} generate #' return an integer matrix. #' See \code{\link{stri_locate}} for more details. #' #' #' @examples #' test <- 'The\u00a0above-mentioned features are very useful. Spam, spam, eggs, bacon, and spam.' #' stri_locate_all_words(test) #' stri_locate_all_boundaries( #' 'Mr. Jones and Mrs. Brown are very happy. So am I, Prof. Smith.', #' type='sentence', #' locale='en_US@ss=standard' # ICU >= 56 only #' ) #' #' #' #' @export #' @family search_locate #' @family indexing #' @family locale_sensitive #' @family text_boundaries #' @rdname stri_locate_boundaries stri_locate_all_boundaries <- function( str, omit_no_match=FALSE, get_length=FALSE, ..., opts_brkiter=NULL ) { if (!missing(...)) opts_brkiter <- do.call(stri_opts_brkiter, as.list(c(opts_brkiter, ...))) .Call(C_stri_locate_all_boundaries, str, omit_no_match, opts_brkiter, get_length) } #' @export #' @rdname stri_locate_boundaries stri_locate_last_boundaries <- function( str, get_length=FALSE, ..., opts_brkiter=NULL ) { if (!missing(...)) opts_brkiter <- do.call(stri_opts_brkiter, as.list(c(opts_brkiter, ...))) .Call(C_stri_locate_last_boundaries, str, opts_brkiter, get_length) } #' @export #' @rdname stri_locate_boundaries stri_locate_first_boundaries <- function( str, get_length=FALSE, ..., opts_brkiter=NULL ) { if (!missing(...)) opts_brkiter <- do.call(stri_opts_brkiter, as.list(c(opts_brkiter, ...))) .Call(C_stri_locate_first_boundaries, str, opts_brkiter, get_length) } #' @export #' @rdname stri_locate_boundaries stri_locate_all_words <- function( str, omit_no_match=FALSE, locale=NULL, get_length=FALSE ) { stri_locate_all_boundaries( str, omit_no_match=omit_no_match, get_length=get_length, opts_brkiter=stri_opts_brkiter( type="word", skip_word_none=TRUE, locale=locale ) ) } #' @export #' @rdname stri_locate_boundaries stri_locate_last_words <- function( str, locale=NULL, get_length=FALSE ) { stri_locate_last_boundaries( str, get_length=get_length, opts_brkiter=stri_opts_brkiter( type="word", skip_word_none=TRUE, locale=locale ) ) } #' @export #' @rdname stri_locate_boundaries stri_locate_first_words <- function( str, locale=NULL, get_length=FALSE ) { stri_locate_first_boundaries( str, get_length=get_length, opts_brkiter=stri_opts_brkiter( type="word", skip_word_none=TRUE, locale=locale ) ) } stringi/R/time_symbols.R0000644000176200001440000000772114750110641014771 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' List Localizable Date-Time Formatting Data #' #' @description #' Returns a list of all localizable date-time formatting data, #' including month and weekday names, localized AM/PM strings, etc. #' #' @details #' \code{context} stands for a selector for date formatting context #' and \code{width} - for date formatting width. #' #' #' @param locale \code{NULL} or \code{''} for default locale, #' or a single string with locale identifier #' @param context single string; one of: \code{'format'}, \code{'standalone'} #' @param width single string; one of: \code{'abbreviated'}, \code{'wide'}, \code{'narrow'} #' #' @return Returns a list with the following named components: #' \enumerate{ #' \item \code{Month} - month names, #' \item \code{Weekday} - weekday names, #' \item \code{Quarter} - quarter names, #' \item \code{AmPm} - AM/PM names, #' \item \code{Era} - era names. #' } #' #' @examples #' stri_datetime_symbols() # uses the Gregorian calendar in most locales #' stri_datetime_symbols('@@calendar=hebrew') #' stri_datetime_symbols('he_IL@@calendar=hebrew') #' stri_datetime_symbols('@@calendar=islamic') #' stri_datetime_symbols('@@calendar=persian') #' stri_datetime_symbols('@@calendar=indian') #' stri_datetime_symbols('@@calendar=coptic') #' stri_datetime_symbols('@@calendar=japanese') #' #' stri_datetime_symbols('ja_JP_TRADITIONAL') # uses the Japanese calendar by default #' stri_datetime_symbols('th_TH_TRADITIONAL') # uses the Buddhist calendar #' #' stri_datetime_symbols('pl_PL', context='format') #' stri_datetime_symbols('pl_PL', context='standalone') #' #' stri_datetime_symbols(width='wide') #' stri_datetime_symbols(width='abbreviated') #' stri_datetime_symbols(width='narrow') #' #' @references #' \emph{Calendar} - ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/datetime/calendar/} #' #' \emph{DateFormatSymbols} class -- ICU API Documentation, #' \url{https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1DateFormatSymbols.html} #' #' \emph{Formatting Dates and Times} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/format_parse/datetime/} #' #' @family datetime #' @export stri_datetime_symbols <- function(locale=NULL, context="standalone", width="wide") { # TODO: get first day of week .Call(C_stri_datetime_symbols, locale, context, width) } stringi/R/random.R0000644000176200001440000002272214750110641013541 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Randomly Shuffle Code Points in Each String #' #' @description #' Generates a (pseudo)random permutation of the code points #' in each string. #' #' @details #' This operation may result in non-Unicode-normalized #' strings and may give peculiar outputs in case of bidirectional strings. #' #' See also \code{\link{stri_reverse}} for reversing the order of code points. #' #' @param str character vector #' #' @return Returns a character vector. #' #' @examples #' stri_rand_shuffle(c('abcdefghi', '0123456789')) #' # you can do better than this with stri_rand_strings: #' stri_rand_shuffle(rep(stri_paste(letters, collapse=''), 10)) #' #' @family random #' @export stri_rand_shuffle <- function(str) { .Call(C_stri_rand_shuffle, str) } #' @title #' Generate Random Strings #' #' @description #' Generates (pseudo)random strings of desired lengths. #' #' @details #' Vectorized over \code{length} and \code{pattern}. #' If length of \code{length} or \code{pattern} is greater than \code{n}, #' then redundant elements are ignored. Otherwise, #' these vectors are recycled if necessary. #' #' This operation may result in non-Unicode-normalized #' strings and may give peculiar outputs for bidirectional strings. #' #' Sampling of code points from the set specified by \code{pattern} #' is always done with replacement and each code point appears with equal #' probability. #' #' @param n single integer, number of observations #' @param length integer vector, desired string lengths #' @param pattern character vector specifying character classes to draw #' elements from, see \link{stringi-search-charclass} #' #' @return Returns a character vector. #' #' @examples #' stri_rand_strings(5, 10) # 5 strings of length 10 #' stri_rand_strings(5, sample(1:10, 5, replace=TRUE)) # 5 strings of random lengths #' stri_rand_strings(10, 5, '[\\p{script=latin}&\\p{Ll}]') # small letters from the Latin script #' #' # generate n random passwords of length in [8, 14] #' # consisting of at least one digit, small and big ASCII letter: #' n <- 10 #' stri_rand_shuffle(stri_paste( #' stri_rand_strings(n, 1, '[0-9]'), #' stri_rand_strings(n, 1, '[a-z]'), #' stri_rand_strings(n, 1, '[A-Z]'), #' stri_rand_strings(n, sample(5:11, 5, replace=TRUE), '[a-zA-Z0-9]') #' )) #' #' @family random #' @export stri_rand_strings <- function(n, length, pattern = "[A-Za-z0-9]") { .Call(C_stri_rand_strings, n, length, pattern) } #' @title #' A Lorem Ipsum Generator #' #' @description #' Generates (pseudo)random \emph{lorem ipsum} text consisting #' of a given number of text paragraphs. #' #' @details #' \emph{Lorem ipsum} is a dummy text often used as a source #' of data for string processing and displaying/lay-outing exercises. #' #' The current implementation is very simple: #' words are selected randomly from a Zipf distribution #' (based on a set of ca. 190 predefined Latin words). #' The number of words per sentence and sentences per paragraph #' follows a discretized, truncated normal distribution. #' No Markov chain modeling, just i.i.d. word selection. #' #' @param n_paragraphs single integer, number of paragraphs to generate #' @param start_lipsum single logical value; should the resulting #' text start with \emph{Lorem ipsum dolor sit amet}? #' @param nparagraphs [DEPRECATED] alias of \code{n_paragraphs} #' #' @return Returns a character vector of length \code{n_paragraphs}. #' #' @examples #' cat(sapply( #' stri_wrap(stri_rand_lipsum(10), 80, simplify=FALSE), #' stri_flatten, collapse='\n'), sep='\n\n') #' cat(stri_rand_lipsum(10), sep='\n\n') #' #' @family random #' @export stri_rand_lipsum <- function(n_paragraphs, start_lipsum = TRUE, nparagraphs=n_paragraphs) { if (!missing(nparagraphs) && missing(n_paragraphs)) { # DEPRECATED warning("The 'nparagraphs' argument in stri_rand_lipsum is a deprecated alias of 'n_paragraphs' and will be removed in a future release of 'stringi'.") n_paragraphs <- nparagraphs } # Whoa! A pure R function in stringi :) # Version 0.3-1 (Marek Gagolewski, 2014-10-16) n_paragraphs <- as.integer(n_paragraphs) stopifnot(is.finite(n_paragraphs), n_paragraphs >= 1) start_lipsum <- identical(start_lipsum, TRUE) rwords <- function(n) { # generate n random words words <- c("SED", "IN", "UT", "ET", "AC", "EU", "NON", "NEC", "AMET", "SIT", "VEL", "AT", "MAURIS", "A", "VITAE", "EGET", "QUIS", "NUNC", "NULLA", "ID", "VESTIBULUM", "PELLENTESQUE", "TINCIDUNT", "ALIQUAM", "IPSUM", "DONEC", "TURPIS", "LIGULA", "EGESTAS", "NIBH", "SAPIEN", "ANTE", "NISL", "VELIT", "ERAT", "EROS", "LEO", "MAGNA", "JUSTO", "ENIM", "MI", "PURUS", "EST", "LACUS", "LOREM", "QUAM", "DIAM", "RISUS", "DOLOR", "SEM", "AUGUE", "NEQUE", "TEMPOR", "DUI", "ARCU", "METUS", "TORTOR", "URNA", "LIBERO", "PHARETRA", "TEMPUS", "FAUCIBUS", "LECTUS", "SUSPENDISSE", "FELIS", "ODIO", "ORCI", "VARIUS", "MASSA", "TELLUS", "VOLUTPAT", "BLANDIT", "INTERDUM", "LOBORTIS", "MAXIMUS", "NISI", "LUCTUS", "PORTTITOR", "AUCTOR", "ELEMENTUM", "EX", "MAECENAS", "MALESUADA", "TRISTIQUE", "ULLAMCORPER", "ULTRICES", "NULLAM", "CONSEQUAT", "LACINIA", "PHASELLUS", "ACCUMSAN", "DAPIBUS", "ELEIFEND", "COMMODO", "DUIS", "EFFICITUR", "ELIT", "IMPERDIET", "AENEAN", "IACULIS", "NAM", "CONSECTETUR", "FERMENTUM", "PORTA", "SCELERISQUE", "SODALES", "FEUGIAT", "LAOREET", "VULPUTATE", "DICTUM", "QUISQUE", "FACILISIS", "FINIBUS", "ORNARE", "PULVINAR", "RHONCUS", "CONDIMENTUM", "MOLLIS", "PRETIUM", "ALIQUET", "CONGUE", "POSUERE", "SUSCIPIT", "ULTRICIES", "CURABITUR", "GRAVIDA", "MATTIS", "VIVERRA", "CURSUS", "EUISMOD", "RUTRUM", "VENENATIS", "CONVALLIS", "PROIN", "VEHICULA", "PLACERAT", "SAGITTIS", "CRAS", "INTEGER", "MORBI", "VIVAMUS", "PRAESENT", "BIBENDUM", "MOLESTIE", "SEMPER", "FRINGILLA", "FUSCE", "DIGNISSIM", "ETIAM", "HENDRERIT", "SOLLICITUDIN", "PER", "FAMES", "POTENTI", "AD", "APTENT", "CLASS", "CONUBIA", "HIMENAEOS", "INCEPTOS", "LITORA", "NOSTRA", "SOCIOSQU", "TACITI", "TORQUENT", "HABITANT", "NETUS", "SENECTUS", "PRIMIS", "CUM", "DIS", "MAGNIS", "MONTES", "MUS", "NASCETUR", "NATOQUE", "PARTURIENT", "PENATIBUS", "RIDICULUS", "SOCIIS", "ADIPISCING", "FACILISI", "CUBILIA", "CURAE", "DICTUMST", "HABITASSE", "HAC", "PLATEA") # Zipf distribution dzipf <- function(k, N, s) 1/k^s/sum(1/(1:N)^s) pzipf.y <- c(0, cumsum(dzipf(1:length(words), length(words), 0.5))) robs <- findInterval(runif(n), pzipf.y) words[robs] } rtruncnorm <- function(n, a, b, mu, sd) { # truncated discretized normal distribution x <- round(rnorm(n, mu, sd)) while (any(x < a | x > b)) x[x < a | x > b] <- round(rnorm(sum(x < a | x > b), mu, sd)) x } sent_para <- rtruncnorm(n_paragraphs, 7, 20, 11, 3) word_sent <- lapply(sent_para, function(numsent) rtruncnorm(numsent, 2, Inf, 8, 3)) totwords <- sum(unlist(word_sent)) words <- rwords(totwords) seps <- sample(c(" ", ", "), replace = TRUE, size = totwords, prob = c(0.9, 0.1)) seps[cumsum(unlist(word_sent))] <- sample(c(". ", "? ", "! "), size = length(unlist(word_sent)), replace = TRUE, prob = c(0.95, 0.025, 0.025)) # end of sentence seps[cumsum(sapply(word_sent, sum))] <- ".\n" # end of para seps[totwords] <- "." # very last sentence in very last para if (start_lipsum) { words <- c("LOREM", "IPSUM", "DOLOR", "SIT", "AMET", words) seps <- c(" ", " ", " ", " ", ", ", seps) } ret <- stri_split_charclass(stri_paste(words, seps, collapse = ""), "[\\n]")[[1]] ret <- stri_trans_totitle(ret, opts_brkiter = stri_opts_brkiter(type = "sentence")) ret } stringi/R/sub.R0000644000176200001440000003253214750110641013052 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Extract a Substring From or Replace a Substring In a Character Vector #' #' @description #' \code{stri_sub} extracts particular substrings at code point-based #' index ranges provided. Its replacement version allows to substitute #' (in-place) parts of #' a string with given replacement strings. \code{stri_sub_replace} #' is its forward pipe operator-friendly variant that returns #' a copy of the input vector. #' #' For extracting/replacing multiple substrings from/within each string, see #' \code{\link{stri_sub_all}}. #' #' @details #' Vectorized over \code{str}, [\code{value}], \code{from} and #' (\code{to} or \code{length}). Parameters #' \code{to} and \code{length} are mutually exclusive. #' #' Indexes are 1-based, i.e., the start of a string is at index 1. #' For negative indexes in \code{from} or \code{to}, #' counting starts at the end of the string. #' For instance, index -1 denotes the last code point in the string. #' Non-positive \code{length} gives an empty string. #' #' Argument \code{from} gives the start of a substring to extract. #' Argument \code{to} defines the last index of a substring, inclusive. #' Alternatively, its \code{length} may be provided. #' #' If \code{from} is a two-column matrix, then these two columns are #' used as \code{from} and \code{to}, respectively, #' unless the second column is named \code{length}. #' In such a case anything passed #' explicitly as \code{to} or \code{length} is ignored. #' Such types of index matrices are generated by \code{\link{stri_locate_first}} #' and \code{\link{stri_locate_last}}. If extraction based on #' \code{\link{stri_locate_all}} is needed, see #' \code{\link{stri_sub_all}}. #' #' In \code{stri_sub}, out-of-bound indexes are silently #' corrected. If \code{from} > \code{to}, then an empty string is returned. #' By default, negative \code{length} results in the corresponding output being #' \code{NA}, see \code{ignore_negative_length}, though. #' #' In \code{stri_sub<-}, some configurations of indexes may work as #' substring 'injection' at the front, back, or in middle. #' Negative \code{length} does not alter the corresponding input string. #' #' If both \code{to} and \code{length} are provided, #' \code{length} has priority over \code{to}. #' #' Note that for some Unicode strings, the extracted substrings might not #' be well-formed, especially if input strings are not normalized #' (see \code{\link{stri_trans_nfc}}), #' include byte order marks, Bidirectional text marks, and so on. #' Handle with care. #' #' #' #' #' @param str character vector #' #' @param from integer vector giving the start indexes; alternatively, #' if \code{use_matrix=TRUE}, #' a two-column matrix of type \code{cbind(from, to)} #' (unnamed columns or the 2nd column named other than \code{length}) #' or \code{cbind(from, length=length)} (2nd column named \code{length}) #' #' @param to integer vector giving the end indexes; mutually exclusive with #' \code{length} and \code{from} being a matrix #' #' @param length integer vector giving the substring lengths; #' mutually exclusive with \code{to} and \code{from} being a matrix #' #' @param omit_na single logical value; indicates whether missing values #' in any of the indexes or in \code{value} leave the corresponding input string #' unchanged [replacement function only] #' #' @param use_matrix single logical value; see \code{from} #' #' @param replacement alias of \code{value} [wherever applicable] #' #' @param value a character vector defining the replacement strings #' [replacement function only] #' #' @param ignore_negative_length single logical value; whether #' negative lengths should be ignored or result in missing values #' #' @param ... arguments to be passed to \code{stri_sub<-} #' #' #' @return #' \code{stri_sub} and \code{stri_sub_replace} return a character vector. #' \code{stri_sub<-} changes the \code{str} object 'in-place'. #' #' @examples #' s <- c("spam, spam, bacon, and spam", "eggs and spam") #' stri_sub(s, from=-4) #' stri_sub(s, from=1, length=c(10, 4)) #' (stri_sub(s, 1, 4) <- 'stringi') #' #' x <- c('12 3456 789', 'abc', '', NA, '667') #' stri_sub(x, stri_locate_first_regex(x, '[0-9]+')) # see stri_extract_first #' stri_sub(x, stri_locate_last_regex(x, '[0-9]+')) # see stri_extract_last #' #' stri_sub_replace(x, stri_locate_first_regex(x, '[0-9]+'), #' omit_na=TRUE, replacement='***') # see stri_replace_first #' stri_sub_replace(x, stri_locate_last_regex(x, '[0-9]+'), #' omit_na=TRUE, replacement='***') # see stri_replace_last #' #' #' \dontrun{x |> stri_sub_replace(1, 5, replacement='new_substring')} #' @family indexing #' @rdname stri_sub #' @export stri_sub <- function( str, from = 1L, to = -1L, length, use_matrix=TRUE, ignore_negative_length=FALSE ) { use_matrix <- (is.logical(use_matrix) && base::length(use_matrix) == 1L && !is.na(use_matrix) && use_matrix) # isTRUE(use_matrix) if (missing(length)) { if (use_matrix && is.matrix(from) && !missing(to)) { warning("argument `to` is ignored in the current context") to <- NULL } .Call(C_stri_sub, str, from, to, NULL, use_matrix, ignore_negative_length) } else { if (!missing(to)) warning("argument `to` is ignored in the current context") if (use_matrix && is.matrix(from)) { warning("argument `length` is ignored in the current context") length <- NULL } .Call(C_stri_sub, str, from, NULL, length, use_matrix, ignore_negative_length) } } #' @rdname stri_sub #' @export `stri_sub<-` <- function( str, from = 1L, to = -1L, length, omit_na=FALSE, use_matrix=TRUE, value ) { use_matrix <- (is.logical(use_matrix) && base::length(use_matrix) == 1L && !is.na(use_matrix) && use_matrix) # isTRUE(use_matrix) if (missing(length)) { if (use_matrix && is.matrix(from) && !missing(to)) { warning("argument `to` is ignored in this context") to <- NULL } .Call(C_stri_sub_replacement, str, from, to, NULL, omit_na, value, use_matrix) } else { if (!missing(to)) warning("argument `to` is ignored in this context") if (use_matrix && is.matrix(from)) { warning("argument `length` is ignored in this context") length <- NULL } .Call(C_stri_sub_replacement, str, from, NULL, length, omit_na, value, use_matrix) } } #' @rdname stri_sub #' @export stri_sub_replace <- function(..., replacement, value = replacement) `stri_sub<-`(..., value = value) #' @title #' Extract or Replace Multiple Substrings #' #' @description #' \code{stri_sub_all} extracts multiple substrings from each string. #' Its replacement version substitutes (in-place) multiple substrings with the #' corresponding replacement strings. #' \code{stri_sub_replace_all} (alias \code{stri_sub_all_replace}) #' is its forward pipe operator-friendly variant, returning #' a copy of the input vector. #' #' For extracting/replacing single substrings from/within each string, see #' \code{\link{stri_sub}}. #' #' @details #' Vectorized over \code{str}, [\code{value}], \code{from} and #' (\code{to} or \code{length}). Just like in \code{\link{stri_sub}}, parameters #' \code{to} and \code{length} are mutually exclusive. #' #' In one of the simplest scenarios, \code{stri_sub_all(str, from, to)}, #' the i-th element of the resulting list #' generated like \code{stri_sub(str[i], from[[i]], to[[i]])}. #' As usual, if one of the inputs is shorter than the others, #' recycling rule is applied. #' #' #' If any of \code{from}, \code{to}, \code{length}, #' or \code{value} is not a list, #' it is wrapped into a list. #' #' If \code{from} consists of a two-column matrix, then these two columns are #' used as \code{from} and \code{to}, respectively, #' unless the second column is named \code{length}. #' Such types of index matrices are generated by #' \code{\link{stri_locate_all}}. #' If extraction or replacement based on \code{\link{stri_locate_first}} #' or \code{\link{stri_locate_last}} is needed, see \code{\link{stri_sub}}. #' #' In the replacement function, the index ranges must be sorted #' with respect to \code{from} and must be mutually disjoint. #' Negative \code{length} does not result in any altering of the #' corresponding input string. On the other hand, in \code{stri_sub_all}, #' this make the corresponding chunk be ignored, #' see \code{ignore_negative_length}, though. #' #' @param str character vector #' #' @param from list of integer vector giving the start indexes; alternatively, #' if \code{use_matrix=TRUE}, a list of two-column matrices of type #' \code{cbind(from, to)} #' (unnamed columns or the 2nd column named other than \code{length}) #' or \code{cbind(from, length=length)} (2nd column named \code{length}) #' #' @param to list of integer vectors giving the end indexes #' #' @param length list of integer vectors giving the substring lengths #' #' @param omit_na single logical value; indicates whether missing values #' in any of the indexes or in \code{value} leave the part of the #' corresponding input string #' unchanged [replacement function only] #' #' @param use_matrix single logical value; see \code{from} #' #' @param replacement alias of \code{value} [wherever applicable] #' #' @param value a list of character vectors defining the replacement strings #' [replacement function only] #' #' @param ignore_negative_length single logical value; whether #' negative lengths should be ignored or result in missing values #' #' @param ... arguments to be passed to \code{stri_sub_all<-} #' #' #' @return #' \code{stri_sub_all} returns a list of character vectors. #' Its replacement versions modify the input 'in-place'. #' #' @examples #' x <- c('12 3456 789', 'abc', '', NA, '667') #' stri_sub_all(x, stri_locate_all_regex(x, '[0-9]+')) # see stri_extract_all #' stri_sub_all(x, stri_locate_all_regex(x, '[0-9]+', omit_no_match=TRUE)) #' #' stri_sub_all(x, stri_locate_all_regex(x, '[0-9]+', omit_no_match=TRUE)) <- '***' #' print(x) #' #' stri_sub_replace_all('a b c', c(1, 3, 5), c(1, 3, 5), replacement=c('A', 'B', 'C')) #' #' #' @family indexing #' @rdname stri_sub_all #' @export stri_sub_all <- function( str, from = list(1L), to = list(-1L), length, use_matrix=TRUE, ignore_negative_length=TRUE ) { if (!is.list(from)) from <- list(from) if (missing(length)) { if (!missing(to) && !is.list(to)) { to <- list(to) } .Call(C_stri_sub_all, str, from, to, NULL, use_matrix, ignore_negative_length) } else { if (!missing(to)) warning("argument `to` is ignored in this context") if (!is.list(length)) { length <- list(length) } .Call(C_stri_sub_all, str, from, NULL, length, use_matrix, ignore_negative_length) } } #' @rdname stri_sub_all #' @export `stri_sub_all<-` <- function( str, from = list(1L), to = list(-1L), length, omit_na=FALSE, use_matrix=TRUE, value ) { if (!is.list(from)) from <- list(from) if (!is.list(value)) value <- list(value) if (missing(length)) { if (!missing(to) && !is.list(to)) { to <- list(to) } .Call(C_stri_sub_replacement_all, str, from, to, NULL, omit_na, value, use_matrix) } else { if (!missing(to)) warning("argument `to` is ignored in this context") if (!is.list(length)) { length <- list(length) } .Call(C_stri_sub_replacement_all, str, from, NULL, length, omit_na, value, use_matrix) } } #' @rdname stri_sub_all #' @export stri_sub_replace_all <- function(..., replacement, value=replacement) `stri_sub_all<-`(..., value=value) #' @rdname stri_sub_all #' @export stri_sub_all_replace <- stri_sub_replace_all stringi/R/sort.R0000644000176200001440000003512714750110641013253 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title String Sorting #' #' @description #' This function sorts a character vector according to a locale-dependent #' lexicographic order. #' #' @details #' For more information on \pkg{ICU}'s Collator and how to tune it up #' in \pkg{stringi}, refer to \code{\link{stri_opts_collator}}. #' #' As usual in \pkg{stringi}, non-character inputs are coerced to strings, #' see an example below for a somewhat non-intuitive behavior of lexicographic #' sorting on numeric inputs. #' #' This function uses a stable sort algorithm (\pkg{STL}'s \code{stable_sort}), #' which performs up to \eqn{N*log^2(N)} element comparisons, #' where \eqn{N} is the length of \code{str}. #' #' @param str a character vector #' @param decreasing a single logical value; should the sort order #' be nondecreasing (\code{FALSE}, default, i.e., weakly increasing) #' or nonincreasing (\code{TRUE})? #' @param na_last a single logical value; controls the treatment of \code{NA}s #' in \code{str}. If \code{TRUE}, then missing values in \code{str} are put #' at the end; if \code{FALSE}, they are put at the beginning; #' if \code{NA}, then they are removed from the output #' @param opts_collator a named list with \pkg{ICU} Collator's options, #' see \code{\link{stri_opts_collator}}, \code{NULL} #' for default collation options #' @param ... additional settings for \code{opts_collator} #' #' @return #' The result is a sorted version of \code{str}, #' i.e., a character vector. #' #' @references #' \emph{Collation} - ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/collation/} #' #' @family locale_sensitive #' @export #' @rdname stri_sort #' #' @examples #' stri_sort(c('hladny', 'chladny'), locale='pl_PL') #' stri_sort(c('hladny', 'chladny'), locale='sk_SK') #' stri_sort(sample(LETTERS)) #' stri_sort(c(1, 100, 2, 101, 11, 10)) # lexicographic order #' stri_sort(c(1, 100, 2, 101, 11, 10), numeric=TRUE) # OK for integers #' stri_sort(c(0.25, 0.5, 1, -1, -2, -3), numeric=TRUE) # incorrect stri_sort <- function(str, decreasing = FALSE, na_last = NA, ..., opts_collator = NULL) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_sort, str, decreasing, na_last, opts_collator) } #' @title Ordering Permutation #' #' @description #' This function finds a permutation which rearranges the #' strings in a given character vector into the ascending or descending #' locale-dependent lexicographic order. #' #' @details #' For more information on \pkg{ICU}'s Collator and how to tune it up #' in \pkg{stringi}, refer to \code{\link{stri_opts_collator}}. #' #' As usual in \pkg{stringi}, non-character inputs are coerced to strings, #' see an example below for a somewhat non-intuitive behavior of lexicographic #' sorting on numeric inputs. #' #' This function uses a stable sort algorithm (\pkg{STL}'s \code{stable_sort}), #' which performs up to \eqn{N*log^2(N)} element comparisons, #' where \eqn{N} is the length of \code{str}. #' #' For ordering with regards to multiple criteria (such as sorting #' data frames by more than 1 column), see \code{\link{stri_rank}}. #' #' @param str a character vector #' @param decreasing a single logical value; should the sort order #' be nondecreasing (\code{FALSE}, default) #' or nonincreasing (\code{TRUE})? #' @param na_last a single logical value; controls the treatment of \code{NA}s #' in \code{str}. If \code{TRUE}, then missing values in \code{str} are put #' at the end; if \code{FALSE}, they are put at the beginning; #' if \code{NA}, then they are removed from the output #' @param opts_collator a named list with \pkg{ICU} Collator's options, #' see \code{\link{stri_opts_collator}}, \code{NULL} #' for default collation options #' @param ... additional settings for \code{opts_collator} #' #' @return The function yields an integer vector that gives the sort order. #' #' @references #' \emph{Collation} - ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/collation/} #' #' @family locale_sensitive #' @export #' @rdname stri_order #' #' @examples #' stri_order(c('hladny', 'chladny'), locale='pl_PL') #' stri_order(c('hladny', 'chladny'), locale='sk_SK') #' #' stri_order(c(1, 100, 2, 101, 11, 10)) # lexicographic order #' stri_order(c(1, 100, 2, 101, 11, 10), numeric=TRUE) # OK for integers #' stri_order(c(0.25, 0.5, 1, -1, -2, -3), numeric=TRUE) # incorrect stri_order <- function(str, decreasing = FALSE, na_last = TRUE, ..., opts_collator = NULL) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_order, str, decreasing, na_last, opts_collator) } #' @title Extract Unique Elements #' #' @description #' This function returns a character vector like \code{str}, #' but with duplicate elements removed. #' #' @details #' As usual in \pkg{stringi}, no attributes are copied. #' Unlike \code{\link{unique}}, this function #' tests for canonical equivalence of strings (and not #' whether the strings are just bytewise equal). Such an operation #' is locale-dependent. Hence, \code{stri_unique} is significantly #' slower (but much better suited for natural language processing) #' than its base R counterpart. #' #' See also \code{\link{stri_duplicated}} for indicating non-unique elements. #' #' @param str a character vector #' @param opts_collator a named list with \pkg{ICU} Collator's options, #' see \code{\link{stri_opts_collator}}, \code{NULL} #' for default collation options #' @param ... additional settings for \code{opts_collator} #' #' @return Returns a character vector. #' #' @examples #' # normalized and non-Unicode-normalized version of the same code point: #' stri_unique(c('\u0105', stri_trans_nfkd('\u0105'))) #' unique(c('\u0105', stri_trans_nfkd('\u0105'))) #' #' stri_unique(c('gro\u00df', 'GROSS', 'Gro\u00df', 'Gross'), strength=1) #' #' @references #' \emph{Collation} - ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/collation/} #' #' @family locale_sensitive #' @export stri_unique <- function(str, ..., opts_collator = NULL) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_unique, str, opts_collator) } #' @title #' Determine Duplicated Elements #' #' @description #' \code{stri_duplicated()} determines which strings in a character vector #' are duplicates of other elements. #' #' \code{stri_duplicated_any()} determines if there are any duplicated #' strings in a character vector. #' #' @details #' Missing values are regarded as equal. #' #' Unlike \code{\link{duplicated}} and \code{\link{anyDuplicated}}, #' these functions test for canonical equivalence of strings #' (and not whether the strings are just bytewise equal) #' Such operations are locale-dependent. #' Hence, \code{stri_duplicated} and \code{stri_duplicated_any} #' are significantly slower (but much better suited for natural language #' processing) than their base R counterparts. #' #' See also \code{\link{stri_unique}} for extracting unique elements. #' #' @param str a character vector #' @param from_last a single logical value; #' indicates whether search should be performed from the last to the #' first string #' @param fromLast [DEPRECATED] alias of \code{from_last} #' @param opts_collator a named list with \pkg{ICU} Collator's options, #' see \code{\link{stri_opts_collator}}, \code{NULL} #' for default collation options #' @param ... additional settings for \code{opts_collator} #' #' @return #' \code{stri_duplicated()} returns a logical vector of the same length #' as \code{str}. Each of its elements indicates whether a canonically #' equivalent string was already found in \code{str}. #' #' \code{stri_duplicated_any()} returns a single non-negative integer. #' Value of 0 indicates that all the elements in \code{str} are unique. #' Otherwise, it gives the index of the first non-unique element. #' #' @references #' \emph{Collation} - ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/collation/} #' #' @examples #' # In the following examples, we have 3 duplicated values, #' # 'a' - 2 times, NA - 1 time #' stri_duplicated(c('a', 'b', 'a', NA, 'a', NA)) #' stri_duplicated(c('a', 'b', 'a', NA, 'a', NA), from_last=TRUE) #' stri_duplicated_any(c('a', 'b', 'a', NA, 'a', NA)) #' #' # compare the results: #' stri_duplicated(c('\u0105', stri_trans_nfkd('\u0105'))) #' duplicated(c('\u0105', stri_trans_nfkd('\u0105'))) #' #' stri_duplicated(c('gro\u00df', 'GROSS', 'Gro\u00df', 'Gross'), strength=1) #' duplicated(c('gro\u00df', 'GROSS', 'Gro\u00df', 'Gross')) #' #' @rdname stri_duplicated #' @family locale_sensitive #' @export stri_duplicated <- function(str, from_last = FALSE, fromLast = from_last, ..., opts_collator = NULL) { if (!missing(fromLast)) { warning("The 'fromLast' argument in stri_duplicated is a deprecated alias of 'from_last' and will be removed in a future release of 'stringi'.") from_last <- fromLast } if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_duplicated, str, from_last, opts_collator) } #' @rdname stri_duplicated #' @export stri_duplicated_any <- function(str, from_last = FALSE, fromLast = from_last, ..., opts_collator = NULL) { if (!missing(fromLast)) { # DEPRECATED warning("The 'fromLast' argument in stri_duplicated_any is a deprecated alias of 'from_last' and will be removed in a future release of 'stringi'.") from_last <- fromLast } if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_duplicated_any, str, from_last, opts_collator) } #' @title #' Sort Keys #' #' @description #' This function computes a locale-dependent sort key, which is an alternative #' character representation of the string that, when ordered in the C locale #' (which orders using the underlying bytes directly), will give an equivalent #' ordering to the original string. It is useful for enhancing algorithms #' that sort only in the C locale (e.g., the \code{strcmp} function in libc) #' with the ability to be locale-aware. #' #' @details #' For more information on \pkg{ICU}'s Collator and how to tune it up #' in \pkg{stringi}, refer to \code{\link{stri_opts_collator}}. #' #' See also \code{\link{stri_rank}} for ranking strings with a single character #' vector, i.e., generating relative sort keys. #' #' @param str a character vector #' @param opts_collator a named list with \pkg{ICU} Collator's options, #' see \code{\link{stri_opts_collator}}, \code{NULL} #' for default collation options #' @param ... additional settings for \code{opts_collator} #' #' @return #' The result is a character vector with the same length as \code{str} that #' contains the sort keys. The output is marked as \code{bytes}-encoded. #' #' @references #' \emph{Collation} - ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/collation/} #' #' @examples #' stri_sort_key(c('hladny', 'chladny'), locale='pl_PL') #' stri_sort_key(c('hladny', 'chladny'), locale='sk_SK') #' #' @family locale_sensitive #' @export #' @rdname stri_sort_key stri_sort_key <- function(str, ..., opts_collator = NULL) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_sort_key, str, opts_collator) } #' @title #' Ranking #' #' @description #' This function ranks each string in a character vector according to a #' locale-dependent lexicographic order. #' It is a portable replacement for the base \code{xtfrm} function. #' #' @details #' Missing values result in missing ranks and tied observations receive #' the same ranks (based on min). #' #' For more information on \pkg{ICU}'s Collator and how to tune it up #' in \pkg{stringi}, refer to \code{\link{stri_opts_collator}}. #' #' @param str a character vector #' @param opts_collator a named list with \pkg{ICU} Collator's options, #' see \code{\link{stri_opts_collator}}, \code{NULL} #' for default collation options #' @param ... additional settings for \code{opts_collator} #' #' @return #' The result is a vector of ranks corresponding to each #' string in \code{str}. #' #' @references #' \emph{Collation} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/collation/} #' #' @examples #' stri_rank(c('hladny', 'chladny'), locale='pl_PL') #' stri_rank(c('hladny', 'chladny'), locale='sk_SK') #' #' stri_rank("a" %s+% c(1, 100, 2, 101, 11, 10)) # lexicographic order #' stri_rank("a" %s+% c(1, 100, 2, 101, 11, 10), numeric=TRUE) # OK #' stri_rank("a" %s+% c(0.25, 0.5, 1, -1, -2, -3), numeric=TRUE) # incorrect #' #' # Ordering a data frame with respect to two criteria: #' X <- data.frame(a=c("b", NA, "b", "b", NA, "a", "a", "c"), b=runif(8)) #' X[order(stri_rank(X$a), X$b), ] #' #' @family locale_sensitive #' @export #' @rdname stri_rank stri_rank <- function(str, ..., opts_collator=NULL) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_rank, str, opts_collator) } stringi/R/search_detect_4.R0000644000176200001440000001544014750110641015300 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Detect Pattern Occurrences #' #' @description #' These functions determine, for each string in \code{str}, #' if there is at least one match to a corresponding \code{pattern}. #' #' @details #' Vectorized over \code{str} and \code{pattern} (with recycling #' of the elements in the shorter vector if necessary). This allows to, #' for instance, search for one pattern in each given string, #' search for each pattern in one given string, #' and search for the i-th pattern within the i-th string. #' #' If \code{pattern} is empty, then the result is \code{NA} #' and a warning is generated. #' #' \code{stri_detect} is a convenience function. #' It calls either \code{stri_detect_regex}, #' \code{stri_detect_fixed}, \code{stri_detect_coll}, #' or \code{stri_detect_charclass}, depending on the argument used. #' #' See also \code{\link{stri_startswith}} and \code{\link{stri_endswith}} #' for testing whether a string starts or ends with a match to a given pattern. #' Moreover, see \code{\link{stri_subset}} for a character vector subsetting. #' #' If \code{max_count} is negative, then all stings are examined. #' Otherwise, searching terminates #' once \code{max_count} matches (or, if \code{negate} is \code{TRUE}, #' no-matches) are detected. The uninspected cases are marked #' as missing in the return vector. Be aware that, unless \code{pattern} is a #' singleton, the elements in \code{str} might be inspected in a #' non-consecutive order. #' #' #' @param str character vector; strings to search in #' @param pattern,regex,fixed,coll,charclass character vector; #' search patterns; for more details refer to \link{stringi-search} #' @param negate single logical value; whether a no-match to a pattern #' is rather of interest #' @param max_count single integer; allows to stop searching once a given #' number of occurrences is detected; \code{-1} (the default) inspects all #' elements #' @param opts_collator,opts_fixed,opts_regex a named list used to tune up #' the search engine's settings; see #' \code{\link{stri_opts_collator}}, \code{\link{stri_opts_fixed}}, #' and \code{\link{stri_opts_regex}}, respectively; \code{NULL} #' for the defaults #' @param ... supplementary arguments passed to the underlying functions, #' including additional settings for \code{opts_collator}, \code{opts_regex}, #' \code{opts_fixed}, and so on #' #' @return Each function returns a logical vector. #' #' @examples #' stri_detect_fixed(c('stringi R', 'R STRINGI', '123'), c('i', 'R', '0')) #' stri_detect_fixed(c('stringi R', 'R STRINGI', '123'), 'R') #' #' stri_detect_charclass(c('stRRRingi','R STRINGI', '123'), #' c('\\p{Ll}', '\\p{Lu}', '\\p{Zs}')) #' #' stri_detect_regex(c('stringi R', 'R STRINGI', '123'), 'R.') #' stri_detect_regex(c('stringi R', 'R STRINGI', '123'), '[[:alpha:]]*?') #' stri_detect_regex(c('stringi R', 'R STRINGI', '123'), '[a-zC1]') #' stri_detect_regex(c('stringi R', 'R STRINGI', '123'), '( R|RE)') #' stri_detect_regex('stringi', 'STRING.', case_insensitive=TRUE) #' #' stri_detect_regex(c('abc', 'def', '123', 'ghi', '456', '789', 'jkl'), #' '^[0-9]+$', max_count=1) #' stri_detect_regex(c('abc', 'def', '123', 'ghi', '456', '789', 'jkl'), #' '^[0-9]+$', max_count=2) #' stri_detect_regex(c('abc', 'def', '123', 'ghi', '456', '789', 'jkl'), #' '^[0-9]+$', negate=TRUE, max_count=3) #' #' @family search_detect #' @export #' @rdname stri_detect stri_detect <- function(str, ..., regex, fixed, coll, charclass) { providedarg <- c( regex = !missing(regex), fixed = !missing(fixed), coll = !missing(coll), charclass = !missing(charclass)) if (sum(providedarg) != 1) stop("you have to specify one of: `regex`, `fixed`, `coll`, or `charclass`") if (providedarg["regex"]) stri_detect_regex(str, regex, ...) else if (providedarg["fixed"]) stri_detect_fixed(str, fixed, ...) else if (providedarg["coll"]) stri_detect_coll(str, coll, ...) else if (providedarg["charclass"]) stri_detect_charclass(str, charclass, ...) } #' @export #' @rdname stri_detect stri_detect_fixed <- function( str, pattern, negate=FALSE, max_count=-1, ..., opts_fixed=NULL ) { if (!missing(...)) opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...))) .Call(C_stri_detect_fixed, str, pattern, negate, max_count, opts_fixed) } #' @export #' @rdname stri_detect stri_detect_charclass <- function(str, pattern, negate = FALSE, max_count = -1) { .Call(C_stri_detect_charclass, str, pattern, negate, max_count) } #' @export #' @rdname stri_detect stri_detect_coll <- function( str, pattern, negate = FALSE, max_count = -1, ..., opts_collator = NULL ) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_detect_coll, str, pattern, negate, max_count, opts_collator) } #' @export #' @rdname stri_detect stri_detect_regex <- function( str, pattern, negate = FALSE, max_count = -1, ..., opts_regex = NULL ) { if (!missing(...)) opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...))) .Call(C_stri_detect_regex, str, pattern, negate, max_count, opts_regex) } stringi/R/encoding_detection.R0000644000176200001440000002724514750110641016112 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Check If a Data Stream Is Possibly in UTF-16 or UTF-32 #' #' @description #' These functions detect whether a given byte stream is #' valid UTF-16LE, UTF-16BE, UTF-32LE, or UTF-32BE. #' #' @details #' These functions are independent of the way \R marks encodings in #' character strings (see \link{Encoding} and \link{stringi-encoding}). #' Most often, these functions act on raw vectors. #' #' A result of \code{FALSE} means that a string is surely not valid UTF-16 #' or UTF-32. However, false positives are possible. #' #' Also note that a data stream may be sometimes classified #' as both valid UTF-16LE and UTF-16BE. #' #' @param str character vector, a raw vector, or #' a list of \code{raw} vectors #' #' @return Returns a logical vector. #' #' @rdname stri_enc_isutf16 #' @family encoding_detection #' @export stri_enc_isutf16be <- function(str) { .Call(C_stri_enc_isutf16be, str) } #' @rdname stri_enc_isutf16 #' @export stri_enc_isutf16le <- function(str) { .Call(C_stri_enc_isutf16le, str) } #' @rdname stri_enc_isutf16 #' @export stri_enc_isutf32be <- function(str) { .Call(C_stri_enc_isutf32be, str) } #' @rdname stri_enc_isutf16 #' @export stri_enc_isutf32le <- function(str) { .Call(C_stri_enc_isutf32le, str) } #' @title #' Check If a Data Stream Is Possibly in ASCII #' #' @description #' The function checks whether all bytes in a string are <= 127. #' #' @details #' This function is independent of the way \R marks encodings in #' character strings (see \link{Encoding} and \link{stringi-encoding}). #' #' @param str character vector, a raw vector, or #' a list of \code{raw} vectors #' #' @return Returns a logical vector. #' The i-th element indicates whether the i-th string #' corresponds to a valid ASCII byte sequence. #' #' @examples #' stri_enc_isascii(letters[1:3]) #' stri_enc_isascii('\u0105\u0104') #' #' @family encoding_detection #' @export stri_enc_isascii <- function(str) { .Call(C_stri_enc_isascii, str) } #' @title #' Check If a Data Stream Is Possibly in UTF-8 #' #' @description #' The function checks whether given sequences of bytes forms #' a proper UTF-8 string. #' #' @details #' \code{FALSE} means that a string is certainly not valid UTF-8. #' However, false positives are possible. For instance, #' \code{(c4,85)} represents ('a with ogonek') in UTF-8 #' as well as ('A umlaut', 'Ellipsis') in WINDOWS-1250. #' Also note that UTF-8, as well as most 8-bit encodings, extend ASCII #' (note that \code{\link{stri_enc_isascii}} implies that #' \code{\link{stri_enc_isutf8}}). #' #' However, the longer the sequence, #' the greater the possibility that the result #' is indeed in UTF-8 -- this is because not all sequences of bytes #' are valid UTF-8. #' #' This function is independent of the way \R marks encodings in #' character strings (see \link{Encoding} and \link{stringi-encoding}). #' #' @param str character vector, a raw vector, or #' a list of \code{raw} vectors #' #' @return Returns a logical vector. #' Its i-th element indicates whether the i-th string #' corresponds to a valid UTF-8 byte sequence. #' #' @examples #' stri_enc_isutf8(letters[1:3]) #' stri_enc_isutf8('\u0105\u0104') #' stri_enc_isutf8('\u1234\u0222') #' #' @family encoding_detection #' @export stri_enc_isutf8 <- function(str) { .Call(C_stri_enc_isutf8, str) } #' @title #' Detect Character Set and Language #' #' @description #' This function uses the \pkg{ICU} engine to determine the character set, #' or encoding, of character data in an unknown format. #' #' @details #' Vectorized over \code{str} and \code{filter_angle_brackets}. #' #' For a character vector input, merging all text lines #' via \code{\link{stri_flatten}(str, collapse='\n')} #' might be needed if \code{str} has been obtained via a call to #' \code{readLines} and in fact represents an image of a single text file. #' #' This is, at best, an imprecise operation using statistics and heuristics. #' Because of this, detection works best if you supply at least a few hundred #' bytes of character data that is mostly in a single language. #' However, because the detection only looks at a limited amount of the input #' data, some of the returned character sets may fail to handle all of the #' input data. Note that in some cases, #' the language can be determined along with the encoding. #' #' Several different techniques are used for character set detection. #' For multi-byte encodings, the sequence of bytes is checked for legible #' patterns. The detected characters are also checked against a list of #' frequently used characters in that encoding. For single byte encodings, #' the data is checked against a list of the most commonly occurring three #' letter groups for each language that can be written using that encoding. #' #' The detection process can be configured to optionally ignore #' HTML or XML style markup (using \pkg{ICU}'s internal facilities), #' which can interfere with the detection #' process by changing the statistics. #' #' This function should most often be used for byte-marked input strings, #' especially after loading them from text files and before the main #' conversion with \code{\link{stri_encode}}. #' The input encoding is of course not taken into account here, even #' if marked. #' #' The following table shows all the encodings that can be detected: #' #' \tabular{ll}{ #' \strong{Character_Set} \tab \strong{Languages}\cr #' UTF-8 \tab -- \cr #' UTF-16BE \tab -- \cr #' UTF-16LE \tab -- \cr #' UTF-32BE \tab -- \cr #' UTF-32LE \tab -- \cr #' Shift_JIS \tab Japanese \cr #' ISO-2022-JP \tab Japanese \cr #' ISO-2022-CN \tab Simplified Chinese \cr #' ISO-2022-KR \tab Korean \cr #' GB18030 \tab Chinese \cr #' Big5 \tab Traditional Chinese \cr #' EUC-JP \tab Japanese \cr #' EUC-KR \tab Korean \cr #' ISO-8859-1 \tab Danish, Dutch, English, French, German, Italian, Norwegian, Portuguese, Swedish \cr #' ISO-8859-2 \tab Czech, Hungarian, Polish, Romanian \cr #' ISO-8859-5 \tab Russian \cr #' ISO-8859-6 \tab Arabic \cr #' ISO-8859-7 \tab Greek \cr #' ISO-8859-8 \tab Hebrew \cr #' ISO-8859-9 \tab Turkish \cr #' windows-1250 \tab Czech, Hungarian, Polish, Romanian \cr #' windows-1251 \tab Russian \cr #' windows-1252 \tab Danish, Dutch, English, French, German, Italian, Norwegian, Portuguese, Swedish \cr #' windows-1253 \tab Greek \cr #' windows-1254 \tab Turkish \cr #' windows-1255 \tab Hebrew \cr #' windows-1256 \tab Arabic \cr #' KOI8-R \tab Russian \cr #' IBM420 \tab Arabic \cr #' IBM424 \tab Hebrew \cr #' } #' #' #' @param str character vector, a raw vector, or #' a list of \code{raw} vectors #' #' @param filter_angle_brackets logical; If filtering is enabled, #' text within angle brackets ('<' and '>') will be removed before detection, #' which will remove most HTML or XML markup. #' #' @return Returns a list of length equal to the length of \code{str}. #' Each list element is a data frame with the following three named vectors #' representing all the guesses: #' \itemize{ #' \item \code{Encoding} -- string; guessed encodings; \code{NA} on failure, #' \item \code{Language} -- string; guessed languages; \code{NA} if the language could #' not be determined (e.g., in case of UTF-8), #' \item \code{Confidence} -- numeric in [0,1]; the higher the value, #' the more confidence there is in the match; \code{NA} on failure. #' } #' The guesses are ordered by decreasing confidence. #' #' @examples #' ## Not run: #' ## f <- rawToChar(readBin('test.txt', 'raw', 100000)) #' ## stri_enc_detect(f) #' #' @references #' \emph{Character Set Detection} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/conversion/detection.html} #' #' @family encoding_detection #' @export stri_enc_detect <- function(str, filter_angle_brackets = FALSE) { lapply(.Call(C_stri_enc_detect, str, filter_angle_brackets), as.data.frame, stringsAsFactors = FALSE) } #' @title #' [DEPRECATED] Detect Locale-Sensitive Character Encoding #' #' @description #' This function tries to detect character encoding #' in case the language of text is known. #' #' #' @details #' Vectorized over \code{str}. #' #' First, the text is checked whether it is valid #' UTF-32BE, UTF-32LE, UTF-16BE, UTF-16LE, UTF-8 #' (as in \code{\link{stri_enc_detect}}, #' this is roughly inspired by \pkg{ICU}'s \code{i18n/csrucode.cpp}) or ASCII. #' #' #' If \code{locale} is not \code{NA} and the above fails, #' the text is checked for the number of occurrences #' of language-specific code points (data provided by the \pkg{ICU} library) #' converted to all possible 8-bit encodings #' that fully cover the indicated language. #' The encoding is selected based on the greatest number of total #' byte hits. #' #' The guess is of course imprecise, #' as it is obtained using statistics and heuristics. #' Because of this, detection works best if you supply at least a few hundred #' bytes of character data that is in a single language. #' #' #' If you have no initial guess on the language and encoding, try with #' \code{\link{stri_enc_detect}} (uses \pkg{ICU} facilities). #' #' @param str character vector, a raw vector, or #' a list of \code{raw} vectors #' @param locale \code{NULL} or \code{''} for the default locale, #' or a single string with locale identifier. #' #' @return #' Just like \code{\link{stri_enc_detect}}, #' this function returns a list of length equal to the length of \code{str}. #' Each list element is a data frame with the following three named components: #' \itemize{ #' \item \code{Encoding} -- string; guessed encodings; \code{NA} on failure #' (if and only if \code{encodings} is empty), #' \item \code{Language} -- always \code{NA}, #' \item \code{Confidence} -- numeric in [0,1]; the higher the value, #' the more confidence there is in the match; \code{NA} on failure. #' } #' The guesses are ordered by decreasing confidence. #' #' @family locale_sensitive #' @family encoding_detection #' @export stri_enc_detect2 <- function(str, locale = NULL) { warning("stri_enc_detect2 is deprecated and will be removed in a future release of 'stringi'.") suppressWarnings(lapply(.Call(C_stri_enc_detect2, str, locale), as.data.frame, stringsAsFactors = FALSE)) } stringi/R/search_split_bound.R0000644000176200001440000001645514750110641016136 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Split a String Into Text Lines #' #' @description #' These functions split each character string in a given vector #' into text lines. #' #' @details #' Vectorized over \code{str} and \code{omit_empty}. #' #' \code{omit_empty} is applied when splitting. If set to \code{TRUE}, #' then empty strings will never appear in the resulting vector. #' #' Newlines are represented with the Carriage Return #' (CR, 0x0D), Line Feed (LF, 0x0A), CRLF, or Next Line (NEL, 0x85) characters, #' depending on the platform. #' Moreover, the Unicode Standard defines two unambiguous separator characters, #' the Paragraph Separator (PS, 0x2029) and the Line Separator (LS, 0x2028). #' Sometimes also the Vertical Tab (VT, 0x0B) and the Form Feed (FF, 0x0C) #' are used for this purpose. #' #' These \pkg{stringi} functions follow UTR#18 rules, #' where a newline sequence #' corresponds to the following regular expression: #' \code{(?:\\u\{D A\}|(?!\\u\{D A\})[\\u\{A\}-\\u\{D\}\\u\{85\}\\u\{2028\}\\u\{2029\}]}. #' Each match serves as a text line separator. #' #' #' @param str character vector (\code{stri_split_lines}) #' or a single string (\code{stri_split_lines1}) #' @param omit_empty logical vector; determines whether empty #' strings should be removed from the result #' [\code{stri_split_lines} only] #' #' @return \code{stri_split_lines} returns a list of character vectors. #' If any input string is \code{NA}, then the corresponding list element #' is a single \code{NA} string. #' #' \code{stri_split_lines1(str)} is equivalent to #' \code{stri_split_lines(str[1])[[1]]} (with default parameters), #' therefore it returns a character vector. Moreover, if the input string #' ends with a newline sequence, the last empty string is omitted from the # result. This function may come in handy if you wish to split a text #' file's contents into text lines. #' #' @references #' \emph{Unicode Newline Guidelines} -- Unicode Technical Report #13, #' \url{https://www.unicode.org/standard/reports/tr13/tr13-5.html} #' #' \emph{Unicode Regular Expressions} -- Unicode Technical Standard #18, #' \url{https://www.unicode.org/reports/tr18/} #' #' @family search_split #' @family text_boundaries #' @export #' @rdname stri_split_lines #' @aliases stri_split_lines stri_split_lines1 stri_split_lines <- function(str, omit_empty = FALSE) { .Call(C_stri_split_lines, str, omit_empty) } #' @rdname stri_split_lines #' @export stri_split_lines1 <- function(str) { .Call(C_stri_split_lines1, str) } #' @title #' Split a String at Text Boundaries #' #' @description #' This function locates text boundaries #' (like character, word, line, or sentence boundaries) #' and splits strings at the indicated positions. #' #' @details #' Vectorized over \code{str} and \code{n}. #' #' If \code{n} is negative (the default), then all text pieces are extracted. #' #' Otherwise, if \code{tokens_only} is \code{FALSE} (which is the default), #' then \code{n-1} tokens are extracted (if possible) and the \code{n}-th string #' gives the (non-split) remainder (see Examples). #' On the other hand, if \code{tokens_only} is \code{TRUE}, #' then only full tokens (up to \code{n} pieces) are extracted. #' #' For more information on text boundary analysis #' performed by \pkg{ICU}'s \code{BreakIterator}, see #' \link{stringi-search-boundaries}. #' #' @param str character vector or an object coercible to #' @param n integer vector, maximal number of strings to return #' @param tokens_only single logical value; may affect the result if \code{n} #' is positive, see Details #' @param simplify single logical value; if \code{TRUE} or \code{NA}, #' then a character matrix is returned; otherwise (the default), a list of #' character vectors is given, see Value #' @param opts_brkiter a named list with \pkg{ICU} BreakIterator's settings, #' see \code{\link{stri_opts_brkiter}}; \code{NULL} for the #' default break iterator, i.e., \code{line_break} #' @param ... additional settings for \code{opts_brkiter} #' #' @return If \code{simplify=FALSE} (the default), #' then the functions return a list of character vectors. #' #' Otherwise, \code{\link{stri_list2matrix}} with \code{byrow=TRUE} #' and \code{n_min=n} arguments is called on the resulting object. #' In such a case, a character matrix with \code{length(str)} rows #' is returned. Note that \code{\link{stri_list2matrix}}'s \code{fill} #' argument is set to an empty string and \code{NA}, #' for \code{simplify} equal to \code{TRUE} and \code{NA}, respectively. #' #' @examples #' test <- 'The\u00a0above-mentioned features are very useful. ' %s+% #' 'Spam, spam, eggs, bacon, and spam. 123 456 789' #' stri_split_boundaries(test, type='line') #' stri_split_boundaries(test, type='word') #' stri_split_boundaries(test, type='word', skip_word_none=TRUE) #' stri_split_boundaries(test, type='word', skip_word_none=TRUE, skip_word_letter=TRUE) #' stri_split_boundaries(test, type='word', skip_word_none=TRUE, skip_word_number=TRUE) #' stri_split_boundaries(test, type='sentence') #' stri_split_boundaries(test, type='sentence', skip_sentence_sep=TRUE) #' stri_split_boundaries(test, type='character') #' #' # a filtered break iterator with the new ICU: #' stri_split_boundaries('Mr. Jones and Mrs. Brown are very happy. #' So am I, Prof. Smith.', type='sentence', locale='en_US@ss=standard') # ICU >= 56 only #' #' @export #' @family search_split #' @family locale_sensitive #' @family text_boundaries stri_split_boundaries <- function(str, n = -1L, tokens_only = FALSE, simplify = FALSE, ..., opts_brkiter = NULL) { if (!missing(...)) opts_brkiter <- do.call(stri_opts_brkiter, as.list(c(opts_brkiter, ...))) .Call(C_stri_split_boundaries, str, n, tokens_only, simplify, opts_brkiter) } stringi/R/utils.R0000644000176200001440000001524414750110641013422 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Convert a List to a Character Matrix #' #' @description #' This function converts a given list of atomic vectors to #' a character matrix. #' #' @details #' This function is similar to the built-in \code{\link{simplify2array}} #' function. However, it always returns a character matrix, #' even if each element in \code{x} is of length 1 #' or if elements in \code{x} are not of the same lengths. #' Moreover, the elements in \code{x} are always coerced to character vectors. #' #' If \code{byrow} is \code{FALSE}, then a matrix with \code{length(x)} #' columns is returned. #' The number of rows is the length of the #' longest vector in \code{x}, but no less than \code{n_min}. Basically, we have #' \code{result[i,j] == x[[j]][i]} if \code{i <= length(x[[j]])} #' and \code{result[i,j] == fill} otherwise, see Examples. #' #' If \code{byrow} is \code{TRUE}, then the resulting matrix is #' a transposition of the above-described one. #' #' This function may be useful, e.g., in connection with \code{\link{stri_split}} #' and \code{\link{stri_extract_all}}. #' #' @param x a list of atomic vectors #' @param byrow a single logical value; should the resulting matrix be #' transposed? #' @param fill a single string, see Details #' @param n_min a single integer value; minimal number of rows (\code{byrow==FALSE}) #' or columns (otherwise) in the resulting matrix #' @param by_row alias of \code{byrow} #' #' @return #' Returns a character matrix. #' #' @examples #' simplify2array(list(c('a', 'b'), c('c', 'd'), c('e', 'f'))) #' stri_list2matrix(list(c('a', 'b'), c('c', 'd'), c('e', 'f'))) #' stri_list2matrix(list(c('a', 'b'), c('c', 'd'), c('e', 'f')), byrow=TRUE) #' #' simplify2array(list('a', c('b', 'c'))) #' stri_list2matrix(list('a', c('b', 'c'))) #' stri_list2matrix(list('a', c('b', 'c')), fill='') #' stri_list2matrix(list('a', c('b', 'c')), fill='', n_min=5) #' #' @family utils #' @export stri_list2matrix <- function(x, byrow = FALSE, fill = NA_character_, n_min = 0, by_row = byrow) { if (!missing(by_row)) byrow <- by_row .Call(C_stri_list2matrix, x, byrow, stri_enc_toutf8(fill), n_min) } #' @title #' Replace NAs with Empty Strings #' #' @description #' This function replaces all missing values with empty strings. #' See \code{\link{stri_replace_na}} for a generalization. #' #' @param x a character vector #' #' @return #' Returns a character vector. #' #' @examples #' stri_na2empty(c('a', NA, '', 'b')) #' #' @family utils #' @export stri_na2empty <- function(x) { x <- stri_enc_toutf8(x) x[is.na(x)] <- "" x } #' @title #' Remove All Empty Strings from a Character Vector #' #' @description #' \code{stri_remove_empty} (alias \code{stri_omit_empty}) #' removes all empty strings from a character vector, #' and, if \code{na_empty} is \code{TRUE}, also gets rid of all missing #' values. #' #' \code{stri_remove_empty_na} (alias \code{stri_omit_empty_na}) #' removes both empty strings and missing values. #' #' \code{stri_remove_na} (alias \code{stri_omit_na}) #' returns a version of \code{x} with missing values removed. #' #' @param x a character vector #' @param na_empty should missing values be treated as empty strings? #' #' @return #' Returns a character vector. #' #' @examples #' stri_remove_empty(stri_na2empty(c('a', NA, '', 'b'))) #' stri_remove_empty(c('a', NA, '', 'b')) #' stri_remove_empty(c('a', NA, '', 'b'), TRUE) #' #' stri_omit_empty_na(c('a', NA, '', 'b')) #' #' @family utils #' @rdname stri_remove_empty #' @export stri_remove_empty <- function(x, na_empty = FALSE) { x <- stri_enc_toutf8(x) if (identical(na_empty, TRUE)) x[!is.na(x) & !stri_isempty(x)] else x[!stri_isempty(x)] } #' @rdname stri_remove_empty #' @export stri_omit_empty <- stri_remove_empty #' @rdname stri_remove_empty #' @export stri_remove_empty_na <- function(x) { stri_remove_empty(x, TRUE) } #' @rdname stri_remove_empty #' @export stri_omit_empty_na <- stri_remove_empty_na #' @rdname stri_remove_empty #' @export stri_remove_na <- function(x) { x <- stri_enc_toutf8(x) x[!is.na(x)] } #' @rdname stri_remove_empty #' @export stri_omit_na <- stri_remove_na #' @title #' Replace Missing Values in a Character Vector #' #' @description #' This function gives a convenient way to replace each missing (\code{NA}) #' value with a given string. #' #' @details #' This function is roughly equivalent to #' \code{str2 <- stri_enc_toutf8(str); #' str2[is.na(str2)] <- stri_enc_toutf8(replacement); #' str2}. #' It may be used, e.g., wherever the 'plain R' \code{NA} handling is #' desired, see Examples. #' #' @param str character vector or an object coercible to #' @param replacement single string #' #' @return Returns a character vector. #' #' @examples #' x <- c('test', NA) #' stri_paste(x, 1:2) # 'test1' NA #' paste(x, 1:2) # 'test 1' 'NA 2' #' stri_paste(stri_replace_na(x), 1:2, sep=' ') # 'test 1' 'NA 2' #' #' @export #' @family utils stri_replace_na <- function(str, replacement = "NA") { .Call(C_stri_replace_na, str, replacement) } stringi/R/search_locate_4.R0000644000176200001440000003176114750110641015303 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title Locate Pattern Occurrences #' #' @description #' These functions find the indexes (positions) where #' there is a match to some pattern. #' The functions \code{stri_locate_all_*} locate all the matches. #' \code{stri_locate_first_*} and \code{stri_locate_last_*} #' give the first and the last matches, respectively. #' #' @details #' Vectorized over \code{str} and \code{pattern} (with recycling #' of the elements in the shorter vector if necessary). This allows to, #' for instance, search for one pattern in each string, #' search for each pattern in one string, #' and search for the i-th pattern within the i-th string. #' #' The matches may be extracted by calling #' \code{\link{stri_sub}} or \code{\link{stri_sub_all}}. #' Alternatively, you may call \code{\link{stri_extract}} directly. #' #' \code{stri_locate}, \code{stri_locate_all}, \code{stri_locate_first}, #' and \code{stri_locate_last} are convenience functions. #' They just call \code{stri_locate_*_*}, depending on the arguments used. #' #' #' #' @param str character vector; strings to search in #' #' @param pattern,regex,fixed,coll,charclass character vector; #' search patterns; for more details refer to \link{stringi-search} #' #' @param opts_collator,opts_fixed,opts_regex named list used to tune up #' the selected search engine's settings; see #' \code{\link{stri_opts_collator}}, \code{\link{stri_opts_fixed}}, #' and \code{\link{stri_opts_regex}}, respectively; \code{NULL} #' for the defaults #' #' @param merge single logical value; #' indicates whether consecutive sequences of indexes in the resulting #' matrix should be merged; \code{stri_locate_all_charclass} only #' #' @param omit_no_match single logical value; if \code{TRUE}, #' a no-match will be indicated by a matrix with 0 rows #' \code{stri_locate_all_*} only #' #' @param get_length single logical value; if \code{FALSE} (default), #' generate \emph{from-to} matrices; otherwise, output #' \emph{from-length} ones #' #' @param capture_groups single logical value; #' whether positions of matches to parenthesized subexpressions #' should be returned too (as \code{capture_groups} attribute); #' \code{stri_locate_*_regex} only #' #' @param mode single string; #' one of: \code{'first'} (the default), \code{'all'}, \code{'last'} #' #' @param ... supplementary arguments passed to the underlying functions, #' including additional settings for \code{opts_collator}, #' \code{opts_regex}, \code{opts_fixed}, and so on #' #' #' @return #' For \code{stri_locate_all_*}, #' a list of integer matrices is returned. Each list element #' represents the results of a separate search scenario. #' The first column gives the start positions #' of the matches, and the second column gives the end positions. #' Moreover, two \code{NA}s in a row denote \code{NA} arguments #' or a no-match (the latter only if \code{omit_no_match} is \code{FALSE}). #' #' \code{stri_locate_first_*} and \code{stri_locate_last_*} #' return an integer matrix with #' two columns, giving the start and end positions of the first #' or the last matches, respectively, and two \code{NA}s if and #' only if they are not found. #' #' For \code{stri_locate_*_regex}, if the match is of zero length, #' \code{end} will be one character less than \code{start}. #' Note that \code{stri_locate_last_regex} searches from start to end, #' but skips overlapping matches, see the example below. #' #' Setting \code{get_length=TRUE} results in the 2nd column representing #' the length of the match instead of the end position. In this case, #' negative length denotes a no-match. #' #' If \code{capture_groups=TRUE}, then the outputs are equipped with the #' \code{capture_groups} attribute, which is a list of matrices #' giving the start-end positions of matches to parenthesized subexpressions. #' Similarly to \code{stri_match_regex}, capture group names are extracted #' unless looking for first/last occurrences of many different patterns. #' #' @examples #' stri_locate_all('stringi', fixed='i') #' #' stri_locate_first_coll('hladn\u00FD', 'HLADNY', strength=1, locale='sk_SK') #' #' stri_locate_all_regex( #' c('breakfast=eggs;lunch=pizza', 'breakfast=spam', 'no food here'), #' '(?\\w+)=(?\\w+)', #' capture_groups=TRUE #' ) # named capture groups #' #' stri_locate_all_fixed("abababa", "ABA", case_insensitive=TRUE, overlap=TRUE) #' stri_locate_first_fixed("ababa", "aba") #' stri_locate_last_fixed("ababa", "aba") # starts from end #' stri_locate_last_regex("ababa", "aba") # no overlaps, from left to right #' #' x <- c("yes yes", "no", NA) #' stri_locate_all_fixed(x, "yes") #' stri_locate_all_fixed(x, "yes", omit_no_match=TRUE) #' stri_locate_all_fixed(x, "yes", get_length=TRUE) #' stri_locate_all_fixed(x, "yes", get_length=TRUE, omit_no_match=TRUE) #' stri_locate_first_fixed(x, "yes") #' stri_locate_first_fixed(x, "yes", get_length=TRUE) #' #' # Use regex positive-lookahead to locate overlapping pattern matches: #' stri_locate_all_regex('ACAGAGACTTTAGATAGAGAAGA', '(?=AGA)') #' # note that start > end here (match of length zero) #' #' #' @family search_locate #' @family indexing #' #' @export #' @rdname stri_locate stri_locate_all <- function(str, ..., regex, fixed, coll, charclass) { providedarg <- c( regex=!missing(regex), fixed=!missing(fixed), coll=!missing(coll), charclass=!missing(charclass)) if (sum(providedarg) != 1) stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`") if (providedarg["regex"]) stri_locate_all_regex(str, regex, ...) else if (providedarg["fixed"]) stri_locate_all_fixed(str, fixed, ...) else if (providedarg["coll"]) stri_locate_all_coll(str, coll, ...) else if (providedarg["charclass"]) stri_locate_all_charclass(str, charclass, ...) } #' @export #' @rdname stri_locate stri_locate_first <- function(str, ..., regex, fixed, coll, charclass) { providedarg <- c( regex=!missing(regex), fixed=!missing(fixed), coll=!missing(coll), charclass=!missing(charclass)) if (sum(providedarg) != 1) stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`") if (providedarg["regex"]) stri_locate_first_regex(str, regex, ...) else if (providedarg["fixed"]) stri_locate_first_fixed(str, fixed, ...) else if (providedarg["coll"]) stri_locate_first_coll(str, coll, ...) else if (providedarg["charclass"]) stri_locate_first_charclass(str, charclass, ...) } #' @export #' @rdname stri_locate stri_locate_last <- function(str, ..., regex, fixed, coll, charclass) { providedarg <- c( regex=!missing(regex), fixed=!missing(fixed), coll=!missing(coll), charclass=!missing(charclass)) if (sum(providedarg) != 1) stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`") if (providedarg["regex"]) stri_locate_last_regex(str, regex, ...) else if (providedarg["fixed"]) stri_locate_last_fixed(str, fixed, ...) else if (providedarg["coll"]) stri_locate_last_coll(str, coll, ...) else if (providedarg["charclass"]) stri_locate_last_charclass(str, charclass, ...) } #' @export #' @rdname stri_locate stri_locate <- function( str, ..., regex, fixed, coll, charclass, mode=c("first", "all", "last") ) { # `first` is default for compatibility with stringr mode <- match.arg(mode) # this is slow switch(mode, first=stri_locate_first(str, ..., regex=regex, fixed=fixed, coll=coll, charclass=charclass), last=stri_locate_last(str, ..., regex=regex, fixed=fixed, coll=coll, charclass=charclass), all=stri_locate_all(str, ..., regex=regex, fixed=fixed, coll=coll, charclass=charclass)) } #' @export #' @rdname stri_locate stri_locate_all_charclass <- function( str, pattern, merge=TRUE, omit_no_match=FALSE, get_length=FALSE ) { .Call(C_stri_locate_all_charclass, str, pattern, merge, omit_no_match, get_length) } #' @export #' @rdname stri_locate stri_locate_first_charclass <- function(str, pattern, get_length=FALSE) { .Call(C_stri_locate_first_charclass, str, pattern, get_length) } #' @export #' @rdname stri_locate stri_locate_last_charclass <- function(str, pattern, get_length=FALSE) { .Call(C_stri_locate_last_charclass, str, pattern, get_length) } #' @export #' @rdname stri_locate stri_locate_all_coll <- function( str, pattern, omit_no_match=FALSE, get_length=FALSE, ..., opts_collator=NULL ) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_locate_all_coll, str, pattern, omit_no_match, opts_collator, get_length) } #' @export #' @rdname stri_locate stri_locate_first_coll <- function( str, pattern, get_length=FALSE, ..., opts_collator=NULL ) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_locate_first_coll, str, pattern, opts_collator, get_length) } #' @export #' @rdname stri_locate stri_locate_last_coll <- function( str, pattern, get_length=FALSE, ..., opts_collator=NULL ) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_locate_last_coll, str, pattern, opts_collator, get_length) } #' @export #' @rdname stri_locate stri_locate_all_regex <- function( str, pattern, omit_no_match=FALSE, capture_groups=FALSE, get_length=FALSE, ..., opts_regex=NULL ) { if (!missing(...)) opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...))) .Call(C_stri_locate_all_regex, str, pattern, omit_no_match, opts_regex, capture_groups, get_length) } #' @export #' @rdname stri_locate stri_locate_first_regex <- function( str, pattern, capture_groups=FALSE, get_length=FALSE, ..., opts_regex=NULL ) { if (!missing(...)) opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...))) .Call(C_stri_locate_first_regex, str, pattern, opts_regex, capture_groups, get_length) } #' @export #' @rdname stri_locate stri_locate_last_regex <- function( str, pattern, capture_groups=FALSE, get_length=FALSE, ..., opts_regex=NULL ) { if (!missing(...)) opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...))) .Call(C_stri_locate_last_regex, str, pattern, opts_regex, capture_groups, get_length) } #' @export #' @rdname stri_locate stri_locate_all_fixed <- function( str, pattern, omit_no_match=FALSE, get_length=FALSE, ..., opts_fixed=NULL ) { if (!missing(...)) opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...))) .Call(C_stri_locate_all_fixed, str, pattern, omit_no_match, opts_fixed, get_length) } #' @export #' @rdname stri_locate stri_locate_first_fixed <- function( str, pattern, get_length=FALSE, ..., opts_fixed=NULL ) { if (!missing(...)) opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...))) .Call(C_stri_locate_first_fixed, str, pattern, opts_fixed, get_length) } #' @export #' @rdname stri_locate stri_locate_last_fixed <- function( str, pattern, get_length=FALSE, ..., opts_fixed=NULL ) { if (!missing(...)) opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...))) .Call(C_stri_locate_last_fixed, str, pattern, opts_fixed, get_length) } stringi/R/length.R0000644000176200001440000001631314750110641013541 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Count the Number of Bytes #' #' @description #' Counts the number of bytes needed to store #' each string in the computer's memory. #' #' @details #' Often, this is not the function you would normally use #' in your string processing activities. See \code{\link{stri_length}} instead. #' #' For 8-bit encoded strings, this is the same as \code{\link{stri_length}}. #' For UTF-8 strings, the returned values may be greater #' than the number of code points, as UTF-8 is not a fixed-byte encoding: #' one code point may be encoded by 1-4 bytes #' (according to the current Unicode standard). #' #' Missing values are handled properly. #' #' The strings do not need to be re-encoded to perform this operation. #' #' The returned values do not include the trailing NUL bytes, #' which are used internally to mark the end of string data (in C). #' #' @param str character vector or an object coercible to #' #' @return Returns an integer vector of the same length as \code{str}. #' #' @examples #' stri_numbytes(letters) #' stri_numbytes(c('abc', '123', '\u0105\u0104')) #' #' \dontrun{ #' # this used to fail on Windows, where there were no native support #' # for 4-bytes Unicode characters; see, however, stri_unescape_unicode(): #' stri_numbytes('\U001F600') # compare stri_length('\U001F600') #' } #' #' @export #' @family length stri_numbytes <- function(str) { .Call(C_stri_numbytes, str) } #' @title #' Count the Number of Code Points #' #' @description #' This function returns the number of code points #' in each string. #' #' @details #' Note that the number of code points is #' not the same as the `width` of the string when #' printed on the console. #' #' If a given string is in UTF-8 and has not been properly normalized #' (e.g., by \code{\link{stri_trans_nfc}}), the returned counts may sometimes be #' misleading. See \code{\link{stri_count_boundaries}} for a method to count #' \emph{Unicode characters}. Moreover, if an incorrect UTF-8 byte sequence #' is detected, then a warning is generated and the corresponding output element #' is set to \code{NA}, see also \code{\link{stri_enc_toutf8}} for a method #' to deal with such cases. #' #' Missing values are handled properly. #' For `byte` encodings we get, as usual, an error. #' #' @param str character vector or an object coercible to #' @return Returns an integer vector of the same length as \code{str}. #' #' @examples #' stri_length(LETTERS) #' stri_length(c('abc', '123', '\u0105\u0104')) #' stri_length('\u0105') # length is one, but... #' stri_numbytes('\u0105') # 2 bytes are used #' stri_numbytes(stri_trans_nfkd('\u0105')) # 3 bytes here but... #' stri_length(stri_trans_nfkd('\u0105')) # ...two code points (!) #' stri_count_boundaries(stri_trans_nfkd('\u0105'), type='character') # ...and one Unicode character #' #' @export #' @family length stri_length <- function(str) { .Call(C_stri_length, str) } #' @title #' Determine if a String is of Length Zero #' #' @description #' This is the fastest way to find out #' whether the elements of a character vector are empty strings. #' #' @details #' Missing values are handled properly. #' #' @param str character vector or an object coercible to #' @return Returns a logical vector of the same length as \code{str}. #' #' @examples #' stri_isempty(letters[1:3]) #' stri_isempty(c(',', '', 'abc', '123', '\u0105\u0104')) #' stri_isempty(character(1)) #' #' @export #' @family length stri_isempty <- function(str) { .Call(C_stri_isempty, str) } #' @title #' Determine the Width of Code Points #' #' @description #' Approximates the number of text columns the `cat()` function #' might use to print a string using a mono-spaced font. #' #' @details #' The Unicode standard does not formalize the notion of a character #' width. Roughly based on \url{http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c}, #' \url{https://github.com/nodejs/node/blob/master/src/node_i18n.cc}, #' and UAX #11 we proceed as follows. #' The following code points are of width 0: #' \itemize{ #' \item code points with general category (see \link{stringi-search-charclass}) #' \code{Me}, \code{Mn}, and \code{Cf}), #' \item \code{C0} and \code{C1} control codes (general category \code{Cc}) #' - for compatibility with the \code{\link{nchar}} function, #' \item Hangul Jamo medial vowels and final consonants #' (code points with enumerable property \code{UCHAR_HANGUL_SYLLABLE_TYPE} #' equal to \code{U_HST_VOWEL_JAMO} or \code{U_HST_TRAILING_JAMO}; #' note that applying the NFC normalization with \code{\link{stri_trans_nfc}} #' is encouraged), #' \item ZERO WIDTH SPACE (U+200B), #' } #' #' Characters with the \code{UCHAR_EAST_ASIAN_WIDTH} enumerable property #' equal to \code{U_EA_FULLWIDTH} or \code{U_EA_WIDE} are #' of width 2. #' #' Most emojis and characters with general category So (other symbols) #' are of width 2. #' #' SOFT HYPHEN (U+00AD) (for compatibility with \code{\link{nchar}}) #' as well as any other characters have width 1. #' #' @param str character vector or an object coercible to #' @return Returns an integer vector of the same length as \code{str}. #' #' @examples #' stri_width(LETTERS[1:5]) #' stri_width(stri_trans_nfkd('\u0105')) #' stri_width(stri_trans_nfkd('\U0001F606')) #' stri_width( # Full-width equivalents of ASCII characters: #' stri_enc_fromutf32(as.list(c(0x3000, 0xFF01:0xFF5E))) #' ) #' stri_width(stri_trans_nfkd('\ubc1f')) # includes Hangul Jamo medial vowels and final consonants #' @export #' @family length #' #' @references #' \emph{East Asian Width} -- Unicode Standard Annex #11, #' \url{https://www.unicode.org/reports/tr11/} stri_width <- function(str) { .Call(C_stri_width, str) } stringi/R/ICU_settings.R0000644000176200001440000000775314750110641014630 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Query Default Settings for \pkg{stringi} #' #' @description #' Gives the current default settings used by the \pkg{ICU} library. #' #' @param short logical; whether or not the results should be given #' in a concise form; defaults to \code{TRUE} #' #' @return If \code{short} is \code{TRUE}, then a single string providing #' information on the default character encoding, locale, and Unicode #' as well as \pkg{ICU} version is returned. #' #' Otherwise, a list with the following components is returned: #' \itemize{ #' \item \code{Unicode.version} -- version of Unicode supported #' by the \pkg{ICU} library; #' \item \code{ICU.version} -- \pkg{ICU} library version used; #' \item \code{Locale} -- contains information on default locale, #' as returned by \code{\link{stri_locale_info}}; #' \item \code{Charset.internal} -- fixed at \code{c('UTF-8', 'UTF-16')}; #' \item \code{Charset.native} -- information on the default encoding, #' as returned by \code{\link{stri_enc_info}}; #' \item \code{ICU.system} -- logical; \code{TRUE} indicates that #' the system \pkg{ICU} libs are used, otherwise \pkg{ICU} was built together #' with \pkg{stringi}; #' \item \code{ICU.UTF8} -- logical; \code{TRUE} if the internal #' \code{U_CHARSET_IS_UTF8} flag is defined and set. #' } #' #' @export #' @family locale #' @family encoding stri_info <- function(short = FALSE) { stopifnot(is.logical(short), length(short) == 1) info <- .Call(C_stri_info) #loclist <- stri_locale_list() locale <- info$Locale$Name charset <- info$Charset.native$Name.friendly if (charset != "UTF-8") { if (!identical(info$Charset.native$ASCII.subset, TRUE)) warning(stri_paste("Your native character encoding is not a superset of US-ASCII. ", "Consider switching to UTF-8.")) else if (!identical(info$Charset.native$Unicode.1to1, TRUE)) warning(stri_paste("Your native character encoding does not map to Unicode properly. ", "Consider switching to UTF-8.")) } if (!short) return(info) else { return(sprintf("stringi_%s (%s.%s; ICU4C %s [%s%s]; Unicode %s)", as.character(packageVersion("stringi")), locale, charset, info$ICU.version, if (info$ICU.system) "system" else "bundle", if (info$ICU.UTF8) "#U_CHARSET_IS_UTF8" else "", info$Unicode.version)) } } stringi/R/search_subset_4.R0000644000176200001440000002051014750110641015327 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Select Elements that Match a Given Pattern #' #' @description #' These functions return or modify a sub-vector where there is a match to #' a given pattern. In other words, they #' are roughly equivalent (but faster and easier to use) to a call to #' \code{str[\link{stri_detect}(str, ...)]} or #' \code{str[\link{stri_detect}(str, ...)] <- value}. #' #' @details #' Vectorized over \code{str} as well as partially over \code{pattern} #' and \code{value}, #' with recycling of the elements in the shorter vector if necessary. #' As the aim here is to subset \code{str}, \code{pattern} #' cannot be longer than the former. Moreover, if the number of #' items to replace is not a multiple of length of \code{value}, #' a warning is emitted and the unused elements are ignored. #' Hence, the length of the output will be the same as length of \code{str}. #' #' \code{stri_subset} and \code{stri_subset<-} are convenience functions. #' They call either \code{stri_subset_regex}, #' \code{stri_subset_fixed}, \code{stri_subset_coll}, #' or \code{stri_subset_charclass}, #' depending on the argument used. #' #' @param str character vector; strings to search within #' #' @param pattern,regex,fixed,coll,charclass character vector; #' search patterns (no more than the length of \code{str}); #' for more details refer to \link{stringi-search} #' #' @param negate single logical value; whether a no-match is rather of interest #' #' @param omit_na single logical value; should missing values be excluded #' from the result? #' #' @param opts_collator,opts_fixed,opts_regex a named list used to tune up #' the search engine's settings; see #' \code{\link{stri_opts_collator}}, \code{\link{stri_opts_fixed}}, #' and \code{\link{stri_opts_regex}}, respectively; \code{NULL} #' for the defaults #' #' @param ... supplementary arguments passed to the underlying functions, #' including additional settings for \code{opts_collator}, \code{opts_regex}, #' \code{opts_fixed}, and so on #' #' @param value non-empty character vector of replacement strings; #' replacement function only #' #' #' @return The \code{stri_subset_*} functions return a character vector. #' As usual, the output encoding is UTF-8. #' #' The \code{stri_subset_*<-} functions modifies \code{str} 'in-place'. #' #' #' @examples #' stri_subset_regex(c('stringi R', '123', 'ID456', ''), '^[0-9]+$') #' #' x <- c('stringi R', '123', 'ID456', '') #' `stri_subset_regex<-`(x, '[0-9]+$', negate=TRUE, value=NA) # returns a copy #' stri_subset_regex(x, '[0-9]+$') <- NA # modifies `x` in-place #' print(x) #' #' @family search_subset #' @export #' @rdname stri_subset stri_subset <- function(str, ..., regex, fixed, coll, charclass) { providedarg <- c( regex = !missing(regex), fixed = !missing(fixed), coll = !missing(coll), charclass = !missing(charclass)) if (sum(providedarg) != 1) stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`") if (providedarg["regex"]) stri_subset_regex(str, regex, ...) else if (providedarg["fixed"]) stri_subset_fixed(str, fixed, ...) else if (providedarg["coll"]) stri_subset_coll(str, coll, ...) else if (providedarg["charclass"]) stri_subset_charclass(str, charclass, ...) } #' @export #' @rdname stri_subset #' @usage stri_subset(str, ..., regex, fixed, coll, charclass) <- value `stri_subset<-` <- function(str, ..., regex, fixed, coll, charclass, value) { providedarg <- c( regex = !missing(regex), fixed = !missing(fixed), coll = !missing(coll), charclass = !missing(charclass)) if (sum(providedarg) != 1) stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`") if (providedarg["regex"]) `stri_subset_regex<-`(str, regex, ..., value = value) else if (providedarg["fixed"]) `stri_subset_fixed<-`(str, fixed, ..., value = value) else if (providedarg["coll"]) `stri_subset_coll<-`(str, coll, ..., value = value) else if (providedarg["charclass"]) `stri_subset_charclass<-`(str, charclass, ..., value = value) } #' @export #' @rdname stri_subset stri_subset_fixed <- function(str, pattern, omit_na = FALSE, negate = FALSE, ..., opts_fixed = NULL) { if (!missing(...)) opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...))) .Call(C_stri_subset_fixed, str, pattern, omit_na, negate, opts_fixed) } #' @export #' @rdname stri_subset #' @usage stri_subset_fixed(str, pattern, negate=FALSE, ..., opts_fixed=NULL) <- value `stri_subset_fixed<-` <- function(str, pattern, negate = FALSE, ..., opts_fixed = NULL, value) { if (!missing(...)) opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...))) .Call(C_stri_subset_fixed_replacement, str, pattern, negate, opts_fixed, value) } #' @export #' @rdname stri_subset stri_subset_charclass <- function(str, pattern, omit_na = FALSE, negate = FALSE) { .Call(C_stri_subset_charclass, str, pattern, omit_na, negate) } #' @export #' @rdname stri_subset #' @usage stri_subset_charclass(str, pattern, negate=FALSE) <- value `stri_subset_charclass<-` <- function(str, pattern, negate = FALSE, value) { .Call(C_stri_subset_charclass_replacement, str, pattern, negate, value) } #' @export #' @rdname stri_subset stri_subset_coll <- function(str, pattern, omit_na = FALSE, negate = FALSE, ..., opts_collator = NULL) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_subset_coll, str, pattern, omit_na, negate, opts_collator) } #' @export #' @rdname stri_subset #' @usage stri_subset_coll(str, pattern, negate=FALSE, ..., opts_collator=NULL) <- value `stri_subset_coll<-` <- function(str, pattern, negate = FALSE, ..., opts_collator = NULL, value) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_subset_coll_replacement, str, pattern, negate, opts_collator, value) } #' @export #' @rdname stri_subset stri_subset_regex <- function(str, pattern, omit_na = FALSE, negate = FALSE, ..., opts_regex = NULL) { if (!missing(...)) opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...))) .Call(C_stri_subset_regex, str, pattern, omit_na, negate, opts_regex) } #' @export #' @rdname stri_subset #' @usage stri_subset_regex(str, pattern, negate=FALSE, ..., opts_regex=NULL) <- value `stri_subset_regex<-` <- function(str, pattern, negate = FALSE, ..., opts_regex = NULL, value) { if (!missing(...)) opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...))) .Call(C_stri_subset_regex_replacement, str, pattern, negate, opts_regex, value) } stringi/R/stats.R0000644000176200001440000001106614750110641013416 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' General Statistics for a Character Vector #' #' @description #' This function gives general statistics for a character vector, #' e.g., obtained by loading a text file with the #' \code{\link{readLines}} or \code{\link{stri_read_lines}} function, #' where each text line' is represented by a separate string. #' #' @details #' None of the strings may contain \code{\\r} or \code{\\n} characters, #' otherwise you will get at error. #' #' Below by `white space` we mean the Unicode binary property #' \code{WHITE_SPACE}, see \code{stringi-search-charclass}. #' #' @param str character vector to be aggregated #' @return Returns an integer vector with the following named elements: #' \enumerate{ #' \item \code{Lines} - number of lines (number of #' non-missing strings in the vector); #' \item \code{LinesNEmpty} - number of lines with at least #' one non-\code{WHITE_SPACE} character; #' \item \code{Chars} - total number of Unicode code points detected; #' \item \code{CharsNWhite} - number of Unicode code points #' that are not \code{WHITE_SPACE}s; #' \item ... (Other stuff that may appear in future releases of \pkg{stringi}). #' } #' @examples #' s <- c('Lorem ipsum dolor sit amet, consectetur adipisicing elit.', #' 'nibh augue, suscipit a, scelerisque sed, lacinia in, mi.', #' 'Cras vel lorem. Etiam pellentesque aliquet tellus.', #' '') #' stri_stats_general(s) #' #' @family stats #' @export stri_stats_general <- function(str) { .Call(C_stri_stats_general, str) } #' @title #' Statistics for a Character Vector Containing LaTeX Commands #' #' @description #' This function gives LaTeX-oriented statistics for a character vector, #' e.g., obtained by loading a text file with the #' \code{\link{readLines}} function, where each text line #' is represented by a separate string. #' #' @details #' We use a slightly modified LaTeX Word Count algorithm implemented in #' Kile 2.1.3, see #' \url{https://kile.sourceforge.io/team.php} for the original contributors. #' #' #' #' @param str character vector to be aggregated #' @return Returns an integer vector with the following named elements: #' \enumerate{ #' \item \code{CharsWord} - number of word characters; #' \item \code{CharsCmdEnvir} - command and words characters; #' \item \code{CharsWhite} - LaTeX white spaces, including \{ and \} in some contexts; #' \item \code{Words} - number of words; #' \item \code{Cmds} - number of commands; #' \item \code{Envirs} - number of environments; #' \item ... (Other stuff that may appear in future releases of \pkg{stringi}). #' } #' @examples #' s <- c('Lorem \\textbf{ipsum} dolor sit \\textit{amet}, consectetur adipisicing elit.', #' '\\begin{small}Proin nibh augue,\\end{small} suscipit a, scelerisque sed, lacinia in, mi.', #' '') #' stri_stats_latex(s) #' #' @family stats #' @export stri_stats_latex <- function(str) { .Call(C_stri_stats_latex, str) } stringi/R/search.R0000644000176200001440000010455014750110641013526 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title String Searching #' #' @description #' This man page explains how to perform string search-based #' operations in \pkg{stringi}. #' #' @details #' The following independent string searching engines are available #' in \pkg{stringi}. #' \itemize{ #' \item \code{stri_*_regex} -- \pkg{ICU}'s regular expressions (regexes), #' see \link{about_search_regex}, #' \item \code{stri_*_fixed} -- locale-independent byte-wise pattern matching, #' see \link{about_search_fixed}, #' \item \code{stri_*_coll} -- \pkg{ICU}'s \code{StringSearch}, #' locale-sensitive, Collator-based pattern search, #' useful for natural language processing tasks, #' see \link{about_search_coll}, #' \item \code{stri_*_charclass} -- character classes search, #' e.g., Unicode General Categories or Binary Properties, #' see \link{about_search_charclass}, #' \item \code{stri_*_boundaries} -- text boundary analysis, #' see \link{about_search_boundaries} #' } #' #' Each search engine is able to perform many search-based operations. #' These may include: #' \itemize{ #' \item \code{stri_detect_*} - detect if a pattern occurs in a string, #' see, e.g., \code{\link{stri_detect}}, #' \item \code{stri_count_*} - count the number of pattern occurrences, #' see, e.g., \code{\link{stri_count}}, #' \item \code{stri_locate_*} - locate all, first, or last occurrences #' of a pattern, see, e.g., \code{\link{stri_locate}}, #' \item \code{stri_extract_*} - extract all, first, or last occurrences #' of a pattern, see, e.g., \code{\link{stri_extract}} #' and, in case of regexes, \code{\link{stri_match}}, #' \item \code{stri_replace_*} - replace all, first, or last occurrences #' of a pattern, see, e.g., \code{\link{stri_replace}} #' and also \code{\link{stri_trim}}, #' \item \code{stri_split_*} - split a string into chunks indicated #' by occurrences of a pattern, #' see, e.g., \code{\link{stri_split}}, #' \item \code{stri_startswith_*} and \code{stri_endswith_*} detect #' if a string starts or ends with a pattern match, see, #' e.g., \code{\link{stri_startswith}}, #' \item \code{stri_subset_*} - return a subset of a character vector #' with strings that match a given pattern, see, e.g., \code{\link{stri_subset}}. #' } #' #' @name about_search #' @rdname about_search #' @aliases about_search search stringi-search #' @family text_boundaries #' @family search_regex #' @family search_fixed #' @family search_coll #' @family search_charclass #' @family search_detect #' @family search_count #' @family search_locate #' @family search_replace #' @family search_split #' @family search_subset #' @family search_extract #' @family search_in #' @family stringi_general_topics invisible(NULL) #' @title #' Regular Expressions in \pkg{stringi} #' #' @description #' A regular expression is a pattern describing, possibly in a very #' abstract way, a text fragment. #' With so many regex functions in \pkg{stringi}, #' regular expressions may be a very powerful tool #' to perform string searching, substring extraction, string splitting, etc., #' tasks. #' #' #' @details #' All \code{stri_*_regex} functions in \pkg{stringi} use #' the \pkg{ICU} regex engine. Its settings may be tuned up (for example #' to perform case-insensitive search) via the #' \code{\link{stri_opts_regex}} function. #' #' #' Regular expression patterns in \pkg{ICU} are quite similar in form and #' behavior to Perl's regexes. Their implementation is loosely inspired #' by JDK 1.4 \code{java.util.regex}. #' \pkg{ICU} Regular Expressions conform to the Unicode Technical Standard #18 #' (see References section) and its features are summarized in #' the ICU User Guide (see below). A good general introduction #' to regexes is (Friedl, 2002). #' Some general topics are also covered in the \R manual, see \link{regex}. #' #' @section \pkg{ICU} Regex Operators at a Glance: #' #' Here is a list of operators provided by the #' ICU User Guide on regexes. #' #' \describe{ #' \item{\code{|}}{Alternation. \code{A|B} matches either A or B.} #' \item{\code{*}}{Match 0 or more times. Match as many times as possible.} #' \item{\code{+}}{Match 1 or more times. Match as many times as possible.} #' \item{\code{?}}{Match zero or one times. Prefer one.} #' \item{\code{{n}} }{Match exactly n times.} #' \item{\code{{n,}} }{Match at least n times. Match as many times as possible.} #' \item{\code{{n,m}} }{Match between n and m times. #' Match as many times as possible, but not more than m.} #' \item{\code{*?}}{Match 0 or more times. Match as few times as possible.} #' \item{\code{+?}}{Match 1 or more times. Match as few times as possible.} #' \item{\code{??}}{Match zero or one times. Prefer zero.} #' \item{\code{{n}?}}{Match exactly n times.} #' \item{\code{{n,}?}}{Match at least n times, but no more than required #' for an overall pattern match.} #' \item{\code{{n,m}?}}{Match between n and m times. Match as few times #' as possible, but not less than n.} #' \item{\code{*+}}{Match 0 or more times. Match as many times as possible #' when first encountered, do not retry with fewer even if overall match fails #' (Possessive Match).} #' \item{\code{++}}{Match 1 or more times. Possessive match.} #' \item{\code{?+}}{Match zero or one times. Possessive match.} #' \item{\code{{n}+}}{Match exactly n times.} #' \item{\code{{n,}+}}{Match at least n times. Possessive Match.} #' \item{\code{{n,m}+}}{Match between n and m times. Possessive Match.} #' \item{\code{(...)}}{Capturing parentheses. Range of input that matched #' the parenthesized sub-expression is available after the match, #' see \code{\link{stri_match}}.} #' \item{\code{(?:...)}}{Non-capturing parentheses. Groups the included pattern, #' but does not provide capturing of matching text. Somewhat more efficient #' than capturing parentheses.} #' \item{\code{(?>...)}}{Atomic-match parentheses. The first match of the #' parenthesized sub-expression is the only one tried; if it does not lead to #' an overall pattern match, back up the search for a match to a position #' before the \code{(?>}.} #' \item{\code{(?#...)}}{Free-format comment \code{(?# comment )}.} #' \item{\code{(?=...)}}{Look-ahead assertion. True if the parenthesized #' pattern matches at the current input position, but does not advance #' the input position.} #' \item{\code{(?!...)}}{Negative look-ahead assertion. True if the #' parenthesized pattern does not match at the current input position. #' Does not advance the input position.} #' \item{\code{(?<=...)}}{Look-behind assertion. True if the parenthesized #' pattern matches text preceding the current input position, with the last #' character of the match being the input character just before the current #' position. Does not alter the input position. The length of possible strings #' matched by the look-behind pattern must not be unbounded (no \code{*} #' or \code{+} operators.)} #' \item{\code{(?...)}}{Named capture group, where \code{name} #' (enclosed within the angle brackets) #' is a sequence like \code{[A-Za-z][A-Za-z0-9]*}} #' \item{\code{(?ismwx-ismwx:...)}}{Flag settings. Evaluate the parenthesized #' expression with the specified flags enabled or \code{-}disabled, #' see also \code{\link{stri_opts_regex}}.} #' \item{\code{(?ismwx-ismwx)}}{Flag settings. Change the flag settings. #' Changes apply to the portion of the pattern following the setting. #' For example, \code{(?i)} changes to a case insensitive match, #' see also \code{\link{stri_opts_regex}}.} #' } #' #' #' @section \pkg{ICU} Regex Meta-characters at a Glance: #' #' Here is a list of meta-characters provided by the #' ICU User Guide on regexes. #' #' \describe{ #' \item{\code{\\a}}{Match a BELL, \code{\\u0007}.} #' \item{\code{\\A}}{Match at the beginning of the input. Differs from \code{^}. #' in that \code{\\A} will not match after a new line within the input.} #' \item{\code{\\b}}{Match if the current position is a word boundary. #' Boundaries occur at the transitions between word (\code{\\w}) and non-word #' (\code{\\W}) characters, with combining marks ignored. For better word #' boundaries, see \pkg{ICU} Boundary Analysis, e.g., \code{\link{stri_extract_all_words}}.} #' \item{\code{\\B}}{Match if the current position is not a word boundary.} #' \item{\code{\\cX}}{Match a control-\code{X} character.} #' \item{\code{\\d}}{Match any character with the Unicode General Category of #' \code{Nd} (Number, Decimal Digit.).} #' \item{\code{\\D}}{Match any character that is not a decimal digit.} #' \item{\code{\\e}}{Match an ESCAPE, \code{\\u001B}.} #' \item{\code{\\E}}{Terminates a \code{\\Q} ... \code{\\E} quoted sequence.} #' \item{\code{\\f}}{Match a FORM FEED, \code{\\u000C}.} #' \item{\code{\\G}}{Match if the current position is at the end of the #' previous match.} #' \item{\code{\\h}}{Match a Horizontal White Space character. #' They are characters with Unicode General Category of Space_Separator plus #' the ASCII tab, \code{\\u0009}. [Since ICU 55]} #' \item{\code{\\H}}{Match a non-Horizontal White Space character. #' [Since ICU 55]} #' \item{\code{\\k}}{Named Capture Back Reference. [Since ICU 55]} #' \item{\code{\\n}}{Match a LINE FEED, \code{\\u000A}.} #' \item{\code{\\N{UNICODE CHARACTER NAME}} }{Match the named character.} #' \item{\code{\\p{UNICODE PROPERTY NAME}} }{Match any character with the #' specified Unicode Property.} #' \item{\code{\\P{UNICODE PROPERTY NAME}} }{Match any character not having #' the specified Unicode Property.} #' \item{\code{\\Q}}{Quotes all following characters until \code{\\E}.} #' \item{\code{\\r}}{Match a CARRIAGE RETURN, \code{\\u000D}.} #' \item{\code{\\s}}{Match a white space character. White space is defined #' as \code{[\\t\\n\\f\\r\\p{Z}]}.} #' \item{\code{\\S}}{Match a non-white space character.} #' \item{\code{\\t}}{Match a HORIZONTAL TABULATION, \code{\\u0009}.} #' \item{\code{\\uhhhh}}{Match the character with the hex value \code{hhhh}.} #' \item{\code{\\Uhhhhhhhh}}{Match the character with the hex value \code{hhhhhhhh}. #' Exactly eight hex digits must be provided, even though the largest #' Unicode code point is \code{\\U0010ffff}.} #' \item{\code{\\w}}{Match a word character. Word characters are #' \code{[\\p{Alphabetic}\\p{Mark}\\p{Decimal_Number}\\p{Connector_Punctuation}\\u200c\\u200d]}.} #' \item{\code{\\W}}{Match a non-word character.} #' \item{\code{\\x{hhhh}} }{Match the character with hex value hhhh. #' From one to six hex digits may be supplied.} #' \item{\code{\\xhh}}{Match the character with two digit hex value hh } #' \item{\code{\\X}}{Match a Grapheme Cluster.} #' \item{\code{\\Z}}{Match if the current position is at the end of input, #' but before the final line terminator, if one exists.} #' \item{\code{\\z}}{Match if the current position is at the end of input.} #' \item{\code{\\n}}{Back Reference. Match whatever the nth capturing #' group matched. n must be a number > 1 and < total number of capture #' groups in the pattern.} #' \item{\code{\\0ooo}}{Match an Octal character. \code{'ooo'} is from one to three #' octal digits. 0377 is the largest allowed Octal character. The leading #' zero is required; it distinguishes Octal constants from back references.} #' \item{\code{[pattern]}}{Match any one character from the set.} #' \item{\code{.}}{Match any character except for - by default - newline, compare \code{\link{stri_opts_regex}}.} #' \item{\code{^}}{Match at the beginning of a line.} #' \item{\code{$}}{Match at the end of a line.} #' \item{\code{\\}}{[outside of sets] Quotes the following character. #' Characters that must be quoted to be treated as literals are #' \code{* ? + [ ( ) { } ^ $ | \\ .}.} #' \item{\code{\\}}{[inside sets] Quotes the following character. #' Characters that must be quoted to be treated as literals are #' \code{[ ] \\}; Characters that may need to be quoted, depending #' on the context are \code{- &}.} #' } #' #' @section Character Classes: #' #' The syntax is similar, but not 100\% compatible with the one #' described in \link{about_search_charclass}. In particular, #' whitespaces are not ignored and set-theoretic operations are #' denoted slightly differently. However, other than this #' \link{about_search_charclass} is a good reference #' on the capabilities offered. #' #' The ICU User Guide on regexes lists what follows. #' #' \describe{ #' \item{\code{[abc]}}{Match any of the characters a, b, or c} #' \item{\code{[^abc]}}{Negation -- match any character except a, b, or c} #' \item{\code{[A-M]}}{Range -- match any character from A to M (based on Unicode code point ordering)} #' \item{\code{[\\p{L}]}, \code{[\\p{Letter}]}, \code{[\\p{General_Category=Letter}]}, \code{[:letter:]}}{Characters with Unicode Category = Letter (4 equivalent forms)} #' \item{\code{[\\P{Letter}]}}{Negated property -- natch everything except Letters} #' \item{\code{[\\p{numeric_value=9}]}}{Match all numbers with a numeric value of 9} #' \item{\code{[\\p{Letter}&&\\p{script=cyrillic}]}}{Intersection; match the set of all Cyrillic letters} #' \item{\code{[\\p{Letter}--\\p{script=latin}]}}{Set difference; match all non-Latin letters} #' \item{\code{[[a-z][A-Z][0-9]]}, \code{[a-zA-Z0-9]}}{Union; match ASCII letters and digits (2 equivalent forms)} #' } #' #' #' @section Regex Functions in \pkg{stringi}: #' #' Note that if a given regex \code{pattern} is empty, #' then all the functions in \pkg{stringi} give \code{NA} in result #' and generate a warning. #' On a syntax error, a quite informative failure message is shown. #' #' If you wish to search for a fixed pattern, #' refer to \link{about_search_coll} or \link{about_search_fixed}. #' They allow to perform a locale-aware text lookup, #' or a very fast exact-byte search, respectively. #' #' #' #' @references #' \emph{Regular expressions} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/strings/regexp.html} #' #' J.E.F. Friedl, \emph{Mastering Regular Expressions}, O'Reilly, 2002 #' #' \emph{Unicode Regular Expressions} -- Unicode Technical Standard #18, #' \url{https://www.unicode.org/reports/tr18/} #' #' \emph{Unicode Regular Expressions} -- Regex tutorial, #' \url{https://www.regular-expressions.info/unicode.html} #' #' @name about_search_regex #' @rdname about_search_regex #' @aliases about_search_regex search_regex stringi-search-regex #' @family search_regex #' @family stringi_general_topics invisible(NULL) #' @title #' Locale-Insensitive Fixed Pattern Matching in \pkg{stringi} #' #' @description #' String searching facilities described here #' provide a way to locate a specific sequence of bytes in a string. #' The search engine's settings may be tuned up (for example #' to perform case-insensitive search) via a call to the #' \code{\link{stri_opts_fixed}} function. #' #' #' @section Byte Compare: #' #' The fast Knuth-Morris-Pratt search algorithm, with worst time complexity of #' O(n+p) (\code{n == length(str)}, \code{p == length(pattern)}) #' is implemented (with some tweaks for very short search patterns). #' #' Be aware that, for natural language processing, #' fixed pattern searching might not be what #' you actually require. It is because a bitwise match will #' not give correct results in cases of: #' \enumerate{ #' \item accented letters; #' \item conjoined letters; #' \item ignorable punctuation; #' \item ignorable case, #' } #' see also \link{about_search_coll}. #' #' Note that the conversion of input data #' to Unicode is done as usual. #' #' @name about_search_fixed #' @rdname about_search_fixed #' @aliases about_search_fixed search_fixed stringi-search-fixed #' @family search_fixed #' @family stringi_general_topics invisible(NULL) #' @title #' Locale-Sensitive Text Searching in \pkg{stringi} #' #' @description #' String searching facilities described here #' provide a way to locate a specific piece of #' text. Interestingly, locale-sensitive searching, especially #' on a non-English text, is a much more complex process #' than it seems at first glance. #' #' #' #' @section Locale-Aware String Search Engine: #' #' All \code{stri_*_coll} functions in \pkg{stringi} use #' \pkg{ICU}'s \code{StringSearch} engine, #' which implements a locale-sensitive string search algorithm. #' The matches are defined by using the notion of ``canonical equivalence'' #' between strings. #' #' Tuning the Collator's parameters allows you to perform correct matching #' that properly takes into account accented letters, conjoined letters, #' ignorable punctuation and letter case. #' #' For more information on \pkg{ICU}'s Collator and the search engine #' and how to tune it up #' in \pkg{stringi}, refer to \code{\link{stri_opts_collator}}. #' #' Please note that \pkg{ICU}'s \code{StringSearch}-based functions #' are often much slower that those to perform fixed pattern searches. #' #' #' @references #' \emph{ICU String Search Service} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/collation/string-search.html} #' #' L. Werner, \emph{Efficient Text Searching in Java}, 1999, #' \url{https://icu-project.org/docs/papers/efficient_text_searching_in_java.html} #' #' @name about_search_coll #' @rdname about_search_coll #' @aliases about_search_coll search_coll stringi-search-coll #' @family search_coll #' @family locale_sensitive #' @family stringi_general_topics invisible(NULL) #' @title Character Classes in \pkg{stringi} #' #' @description #' Here we describe how character classes (sets) can be specified #' in the \pkg{stringi} package. These are useful for defining #' search patterns (note that the \pkg{ICU} regex engine uses the same #' scheme for denoting character classes) or, e.g., #' generating random code points with \code{\link{stri_rand_strings}}. #' #' #' @details #' All \code{stri_*_charclass} functions in \pkg{stringi} perform #' a single character (i.e., Unicode code point) search-based operations. #' You may obtain the same results using \link{about_search_regex}. #' However, these very functions aim to be faster. #' #' Character classes are defined using \pkg{ICU}'s \code{UnicodeSet} #' patterns. Below we briefly summarize their syntax. #' For more details refer to the bibliographic References below. #' #' #' @section \code{UnicodeSet} patterns: #' #' A \code{UnicodeSet} represents a subset of Unicode code points #' (recall that \pkg{stringi} converts strings in your native encoding #' to Unicode automatically). Legal code points are U+0000 to U+10FFFF, #' inclusive. #' #' Patterns either consist of series of characters bounded by #' square brackets #' (such patterns follow a syntax similar to that employed #' by regular expression character classes) #' or of Perl-like Unicode property set specifiers. #' #' \code{[]} denotes an empty set, \code{[a]} -- #' a set consisting of character ``a'', #' \code{[\\u0105]} -- a set with character U+0105, #' and \code{[abc]} -- a set with ``a'', ``b'', and ``c''. #' #' \code{[a-z]} denotes a set consisting of characters #' ``a'' through ``z'' inclusively, in Unicode code point order. #' #' Some set-theoretic operations are available. #' \code{^} denotes the complement, e.g., \code{[^a-z]} contains #' all characters but ``a'' through ``z''. #' Moreover, \code{[[pat1][pat2]]}, #' \code{[[pat1]\&[pat2]]}, and \code{[[pat1]-[pat2]]} #' denote union, intersection, and asymmetric difference of sets #' specified by \code{pat1} and \code{pat2}, respectively. #' #' Note that all white-spaces are ignored unless they are quoted or back-slashed #' (white spaces can be freely used for clarity, as \code{[a c d-f m]} #' means the same as \code{[acd-fm]}). #' \pkg{stringi} does not allow including multi-character strings #' (see \code{UnicodeSet} API documentation). #' Also, empty string patterns are disallowed. #' #' Any character may be preceded by #' a backslash in order to remove its special meaning. #' #' A malformed pattern always results in an error. #' #' Set expressions at a glance #' (according to \url{https://unicode-org.github.io/icu/userguide/strings/regexp.html}): #' #' #' Some examples: #' #' \describe{ #' \item{\code{[abc]}}{Match any of the characters a, b or c.} #' \item{\code{[^abc]}}{Negation -- match any character except a, b or c.} #' \item{\code{[A-M]}}{Range -- match any character from A to M. The characters #' to include are determined by Unicode code point ordering.} #' \item{\code{[\\u0000-\\U0010ffff]}}{Range -- match all characters.} #' \item{\code{[\\p{Letter}]} or \code{[\\p{General_Category=Letter}]} or \code{[\\p{L}]}}{ #' Characters with Unicode Category = Letter. All forms shown are equivalent.} #' \item{\code{[\\P{Letter}]}}{Negated property #' (Note the upper case \code{\\P}) -- match everything except Letters.} #' \item{\code{[\\p{numeric_value=9}]}}{Match all numbers with a numeric value of 9. #' Any Unicode Property may be used in set expressions.} #' \item{\code{[\\p{Letter}&\\p{script=cyrillic}]}}{Set #' intersection -- match the set of all Cyrillic letters.} #' \item{\code{[\\p{Letter}-\\p{script=latin}]}}{Set difference -- #' match all non-Latin letters.} #' \item{\code{[[a-z][A-Z][0-9]]} or \code{[a-zA-Z0-9]}}{Implicit union of #' sets -- match ASCII letters and digits (the two forms are equivalent).} #' \item{\code{[:script=Greek:]}}{Alternative POSIX-like syntax for properties -- #' equivalent to \code{\\p{script=Greek}}.} #' } #' #' @section Unicode properties: #' #' Unicode property sets are specified with a POSIX-like syntax, #' e.g., \code{[:Letter:]}, #' or with a (extended) Perl-style syntax, e.g., \code{\\p{L}}. #' The complements of the above sets are #' \code{[:^Letter:]} and \code{\\P{L}}, respectively. #' #' The names are normalized before matching #' (for example, the match is case-insensitive). #' Moreover, many names have short aliases. #' #' Among predefined Unicode properties we find, e.g.: #' \itemize{ #' \item Unicode General Categories, e.g., \code{Lu} for uppercase letters, #' \item Unicode Binary Properties, e.g., \code{WHITE_SPACE}, #' } #' and many more (including Unicode scripts). #' #' Each property provides access to the large and comprehensive #' Unicode Character Database. #' Generally, the list of properties available in \pkg{ICU} #' is not well-documented. Please refer to the References section #' for some links. #' #' Please note that some classes might overlap. #' However, e.g., General Category \code{Z} (some space) and Binary Property #' \code{WHITE_SPACE} matches different character sets. #' #' #' @section Unicode General Categories: #' #' The Unicode General Category property of a code point provides the most #' general classification of that code point. #' Each code point falls into one and only one Category. #' #' \describe{ #' \item{\code{Cc}}{a C0 or C1 control code.} #' \item{\code{Cf}}{a format control character.} #' \item{\code{Cn}}{a reserved unassigned code point or a non-character.} #' \item{\code{Co}}{a private-use character.} #' \item{\code{Cs}}{a surrogate code point.} #' \item{\code{Lc}}{the union of Lu, Ll, Lt.} #' \item{\code{Ll}}{a lowercase letter.} #' \item{\code{Lm}}{a modifier letter.} #' \item{\code{Lo}}{other letters, including syllables and ideographs.} #' \item{\code{Lt}}{a digraphic character, with the first part uppercase.} #' \item{\code{Lu}}{an uppercase letter.} #' \item{\code{Mc}}{a spacing combining mark (positive advance width).} #' \item{\code{Me}}{an enclosing combining mark.} #' \item{\code{Mn}}{a non-spacing combining mark (zero advance width).} #' \item{\code{Nd}}{a decimal digit.} #' \item{\code{Nl}}{a letter-like numeric character.} #' \item{\code{No}}{a numeric character of other type.} #' \item{\code{Pd}}{a dash or hyphen punctuation mark.} #' \item{\code{Ps}}{an opening punctuation mark (of a pair).} #' \item{\code{Pe}}{a closing punctuation mark (of a pair).} #' \item{\code{Pc}}{a connecting punctuation mark, like a tie.} #' \item{\code{Po}}{a punctuation mark of other type.} #' \item{\code{Pi}}{an initial quotation mark.} #' \item{\code{Pf}}{a final quotation mark.} #' \item{\code{Sm}}{a symbol of mathematical use.} #' \item{\code{Sc}}{a currency sign.} #' \item{\code{Sk}}{a non-letter-like modifier symbol.} #' \item{\code{So}}{a symbol of other type.} #' \item{\code{Zs}}{a space character (of non-zero width).} #' \item{\code{Zl}}{U+2028 LINE SEPARATOR only.} #' \item{\code{Zp}}{U+2029 PARAGRAPH SEPARATOR only.} #' \item{\code{C} }{the union of Cc, Cf, Cs, Co, Cn.} #' \item{\code{L} }{the union of Lu, Ll, Lt, Lm, Lo.} #' \item{\code{M} }{the union of Mn, Mc, Me.} #' \item{\code{N} }{the union of Nd, Nl, No.} #' \item{\code{P} }{the union of Pc, Pd, Ps, Pe, Pi, Pf, Po.} #' \item{\code{S} }{the union of Sm, Sc, Sk, So.} #' \item{\code{Z} }{the union of Zs, Zl, Zp } #' } #' #' @section Unicode Binary Properties: #' #' Each character may follow many Binary Properties at a time. #' #' Here is a comprehensive list of supported Binary Properties: #' #' \describe{ #' \item{\code{ALPHABETIC} }{alphabetic character.} #' \item{\code{ASCII_HEX_DIGIT}}{a character matching the \code{[0-9A-Fa-f]} charclass.} #' \item{\code{BIDI_CONTROL} }{a format control which have specific functions #' in the Bidi (bidirectional text) Algorithm.} #' \item{\code{BIDI_MIRRORED} }{a character that may change display in right-to-left text.} #' \item{\code{DASH} }{a kind of a dash character.} #' \item{\code{DEFAULT_IGNORABLE_CODE_POINT}}{characters that are ignorable in most #' text processing activities, #' e.g., <2060..206F, FFF0..FFFB, E0000..E0FFF>.} #' \item{\code{DEPRECATED} }{a deprecated character according #' to the current Unicode standard (the usage of deprecated characters #' is strongly discouraged).} #' \item{\code{DIACRITIC} }{a character that linguistically modifies #' the meaning of another character to which it applies.} #' \item{\code{EXTENDER} }{a character that extends the value #' or shape of a preceding alphabetic character, #' e.g., a length and iteration mark.} #' \item{\code{HEX_DIGIT} }{a character commonly #' used for hexadecimal numbers, #' see also \code{ASCII_HEX_DIGIT}.} #' \item{\code{HYPHEN}}{a dash used to mark connections between #' pieces of words, plus the Katakana middle dot.} #' \item{\code{ID_CONTINUE}}{a character that can continue an identifier, #' \code{ID_START}+\code{Mn}+\code{Mc}+\code{Nd}+\code{Pc}.} #' \item{\code{ID_START}}{a character that can start an identifier, #' \code{Lu}+\code{Ll}+\code{Lt}+\code{Lm}+\code{Lo}+\code{Nl}.} #' \item{\code{IDEOGRAPHIC}}{a CJKV (Chinese-Japanese-Korean-Vietnamese) #' ideograph.} #' \item{\code{LOWERCASE}}{...} #' \item{\code{MATH}}{...} #' \item{\code{NONCHARACTER_CODE_POINT}}{...} #' \item{\code{QUOTATION_MARK}}{...} #' \item{\code{SOFT_DOTTED}}{a character with a ``soft dot'', like i or j, #' such that an accent placed on this character causes the dot to disappear.} #' \item{\code{TERMINAL_PUNCTUATION}}{a punctuation character that generally #' marks the end of textual units.} #' \item{\code{UPPERCASE}}{...} #' \item{\code{WHITE_SPACE}}{a space character or TAB or CR or LF or ZWSP or ZWNBSP.} #' \item{\code{CASE_SENSITIVE}}{...} #' \item{\code{POSIX_ALNUM}}{...} #' \item{\code{POSIX_BLANK}}{...} #' \item{\code{POSIX_GRAPH}}{...} #' \item{\code{POSIX_PRINT}}{...} #' \item{\code{POSIX_XDIGIT}}{...} #' \item{\code{CASED}}{...} #' \item{\code{CASE_IGNORABLE}}{...} #' \item{\code{CHANGES_WHEN_LOWERCASED}}{...} #' \item{\code{CHANGES_WHEN_UPPERCASED}}{...} #' \item{\code{CHANGES_WHEN_TITLECASED}}{...} #' \item{\code{CHANGES_WHEN_CASEFOLDED}}{...} #' \item{\code{CHANGES_WHEN_CASEMAPPED}}{...} #' \item{\code{CHANGES_WHEN_NFKC_CASEFOLDED}}{...} #' \item{\code{EMOJI}}{Since ICU 57} #' \item{\code{EMOJI_PRESENTATION}}{Since ICU 57} #' \item{\code{EMOJI_MODIFIER}}{Since ICU 57} #' \item{\code{EMOJI_MODIFIER_BASE}}{Since ICU 57} #' } #' #' #' @section POSIX Character Classes: #' #' Avoid using POSIX character classes, #' e.g., \code{[:punct:]}. The ICU User Guide (see below) #' states that in general they are not well-defined, so you may end up #' with something different than you expect. #' #' In particular, in POSIX-like regex engines, \code{[:punct:]} stands for #' the character class corresponding to the \code{ispunct()} classification #' function (check out \code{man 3 ispunct} on UNIX-like systems). #' According to ISO/IEC 9899:1990 (ISO C90), the \code{ispunct()} function #' tests for any printing character except for space or a character #' for which \code{isalnum()} is true. However, in a POSIX setting, #' the details of what characters belong into which class depend #' on the current locale. So the \code{[:punct:]} class does not lead #' to a portable code (again, in POSIX-like regex engines). #' #' Therefore, a POSIX flavor of \code{[:punct:]} is more like #' \code{[\\p{P}\\p{S}]} in \pkg{ICU}. You have been warned. #' #' #' @references #' \emph{The Unicode Character Database} -- Unicode Standard Annex #44, #' \url{https://www.unicode.org/reports/tr44/} #' #' \emph{UnicodeSet} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/strings/unicodeset.html} #' #' \emph{Properties} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/strings/properties.html} #' #' \emph{C/POSIX Migration} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/icu/posix.html} #' #' \emph{Unicode Script Data}, \url{https://www.unicode.org/Public/UNIDATA/Scripts.txt} #' #' \emph{icu::Unicodeset Class Reference} -- ICU4C API Documentation, #' \url{https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1UnicodeSet.html} #' #' @name about_search_charclass #' @rdname about_search_charclass #' @aliases about_search_charclass search_charclass stringi-search-charclass #' @family search_charclass #' @family stringi_general_topics invisible(NULL) #' @title #' Text Boundary Analysis in \pkg{stringi} #' #' @description #' Text boundary analysis is the process of locating linguistic boundaries #' while formatting and handling text. #' #' @details #' Examples of the boundary analysis process include: #' #' \itemize{ #' \item Locating positions to word-wrap text to fit #' within specific margins while displaying or printing, #' see \code{\link{stri_wrap}} and \code{\link{stri_split_boundaries}}. #' \item Counting characters, words, sentences, or paragraphs, #' see \code{\link{stri_count_boundaries}}. #' \item Making a list of the unique words in a document, #' see \code{\link{stri_extract_all_words}} and then \code{\link{stri_unique}}. #' \item Capitalizing the first letter of each word #' or sentence, see also \code{\link{stri_trans_totitle}}. #' \item Locating a particular unit of the text (for example, #' finding the third word in the document), #' see \code{\link{stri_locate_all_boundaries}}. #' } #' #' Generally, text boundary analysis is a locale-dependent operation. #' For example, in Japanese and Chinese one does not separate words with spaces #' - a line break can occur even in the middle of a word. #' These languages have punctuation and diacritical #' marks that cannot start or end a line, so this must also be taken into account. #' #' \pkg{stringi} uses \pkg{ICU}'s \code{BreakIterator} to locate specific #' text boundaries. Note that the \code{BreakIterator}'s behavior #' may be controlled in come cases, see \code{\link{stri_opts_brkiter}}. #' \itemize{ #' \item The \code{character} boundary iterator tries to match what a user #' would think of as a ``character'' -- a basic unit of a writing system #' for a language -- which may be more than just a single Unicode code point. #' \item The \code{word} boundary iterator locates the boundaries #' of words, for purposes such as ``Find whole words'' operations. #' \item The \code{line_break} iterator locates positions that would #' be appropriate to wrap lines when displaying the text. #' \item The break iterator of type \code{sentence} #' locates sentence boundaries. #' } #' #' For technical details on different classes of text boundaries refer #' to the \pkg{ICU} User Guide, see below. #' #' @references #' \emph{Boundary Analysis} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/boundaryanalysis/} #' #' @name about_search_boundaries #' @rdname about_search_boundaries #' @aliases about_search_boundaries search_boundaries stringi-search-boundaries #' @family locale_sensitive #' @family text_boundaries #' @family stringi_general_topics invisible(NULL) stringi/R/search_count_bound.R0000644000176200001440000001070614750110641016124 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Count the Number of Text Boundaries #' #' @description #' These functions determine the number of text boundaries #' (like character, word, line, or sentence boundaries) in a string. #' #' @details #' Vectorized over \code{str}. #' #' For more information on text boundary analysis #' performed by \pkg{ICU}'s \code{BreakIterator}, see #' \link{stringi-search-boundaries}. #' #' In case of \code{stri_count_words}, #' just like in \code{\link{stri_extract_all_words}} and #' \code{\link{stri_locate_all_words}}, #' \pkg{ICU}'s word \code{BreakIterator} iterator is used #' to locate the word boundaries, and all non-word characters #' (\code{UBRK_WORD_NONE} rule status) are ignored. #' This function is equivalent to a call to #' \code{\link{stri_count_boundaries}(str, type='word', skip_word_none=TRUE, locale=locale)}. #' #' Note that a \code{BreakIterator} of type \code{character} #' may be used to count the number of \emph{Unicode characters} in a string. #' The \code{\link{stri_length}} function, #' which aims to count the number of \emph{Unicode code points}, #' might report different results. #' #' Moreover, a \code{BreakIterator} of type \code{sentence} #' may be used to count the number of sentences in a text piece. #' #' #' @param str character vector or an object coercible to #' @param opts_brkiter a named list with \pkg{ICU} BreakIterator's settings, #' see \code{\link{stri_opts_brkiter}}; #' \code{NULL} for the default break iterator, i.e., \code{line_break} #' @param ... additional settings for \code{opts_brkiter} #' @param locale \code{NULL} or \code{''} for text boundary analysis following #' the conventions of the default locale, or a single string with #' locale identifier, see \link{stringi-locale} #' #' @return #' Both functions return an integer vector. #' #' @examples #' test <- 'The\u00a0above-mentioned features are very useful. Spam, spam, eggs, bacon, and spam.' #' stri_count_boundaries(test, type='word') #' stri_count_boundaries(test, type='sentence') #' stri_count_boundaries(test, type='character') #' stri_count_words(test) #' #' test2 <- stri_trans_nfkd('\u03c0\u0153\u0119\u00a9\u00df\u2190\u2193\u2192') #' stri_count_boundaries(test2, type='character') #' stri_length(test2) #' stri_numbytes(test2) #' #' @export #' @family search_count #' @family locale_sensitive #' @family text_boundaries #' @rdname stri_count_boundaries stri_count_boundaries <- function(str, ..., opts_brkiter = NULL) { if (!missing(...)) opts_brkiter <- do.call(stri_opts_brkiter, as.list(c(opts_brkiter, ...))) .Call(C_stri_count_boundaries, str, opts_brkiter) } #' @export #' @rdname stri_count_boundaries stri_count_words <- function(str, locale = NULL) { stri_count_boundaries(str, opts_brkiter = stri_opts_brkiter(type = "word", skip_word_none = TRUE, locale = locale)) } stringi/R/join.R0000644000176200001440000002217514750110641013222 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Duplicate Strings #' #' @description #' Duplicates each \code{str}(\code{e1}) string \code{times}(\code{e2}) times #' and concatenates the results. #' #' @details #' Vectorized over all arguments. #' #' \code{e1 \%s*\% e2} and \code{e1 \%stri*\% e2} are synonyms #' for \code{stri_dup(e1, e2)} #' #' @param str,e1 a character vector of strings to be duplicated #' @param times,e2 an integer vector with the numbers of times to duplicate each string #' #' @return Returns a character vector. #' #' @export #' @family join #' @rdname stri_dup #' @aliases stri_dup operator_multiply oper_multiply #' @examples #' stri_dup('a', 1:5) #' stri_dup(c('a', NA, 'ba'), 4) #' stri_dup(c('abc', 'pqrst'), c(4, 2)) #' "a" %s*% 5 stri_dup <- function(str, times) { .Call(C_stri_dup, str, times) } #' @usage #' e1 \%s*\% e2 #' @rdname stri_dup #' @export `%s*%` <- function(e1, e2) { .Call(C_stri_dup, e1, e2) } #' @usage #' e1 \%stri*\% e2 #' @rdname stri_dup #' @export `%stri*%` <- `%s*%` #' @title #' Concatenate Two Character Vectors #' #' @description #' Binary operators for joining (concatenating) two character vectors, #' with a typical R look-and-feel. #' #' @details #' Vectorized over \code{e1} and \code{e2}. #' #' These operators act like a call to \code{\link{stri_join}(e1, e2, sep='')}. #' However, note that joining 3 vectors, e.g., \code{e1 \%s+\% e2 \%s+\% e3} #' is slower than \code{\link{stri_join}(e1, e2, e3, sep='')}, #' because it creates a new (temporary) result vector each time #' the operator is applied. #' #' #' @param e1 a character vector or an object coercible to a character vector #' @param e2 a character vector or an object coercible to a character vector #' #' @return Returns a character vector. #' #' #' @examples #' c('abc', '123', 'xy') %s+% letters[1:6] #' 'ID_' %s+% 1:5 #' #' @rdname operator_add #' @aliases oper_plus operator_add operator_plus #' @family join #' #' @usage #' e1 \%s+\% e2 #' #' @export `%s+%` <- function(e1, e2) { .Call(C_stri_join2, e1, e2) } #' @usage #' e1 \%stri+\% e2 #' @rdname operator_add #' @export `%stri+%` <- `%s+%` #' @title #' Concatenate Character Vectors #' #' @description #' These are the \pkg{stringi}'s equivalents of the built-in #' \code{\link{paste}} function. #' \code{stri_c} and \code{stri_paste} are aliases for \code{stri_join}. #' #' @details #' Vectorized over each atomic vector in `\code{...}`. #' #' Unless \code{collapse} is \code{NULL}, the result will be a single string. #' Otherwise, you get a character vector of length equal #' to the length of the longest argument. #' #' If any of the arguments in `\code{...}` is a vector of length 0 #' (not to be confused with vectors of empty strings) #' and \code{ignore_null} is \code{FALSE}, then #' you will get a 0-length character vector in result. #' #' If \code{collapse} or \code{sep} has length greater than 1, #' then only the first string will be used. #' #' In case where there are missing values in any of the input vectors, #' \code{NA} is set to the corresponding element. #' Note that this behavior is different from \code{\link{paste}}, #' which treats missing values as ordinary strings like \code{'NA'}. #' Moreover, as usual in \pkg{stringi}, the resulting strings are #' always in UTF-8. #' #' @param ... character vectors (or objects coercible to character vectors) #' whose corresponding elements are to be concatenated #' @param sep a single string; separates terms #' @param collapse a single string or \code{NULL}; an optional #' results separator #' @param ignore_null a single logical value; if \code{TRUE}, then empty #' vectors provided via \code{...} are silently ignored #' #' @return Returns a character vector. #' #' @export #' @examples #' stri_join(1:13, letters) #' stri_join(1:13, letters, sep=',') #' stri_join(1:13, letters, collapse='; ') #' stri_join(1:13, letters, sep=',', collapse='; ') #' stri_join(c('abc', '123', 'xyz'),'###', 1:6, sep=',') #' stri_join(c('abc', '123', 'xyz'),'###', 1:6, sep=',', collapse='; ') #' #' @family join #' @rdname stri_join stri_join <- function(..., sep = "", collapse = NULL, ignore_null = FALSE) { .Call(C_stri_join, list(...), sep, collapse, ignore_null) } #' @rdname stri_join #' @export stri_c <- stri_join #' @rdname stri_join #' @export stri_paste <- stri_join #' @title #' Flatten a String #' #' @description #' Joins the elements of a character vector into one string. #' #' @details #' The \code{stri_flatten(str, collapse='XXX')} call #' is equivalent to \code{\link{paste}(str, collapse='XXX', sep='')}. #' #' If you wish to use some more fancy (e.g., differing) #' separators between flattened strings, #' call \code{\link{stri_join}(str, separators, collapse='')}. #' #' If \code{str} is not empty, then a single string is returned. #' If \code{collapse} has length > 1, then only the first string #' will be used. #' #' @param str a vector of strings to be coerced to character #' @param collapse a single string denoting the separator #' @param na_empty single logical value; should missing values #' in \code{str} be treated as empty strings (\code{TRUE}) #' or be omitted whatsoever (\code{NA})? #' @param omit_empty single logical value; should empty strings #' in \code{str} be omitted? #' #' @return #' Returns a single string, i.e., a character #' vector of length 1. #' #' @examples #' stri_flatten(LETTERS) #' stri_flatten(LETTERS, collapse=',') #' stri_flatten(stri_dup(letters[1:6], 1:3)) #' stri_flatten(c(NA, '', 'A', '', 'B', NA, 'C'), collapse=',', na_empty=TRUE, omit_empty=TRUE) #' stri_flatten(c(NA, '', 'A', '', 'B', NA, 'C'), collapse=',', na_empty=NA) #' #' @export #' @family join stri_flatten <- function(str, collapse = "", na_empty = FALSE, omit_empty = FALSE) { .Call(C_stri_flatten, str, collapse, na_empty, omit_empty) } #' @title #' Concatenate Strings in a List #' #' @description #' These functions concatenate all the strings in each character vector #' in a given list. #' \code{stri_c_list} and \code{stri_paste_list} are aliases for #' \code{stri_join_list}. #' #' @details #' Unless \code{collapse} is \code{NULL}, the result will be a single string. #' Otherwise, you get a character vector of length equal #' to the length of \code{x}. #' #' Vectors in \code{x} of length 0 are silently ignored. #' #' If \code{collapse} or \code{sep} has length greater than 1, #' then only the first string will be used. #' #' @param x a list consisting of character vectors #' @param sep a single string; separates strings in each of the character #' vectors in \code{x} #' @param collapse a single string or \code{NULL}; an optional #' results separator #' #' @return Returns a character vector. #' #' @export #' @examples #' stri_join_list( #' stri_extract_all_words(c('Lorem ipsum dolor sit amet.', #' 'Spam spam bacon sausage and spam.')), #' sep=', ') #' #' stri_join_list( #' stri_extract_all_words(c('Lorem ipsum dolor sit amet.', #' 'Spam spam bacon sausage and spam.')), #' sep=', ', collapse='. ') #' #' stri_join_list( #' stri_extract_all_regex( #' c('spam spam bacon', '123 456', 'spam 789 sausage'), '\\p{L}+' #' ), #' sep=',') #' #' stri_join_list( #' stri_extract_all_regex( #' c('spam spam bacon', '123 456', 'spam 789 sausage'), '\\p{L}+', #' omit_no_match=TRUE #' ), #' sep=',', collapse='; ') #' #' @family join #' @rdname stri_join_list stri_join_list <- function(x, sep = "", collapse = NULL) { .Call(C_stri_join_list, x, sep, collapse) } #' @rdname stri_join_list #' @export stri_c_list <- stri_join_list #' @rdname stri_join_list #' @export stri_paste_list <- stri_join_list stringi/R/time_calendar.R0000644000176200001440000002457714770534711015074 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Create a Date-Time Object #' #' @description #' Constructs date-time objects from numeric representations. #' #' @details #' Vectorized over \code{year}, \code{month}, \code{day}, \code{hour}, #' \code{hour}, \code{minute}, and \code{second}. #' #' @param year integer vector; 0 is 1BCE, -1 is 2BCE, etc.; #' \code{NULL} for the current year #' @param month integer vector; months are 1-based; #' \code{NULL} for the current month #' @param day integer vector; #' \code{NULL} for the current day #' @param hour integer vector; #' \code{NULL} for the current hour #' @param minute integer vector; #' \code{NULL} for the current minute #' @param second numeric vector; fractional seconds are allowed; #' \code{NULL} for the current seconds (without milliseconds) #' @param tz \code{NULL} or \code{''} for the default time zone or #' a single string with time zone identifier, see \code{\link{stri_timezone_list}} #' @param lenient single logical value; should the operation be lenient? #' @param locale \code{NULL} or \code{''} for default locale, #' or a single string with locale identifier; a non-Gregorian calendar #' may be specified by setting \code{@@calendar=name} keyword #' #' @return #' Returns an object of class \code{\link{POSIXct}}. #' #' @examples #' stri_datetime_create(2015, 12, 31, 23, 59, 59.999) #' stri_datetime_create(5775, 8, 1, locale='@@calendar=hebrew') # 1 Nisan 5775 -> 2015-03-21 #' stri_datetime_create(2015, 02, 29) #' stri_datetime_create(2015, 02, 29, lenient=TRUE) #' stri_datetime_create(hour=15, minute=59) #' #' @family datetime #' @export stri_datetime_create <- function( year = NULL, month = NULL, day = NULL, hour = 0L, minute = 0L, second = 0, lenient = FALSE, tz = NULL, locale = NULL) { if (any(sapply(list(year, month, day, hour, minute, second), is.null))) { now <- stri_datetime_fields(stri_datetime_now(), tz=tz, locale=locale) if (is.null(year)) year <- now[["Year"]] if (is.null(month)) month <- now[["Month"]] if (is.null(day)) day <- now[["Day"]] if (is.null(hour)) hour <- now[["Hour"]] if (is.null(minute)) minute <- now[["Minute"]] if (is.null(second)) second <- now[["Second"]] } .Call(C_stri_datetime_create, year, month, day, hour, minute, second, lenient, tz, locale) } #' @title #' Get Current Date and Time #' #' @description #' Returns the current date and time. #' #' @details #' The current date and time in \pkg{stringi} is represented as the (signed) #' number of seconds since 1970-01-01 00:00:00 UTC. #' UTC leap seconds are ignored. #' #' @return #' Returns an object of class \code{\link{POSIXct}}. #' #' @family datetime #' @export stri_datetime_now <- function() { .Call(C_stri_datetime_now) } #' @title #' Get Values for Date and Time Fields #' #' @description #' Computes and returns values for all date and time fields. #' #' @details #' Vectorized over \code{time}. #' #' #' @param time an object of class \code{\link{POSIXct}} #' (\code{as.POSIXct} will be called on character vectors #' and objects of class \code{POSIXlt}, \code{Date}, and \code{factor}) #' @param tz \code{NULL} or \code{''} for the default time zone or #' a single string with time zone identifier, see \code{\link{stri_timezone_list}} #' @param locale \code{NULL} or \code{''} for the current default locale, #' or a single string with a locale identifier; a non-Gregorian calendar #' may be specified by setting \code{@@calendar=name} keyword #' #' @return #' Returns a data frame with the following columns: #' \enumerate{ #' \item Year (0 is 1BC, -1 is 2BC, etc.) #' \item Month (1-based, i.e., 1 stands for the first month, e.g., January; #' note that the number of months depends on the selected calendar, #' see \code{\link{stri_datetime_symbols}}) #' \item Day #' \item Hour (24-h clock) #' \item Minute #' \item Second #' \item Millisecond #' \item WeekOfYear (this is locale-dependent) #' \item WeekOfMonth (this is locale-dependent) #' \item DayOfYear #' \item DayOfWeek (1-based, 1 denotes Sunday; see \code{\link{stri_datetime_symbols}}) #' \item Hour12 (12-h clock) #' \item AmPm (see \code{\link{stri_datetime_symbols}}) #' \item Era (see \code{\link{stri_datetime_symbols}}) #' } #' #' @examples #' stri_datetime_fields(stri_datetime_now()) #' stri_datetime_fields(stri_datetime_now(), locale='@@calendar=hebrew') #' stri_datetime_symbols(locale='@@calendar=hebrew')$Month[ #' stri_datetime_fields(stri_datetime_now(), locale='@@calendar=hebrew')$Month #' ] #' #' @family datetime #' @export stri_datetime_fields <- function(time, tz = attr(time, "tzone"), locale = NULL) { # POSSIBLY @TODO: # TimeZone # GMT Offset CAL_ZONE_OFFSET + UCAL_DST_OFFSET # isDST: UBool inDaylightTime (UErrorCode &status) const =0 # isWeekend: virtual UBool isWeekend (void) const as.data.frame(.Call(C_stri_datetime_fields, time, tz, locale)) } #' @title #' Date and Time Arithmetic #' #' @description #' Modifies a date-time object by adding a specific amount of time units. #' #' @details #' Vectorized over \code{time} and \code{value}. #' #' #' Note that, e.g., January, 31 + 1 month = February, 28 or 29. #' #' @param time an object of class \code{\link{POSIXct}} #' (\code{as.POSIXct} will be called on character vectors #' and objects of class \code{POSIXlt}, \code{Date}, and \code{factor}) #' @param value integer vector; signed number of units to add to \code{time} #' @param units single string; one of \code{'years'}, \code{'months'}, #' \code{'weeks'}, \code{'days'}, \code{'hours'}, \code{'minutes'}, #' \code{'seconds'}, or \code{'milliseconds'} #' @param tz \code{NULL} or \code{''} for the default time zone #' or a single string with a timezone identifier, #' @param locale \code{NULL} or \code{''} for default locale, #' or a single string with locale identifier; a non-Gregorian calendar #' may be specified by setting the \code{@@calendar=name} keyword #' #' @return #' Both functions return an object of class \code{\link{POSIXct}}. #' #' The replacement version of \code{stri_datetime_add} modifies #' the state of the \code{time} object. #' #' @references #' \emph{Calendar Classes} - ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/datetime/calendar/} #' #' #' @examples #' x <- stri_datetime_now() #' print(x) #' stri_datetime_add(x, units='months') <- 2 #' print(x) #' #' x <- stri_datetime_create(2025, 4, 20) #' print(x) #' stri_datetime_add(x, -2, units='months') #' stri_datetime_add(x, 1, units='years') #' stri_datetime_add(x, 1, units='years', locale='@@calendar=hebrew') #' #' stri_datetime_add(stri_datetime_create(2024, 1, 31), 1, units='months') #' #' @family datetime #' @rdname stri_datetime_add #' @export stri_datetime_add <- function(time, value = 1L, units = "seconds", tz = NULL, locale = NULL) { .Call(C_stri_datetime_add, time, value, units, tz, locale) } #' @rdname stri_datetime_add #' @export `stri_datetime_add<-` <- function(time, units = "seconds", tz = NULL, locale = NULL, value) { .Call(C_stri_datetime_add, time, value, units, tz, locale) } # #' @title # #' Date-Time Objects in \pkg{stringi} # #' # #' @description # #' Date-time objects' representation in \pkg{stringi} may change # #' in future versions of the package. This is DRAFT API. # #' # #' @details # #' An object of class \code{\link{POSIXst}}, # #' inherits from (for compatibility with other base R functions) # #' \code{POSIXct} and \code{POSIX} classes. # #' In fact, it is a numeric vector representing the (signed) number of seconds # #' since the UNIX Epoch, i.e., 1970-01-01 00:00:00 UTC. # #' UTC leap seconds are ignored. # #' # #' Thanks to this property, standard comparison operators, e.g., \code{<}, \code{==}, # #' etc. or the \code{sort()} function may be used. # #' # #' An object of class \code{\link{POSIXst}} may be equipped with # #' an attribute called \code{tzone}. Its value is used for date/time # #' formatting (e.g., when objects are printed in the console), # #' see \code{\link{format.POSIXst}} and \code{\link{stri_datetime_fields}}. # #' # #' @param x ... # #' @param tz \code{NULL} or \code{''} for the default time zone or # #' a single string with time zone identifier, see \code{\link{stri_timezone_list}} # #' @param recursive,... further arguments to be passed to or from other methods. # #' # #' @return # #' \code{as.POSIXst} returns an object of class \code{POSIXst}. # #' # #' @export # #' @rdname as.POSIXst # #' @family datetime # #' @aliases as.POSIXst POSIXst # as.POSIXst <- function(x, tz=attr(time, 'tzone'), ...) { # # UseMethod('as.POSIXct') # stop('TO DO') # } # #' @export # #' @rdname as.POSIXst # c.POSIXst <- function (..., recursive=FALSE) { # stopifnot(identical(recursive, FALSE)) # .Call(C_stri_c_posixst, list(...)) # } # TO DO: field difference stringi/R/trans_normalization.R0000644000176200001440000001275214750110641016360 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Perform or Check For Unicode Normalization #' #' @description #' These functions convert strings to NFC, NFKC, NFD, NFKD, or NFKC_Casefold #' Unicode Normalization Form or check whether strings are normalized. #' #' @details #' Unicode Normalization Forms are formally defined normalizations of Unicode #' strings which, e.g., make possible to determine whether any two #' strings are equivalent. #' Essentially, the Unicode Normalization Algorithm puts all combining #' marks in a specified order, and uses rules for decomposition #' and composition to transform each string into one of the #' Unicode Normalization Forms. #' #' The following Normalization Forms (NFs) are supported: #' \itemize{ #' \item NFC (Canonical Decomposition, followed by Canonical Composition), #' \item NFD (Canonical Decomposition), #' \item NFKC (Compatibility Decomposition, followed by Canonical Composition), #' \item NFKD (Compatibility Decomposition), #' \item NFKC_Casefold (combination of NFKC, case folding, and removing ignorable #' characters which was introduced with Unicode 5.2). #' } #' #' Note that many W3C Specifications recommend using NFC for all content, #' because this form avoids potential interoperability problems arising #' from the use of canonically equivalent, yet different, #' character sequences in document formats on the Web. #' Thus, you will rather not use these functions in typical #' string processing activities. Most often you may assume #' that a string is in NFC, see RFC5198. #' #' As usual in \pkg{stringi}, #' if the input character vector is in the native encoding, #' it will be automatically converted to UTF-8. #' #' For more general text transforms refer to \code{\link{stri_trans_general}}. #' #' #' @param str character vector to be encoded #' #' @return The \code{stri_trans_nf*} functions return a character vector #' of the same length as input (the output is always in UTF-8). #' #' \code{stri_trans_isnf*} return a logical vector. #' #' @references #' \emph{Unicode Normalization Forms} -- Unicode Standard Annex #15, #' \url{https://unicode.org/reports/tr15/} #' #' \emph{Unicode Format for Network Interchange} #' -- RFC5198, \url{https://www.rfc-editor.org/rfc/rfc5198} #' #' \emph{Character Model for the World Wide Web 1.0: Normalization} #' -- W3C Working Draft, \url{https://www.w3.org/TR/charmod-norm/} #' #' \emph{Normalization} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/transforms/normalization/} #' (technical details) #' #' \emph{Unicode Equivalence} -- Wikipedia, #' \url{https://en.wikipedia.org/wiki/Unicode_equivalence} #' #' @examples #' stri_trans_nfd('\u0105') # a with ogonek -> a, ogonek #' stri_trans_nfkc('\ufdfa') # 1 codepoint -> 18 codepoints #' #' @export #' @rdname stri_trans_nf #' @family transform stri_trans_nfc <- function(str) { .Call(C_stri_trans_nfc, str) } #' @rdname stri_trans_nf #' @export stri_trans_nfd <- function(str) { .Call(C_stri_trans_nfd, str) } #' @rdname stri_trans_nf #' @export stri_trans_nfkd <- function(str) { .Call(C_stri_trans_nfkd, str) } #' @rdname stri_trans_nf #' @export stri_trans_nfkc <- function(str) { .Call(C_stri_trans_nfkc, str) } #' @rdname stri_trans_nf #' @export stri_trans_nfkc_casefold <- function(str) { .Call(C_stri_trans_nfkc_casefold, str) } #' @rdname stri_trans_nf #' @export stri_trans_isnfc <- function(str) { .Call(C_stri_trans_isnfc, str) } #' @rdname stri_trans_nf #' @export stri_trans_isnfd <- function(str) { .Call(C_stri_trans_isnfd, str) } #' @rdname stri_trans_nf #' @export stri_trans_isnfkd <- function(str) { .Call(C_stri_trans_isnfkd, str) } #' @rdname stri_trans_nf #' @export stri_trans_isnfkc <- function(str) { .Call(C_stri_trans_isnfkc, str) } #' @rdname stri_trans_nf #' @export stri_trans_isnfkc_casefold <- function(str) { .Call(C_stri_trans_isnfkc_casefold, str) } stringi/R/search_count_4.R0000644000176200001440000001304014750110641015152 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Count the Number of Pattern Occurrences #' #' @description #' These functions count the number of occurrences #' of a pattern in a string. #' #' @details #' Vectorized over \code{str} and \code{pattern} (with recycling #' of the elements in the shorter vector if necessary). This allows to, #' for instance, search for one pattern in each given string, #' search for each pattern in one given string, #' and search for the i-th pattern within the i-th string. #' #' If \code{pattern} is empty, then the result is \code{NA} #' and a warning is generated. #' #' \code{stri_count} is a convenience function. #' It calls either \code{stri_count_regex}, #' \code{stri_count_fixed}, \code{stri_count_coll}, #' or \code{stri_count_charclass}, depending on the argument used. #' #' @param str character vector; strings to search in #' @param pattern,regex,fixed,coll,charclass character vector; #' search patterns; for more details refer to \link{stringi-search} #' @param opts_collator,opts_fixed,opts_regex a named list used to tune up #' the search engine's settings; see #' \code{\link{stri_opts_collator}}, \code{\link{stri_opts_fixed}}, #' and \code{\link{stri_opts_regex}}, respectively; \code{NULL} #' for the defaults #' @param ... supplementary arguments passed to the underlying functions, #' including additional settings for \code{opts_collator}, \code{opts_regex}, #' \code{opts_fixed}, and so on #' #' @return All the functions return an integer vector. #' #' @examples #' s <- 'Lorem ipsum dolor sit amet, consectetur adipisicing elit.' #' stri_count(s, fixed='dolor') #' stri_count(s, regex='\\p{L}+') #' #' stri_count_fixed(s, ' ') #' stri_count_fixed(s, 'o') #' stri_count_fixed(s, 'it') #' stri_count_fixed(s, letters) #' stri_count_fixed('babab', 'b') #' stri_count_fixed(c('stringi', '123'), 'string') #' #' stri_count_charclass(c('stRRRingi', 'STrrrINGI', '123'), #' c('\\p{Ll}', '\\p{Lu}', '\\p{Zs}')) #' stri_count_charclass(' \t\n', '\\p{WHITE_SPACE}') # white space - binary property #' stri_count_charclass(' \t\n', '\\p{Z}') # white-space - general category (note the difference) #' #' stri_count_regex(s, '(s|el)it') #' stri_count_regex(s, 'i.i') #' stri_count_regex(s, '.it') #' stri_count_regex('bab baab baaab', c('b.*?b', 'b.b')) #' stri_count_regex(c('stringi', '123'), '^(s|1)') #' #' @family search_count #' @export #' @rdname stri_count stri_count <- function(str, ..., regex, fixed, coll, charclass) { providedarg <- c( regex = !missing(regex), fixed = !missing(fixed), coll = !missing(coll), charclass = !missing(charclass)) if (sum(providedarg) != 1) stop("you have to specify either `regex`, `fixed`, `coll`, or `charclass`") if (providedarg["regex"]) stri_count_regex(str, regex, ...) else if (providedarg["fixed"]) stri_count_fixed(str, fixed, ...) else if (providedarg["coll"]) stri_count_coll(str, coll, ...) else if (providedarg["charclass"]) stri_count_charclass(str, charclass, ...) } #' @export #' @rdname stri_count stri_count_charclass <- function(str, pattern) { .Call(C_stri_count_charclass, str, pattern) } #' @export #' @rdname stri_count stri_count_coll <- function(str, pattern, ..., opts_collator = NULL) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_count_coll, str, pattern, opts_collator) } #' @export #' @rdname stri_count stri_count_fixed <- function(str, pattern, ..., opts_fixed = NULL) { if (!missing(...)) opts_fixed <- do.call(stri_opts_fixed, as.list(c(opts_fixed, ...))) .Call(C_stri_count_fixed, str, pattern, opts_fixed) } #' @export #' @rdname stri_count stri_count_regex <- function(str, pattern, ..., opts_regex = NULL) { if (!missing(...)) opts_regex <- do.call(stri_opts_regex, as.list(c(opts_regex, ...))) .Call(C_stri_count_regex, str, pattern, opts_regex) } stringi/R/encoding_conversion.R0000644000176200001440000002625514770472035016333 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Convert Strings Between Given Encodings #' #' @description #' These functions convert strings between encodings. #' They aim to serve as a more portable and faster replacement #' for \R's own \code{\link{iconv}}. #' #' @details #' \code{stri_conv} is an alias for \code{stri_encode}. #' #' Refer to \code{\link{stri_enc_list}} for the list #' of supported encodings and \link{stringi-encoding} #' for a general discussion. #' #' If \code{from} is either missing, \code{''}, or \code{NULL}, #' and if \code{str} is a character vector #' then the marked encodings are used #' (see \code{\link{stri_enc_mark}}) -- in such a case \code{bytes}-declared #' strings are disallowed. #' Otherwise, i.e., if \code{str} is a \code{raw}-type vector #' or a list of raw vectors, #' we assume that the input encoding is the current default encoding #' as given by \code{\link{stri_enc_get}}. #' #' However, if \code{from} is given explicitly, #' the internal encoding declarations are always ignored. #' #' For \code{to_raw=FALSE}, the output #' strings always have the encodings marked according to the target converter #' used (as specified by \code{to}) and the current default Encoding #' (\code{ASCII}, \code{latin1}, \code{UTF-8}, \code{native}, #' or \code{bytes} in all other cases). #' #' #' Note that some issues might occur if \code{to} indicates, e.g., #' UTF-16 or UTF-32, as the output strings may have embedded NULs. #' In such cases, please use \code{to_raw=TRUE} and consider #' specifying a byte order marker (BOM) for portability reasons #' (e.g., set \code{UTF-16} or \code{UTF-32} which automatically #' adds the BOMs). #' #' Note that \code{stri_encode(as.raw(data), 'encodingname')} #' is a clever substitute for \code{\link{rawToChar}}. #' #' In the current version of \pkg{stringi}, if an incorrect code point is found #' on input, it is replaced with the default (for that target encoding) #' 'missing/erroneous' character (with a warning), e.g., #' the SUBSTITUTE character (U+001A) or the REPLACEMENT one (U+FFFD). #' Occurrences thereof can be located in the output string to diagnose #' the problematic sequences, e.g., by calling: #' \code{stri_locate_all_regex(converted_string, '[\\ufffd\\u001a]'}. #' #' Because of the way this function is currently implemented, #' maximal size of a single string to be converted cannot exceed ~0.67 GB. #' #' #' @param str a character vector, a raw vector, or #' a list of \code{raw} vectors to be converted #' @param from input encoding: #' \code{NULL} or \code{''} for the default encoding #' or internal encoding marks' usage (see Details); #' otherwise, a single string with encoding name, #' see \code{\link{stri_enc_list}} #' @param to target encoding: #' \code{NULL} or \code{''} for default encoding #' (see \code{\link{stri_enc_get}}), #' or a single string with encoding name #' @param to_raw a single logical value; indicates whether a list of raw vectors #' rather than a character vector should be returned #' #' @return If \code{to_raw} is \code{FALSE}, #' then a character vector with encoded strings (and appropriate #' encoding marks) is returned. #' Otherwise, a list of vectors of type raw is produced. #' #' @references #' \emph{Conversion} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/conversion/} #' #' @family encoding_conversion #' @rdname stri_encode #' @export stri_encode <- function(str, from = NULL, to = NULL, to_raw = FALSE) { .Call(C_stri_encode, str, from, to, to_raw) } #' @rdname stri_encode #' @export stri_conv <- stri_encode #' @title #' Convert Strings To UTF-32 #' #' @description #' UTF-32 is a 32-bit encoding where each Unicode code point #' corresponds to exactly one integer value. #' This function converts a character vector to a list #' of integer vectors so that, e.g., #' individual code points may be easily accessed, changed, etc. #' #' @details #' See \code{\link{stri_enc_fromutf32}} for a dual operation. #' #' This function is roughly equivalent to a vectorized call #' to \code{\link{utf8ToInt}(enc2utf8(str))}. #' If you want a list of raw vectors on output, #' use \code{\link{stri_encode}}. #' #' Unlike \code{utf8ToInt}, if ill-formed UTF-8 byte sequences are detected, #' a corresponding element is set to NULL and a warning is generated. #' To deal with such issues, use, e.g., \code{\link{stri_enc_toutf8}}. #' #' @param str a character vector (or an object coercible to) #' to be converted #' @return Returns a list of integer vectors. #' Missing values are converted to \code{NULL}s. #' #' @family encoding_conversion #' @export stri_enc_toutf32 <- function(str) { .Call(C_stri_enc_toutf32, str) } #' @title #' Convert From UTF-32 #' #' @description #' This function converts integer vectors, #' representing sequences of UTF-32 code points, to UTF-8 strings. #' #' @details #' UTF-32 is a 32-bit encoding where each Unicode code point #' corresponds to exactly one integer value. #' #' This function is a vectorized version of #' \code{\link{intToUtf8}}. As usual in \pkg{stringi}, #' it returns character strings in UTF-8. #' See \code{\link{stri_enc_toutf32}} for a dual operation. #' #' If an ill-defined code point is given, a warning is generated #' and the corresponding string is set to \code{NA}. #' Note that \code{0}s are not allowed in \code{vec}, as they are used #' internally to mark the end of a string (in the C API). #' #' #' See also \code{\link{stri_encode}} for decoding arbitrary byte sequences #' from any given encoding. #' #' #' @param vec a list of integer vectors (or objects coercible to such vectors) #' or \code{NULL}s. For convenience, a single integer vector can also #' be given. #' @return Returns a character vector (in UTF-8). #' \code{NULL}s in the input list are converted to \code{NA_character_}. #' #' @family encoding_conversion #' @export stri_enc_fromutf32 <- function(vec) { .Call(C_stri_enc_fromutf32, vec) } #' @title #' Convert Strings To UTF-8 #' #' @description #' Converts character strings with declared marked encodings #' to UTF-8 strings. #' #' @details #' If \code{is_unknown_8bit} is set to \code{FALSE} (the default), #' then R encoding marks are used, see \code{\link{stri_enc_mark}}. #' Bytes-marked strings will cause the function to fail. #' #' If a string is in UTF-8 and has a byte order mark (BOM), #' then the BOM will be silently removed from the output string. #' #' If the default encoding is UTF-8, see \code{\link{stri_enc_get}}, #' then strings marked with \code{native} are -- for efficiency reasons -- #' returned as-is, i.e., with unchanged markings. #' A similar behavior is observed when calling \code{\link{enc2utf8}}. #' #' For \code{is_unknown_8bit=TRUE}, if a string is declared to be neither #' in ASCII nor in UTF-8, then all byte codes > 127 are replaced with #' the Unicode REPLACEMENT CHARACTER (\\Ufffd). #' Note that the REPLACEMENT CHARACTER may be interpreted as Unicode #' missing value for single characters. #' Here a \code{bytes}-marked string is assumed to use an 8-bit encoding #' that extends the ASCII map. #' #' What is more, setting \code{validate} to \code{TRUE} #' or \code{NA} in both cases validates the resulting UTF-8 byte stream. #' If \code{validate=TRUE}, then #' in case of any incorrect byte sequences, they will be #' replaced with the REPLACEMENT CHARACTER. #' This option may be used in a case #' where you want to fix an invalid UTF-8 byte sequence. #' For \code{NA}, a bogus string will be replaced with a missing value. #' #' @param str a character vector to be converted #' @param is_unknown_8bit a single logical value, see Details #' @param validate a single logical value (can be \code{NA}), see Details #' @return Returns a character vector. #' #' @family encoding_conversion #' @export stri_enc_toutf8 <- function(str, is_unknown_8bit = FALSE, validate = FALSE) { .Call(C_stri_enc_toutf8, str, is_unknown_8bit, validate) } #' @title #' Convert Strings To Native Encoding #' #' @description #' Converts character strings with declared encodings #' to the current native encoding. #' #' @details #' This function just calls \code{\link{stri_encode}(str, NULL, NULL)}. #' The current native encoding can be read with \code{\link{stri_enc_get}}. #' Character strings declared to be in \code{bytes} encoding will fail here. #' #' Note that if working in a UTF-8 environment, #' resulting strings will be marked with \code{UTF-8} #' and not \code{native}, see \code{\link{stri_enc_mark}}. #' #' @param str a character vector to be converted #' @return Returns a character vector. #' #' @family encoding_conversion #' @export stri_enc_tonative <- function(str) { stri_encode(str, NULL, NULL) } #' @title #' Convert To ASCII #' #' @description #' This function converts input strings to ASCII, #' i.e., to character strings consisting of bytes not greater than 127. #' #' @details #' All code points greater than 127 are replaced with the ASCII SUBSTITUTE #' CHARACTER (0x1A). #' \R encoding declarations are always used to determine #' which encoding is assumed for each input, see \code{\link{stri_enc_mark}}. #' If ill-formed byte sequences are found in UTF-8 byte #' streams, a warning is generated. #' #' A \code{bytes}-marked string is assumed to be in an 8-bit encoding #' extending the ASCII map (a common assumption in \R itself). #' #' Note that the SUBSTITUTE CHARACTER (\code{\\x1a == \\032}) may be interpreted #' as the ASCII missing value for single characters. #' #' @param str a character vector to be converted #' @return Returns a character vector. #' #' @family encoding_conversion #' @export stri_enc_toascii <- function(str) { .Call(C_stri_enc_toascii, str) } stringi/R/trans_casemap.R0000644000176200001440000001200414750110641015071 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Transform Strings with Case Mapping or Folding #' #' @description #' These functions transform strings either to lower case, #' UPPER CASE, or Title Case or perform case folding. #' #' @details #' Vectorized over \code{str}. #' #' \pkg{ICU} implements full Unicode string case mappings. It is #' worth noting that, generally, case mapping: #' \itemize{ #' \item can change the number of code points and/or code units #' of a string, #' \item is language-sensitive (results may differ depending on the locale), and #' \item is context-sensitive (a character in the input string may map #' differently depending on surrounding characters). #' } #' #' With \code{stri_trans_totitle}, if \code{word} \code{BreakIterator} #' is used (the default), then the first letter of each word will be capitalized #' and the rest will be transformed to lower case. #' With the break iterator of type \code{sentence}, the first letter #' of each sentence will be capitalized only. #' Note that according the \pkg{ICU} User Guide, #' the string \code{'one. two. three.'} consists of one sentence. #' #' Case folding, on the other hand, is locale-independent. #' Its purpose is to make two pieces of text that differ only in case identical. #' This may come in handy when comparing strings. #' #' For more general (but not locale dependent) #' text transforms refer to \code{\link{stri_trans_general}}. #' #' @param str character vector #' @param locale \code{NULL} or \code{''} for case mapping following #' the conventions of the default locale, or a single string with #' locale identifier, see \link{stringi-locale}. #' @param opts_brkiter a named list with \pkg{ICU} BreakIterator's settings, #' see \code{\link{stri_opts_brkiter}}; #' \code{NULL} for default break iterator, i.e., \code{word}; #' \code{stri_trans_totitle} only #' @param ... additional settings for \code{opts_brkiter} #' #' @return #' Each function returns a character vector. #' #' @references #' \emph{Case Mappings} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/transforms/casemappings.html} #' #' @family locale_sensitive #' @family transform #' @export #' @rdname stri_trans_casemap #' @family text_boundaries #' #' @examples #' stri_trans_toupper('\u00DF', 'de_DE') # small German Eszett / scharfes S #' stri_cmp_eq(stri_trans_toupper('i', 'en_US'), stri_trans_toupper('i', 'tr_TR')) #' stri_trans_toupper(c('abc', '123', '\u0105\u0104')) #' stri_trans_tolower(c('AbC', '123', '\u0105\u0104')) #' stri_trans_totitle(c('AbC', '123', '\u0105\u0104')) #' stri_trans_casefold(c('AbC', '123', '\u0105\u0104')) #' stri_trans_totitle('stringi is a FREE R pAcKaGe. WItH NO StrinGS attached.') # word boundary #' stri_trans_totitle('stringi is a FREE R pAcKaGe. WItH NO StrinGS attached.', type='sentence') stri_trans_tolower <- function(str, locale = NULL) { .Call(C_stri_trans_tolower, str, locale) } #' @export #' @rdname stri_trans_casemap stri_trans_toupper <- function(str, locale = NULL) { .Call(C_stri_trans_toupper, str, locale) } #' @export #' @rdname stri_trans_casemap stri_trans_casefold <- function(str) { .Call(C_stri_trans_casefold, str) } #' @export #' @rdname stri_trans_casemap stri_trans_totitle <- function(str, ..., opts_brkiter = NULL) { if (!missing(...)) opts_brkiter <- do.call(stri_opts_brkiter, as.list(c(opts_brkiter, ...))) .Call(C_stri_trans_totitle, str, opts_brkiter) } stringi/R/trans_transliterate.R0000644000176200001440000001375314750110641016355 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' General Text Transforms, Including Transliteration #' #' @description #' \pkg{ICU} General transforms provide different ways #' for processing Unicode text. They are useful in handling a variety #' of different tasks, including: #' \itemize{ #' \item locale-independent upper case, lower case, title case, #' full/halfwidth conversions, #' \item normalization, #' \item hex and character name conversions, #' \item script to script conversion/transliteration. #' } #' #' #' @details #' \pkg{ICU} Transforms were mainly designed to transliterate characters #' from one script to another (for example, from Greek to Latin, #' or Japanese Katakana to Latin). #' However, these services are also capable of handling a much #' broader range of tasks. #' In particular, the Transforms include prebuilt transformations #' for case conversions, for normalization conversions, for the removal #' of given characters, and also for a variety of language and script #' transliterations. Transforms can be chained together to perform #' a series of operations and each step of the process can use a #' UnicodeSet to restrict the characters that are affected. #' #' To get the list of available transforms, #' call \code{\link{stri_trans_list}}. #' #' Note that transliterators are often combined in sequence #' to achieve a desired transformation. #' This is analogous to the composition of mathematical functions. #' For example, given a script that converts lowercase ASCII characters #' from Latin script to Katakana script, it is convenient to first #' (1) separate input base characters and accents, and then (2) #' convert uppercase to lowercase. #' To achieve this, a compound transform can be specified as follows: #' \code{NFKD; Lower; Latin-Katakana;} (with the default \code{rules=FALSE}). #' #' Custom rule-based transliteration is also supported, see the \pkg{ICU} #' manual and below for some examples. #' #' Transliteration is not dependent on the current locale. #' #' @param str character vector #' @param id a single string with transform identifier, #' see \code{\link{stri_trans_list}}, or custom transliteration rules #' @param rules if \code{TRUE}, treat \code{id} as a string with #' semicolon-separated transliteration rules (see the \pkg{ICU} manual); #' @param forward transliteration direction (\code{TRUE} for forward, #' \code{FALSE} for reverse) #' #' @return #' Returns a character vector. #' #' @examples #' stri_trans_general('gro\u00df', 'latin-ascii') #' stri_trans_general('stringi', 'latin-greek') #' stri_trans_general('stringi', 'latin-cyrillic') #' stri_trans_general('stringi', 'upper') # see stri_trans_toupper #' stri_trans_general('\u0104', 'nfd; lower') # compound id; see stri_trans_nfd #' stri_trans_general('Marek G\u0105golewski', 'pl-pl_FONIPA') #' stri_trans_general('\u2620', 'any-name') # character name #' stri_trans_general('\\N{latin small letter a}', 'name-any') # decode name #' stri_trans_general('\u2620', 'hex/c') # to hex #' stri_trans_general("\u201C\u2026\u201D \u0105\u015B\u0107\u017C", #' "NFKD; NFC; [^\\p{L}] latin-ascii") #' #' x <- "\uC885\uB85C\uAD6C \uC0AC\uC9C1\uB3D9" #' stringi::stri_trans_general(x, "Hangul-Latin") #' # Deviate from the ICU rules of romanisation of Korean, #' # see https://en.wikipedia.org/wiki/Romanization_of_Korean #' id <- " #' :: NFD; #' \u11A8 > k; #' \u11AE > t; #' \u11B8 > p; #' \u1105 > r; #' :: Hangul-Latin; #' " #' stringi::stri_trans_general(x, id, rules=TRUE) #' #' #' @references #' \emph{General Transforms} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/transforms/general/} #' #' @family transform #' @export stri_trans_general <- function(str, id, rules=FALSE, forward=TRUE) { .Call(C_stri_trans_general, str, id, rules, forward) } #' @title #' List Available Text Transforms and Transliterators #' #' @description #' Returns a list of available text transform identifiers. #' Each of them may be used in \code{\link{stri_trans_general}} #' tasks. #' #' @return Returns a character vector. #' #' @references #' \emph{General Transforms} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/transforms/general/} #' #' @examples #' stri_trans_list() #' #' @family transform #' @export stri_trans_list <- function() { stri_sort( .Call(C_stri_trans_list), locale="en_US", numeric=TRUE, strength=1 ) } stringi/R/locale.R0000644000176200001440000001426514750110641013523 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Locales and \pkg{stringi} #' #' @description #' In this section we explain how we specify locales in \pkg{stringi}. #' Locale is a fundamental concept in \pkg{ICU}. #' It identifies a specific user community, i.e., a group of users #' who have similar culture and language expectations #' for human-computer interaction. #' #' #' @details #' Because a locale is just an identifier of a region, #' no validity check is performed when you specify a Locale. #' \pkg{ICU} is implemented as a set of services. #' If you want to verify whether particular resources are available #' in the locale you asked for, you must query those resources. #' Note: when you ask for a resource for a particular locale, you get back #' the best available match, not necessarily precisely the one you requested. #' #' @section Locale Identifiers: #' #' \pkg{ICU} services are parametrized by locale, #' to deliver culturally correct results. #' Locales are identified by character strings #' of the form \code{Language} code, #' \code{Language_Country} code, or \code{Language_Country_Variant} #' code, e.g., 'en_US'. #' #' The two-letter \code{Language} code uses the ISO-639-1 standard, #' e.g., 'en' stands for English, 'pl' -- Polish, 'fr' -- French, #' and 'de' for German. #' #' \code{Country} is a two-letter code following the ISO-3166 standard. #' This is to reflect different language conventions within the same language, #' for example in US-English ('en_US') and Australian-English ('en_AU'). #' #' Differences may also appear in language conventions used within #' the same country. For example, the Euro currency may be used in several European #' countries while the individual country's currency is still in circulation. #' In such a case, \pkg{ICU} \code{Variant} '_EURO' could be used for selecting #' locales that support the Euro currency. #' #' The final (optional) element of a locale is a list of #' keywords together with their values. Keywords must be unique. #' Their order is not significant. Unknown keywords are ignored. #' The handling of keywords depends on the specific services that #' utilize them. Currently, the following keywords are recognized: #' \code{calendar}, \code{collation}, \code{currency}, and \code{numbers}, #' e.g., \code{fr@@collation=phonebook;}\code{calendar=islamic-civil} is a valid #' French locale specifier together with keyword arguments. For #' more information, refer to the ICU user guide. #' #' For a list of locales that are recognized by \pkg{ICU}, #' call \code{\link{stri_locale_list}}. #' #' Note that in \pkg{stringi}, 'C' is a synonym of `en_US_POSIX`. #' #' #' @section A Note on Default Locales: #' #' Each locale-sensitive function in \pkg{stringi} #' selects the current default locale if an empty string or \code{NULL} #' is provided as its \code{locale} argument. Default locales are available #' to all the functions; initially, the system locale on that platform is used, #' but it may be changed by calling \code{\link{stri_locale_set}}. #' #' Your program should avoid changing the default locale. #' All locale-sensitive functions may request #' any desired locale per-call (by specifying the \code{locale} argument), #' i.e., without referencing to the default locale. #' During many tests, however, we did not observe any improper #' behavior of \pkg{stringi} while using a modified default locale. #' #' #' #' #' @section Locale-Sensitive Functions in \pkg{stringi}: #' #' One of many examples of locale-dependent services is the Collator, which #' performs a locale-aware string comparison. It is used for string comparing, #' ordering, sorting, and searching. See \code{\link{stri_opts_collator}} #' for the description on how to tune its settings, and its \code{locale} #' argument in particular. #' #' When choosing a resource bundle that is not available in the explicitly #' requested locale (but not when using the default locale) #' nor in its more general variants (e.g., `es_ES` vs `es`), #' a warning is emitted. #' #' Other locale-sensitive functions include, e.g., #' \code{\link{stri_trans_tolower}} (that does character case mapping). #' #' @references #' \emph{Locale} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/locale/} #' #' \emph{ISO 639: Language Codes}, #' \url{https://www.iso.org/iso-639-language-codes.html} #' #' \emph{ISO 3166: Country Codes}, #' \url{https://www.iso.org/iso-3166-country-codes.html} #' #' @name about_locale #' @rdname about_locale #' @aliases about_locale locale stringi-locale #' @family locale_management #' @family locale_sensitive #' @family stringi_general_topics invisible(NULL) stringi/R/compare.R0000644000176200001440000002536714750110641013717 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Compare Strings with or without Collation #' #' @description #' These functions may be used to determine if two strings #' are equal, canonically equivalent (this is performed in a much more clever #' fashion than when testing for equality), or to check whether they are in #' a specific lexicographic order. #' #' #' @details #' All the functions listed here are vectorized over \code{e1} and \code{e2}. #' #' \code{stri_cmp_eq} tests whether two corresponding strings #' consist of exactly the same code points, while \code{stri_cmp_neq} allows #' to check whether there is any difference between them. These are #' locale-independent operations: for natural language processing, #' where the notion of canonical equivalence is more valid, this might #' not be exactly what you are looking for, see Examples. #' Please note that \pkg{stringi} always silently removes UTF-8 #' BOMs from input strings, therefore, e.g., \code{stri_cmp_eq} does not take #' BOMs into account while comparing strings. #' #' \code{stri_cmp_equiv} tests for canonical equivalence of two strings #' and is locale-dependent. Additionally, the \pkg{ICU}'s Collator may be #' tuned up so that, e.g., the comparison is case-insensitive. #' To test whether two strings are not canonically equivalent, #' call \code{stri_cmp_nequiv}. #' #' \code{stri_cmp_le} tests whether #' the elements in the first vector are less than or equal to #' the corresponding elements in the second vector, #' \code{stri_cmp_ge} tests whether they are greater or equal, #' \code{stri_cmp_lt} if less, and \code{stri_cmp_gt} if greater, #' see also, e.g., \code{\link{\%s<\%}}. #' #' \code{stri_compare} is an alias to \code{stri_cmp}. They both #' perform exactly the same locale-dependent operation. #' Both functions provide a C library's \code{strcmp()} look-and-feel, #' see Value for details. #' #' #' For more information on \pkg{ICU}'s Collator and how to tune its settings #' refer to \code{\link{stri_opts_collator}}. #' Note that different locale settings may lead to different results #' (see the examples below). #' #' #' @param e1,e2 character vectors or objects coercible to character vectors #' @param opts_collator a named list with \pkg{ICU} Collator's options, #' see \code{\link{stri_opts_collator}}, \code{NULL} #' for the default collation options. #' @param ... additional settings for \code{opts_collator} #' #' @return The \code{stri_cmp} and \code{stri_compare} functions #' return an integer vector representing the comparison results: #' \code{-1} if \code{e1[...] < e2[...]}, #' \code{0} if they are canonically equivalent, and \code{1} if greater. #' #' All the other functions return a logical vector that indicates #' whether a given relation holds between two corresponding elements #' in \code{e1} and \code{e2}. #' #' @references #' \emph{Collation} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/collation/} #' #' @examples #' # in Polish, ch < h: #' stri_cmp_lt('hladny', 'chladny', locale='pl_PL') #' #' # in Slovak, ch > h: #' stri_cmp_lt('hladny', 'chladny', locale='sk_SK') #' #' # < or > (depends on locale): #' stri_cmp('hladny', 'chladny') #' #' # ignore case differences: #' stri_cmp_equiv('hladny', 'HLADNY', strength=2) #' #' # also ignore diacritical differences: #' stri_cmp_equiv('hladn\u00FD', 'hladny', strength=1, locale='sk_SK') #' #' marios <- c('Mario', 'mario', 'M\\u00e1rio', 'm\\u00e1rio') #' stri_cmp_equiv(marios, 'mario', case_level=TRUE, strength=2L) #' stri_cmp_equiv(marios, 'mario', case_level=TRUE, strength=1L) #' stri_cmp_equiv(marios, 'mario', strength=1L) #' stri_cmp_equiv(marios, 'mario', strength=2L) #' #' # non-Unicode-normalized vs normalized string: #' stri_cmp_equiv(stri_trans_nfkd('\u0105'), '\u105') #' #' # note the difference: #' stri_cmp_eq(stri_trans_nfkd('\u0105'), '\u105') #' #' # ligatures: #' stri_cmp_equiv('\ufb00', 'ff', strength=2) #' #' # phonebook collation #' stri_cmp_equiv('G\u00e4rtner', 'Gaertner', locale='de_DE@@collation=phonebook', strength=1L) #' stri_cmp_equiv('G\u00e4rtner', 'Gaertner', locale='de_DE', strength=1L) #' #' @family locale_sensitive #' @export #' @rdname stri_compare stri_compare <- function(e1, e2, ..., opts_collator = NULL) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_cmp, e1, e2, opts_collator) } #' @export #' @rdname stri_compare stri_cmp <- stri_compare #' @export #' @rdname stri_compare stri_cmp_eq <- function(e1, e2) { .Call(C_stri_cmp_eq, e1, e2) } #' @export #' @rdname stri_compare stri_cmp_neq <- function(e1, e2) { .Call(C_stri_cmp_neq, e1, e2) } #' @export #' @rdname stri_compare stri_cmp_equiv <- function(e1, e2, ..., opts_collator = NULL) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_cmp_equiv, e1, e2, opts_collator) } #' @export #' @rdname stri_compare stri_cmp_nequiv <- function(e1, e2, ..., opts_collator = NULL) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_cmp_nequiv, e1, e2, opts_collator) } #' @export #' @rdname stri_compare stri_cmp_lt <- function(e1, e2, ..., opts_collator = NULL) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_cmp_lt, e1, e2, opts_collator) } #' @export #' @rdname stri_compare stri_cmp_gt <- function(e1, e2, ..., opts_collator = NULL) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_cmp_gt, e1, e2, opts_collator) } #' @export #' @rdname stri_compare stri_cmp_le <- function(e1, e2, ..., opts_collator = NULL) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_cmp_le, e1, e2, opts_collator) } #' @export #' @rdname stri_compare stri_cmp_ge <- function(e1, e2, ..., opts_collator = NULL) { if (!missing(...)) opts_collator <- do.call(stri_opts_collator, as.list(c(opts_collator, ...))) .Call(C_stri_cmp_ge, e1, e2, opts_collator) } #' @title #' Compare Strings with or without Collation #' #' @description #' Relational operators for comparing corresponding strings in #' two character vectors, with a typical R look-and-feel. #' #' @details #' These functions call \code{\link{stri_cmp_le}} or its #' friends, using the default collator options. #' As a consequence, they are vectorized over \code{e1} and \code{e2}. #' #' \code{\%stri==\%} tests for canonical equivalence of strings #' (see \code{\link{stri_cmp_equiv}}) and is a locale-dependent operation. #' #' \code{\%stri===\%} performs a locale-independent, #' code point-based comparison. #' #' #' @param e1,e2 character vectors or objects coercible to character vectors #' #' @return All the functions return a logical vector #' indicating the result of a pairwise comparison. #' As usual, the elements of shorter vectors are recycled if necessary. #' #' #' @examples #' 'a' %stri<% 'b' #' c('a', 'b', 'c') %stri>=% 'b' #' #' @usage #' e1 \%s<\% e2 #' #' @family locale_sensitive #' @rdname operator_compare #' @aliases operator_compare oper_comparison oper_compare #' @export "%s<%" <- function(e1, e2) { stri_cmp_lt(e1, e2) } #' @usage #' e1 \%s<=\% e2 #' @rdname operator_compare #' @export "%s<=%" <- function(e1, e2) { stri_cmp_le(e1, e2) } #' @usage #' e1 \%s>\% e2 #' @rdname operator_compare #' @export "%s>%" <- function(e1, e2) { stri_cmp_gt(e1, e2) } #' @usage #' e1 \%s>=\% e2 #' @rdname operator_compare #' @export "%s>=%" <- function(e1, e2) { stri_cmp_ge(e1, e2) } #' @usage #' e1 \%s==\% e2 #' @rdname operator_compare #' @export "%s==%" <- function(e1, e2) { stri_cmp_equiv(e1, e2) } #' @usage #' e1 \%s!=\% e2 #' @rdname operator_compare #' @export "%s!=%" <- function(e1, e2) { stri_cmp_nequiv(e1, e2) } #' @usage #' e1 \%s===\% e2 #' @rdname operator_compare #' @export "%s===%" <- function(e1, e2) { stri_cmp_eq(e1, e2) } #' @usage #' e1 \%s!==\% e2 #' @rdname operator_compare #' @export "%s!==%" <- function(e1, e2) { stri_cmp_neq(e1, e2) } #' @usage #' e1 \%stri<\% e2 #' @rdname operator_compare #' @export "%stri<%" <- function(e1, e2) { stri_cmp_lt(e1, e2) } #' @usage #' e1 \%stri<=\% e2 #' @rdname operator_compare #' @export "%stri<=%" <- function(e1, e2) { stri_cmp_le(e1, e2) } #' @usage #' e1 \%stri>\% e2 #' @rdname operator_compare #' @export "%stri>%" <- function(e1, e2) { stri_cmp_gt(e1, e2) } #' @usage #' e1 \%stri>=\% e2 #' @rdname operator_compare #' @export "%stri>=%" <- function(e1, e2) { stri_cmp_ge(e1, e2) } #' @usage #' e1 \%stri==\% e2 #' @rdname operator_compare #' @export "%stri==%" <- function(e1, e2) { stri_cmp_equiv(e1, e2) } #' @usage #' e1 \%stri!=\% e2 #' @rdname operator_compare #' @export "%stri!=%" <- function(e1, e2) { stri_cmp_nequiv(e1, e2) } #' @usage #' e1 \%stri===\% e2 #' @rdname operator_compare #' @export "%stri===%" <- function(e1, e2) { stri_cmp_eq(e1, e2) } #' @usage #' e1 \%stri!==\% e2 #' @rdname operator_compare #' @export "%stri!==%" <- function(e1, e2) { stri_cmp_neq(e1, e2) } stringi/R/stringi_package.R0000644000176200001440000001667414770530442015433 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the R package 'stringi'. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title Fast and Portable Character String Processing in R #' #' @description #' \pkg{stringi} is THE R package for fast, correct, consistent, #' and convenient string/text manipulation. #' It gives predictable results on every platform, in each locale, #' and under any native character encoding. #' #' \bold{Keywords}: R, text processing, character strings, #' internationalization, localization, ICU, ICU4C, i18n, l10n, Unicode. #' #' \bold{Homepage}: \url{https://stringi.gagolewski.com/} #' #' \bold{License}: The BSD-3-clause license for the package code, #' the ICU license for the accompanying ICU4C distribution, #' and the UCD license for the Unicode Character Database. #' See the COPYRIGHTS and LICENSE file for more details. #' #' @details #' Manual pages on general topics: #' \itemize{ #' \item \link{about_encoding} -- character encoding issues, including #' information on encoding management in \pkg{stringi}, as well as #' on encoding detection and conversion. #' #' \item \link{about_locale} -- locale issues, including locale #' management and specification in \pkg{stringi}, and the list of #' locale-sensitive operations. In particular, see #' \code{\link{stri_opts_collator}} for a description of the string #' collation algorithm, which is used for string comparing, ordering, #' ranking, sorting, case-folding, and searching. #' #' \item \link{about_arguments} -- information on how \pkg{stringi} #' handles the arguments passed to its function. #' } #' #' #' @section Facilities available: #' #' Refer to the following: #' \itemize{ #' \item \link{about_search} for string searching facilities; #' these include pattern searching, matching, string splitting, and so on. #' The following independent search engines are provided: #' \itemize{ #' \item \link{about_search_regex} -- with ICU (Java-like) regular expressions, #' \item \link{about_search_fixed} -- fast, locale-independent, byte-wise pattern #' matching, #' \item \link{about_search_coll} -- locale-aware pattern matching #' for natural language processing tasks, #' \item \link{about_search_charclass} -- seeking elements of #' particular character classes, like ``all whites-paces'' or ``all digits'', #' \item \link{about_search_boundaries} -- text boundary analysis. #' } #' #' \item \code{\link{stri_datetime_format}} for date/time formatting #' and parsing. Also refer to the links therein for other date/time/time zone- #' related operations. #' #' \item \code{\link{stri_stats_general}} and \code{\link{stri_stats_latex}} #' for gathering some fancy statistics on a character vector's contents. #' #' \item \code{\link{stri_join}}, \code{\link{stri_dup}}, \code{\link{\%s+\%}}, #' and \code{\link{stri_flatten}} for concatenation-based operations. #' #' \item \code{\link{stri_sub}} for extracting and replacing substrings, #' and \code{\link{stri_reverse}} for a joyful function #' to reverse all code points in a string. #' #' \item \code{\link{stri_length}} (among others) for determining the number #' of code points in a string. See also \code{\link{stri_count_boundaries}} #' for counting the number of Unicode characters #' and \code{\link{stri_width}} for approximating the width of a string. #' #' \item \code{\link{stri_trim}} (among others) for #' trimming characters from the beginning or/and end of a string, #' see also \link{about_search_charclass}, and \code{\link{stri_pad}} #' for padding strings so that they are of the same width. #' Additionally, \code{\link{stri_wrap}} wraps text into lines. #' #' \item \code{\link{stri_trans_tolower}} (among others) for case mapping, #' i.e., conversion to lower, UPPER, or Title Case, #' \code{\link{stri_trans_nfc}} (among others) for Unicode normalization, #' \code{\link{stri_trans_char}} for translating individual code points, #' and \code{\link{stri_trans_general}} for other universal #' text transforms, including transliteration. #' #' \item \code{\link{stri_cmp}}, \code{\link{\%s<\%}}, \code{\link{stri_order}}, #' \code{\link{stri_sort}}, \code{\link{stri_rank}}, \code{\link{stri_unique}}, #' and \code{\link{stri_duplicated}} for collation-based, #' locale-aware operations, see also \link{about_locale}. #' #' \item \code{\link{stri_split_lines}} (among others) #' to split a string into text lines. #' #' \item \code{\link{stri_escape_unicode}} (among others) for escaping #' some code points. #' #' \item \code{\link{stri_rand_strings}}, \code{\link{stri_rand_shuffle}}, #' and \code{\link{stri_rand_lipsum}} for generating (pseudo)random strings. #' #' \item \code{\link{stri_read_raw}}, #' \code{\link{stri_read_lines}}, and \code{\link{stri_write_lines}} #' for reading and writing text files. #' } #' #' Note that each man page provides many further links to other #' interesting facilities and topics. #' #' @docType package #' @author Marek Gagolewski, #' with contributions from Bartek Tartanus and many others. #' ICU4C was developed by IBM, Unicode, Inc., and others. #' #' @references #' \emph{\pkg{stringi} Package Homepage}, #' \url{https://stringi.gagolewski.com/} #' #' Gagolewski M., \pkg{stringi}: Fast and portable character string #' processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, #' \doi{10.18637/jss.v103.i02} #' #' \emph{ICU -- International Components for Unicode}, #' \url{https://icu.unicode.org/} #' #' \emph{ICU4C API Documentation}, #' \url{https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/} #' #' \emph{The Unicode Consortium}, #' \url{https://home.unicode.org/} #' #' \emph{UTF-8, A Transformation Format of ISO 10646} -- RFC 3629, #' \url{https://www.rfc-editor.org/rfc/rfc3629} #' #' @family stringi_general_topics #' @useDynLib stringi, .registration = TRUE #' @importFrom tools md5sum #' @importFrom utils packageVersion #' @importFrom utils download.file #' @importFrom utils unzip #' @importFrom stats runif #' @importFrom stats rnorm "_PACKAGE" stringi/R/trim.R0000644000176200001440000001057414750110641013236 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Trim Characters from the Left and/or Right Side of a String #' #' @description #' These functions may be used, e.g., to remove unnecessary #' white-spaces from strings. Trimming ends at the first or #' starts at the last \code{pattern} match. #' #' @details #' Vectorized over \code{str} and \code{pattern}. #' #' \code{stri_trim} is a convenience wrapper over \code{stri_trim_left} #' and \code{stri_trim_right}. #' #' Contrary to many other string processing libraries, #' our trimming functions are universal. The class of characters #' to be retained or trimmed can be adjusted. #' #' For replacing pattern matches with #' an arbitrary replacement string, see \code{\link{stri_replace}}. #' #' Trimming can also be used where you would normally rely on #' regular expressions. For instance, you may get #' \code{'23.5'} out of \code{'total of 23.5 bitcoins'}. #' #' For trimming white-spaces, please note the difference #' between Unicode binary property `\code{\\p\{Wspace\}}` (more universal) #' and general character category `\code{\\p\{Z\}}`, #' see \link{stringi-search-charclass}. #' #' @param str a character vector of strings to be trimmed #' @param pattern a single pattern, specifying the class of characters #' (see \link{stringi-search-charclass}) to #' to be preserved (if \code{negate} is \code{FALSE}; default) #' or trimmed (otherwise) #' @param side character [\code{stri_trim} only]; defaults to \code{'both'} #' @param negate either \code{TRUE} or \code{FALSE}; see \code{pattern} #' #' #' @return #' All functions return a character vector. #' #' #' @examples #' stri_trim_left(' aaa') #' stri_trim_right('r-project.org/', '\\P{P}') #' stri_trim_both(' Total of 23.5 bitcoins. ', '\\p{N}') #' stri_trim_both(' Total of 23.5 bitcoins. ', '\\P{N}', negate=TRUE) #' #' @aliases stri_trim #' @family search_replace #' @family search_charclass #' @rdname stri_trim #' @export stri_trim_both <- function(str, pattern="\\P{Wspace}", negate=FALSE) { .Call(C_stri_trim_both, str, pattern, negate) } #' @rdname stri_trim #' @export stri_trim_left <- function(str, pattern="\\P{Wspace}", negate=FALSE) { .Call(C_stri_trim_left, str, pattern, negate) } #' @rdname stri_trim #' @export stri_trim_right <- function(str, pattern="\\P{Wspace}", negate=FALSE) { .Call(C_stri_trim_right, str, pattern, negate) } #' @rdname stri_trim #' @export stri_trim <- function(str, side=c("both", "left", "right"), pattern="\\P{Wspace}", negate=FALSE) { # `both` is default for compatibility with stringr side <- match.arg(side) # this is slow switch(side, both=stri_trim_both(str, pattern, negate), left=stri_trim_left(str, pattern, negate), right=stri_trim_right(str, pattern, negate) ) } stringi/R/sprintf.R0000644000176200001440000002220614750110641013743 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Format Strings #' #' @description #' \code{stri_sprintf} (synonym: \code{stri_string_format}) #' is a Unicode-aware replacement for and enhancement of #' the built-in \code{\link[base]{sprintf}} function. #' Moreover, \code{stri_printf} prints formatted strings. #' #' @details #' Vectorized over \code{format} and all vectors passed via \code{...}. #' #' Unicode code points may have various widths when #' printed on the console (compare \code{\link{stri_width}}). #' These functions, by default (see the \code{use_length} argument), take this #' into account. #' #' These functions are not locale sensitive. For instance, numbers are #' always formatted in the "POSIX" style, e.g., \code{-123456.789} #' (no thousands separator, dot as a fractional separator). #' Such a feature might be added at a later date, though. #' #' All arguments passed via \code{...} are evaluated. If some of them #' are unused, a warning is generated. Too few arguments result in an error. #' #' Note that \code{stri_printf} treats missing values in \code{...} #' as \code{"NA"} strings by default. #' #' All format specifiers supported \code{\link[base]{sprintf}} are #' also available here. For the formatting of integers and floating-point #' values, currently the system \code{std::snprintf()} is called, but #' this may change in the future. Format specifiers are normalized #' and necessary sanity checks are performed. #' #' Supported conversion specifiers: \code{dioxX} (integers) #' \code{feEgGaA} (floats) and \code{s} (character strings). #' Supported flags: \code{-} (left-align), #' \code{+} (force output sign or blank when \code{NaN} or \code{NA}; numeric only), #' \code{} (output minus or space for a sign; numeric only) #' \code{0} (pad with 0s; numeric only), #' \code{#} (alternative output of some numerics). #' #' #' @param format character vector of format strings #' @param ... vectors (coercible to integer, real, or character) #' @param na_string single string to represent missing values; #' if \code{NA}, missing values in \code{...} #' result in the corresponding outputs be missing too; #' use \code{"NA"} for compatibility with base R #' @param inf_string single string to represent the (unsigned) infinity (\code{NA} allowed) #' @param nan_string single string to represent the not-a-number (\code{NA} allowed) #' @param use_length single logical value; should the number of code #' points be used when applying modifiers such as \code{\%20s} #' instead of the total code point width? #' @param file see \code{\link[base]{cat}} #' @param sep see \code{\link[base]{cat}} #' @param append see \code{\link[base]{cat}} #' #' @return #' \code{stri_printf} is used for its side effect, which is printing #' text on the standard output or other connection/file. Hence, it returns #' \code{invisible(NULL)}. #' #' The other functions return a character vector. #' #' #' @references #' \code{printf} in \code{glibc}, #' \url{https://man.archlinux.org/man/printf.3} #' #' \code{printf} format strings -- Wikipedia, #' \url{https://en.wikipedia.org/wiki/Printf_format_string} #' #' @examples #' stri_printf("%4s=%.3f", c("e", "e\u00b2", "\u03c0", "\u03c0\u00b2"), #' c(exp(1), exp(2), pi, pi^2)) #' #' x <- c( #' "xxabcd", #' "xx\u0105\u0106\u0107\u0108", #' stri_paste( #' "\u200b\u200b\u200b\u200b", #' "\U0001F3F4\U000E0067\U000E0062\U000E0073\U000E0063\U000E0074\U000E007F", #' "abcd" #' )) #' stri_printf("[%10s]", x) # minimum width = 10 #' stri_printf("[%-10.3s]", x) # output of max width = 3, but pad to width of 10 #' stri_printf("[%10s]", x, use_length=TRUE) # minimum number of Unicode code points = 10 #' #' # vectorization wrt all arguments: #' p <- runif(10) #' stri_sprintf(ifelse(p > 0.5, "P(Y=1)=%1$.2f", "P(Y=0)=%2$.2f"), p, 1-p) #' #' # using a "preformatted" logical vector: #' x <- c(TRUE, FALSE, FALSE, NA, TRUE, FALSE) #' stri_sprintf("%s) %s", letters[seq_along(x)], c("\u2718", "\u2713")[x+1]) #' #' # custom NA/Inf/NaN strings: #' stri_printf("%+10.3f", c(-Inf, -0, 0, Inf, NaN, NA_real_), #' na_string="", nan_string="\U0001F4A9", inf_string="\u221E") #' #' stri_sprintf("UNIX time %1$f is %1$s.", Sys.time()) #' #' # the following do not work in sprintf() #' stri_sprintf("%1$#- *2$.*3$f", 1.23456, 10, 3) # two asterisks #' stri_sprintf(c("%s", "%f"), pi) # re-coercion needed #' stri_sprintf("%1$s is %1$f UNIX time.", Sys.time()) # re-coercion needed #' stri_sprintf(c("%d", "%s"), factor(11:12)) # re-coercion needed #' stri_sprintf(c("%s", "%d"), factor(11:12)) # re-coercion needed #' #' @rdname stri_sprintf #' @family length #' @export stri_sprintf <- function( format, ..., na_string=NA_character_, inf_string="Inf", nan_string="NaN", use_length=FALSE ) { # force eval of ... here .Call(C_stri_sprintf, format, list(...), na_string, inf_string, nan_string, use_length) } #' @rdname stri_sprintf #' @export stri_string_format <- stri_sprintf #' @rdname stri_sprintf #' @export stri_printf <- function( format, ..., file="", sep="\n", append=FALSE, na_string="NA", inf_string="Inf", nan_string="NaN", use_length=FALSE ) { # force eval of ... here str <- .Call(C_stri_sprintf, format, list(...), na_string, inf_string, nan_string, use_length) cat(str, file=file, sep=sep, append=append) } #' @title #' C-Style Formatting with \code{\link{stri_sprintf}} as a Binary Operator #' #' @description #' Provides access to \code{\link{stri_sprintf}} in form of a binary #' operator in a way similar to Python's \code{\%} overloaded for strings. #' #' Missing values and empty vectors are propagated as usual. #' #' @details #' Vectorized over \code{e1} and \code{e2}. #' #' \code{e1 \%s$\% atomic_vector} is equivalent to #' \code{e1 \%s$\% list(atomic_vector)}. #' #' #' @param e1 format strings, see \code{\link{stri_sprintf}} for syntax #' @param e2 a list of atomic vectors to be passed to \code{\link{stri_sprintf}} #' or a single atomic vector #' #' @return #' Returns a character vector. #' #' #' @examples #' "value='%d'" %s$% 3 #' "value='%d'" %s$% 1:3 #' "%s='%d'" %s$% list("value", 3) #' "%s='%d'" %s$% list("value", 1:3) #' "%s='%d'" %s$% list(c("a", "b", "c"), 1) #' "%s='%d'" %s$% list(c("a", "b", "c"), 1:3) #' #' x <- c("abcd", "\u00DF\u00B5\U0001F970", "abcdef") #' cat("[%6s]" %s$% x, sep="\n") # width used, not the number of bytes #' #' @rdname operator_dollar #' @aliases operator_dollar oper_dollar #' @family length #' #' @usage #' e1 \%s$\% e2 #' #' @export `%s$%` <- function(e1, e2) { if (!is.list(e2)) e2 <- list(e2) na_string <- NA_character_ .Call(C_stri_sprintf, e1, e2, na_string, "Inf", "NaN", FALSE) # old version: based on base::sprintf # # this is stringi, assure UTF-8 output and proper NA handling! # e1 <- stri_enc_toutf8(as.character(e1)) # if (length(e1) == 0) return(character(0)) # # for (i in seq_along(e2)) { # stopifnot(is.atomic(e2[[i]])) # factor is atomic # if (length(e2[[i]]) == 0) return(character(0)) # if (is.character(e2[[i]]) || is.factor(e2[[i]])) { # e2[[i]] <- stri_enc_toutf8(e2[[i]]) # } # } # # ret <- stri_enc_toutf8(do.call(sprintf, as.list(c(list(e1), e2)))) # # for the time being, let stri_paste determine NAs # # (it might be too greedy if there are unused strings) # which_na <- do.call(stri_paste, e2) # ret[is.na(which_na)] <- NA_character_ # # ret[is.na(e1)] <- NA_character_ # # ret } #' @usage #' e1 \%stri$\% e2 #' @rdname operator_dollar #' @export `%stri$%` <- `%s$%` stringi/R/escape.R0000644000176200001440000000731314750110641013520 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Escape Unicode Code Points #' #' @description #' Generates an ASCII string where all non-printable characters #' and non-ASCII characters are converted to escape sequences. #' #' @details #' #' For non-printable and certain special (well-known, #' see also the R man page \link{Quotes}) #' ASCII characters, the following #' (also recognized in R) convention is used. #' We get \code{\\a}, \code{\\b}, \code{\\t}, \code{\\n}, \code{\\v}, #' \code{\\f}, \code{\\r}, \code{\"}, \code{\'}, \code{\\\\} #' or either \code{\\uXXXX} (4 hex digits) or \code{\\UXXXXXXXX} (8 hex digits) #' otherwise. #' #' #' As usual in stringi, any input string is converted to Unicode #' before executing the escape process. #' #' #' @param str character vector #' #' @return #' Returns a character vector. #' #' @examples #' stri_escape_unicode('a\u0105!') #' #' @family escape #' @export stri_escape_unicode <- function(str) { .Call(C_stri_escape_unicode, str) } #' @title #' Un-escape All Escape Sequences #' #' @description #' Un-escapes all known escape sequences. #' #' @details #' Uses \pkg{ICU}'s facilities to un-escape Unicode character sequences. #' #' The following escape sequences are recognized: #' \code{\\a}, \code{\\b}, \code{\\t}, \code{\\n}, \code{\\v}, \code{\\?}, #' \code{\\e}, \code{\\f}, \code{\\r}, \code{\"}, \code{\'}, \code{\\\\}, #' \code{\\uXXXX} (4 hex digits), #' \code{\\UXXXXXXXX} (8 hex digits), #' \code{\\xXX} (1-2 hex digits), #' \code{\\ooo} (1-3 octal digits), #' \code{\\cX} (control-X; X is masked with 0x1F). #' For \code{\\xXX} and \code{\\ooo}, beware of non-valid UTF-8 byte sequences. #' #' Note that some versions of R on Windows cannot handle #' characters defined with \code{\\UXXXXXXXX}. #' #' @param str character vector #' #' @return #' Returns a character vector. #' If an escape sequence is ill-formed, #' the result will be \code{NA} and a warning will be given. #' #' @examples #' stri_unescape_unicode('a\\u0105!\\u0032\\n') #' #' @family escape #' @export stri_unescape_unicode <- function(str) { .Call(C_stri_unescape_unicode, str) } stringi/R/encoding.R0000644000176200001440000002776014750110641014056 0ustar liggesusers# kate: default-dictionary en_US ## This file is part of the 'stringi' package for R. ## Copyright (c) 2013-2025, Marek Gagolewski ## All rights reserved. ## ## Redistribution and use in source and binary forms, with or without ## modification, are permitted provided that the following conditions are met: ## ## 1. Redistributions of source code must retain the above copyright notice, ## this list of conditions and the following disclaimer. ## ## 2. Redistributions in binary form must reproduce the above copyright notice, ## this list of conditions and the following disclaimer in the documentation ## and/or other materials provided with the distribution. ## ## 3. Neither the name of the copyright holder nor the names of its ## contributors may be used to endorse or promote products derived from ## this software without specific prior written permission. ## ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, ## BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ## FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; ## OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE ## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, ## EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #' @title #' Character Encodings and \pkg{stringi} #' #' @description #' This manual page explains how \pkg{stringi} deals with character #' strings in various encodings. #' #' In particular we should note that: #' \itemize{ #' \item \R lets strings in ASCII, UTF-8, and your platform's #' native encoding coexist. A character vector printed on the console #' by calling \code{\link{print}} or \code{\link{cat}} is #' silently re-encoded to the native encoding. #' \item Functions in \pkg{stringi} process each string internally in #' Unicode, the most universal character encoding ever. #' Even if a string is given in the native encoding, i.e., your platform's #' default one, it will be converted to Unicode (precisely: UTF-8 or UTF-16). #' \item Most \pkg{stringi} functions always return UTF-8 encoded strings, #' regardless of the input encoding. What is more, the functions have been #' optimized for UTF-8/ASCII input (they have competitive, if not better #' performance, especially when performing more complex operations like #' string comparison, sorting, and even concatenation). Thus, it is #' best to rely on cascading calls to \pkg{stringi} operations solely. #' } #' #' @details #' Quoting the ICU User Guide, #' 'Hundreds of encodings have been developed over the years, each for small #' groups of languages and for special purposes. As a result, #' the interpretation of text, input, sorting, display, and storage #' depends on the knowledge of all the different types of character sets #' and their encodings. Programs have been written to handle either #' one single encoding at a time and switch between them, or to convert #' between external and internal encodings.' #' #' 'Unicode provides a single character set that covers the major #' languages of the world, and a small number of machine-friendly encoding #' forms and schemes to fit the needs of existing applications and protocols. #' It is designed for best interoperability with both ASCII and ISO-8859-1 #' (the most widely used character sets) to make it easier for Unicode to be #' used in almost all applications and protocols' (see the ICU User Guide). #' #' The Unicode Standard determines the way to map any possible character #' to a numeric value -- a so-called code point. Such code points, however, #' have to be stored somehow in computer's memory. #' The Unicode Standard encodes characters in the range U+0000..U+10FFFF, #' which amounts to a 21-bit code space. Depending on the encoding #' form (UTF-8, UTF-16, or UTF-32), each character will #' then be represented either as a sequence of one to four 8-bit bytes, #' one or two 16-bit code units, or a single 32-bit integer #' (compare the ICU FAQ). #' #' Unicode can be thought of as a superset of the spectrum of characters #' supported by any given code page. #' #' @section UTF-8 and UTF-16: #' #' For portability reasons, the UTF-8 encoding is the most natural choice #' for representing Unicode character strings in \R. UTF-8 has ASCII as its #' subset (code points 1--127 represent the same characters in both of them). #' Code points larger than 127 are represented by multi-byte sequences #' (from 2 to 4 bytes: Please note that not all sequences of bytes #' are valid UTF-8, compare \code{\link{stri_enc_isutf8}}). #' #' Most of the computations in \pkg{stringi} are performed internally #' using either UTF-8 or UTF-16 encodings (this depends on type of service #' you request: some \pkg{ICU} services are designed only to work with UTF-16). #' Due to such a choice, with \pkg{stringi} you get the same result on #' each platform, which is -- unfortunately -- not the case of base \R's #' functions (for instance, it is known that performing a regular expression #' search under Linux on some texts may give you a different result #' to those obtained under Windows). We really had portability in our minds #' while developing our package! #' #' We have observed that \R correctly handles UTF-8 strings regardless of your #' platform's native encoding (see below). Therefore, we decided that most #' functions in \pkg{stringi} will output its results in UTF-8 #' -- this speeds ups computations on cascading calls to our functions: #' the strings does not have to be re-encoded each time. #' #' Note that some Unicode characters may have an ambiguous representation. #' For example, ``a with ogonek'' (one character) and ``a''+``ogonek'' #' (two graphemes) are semantically the same. \pkg{stringi} provides functions #' to normalize character sequences, see \code{\link{stri_trans_nfc}} #' for discussion. However, it is observed that denormalized strings #' do appear very rarely in typical string processing activities. #' #' Additionally, do note that \pkg{stringi} silently removes byte order marks #' (BOMs - they may incidentally appear in a string read from a text file) #' from UTF8-encoded strings, see \code{\link{stri_enc_toutf8}}. #' #' #' @section Character Encodings in \R: #' #' Data in memory are just bytes (small integer #' values) -- an en\emph{coding} is a way to represent characters with such #' numbers, it is a semantic 'key' to understand a given byte sequence. #' For example, in ISO-8859-2 (Central European), the value 177 represents #' Polish ``a with ogonek'', and in ISO-8859-1 (Western European), #' the same value denotes the ``plus-minus'' sign. Thus, a character encoding #' is a translation scheme: we need to communicate with \R somehow, #' relying on how it represents strings. #' #' Overall, \R has a very simple encoding marking mechanism, #' see \code{\link{stri_enc_mark}}. There is an implicit assumption #' that your platform's default (native) encoding always extends #' ASCII -- \pkg{stringi} checks that whenever your native encoding #' is being detected automatically on \pkg{ICU}'s initialization and each time #' when you change it manually by calling \code{\link{stri_enc_set}}. #' #' Character strings in \R (internally) can be declared to be in: #' \itemize{ #' \item \code{UTF-8}; #' \item \code{latin1}, i.e., either ISO-8859-1 (Western European on #' Linux, OS X, and other Unixes) or WINDOWS-1252 (Windows); #' \item \code{bytes} -- for strings that #' should be manipulated as sequences of bytes. #' } #' Moreover, there are two other cases: #' \itemize{ #' \item ASCII -- for strings consisting only of byte codes #' not greater than 127; #' \item \code{native} (a.k.a. \code{unknown} in \code{\link{Encoding}}; #' quite a misleading name: no explicit encoding mark) -- for #' strings that are assumed to be in your platform's native (default) encoding. #' This can represent UTF-8 if you are an OS X user, #' or some 8-bit Windows code page, for example. #' The native encoding used by \R may be determined by examining #' the LC_CTYPE category, see \code{\link{Sys.getlocale}}. #' } #' #' Intuitively, ``native'' strings result from reading #' a string from stdin (e.g., keyboard input). This makes sense: your operating #' system works in some encoding and provides \R with some data. #' #' Each time when a \pkg{stringi} function encounters a string declared #' in native encoding, it assumes that the input data should be translated #' from the default encoding, i.e., the one returned by \code{\link{stri_enc_get}} #' (unless you know what you are doing, the default encoding should only be #' changed if the automatic encoding detection process fails on \pkg{stringi} #' load). #' #' Functions which allow \code{'bytes'} encoding markings are very rare in #' \pkg{stringi}, and were carefully selected. These are: #' \code{\link{stri_enc_toutf8}} (with argument \code{is_unknown_8bit=TRUE}), #' \code{\link{stri_enc_toascii}}, and \code{\link{stri_encode}}. #' #' Finally, note that \R lets strings in ASCII, UTF-8, and your platform's #' native encoding coexist. A character vector printed with #' \code{\link{print}}, \code{\link{cat}}, etc., is silently re-encoded #' so that it can be properly shown, e.g., on the console. #' #' #' @section Encoding Conversion: #' #' Apart from automatic conversion from the native encoding, #' you may re-encode a string manually, for example #' when you read it from a file created on a different platform. #' Call \code{\link{stri_enc_list}} for the list of #' encodings supported by \pkg{ICU}. #' Note that converter names are case-insensitive #' and \pkg{ICU} tries to normalize the encoding specifiers. #' Leading zeroes are ignored in sequences of digits (if further digits follow), #' and all non-alphanumeric characters are ignored. Thus the strings #' 'UTF-8', 'utf_8', 'u*Tf08' and 'Utf 8' are equivalent. #' #' The \code{\link{stri_encode}} function #' allows you to convert between any given encodings #' (in some cases you will obtain \code{bytes}-marked #' strings, or even lists of raw vectors (i.e., for UTF-16). #' There are also some useful more specialized functions, #' like \code{\link{stri_enc_toutf32}} (converts a character vector to a list #' of integers, where one code point is exactly one numeric value) #' or \code{\link{stri_enc_toascii}} (substitutes all non-ASCII #' bytes with the SUBSTITUTE CHARACTER, #' which plays a similar role as \R's \code{NA} value). #' #' There are also some routines for automated encoding detection, #' see, e.g., \code{\link{stri_enc_detect}}. #' #' #' @section Encoding Detection: #' #' Given a text file, one has to know how to interpret (encode) #' raw data in order to obtain meaningful information. #' #' Encoding detection is always an imprecise operation and #' needs a considerable amount of data. However, in case of some #' encodings (like UTF-8, ASCII, or UTF-32) a ``false positive'' byte #' sequence is quite rare (statistically speaking). #' #' Check out \code{\link{stri_enc_detect}} (among others) for a useful #' function in this category. #' #' @name about_encoding #' @rdname about_encoding #' @aliases about_encoding stringi-encoding encoding #' @family stringi_general_topics #' @family encoding_management #' @family encoding_detection #' @family encoding_conversion #' #' @references #' \emph{Unicode Basics} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/icu/unicode.html} #' #' \emph{Conversion} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/conversion/} #' #' \emph{Converters} -- ICU User Guide, #' \url{https://unicode-org.github.io/icu/userguide/conversion/converters.html} #' (technical details) #' #' \emph{UTF-8, UTF-16, UTF-32 & BOM} -- ICU FAQ, #' \url{https://www.unicode.org/faq/utf_bom.html} invisible(NULL) stringi/cleanup0000755000176200001440000000012614771224007013311 0ustar liggesusers#!/bin/sh rm -f config.* src/Makevars src/*.o src/uconfig_local.h src/install.libs.R stringi/NEWS0000644000176200001440000011500714771217072012444 0ustar liggesusers# Changelog ## 1.8.7 (2025-03-27) * [BUGFIX] Fixed build warnings. * [BUGFIX] #512: Fixed PROTECT stack imbalance in `stri_encode_from_marked`. ## 1.8.4 (2024-05-06) * [BUILD TIME] [BUGFIX] #508: Fixed build errors on Windows (thanks to @jeroen and @kalibera). ## 1.8.3 (2023-12-10) * [BUILD TIME] [BUGFIX] Fixed the *format string is not a string literal (potentially insecure)* warnings. ## 1.8.2 (2023-11-22) * [BUILD TIME] [BUGFIX] #501: Fixed failing build on 32-bit Windows (Windows API `ResolveLocaleName` function not available). * [BUILD TIME] [BUGFIX] #502: `PKG_CPPFLAGS` are now considered before other `CPPFLAGS` (the same with other flag types) in the `configure` script to make it compatible with what happens in `Makevars`. * [BUILD TIME] [BUGFIX] Support for ICU's `double` conversion on Loongarch has been restored (see #463). ## 1.8.1 (2023-11-09) * [GENERAL] ICU bundle updated to version 74.1 (Unicode 15.1, CLDR 44). * [BACKWARD INCOMPATIBILITY] [BUILD TIME] Support for Solaris has now been dropped. The package is no longer shipped with the very outdated ICU55 bundle. A compiler supporting at least C++11 as well as ICU >= 61 are now required. * [BACKWARD INCOMPATIBILITY] #469: Missing date-time fields in `stri_datetime_parse` and `stri_datetime_create` now default to today's midnight local time. * [BACKWARD INCOMPATIBILITY] Removed the long-deprecated and defunct `fallback_encoding` parameter of `stri_read_lines` and the ellipsis parameter of `stri_opts_collator`, `stri_opts_regex`, `stri_opts_fixed`, `stri_opts_brkiter`, and `stri_opts_regex`. * [BUILD TIME] As per the suggestion of Prof. Brian Ripley, `icudt74l` (ICU data - little endian) is now included in the source tarball (compressed with xz to save space). This allows for building **`stringi`** on systems with no internet access. * [NEW FEATURE] #476: In break iterator-, date-time-, and collator-based operations (e.g., `stri_sort`), a warning is emitted when the *root* ICU resource bundle is returned when using an *explicitly* requested locale. This might happen when we pass an 'unknown' `locale` argument to these functions. Note that when relying on the default `locale=NULL` argument, no warning is emitted. In such a case, checking if the default locale as returned by `stri_enc_get` is amongst those listed in `stri_enc_list` is recommended. * [NEW FEATURE] The `C` locale identifier now resolves to `en_US_POSIX`. * [BUGFIX] #469: `stri_datetime_parse` did not reset the `Calendar` object when parsing multiple dates. * [BUGFIX] #487: Some functions did not accept ASCII strings longer than 858993457 characters on input. ## 1.7.12 (2023-01-09) * [BUGFIX] Fixed a few issues reported by `rchk`. * [NOTE] [BACKWARD INCOMPATIBLE CHANGE IF ICU >= 72] If building against ICU >= 72, note a backward incompatible change: `@` is no longer considered a word break; for more details, see . ## 1.7.8 (2022-07-11) * [DOCUMENTATION] Paper on **`stringi`** has been published in the *Journal of Statistical Software*; see . * [BUGFIX] #473, #397: Fixed buffer overflow in `stri_dup`; Also, `stri_dup`, `stri_paste`, ... fail more graciously on attempts to generate strings of length >= 2^31 each. * [BUILD TIME] #480: Using `Rf_isNull` instead of `isNull`. * [DOCUMENTATION] #462: That the `numeric=TRUE` collator does not handle negative numbers correctly is now mentioned in the manual. ## 1.7.6 (2021-11-29) * [BUILD TIME] #463: Added Loongarch support in ICU's double conversion (@liuxiang88). * [BUGFIX] #467: The UCRT build on Windows was not marking strings as `latin1`. ## 1.7.5 (2021-10-04) * [DOCUMENTATION] Paper on **`stringi`** has been accepted for publication in the *Journal of Statistical Software*, see for a draft version. * [DOCUMENTATION] The **`stringi`** website at now features a comprehensive tutorial based on the aforementioned paper. * [DOCUMENTATION] The *ICU* Project site has been moved to . * [BUILD TIME] #457: The `autoconf` macros `AC_LANG_CPLUSPLUS` and `AC_TRY_COMPILE` were obsolete. * [BUGFIX] #458: Passing ALTREP objects no longer yields 'embeded nul in string' errors. ## 1.7.4 (2021-08-12) * [BUGFIX] #449: Fixed segfaults generated by `stri_sprintf`. * [BUILD TIME] No longer defining `USE_RINTERNALS` and `R_NO_REMAP`. ## 1.7.3 (2021-07-15) * [BUGFIX] Fixed the previous patch of ICU55 causing a build failure on, amongst others, CRAN's Solaris-based target. ## 1.7.2 (2021-07-14) * [BUGFIX] Workaround for a bug in `tools::checkFF` failing when `NA_character_` is passed to `.Call`. ## 1.7.1 (2021-07-14) * [BACKWARD INCOMPATIBILITY] `%s$%` and `%stri$%` now use the new `stri_sprintf` (see below) function instead of `base::sprintf`. * [BACKWARD INCOMPATIBILITY, NEW FEATURE] In `stri_sub<-` and `stri_sub_all<-`, providing a negative `length` from now on does not result in the corresponding input string being altered. * [BACKWARD INCOMPATIBILITY, NEW FEATURE] In `stri_sub` and `stri_sub_all`, negative `length` results in the corresponding output being `NA` or not extracted at all, depending on the setting of the new argument `ignore_negative_length`. * [BACKWARD INCOMPATIBILITY, BUGFIX, NEW FEATURE] In `stri_subset*` and their replacement versions, `pattern` and `value` cannot be longer than `str` (but now they are recycled if necessary). * [BACKWARD INCOMPATIBILITY, NEW FEATURE] `stri_sub*` now accept the `from` argument being a matrix like `cbind(from, length=length)`. Unnamed columns or any other names are still interpreted as `cbind(from, to)`. Also, the new argument `use_matrix` can be used to disable the special treatment of such matrices. * [DOCUMENTATION] It has been clarified that the syntax of `*_charclass` (e.g., used in `stri_trim*`) differs slightly from regex character classes. * [NEW FEATURE] #420: `stri_sprintf` (alias: `stri_string_format`) is a Unicode-aware replacement for and enhancement of the base `sprintf`: it adds a customised handling of `NA`s (on demand), computing field size based on code point width, outputting substrings of at most given width, variable width and precision (both at the same time), etc. Moreover, `stri_printf` can be used to display formatted strings conveniently. * [NEW FEATURE] #153: `stri_match_*_regex` now extract capture group names. * [NEW FEATURE] #25: `stri_locate_*_regex` now have a new argument, `capture_groups`, which allows for extracting positions of matches to parenthesised subexpressions. * [NEW FEATURE] `stri_locate_*` now have a new argument, `get_length`, whose setting may result in generating *from-length* matrices (instead of *from-to* ones). * [NEW FEATURE] #438: `stri_trans_general` now supports rule-based as well as reverse-direction transliteration. * [NEW FEATURE] #434: `stri_datetime_format` and `stri_datetime_parse` are now vectorised also with respect to the `format` argument. * [NEW FEATURE] `stri_datetime_fstr` has a new argument, `ignore_special`, which defaults to `TRUE` for backward compatibility. * [NEW FEATURE] `stri_datetime_format`, `stri_datetime_add`, and `stri_datetime_fields` now call `as.POSIXct` more eagerly. * [NEW FEATURE] `stri_trim*` now have a new argument, `negate`. * [NEW FEATURE] `stri_replace_rstr` converts `gsub`-style replacement strings to `stri_replace`-style. * [INTERNAL] `stri_prepare_arg*` have been refactored, buffer overruns in the exception handling subsystem are now avoided. * [BUGFIX] Few functions (`stri_length`, `stri_enc_toutf32`, etc.) did not throw an exception on an invalid UTF-8 byte sequence (and merely issued a warning instead). * [BUGFIX] `stri_datetime_fstr` did not honour `NA_character_` and did not parse format strings such as `"%Y%m%d"` correctly. It has now been completely rewritten (in C). * [BUGFIX] `stri_wrap` did not recognise the width of certain Unicode sequences correctly. ## 1.6.2 (2021-05-14) * [BACKWARD INCOMPATIBILITY] In `stri_enc_list()`, `simplify` now defaults to `TRUE`. * [NEW FEATURE] #425: The outputs of `stri_enc_list()`, `stri_locale_list()`, `stri_timezone_list()`, and `stri_trans_list()` are now sorted. * [NEW FEATURE] #428: In `stri_flatten`, `na_empty=NA` now omits missing values. * [BUILD TIME] #431: Pre-4.9.0 GCC has `::max_align_t`, but not `std::max_align_t`, added a (possible) workaround, see the `INSTALL` file. * [BUGFIX] #429: `stri_width()` misclassified the width of certain code points (including grave accent, Eszett, etc.); General category *Sk* (Symbol, modifier) is no longer of width 0, `UCHAR_EAST_ASIAN_WIDTH` of `U_EA_AMBIGUOUS` is no longer of width 2. * [BUGFIX] #354: `ALTREP` `CHARSXP`s were not copied, and thus could have been garbage collected in the so-called meanwhile (with thanks to @jimhester). ## 1.6.1 (2021-05-05) * [GENERAL] #401: stringi is now bundled with ICU4C 69.1 (upgraded from 61.1), which is used on most Windows and OS X builds as well as on *nix systems not equipped with system ICU. However, if the C++11 support is disabled, stringi will be built against the battle-tested ICU4C 55.1. The update to ICU brings Unicode 13.0 and CLDR 39 support. * [DOCUMENTATION] A draft version of a paper on **`stringi`** is now available at . * [GENERAL] stringi now requires R >= 3.1 (`CXX_STD` of `CXX11` or `CXX1X`). * [NEW FEATURE] #408: `stri_trans_casefold()` performs case folding; this is different from case mapping, which is locale-dependent. Folding makes two pieces of text that differ only in case identical. This can come in handy when comparing strings. * [NEW FEATURE] #421: `stri_rank()` ranks strings in a character vector (e.g., for ordering data frames with regards to multiple criteria, the ranks can be passed to `order()`, see #219). * [NEW FEATURE] #266: `stri_width()` now supports emojis. * [NEW FEATURE] `%s$%` and `%stri$%` are now vectorised with respect to both arguments. * [BUGFIX] `stri_sort_key()` now outputs `bytes`-encoded strings. * [BUGFIX] #415: `locale=''` was not equivalent to `locale=NULL` in `stri_opts_collator()`. * [INTERNAL] #414: Use `LEVELS(x)` macro instead of accessing `(x)->sxpinfo.gp` directly (@lukaszdaniel). ## 1.5.3 (2020-09-04) * [DOCUMENTATION] stringi home page has moved to and now includes a comprehensive reference manual. * [NEW FEATURE] #400: `%s$%` and `%stri$%` are now binary operators that call base R's `sprintf()`. * [NEW FEATURE] #399: The `%s*%` and `%stri*%` operators can be used in addition to `stri_dup()`, for the very same purpose. * [NEW FEATURE] #355: `stri_opts_regex()` now accepts the `time_limit` and `stack_limit` options so as to prevent malformed or malicious regexes from running for too long. * [NEW FEATURE] #345: `stri_startswith()` and `stri_endswith()` are now equipped with the `negate` parameter. * [NEW FEATURE] #382: Incorrect regexes are now reported to ease debugging. * [DEPRECATION WARNING] #347: Any unknown option passed to `stri_opts_fixed()`, `stri_opts_regex()`, `stri_opts_coll()`, and `stri_opts_brkiter()` now generates a warning. In the future, the `...` parameter will be removed, so that will be an error. * [DEPRECATION WARNING] `stri_duplicated()`'s `fromLast` argument has been renamed `from_last`. `fromLast` is now its alias scheduled for removal in a future version of the package. * [DEPRECATION WARNING] `stri_enc_detect2()` is scheduled for removal in a future version of the package. Use `stri_enc_detect()` or the more targeted `stri_enc_isutf8()`, `stri_enc_isascii()`, etc., instead. * [DEPRECATION WARNING] `stri_read_lines()`, `stri_write_lines()`, `stri_read_raw()`: use `con` argument instead of `fname` now. The argument `fallback_encoding` is scheduled for removal and is no longer used. `stri_read_lines()` does not support `encoding="auto"` anymore. * [DEPRECATION WARNING] `nparagraphs` in `stri_rand_lipsum()` has been renamed `n_paragraphs`. * [NEW FEATURE] #398: Alternative, British spelling of function parameters has been introduced, e.g., `stri_opts_coll()` now supports both `normalization` and `normalisation`. * [NEW FEATURE] #393: `stri_read_bin()`, `stri_read_lines()`, and `stri_write_lines()` are no longer marked as draft API. * [NEW FEATURE] #187: `stri_read_bin()`, `stri_read_lines()`, and `stri_write_lines()` now support connection objects as well. * [NEW FEATURE] #386: New function `stri_sort_key()` for generating locale-dependent sort keys which can be ordered at the byte level and return an equivalent ordering to the original string (@DavisVaughan). * [BUGFIX] #138: `stri_encode()` and `stri_rand_strings()` now can generate strings of much larger lengths. * [BUGFIX] `stri_wrap()` did not honour `indent` correctly when `use_width` was `TRUE`. ## 1.4.6 (2020-02-17) * [BACKWARD INCOMPATIBILITY] #369: `stri_c()` now returns an empty string when input is empty and `collapse` is set. * [BUGFIX] #370: fixed an issue in `stri_prepare_arg_POSIXct()` reported by rchk. * [DOCUMENTATION] #372: documented arguments not in `\usage` in documentation object `stri_datetime_format`: `...` ## 1.4.5 (2020-01-11) * [BUGFIX] #366: fix for #363 required ICU >= 55 . ## 1.4.4 (2020-01-06) * [BUGFIX] #348: Avoid copying 0 bytes to a nil-buffer in `stri_sub_all()`. * [BUGFIX] #362: Removed `configure` variable `CXXCPP` as it is now deprecated. * [BUGFIX] #318: PROTECTing objects from gcing as reported by `rchk`. * [BUGFIX] #344, #364: Removed compiler warnings in icu61/common/cstring.h. * [BUGFIX] #363: Status of `RegexMatcher` is now checked after its use. ## 1.4.3 (2019-03-12) * [NEW FEATURE] #30: New function `stri_sub_all()` - a version of `stri_sub()` accepting list `from`/`to`/`length` arguments for extracting multiple substrings from each string in a character vector. * [NEW FEATURE] #30: New function `stri_sub_all<-()` (and its `%<%`-friendly version, `stri_sub_replace_all()`) - for replacing multiple substrings with corresponding replacement strings. * [NEW FEATURE] In `stri_sub_replace()`, `value` parameter has a new alias, `replacement`. * [NEW FEATURE] New convenience functions based on `stri_remove_empty()`: `stri_omit_empty_na()`, `stri_remove_empty_na()`, `stri_omit_empty()`, and also `stri_remove_na()`, `stri_omit_na()`. * [BUGFIX] #343: `stri_trans_char()` did not yield correct results for overlapping pattern and replacement strings. * [WARNFIX] #205: `configure.ac` is now included in the source bundle. ## 1.3.1 (2019-02-10) * [BACKWARD INCOMPATIBILITY] #335: A fix to #314 prevented (by design) the use of the system ICU if the library had been compiled with `U_CHARSET_IS_UTF8=1`. However, this is the default setting in `libicu`>=61. From now on, in such cases the system ICU is used more eagerly, but `stri_enc_set()` issues a warning stating that the default (UTF-8) encoding cannot be changed. * [NEW FEATURE] #232: All `stri_detect_*` functions now have the `max_count` argument that allows for, e.g., stopping at the first pattern occurrence. * [NEW FEATURE] #338: `stri_sub_replace()` is now an alias for `stri_sub<-()` which makes it much more easily pipable (@yutannihilation, @BastienFR). * [NEW FEATURE] #334: Added missing `icudt61b.dat` to support big-endian platforms (thanks to Dimitri John Ledkov @xnox). * [BUGFIX] #296: Out-of-the box build used to fail on CentOS 6, upgraded `configure` to `--disable-cxx11` more eagerly at an early stage. * [BUGFIX] #341: Fixed possible buffer overflows when calling `strncpy()` from within ICU 61. * [BUGFIX] #325: Made `configure` more portable so that it works under `/bin/dash` now. * [BUGFIX] #319: Fixed overflow in `stri_rand_shuffle()`. * [BUGFIX] #337: Empty search patterns in search functions (e.g., `stri_split_regex()` and `stri_count_fixed()`) used to raise too many warnings on empty search patterns. ## 1.2.4 (2018-07-20) * [BUGFIX] #314: Testing `U_CHARSET_IS_UTF8` in `configure` when using `pkg-build`. * [BUILD TIME] #317: Included `icudt61l.zip` in the source bundle to solve the frequent `icudt download failed` error (also on CRAN's `windows-release` and `windows-oldrel`). (reverted in version 1.3.1, the `winbuilder` errors were caused by a build chain bug). ## 1.2.3 (2018-05-16) * [BUGFIX] #296: Fixed the behaviour of the `configure` script on CentOS 6. * [BUGFIX] Fixed broken Windows build by updating the `icudt` mirror list. ## 1.2.2 (2018-05-01) * [GENERAL] #193: stringi is now bundled with ICU4C 61.1, which is used on most Windows and OS X builds as well as on *nix systems not equipped with ICU. However, if the C++11 support is disabled, stringi will be built against ICU4C 55.1. The update to ICU brings Unicode 10.0 support, including new emoji characters. * [BUGFIX] #288: `stri_match()` did not return the correct number of columns when input was empty. * [NEW FEATURE] #188: `stri_enc_detect()` now returns a list of data frames. * [NEW FEATURE] #289: `stri_flatten()` how has `na_empty` and `omit_empty` arguments. * [NEW FEATURE] New functions: `stri_remove_empty()`, `stri_na2empty()`. * [NEW FEATURE] #285: Coercion from a non-trivial list (one that consists of atomic vectors, each of length 1) to an atomic vector now issues a warning. * [WARN] Removed `-Wparentheses` warnings in `icu55/common/cstring.h:38:63` and `icu55/i18n/windtfmt.cpp` in the ICU4C 55.1 bundle. ## 1.1.7 (2018-03-06) * [BUGFIX] Fixed ICU4C 55.1 generating some *significant warnings* (`icu55/i18n/winnmfmt.cpp`) and *suppressing important diagnostics* (`src/icu55/i18n/decNumber.c`). ## 1.1.6 (2017-11-10) * [WINDOWS SPECIFIC] #270: Strings marked with `latin1` encoding are now converted internally to UTF-8 using the WINDOWS-1252 codec. This fixes problems with - among others - displaying the Euro sign. * [NEW FEATURE] #263: Added support for custom rule-based break iteration, see `?stri_opts_brkiter`. * [NEW FEATURE] #267: `omit_na=TRUE` in `stri_sub<-()` now ignores missing values in any of the arguments provided. * [BUGFIX] Fixed unPROTECTed variable names and stack imbalances as reported by `rchk`. ## 1.1.5 (2017-04-07) * [GENERAL] stringi now requires ICU4C >= 52. * [BUGFIX] Fixed errors pointed out by `clang-UBSAN` in `stri_brkiter.h`. * [GENERAL] stringi now requires R >= 2.14. * [BUILD TIME] #238, #220: Now trying *standard* ICU4C build flags if a call to `pkg-config` fails. * [BUILD TIME] #258: Use `CXX11` instead of `CXX1X` on R >= 3.4. * [BUILD TIME, BUGFIX] #254: `dir.exists()` is R >= 3.2. ## 1.1.3 (2017-03-21) * [REMOVE DEPRECATED] `stri_install_check()` and `stri_install_icudt()` marked as deprecated in stringi 0.5-5 are no longer being exported. * [BUGFIX] #227: Incorrect behaviour of `stri_sub()` and `stri_sub<-()` if the empty string was the result. * [BUILD TIME] #231: The `configure` (Linux/Unix only) script now reads the following environment variables: `STRINGI_CFLAGS`, `STRINGI_CPPFLAGS`, `STRINGI_CXXFLAGS`, `STRINGI_LDFLAGS`, `STRINGI_LIBS`, `STRINGI_DISABLE_CXX11`, `STRINGI_DISABLE_ICU_BUNDLE`, `STRINGI_DISABLE_PKG_CONFIG`, `PKG_CONFIG`, see `INSTALL` for more information. * [BUILD TIME] #253: Call to `R_useDynamicSymbols()` added. * [BUILD TIME] #230: `icudt` is now being downloaded by `configure` (*NIX only) *before* building. * [BUILD TIME] #242: `_COUNT/_LIMIT` enum constants have been deprecated as of ICU 58.2, stringi code has been upgraded accordingly. ## 1.1.2 (2016-09-30) * [BUGFIX] `round()`, `snprintf()` is not C++98. ## 1.1.1 (2016-05-25) * [BUGFIX] #214: Allow a regex pattern like `.*` to match an empty string. * [BUGFIX] #210: `stri_replace_all_fixed(c("1", "NULL"), "NULL", NA)` now results in `c("1", NA)`. * [NEW FEATURE] #199: `stri_sub<-()` now allows for ignoring `NA` locations (a new `omit_na` argument added). * [NEW FEATURE] #207: `stri_sub<-()` now allows for substring insertions (via `length=0`). * [NEW FUNCTION] #124: `stri_subset<-()` functions added. * [NEW FEATURE] #216: `stri_detect()`, `stri_subset()`, `stri_subset<-()` now all have the `negate` argument. * [NEW FUNCTION] #175: `stri_join_list()` concatenates all strings in a list of character vectors. Useful in conjunction with, e.g., `stri_extract_all_regex()`, `stri_extract_all_words()`, etc. ## 1.0-1 (2015-10-22) * [GENERAL] #88: C API is now available for use in, e.g., Rcpp packages, see for an example. * [BUGFIX] #183: Floating point exception raised in `stri_sub()` and `stri_sub<-()` when `to` or `length` was a zero-length numeric vector. * [BUGFIX] #180: `stri_c()` warned incorrectly (recycling rule) when using more than two elements. ## 0.5-5 (2015-06-28) * [BACKWARD INCOMPATIBILITY] `stri_install_check()` and `stri_install_icudt()` are now deprecated. From now on they are supposed to be used only by the stringi installer. * [BUGFIX] #176: A patch for `sys/feature_tests.h` no longer included (the original file was copyrighted by Sun Microsystems); fixed the *Compiler or options invalid for pre-Unix 03 X/Open applications and pre-2001 POSIX applications* error by forcing (conditionally) `_XPG6` conformance. * [BUGFIX] #174: `stri_paste()` did not generate any warning when the recycling rule is violated and `sep==""`. * [BUGFIX] #170: `icu::setDataDirectory` is no longer called if our ICU source bundle is not used (this used to cause build problems on openSUSE). * [BUILD TIME] #169: `configure` now tries to switch to the *standard* C++ compiler if a C++11 one is not configured correctly. * [BUILD TIME] `configure.win` (`Biarch: TRUE`) now mimics `autoconf`'s `AC_SUBST` and `AC_CONFIG_FILES` so that the build process is now more similar across different platforms. * [NEW FEATURE] `stri_info()` now also gives information about which version of ICU4C is in use (system or bundle). ## 0.5-2 (2015-06-21) * [BACKWARD INCOMPATIBILITY] The second argument to `stri_pad_*()` has been renamed `width`. * [GENERAL] #69: stringi is now bundled with ICU4C 55.1. * [NEW FUNCTIONS] `stri_extract_*_boundaries()` extract text between text boundaries. * [NEW FUNCTION] #46: `stri_trans_char()` is a stringi-flavoured `chartr()` equivalent. * [NEW FUNCTION] #8: `stri_width()` approximates the *width* of a string in a more Unicode-ish fashion than `nchar(..., "width")` * [NEW FEATURE] #149: `stri_pad()` and `stri_wrap()` is now (by default) based on code point widths instead of the number of code points. Moreover, the default behaviour of `stri_wrap()` is now such that it does not get rid of non-breaking, zero width, etc., spaces. * [NEW FEATURE] #133: `stri_wrap()` silently allows for `width <= 0` (for compatibility with `strwrap()`). * [NEW FEATURE] #139: `stri_wrap()` gained a new argument: `whitespace_only`. * [NEW FUNCTIONS] #137: Date-time formatting/parsing: * `stri_timezone_list()` - lists all known time zone identifiers; * `stri_timezone_set()`, `stri_timezone_get()` - manage the current default time zone; * `stri_timezone_info()` - basic information on a given time zone; * `stri_datetime_symbols()` - gives localizable date-time formatting data; * `stri_datetime_fstr()` - converts a `strptime`-like format string to an ICU date/time format string; * `stri_datetime_format()` - converts date/time to string; * `stri_datetime_parse()` - converts string to date/time object; * `stri_datetime_create()` - constructs date-time objects from numeric representations; * `stri_datetime_now()` - returns current date-time; * `stri_datetime_fields()` - returns date-time fields' values; * `stri_datetime_add()` - adds specific number of date-time units to a date-time object. * [GENERAL] #144: Performance improvements in handling ASCII strings (these affect `stri_sub()`, `stri_locate()` and other string index-based operations) * [GENERAL] #143: Searching for short fixed patterns (`stri_*_fixed()`) now relies on the current `libC`'s implementation of `strchr()` and `strstr()`. This is very fast, e.g., on `glibc` using the `SSE2/3/4` instruction set. * [BUILD TIME] #141: A local copy of `icudt*.zip` may be used on package install; see the `INSTALL` file for more information. * [BUILD TIME] #165: The `configure` option `--disable-icu-bundle` forces the use of system ICU when building the package. * [BUGFIX] Locale specifiers are now normalized in a more intelligent way: e.g., `@calendar=gregorian` expands to `DEFAULT_LOCALE@calendar=gregorian`. * [BUGFIX] #134: `stri_extract_all_words()` did not accept `simplify=NA`. * [BUGFIX] #132: Incorrect behaviour in `stri_locate_regex()` for matches of zero lengths. * [BUGFIX] stringr/#73: `stri_wrap()` returned `CHARSXP` instead of `STRSXP` on empty string input with `simplify=FALSE` argument. * [BUGFIX] #164: Using `libicu-dev` failed on Ubuntu (`LIBS` shall be passed after `LDFLAGS` and the list of `.o` files). * [BUGFIX] #168: Build now fails if `icudt` is not available. * [BUGFIX] #135: C++11 is now used by default (see the `INSTALL` file, however) to build stringi from sources. This is because ICU4C uses the `long long` type which is not part of the C++98 standard. * [BUGFIX] #154: Dates and other objects with a custom class attribute were not coerced to the character type correctly. * [BUGFIX] Force ICU `u_init()` call on the stringi dynlib load. * [BUGFIX] #157: Many overfull `hbox`es in the package PDF manual have been corrected. ## 0.4-1 (2014-12-11) * [IMPORTANT CHANGE] `n_max` argument in `stri_split_*()` has been renamed `n`. * [IMPORTANT CHANGE] `simplify=FALSE` in `stri_extract_all_*()` and `stri_split_*()` now calls `stri_list2matrix()` with `fill=""`. `fill=NA_character_` may be obtained by using `simplify=NA`. * [IMPORTANT CHANGE, NEW FUNCTIONS] #120: `stri_extract_words()` has been renamed `stri_extract_all_words()` and `stri_locate_boundaries()` - `stri_locate_all_boundaries()` as well as `stri_locate_words()` - `stri_locate_all_words()`. New functions are now available: `stri_locate_first_boundaries()`, `stri_locate_last_boundaries()`, `stri_locate_first_words()`, `stri_locate_last_words()`, `stri_extract_first_words()`, `stri_extract_last_words()`. * [IMPORTANT CHANGE] #111: `opts_regex`, `opts_collator`, `opts_fixed`, and `opts_brkiter` can now be supplied individually via `...`. In other words, you may now simply call, e.g., `stri_detect_regex(str, pattern, case_insensitive=TRUE)` instead of `stri_detect_regex(str, pattern, opts_regex=stri_opts_regex(case_insensitive=TRUE))`. * [NEW FEATURE] #110: Fixed pattern search engine's settings can now be supplied via `opts_fixed` argument in `stri_*_fixed()`, see `stri_opts_fixed()`. A simple (not suitable for natural language processing) yet very fast `case_insensitive` pattern matching can be performed now. `stri_extract_*_fixed()` is again available. * [NEW FEATURE] #23: `stri_extract_all_fixed()`, `stri_count()`, and `stri_locate_all_fixed()` may now also look for overlapping pattern matches, see `?stri_opts_fixed`. * [NEW FEATURE] #129: `stri_match_*_regex()` gained a `cg_missing` argument. * [NEW FEATURE] #117: `stri_extract_all_*()`, `stri_locate_all_*()`, `stri_match_all_*()` gained a new argument: `omit_no_match`. Setting it to `TRUE` makes these functions compatible with their **`stringr`** equivalents. * [NEW FEATURE] #118: `stri_wrap()` gained `indent`, `exdent`, `initial`, and `prefix` arguments. Moreover, Knuth's dynamic word wrapping algorithm now assumes that the cost of printing the last line is zero, see #128. * [NEW FEATURE] #122: `stri_subset()` gained an `omit_na` argument. * [NEW FEATURE] `stri_list2matrix()` gained an `n_min` argument. * [NEW FEATURE] #126: `stri_split()` is now also able to act just like `stringr::str_split_fixed()`. * [NEW FEATURE] #119: `stri_split_boundaries()` now has `n`, `tokens_only`, and `simplify` arguments. Additionally, `stri_extract_all_words()` is now equipped with `simplify` arg. * [NEW FEATURE] #116: `stri_paste()` gained a new argument: `ignore_null`. Setting it to `TRUE` makes this function more compatible with `paste()`. * [OTHER] #123: `useDynLib` is used to speed up symbol look-up in the compiled dynamic library. * [BUGFIX] #114: `stri_paste()`: could return result in an incorrect order. * [BUGFIX] #94: Run-time errors on Solaris caused by setting `-DU_DISABLE_RENAMING=1` - memory allocation errors in, among others, the ICU `UnicodeString`. This setting also caused some `ASAN` sanity check failures within ICU code. ## 0.3-1 (2014-11-06) * [IMPORTANT CHANGE] #87: `%>%` overlapped with the pipe operator from the `magrittr` package; now each operator like `%>%` has been renamed `%s>%`. * [IMPORTANT CHANGE] #108: Now the `BreakIterator` (for text boundary analysis) may be more easily controlled via `stri_opts_brkiter()` (see options `type` and `locale` which aim to replace now-removed `boundary` and `locale` parameters to `stri_locate_boundaries()`, `stri_split_boundaries()`, `stri_trans_totitle()`, `stri_extract_words()`, and `stri_locate_words()`). * [NEW FUNCTIONS] #109: `stri_count_boundaries()` and `stri_count_words()` count the number of text boundaries in a string. * [NEW FUNCTIONS] #41: `stri_startswith_*()` and `stri_endswith_*()` determine whether a string starts or ends with a given pattern. * [NEW FEATURE] #102: `stri_replace_all_*()` now all have the `vectorize_all` parameter, which defaults to `TRUE` for backward compatibility. * [NEW FUNCTION] #91: Added `stri_subset_*()` - a convenient and more efficient substitute for `str[stri_detect_*(str, ...)]`. * [NEW FEATURE] #100: `stri_split_fixed()`, `stri_split_charclass()`, `stri_split_regex()`, `stri_split_coll()` gained a `tokens_only` parameter, which defaults to `FALSE` for backward compatibility. * [NEW FUNCTION] #105: `stri_list2matrix()` converts lists of atomic vectors to character matrices, useful in conjunction with `stri_split()` and `stri_extract()`. * [NEW FEATURE] #107: `stri_split_*()` now allow setting an `omit_empty=NA` argument. * [NEW FEATURE] #106: `stri_split()` and `stri_extract_all()` gained a `simplify` argument (if `TRUE`, then `stri_list2matrix(..., byrow=TRUE)` is called on the resulting list). * [NEW FUNCTION] #77: `stri_rand_lipsum()` generates a (pseudo)random dummy *lorem ipsum* text. * [NEW FEATURE] #98: `stri_trans_totitle()` gained a `opts_brkiter` parameter; it indicates which ICU `BreakIterator` should be used when case mapping. * [NEW FEATURE] `stri_wrap()` gained a new parameter: `normalize`. * [BUGFIX] #86: `stri_*_fixed()`, `stri_*_coll()`, and `stri_*_regex()` could give incorrect results if one of search strings were of length 0. * [BUGFIX] #99: `stri_replace_all()` did not use the `replacement` arg. * [BUGFIX] #112: Some of the objects were not PROTECTed from garbage collection - this could have led to spontaneous SEGFAULTS. * [BUGFIX] Some collator's options were not passed correctly to ICU services. * [BUGFIX] Memory leaks as detected by `valgrind --tool=memcheck --leak-check=full` have been removed. * [DOCUMENTATION] Significant extensions/clean ups in the stringi manual. ## 0.2-5 (2014-05-16) * Some examples are no longer run if `icudt` is not available (this was reverted in a future version though). ## 0.2-4 (2014-05-15) * [BUGFIX] Fixed issues with loading of misaligned addresses in `stri_*_fixed()`. ## 0.2-3 (2014-05-14) * [IMPORTANT CHANGE] `stri_cmp*()` now do not allow for passing `opts_collator=NA`. From now on, `stri_cmp_eq()`, `stri_cmp_neq()`, and the new operators `%===%`, `%!==%`, `%stri===%`, and `%stri!==%` are locale-independent operations, which base on code point comparisons. New functions `stri_cmp_equiv()` and `stri_cmp_nequiv()` (and from now on also `%==%`, `%!=%`, `%stri==%`, and `%stri!=%`) test for canonical equivalence. * [IMPORTANT CHANGE] `stri_*_fixed()` search functions now perform a locale-independent exact (byte-wise, of course after conversion to UTF-8) pattern search. All the `Collator`-based, locale-dependent search routines are now available via `stri_*_coll()`. The reason behind this is that ICU's `USearch` has currently very poor performance. What is more, in many search tasks exact pattern matching is sufficient anyway. * [GENERAL] `stri_*_fixed` now use a tweaked Knuth-Morris-Pratt search algorithm which improves the search performance drastically. * [IMPORTANT CHANGE] `stri_enc_nf*()` and `stri_enc_isnf*()` function families have been renamed `stri_trans_nf*()` and `stri_trans_isnf*()`, respectively -- they deal with text transforming, and not with character encoding. Note that all of these may be performed by ICU's `Transliterator` too (see below). * [NEW FUNCTION] `stri_trans_general()` and `stri_trans_list()` give access to ICU's `Transliterator`: they may be used to perform some generic text transforms, like Unicode normalisation, case folding, etc. * [NEW FUNCTION `stri_split_boundaries()` uses ICU's `BreakIterator` to split strings at specific text boundaries. Moreover, `stri_locate_boundaries()` indicates positions of these boundaries. * [NEW FUNCTION] `stri_extract_words()` uses ICU's `BreakIterator` to extract all words from a text. Additionally, `stri_locate_words()` locates start and end positions of words in a text. * [NEW FUNCTION] `stri_pad()`, `stri_pad_left()`, `stri_pad_right()`, and `stri_pad_both()` pad a string with a specific code point. * [NEW FUNCTION] `stri_wrap()` breaks paragraphs of text into lines. Two algorithms (greedy and minimal raggedness) are available. * [IMPORTANT CHANGE] `stri_*_charclass()` search functions now rely solely on ICU's `UnicodeSet` patterns. All the previously accepted charclass identifiers became invalid. However, new patterns should now be more familiar to the users (they are regex-like). Moreover, we observe a very nice performance gain. * [IMPORTANT CHANGE] `stri_sort()` now does not include `NA`s in output vectors by default, for compatibility with `sort()`. Moreover, currently none of the input vector's attributes are preserved. * [NEW FUNCTION] `stri_unique()` extracts unique elements from a character vector. * [NEW FUNCTIONS] `stri_duplicated()` and `stri_duplicated_any()` determine duplicate elements in a character vector. * [NEW FUNCTION] `stri_replace_na()` replaces `NA`s in a character vector with a given string, useful for emulating, e.g., R's `paste()` behaviour. * [NEW FUNCTION] `stri_rand_shuffle()` generates a random permutation of code points in a string. * [NEW FUNCTION] `stri_rand_strings()` generates random strings. * [NEW FUNCTIONS] New functions and binary operators for string comparison: `stri_cmp_eq()`, `stri_cmp_neq()`, `stri_cmp_lt()`, `stri_cmp_le()`, `stri_cmp_gt()`, `stri_cmp_ge()`, `%==%`, `%!=%`, `%<%`, `%<=%`, `%>%`, `%>=%`. * [NEW FUNCTION] `stri_enc_mark()` reads declared encodings of character strings as seen by stringi. * [NEW FUNCTION] `stri_enc_tonative(str)` is an alias to `stri_encode(str, NULL, NULL)`. * [NEW FEATURE] `stri_order()` and `stri_sort()` now have an additional argument `na_last` (defaults to `TRUE` and `NA`, respectively). * [NEW FEATURE] `stri_replace_all_charclass()`, `stri_extract_all_charclass()`, and `stri_locate_all_charclass()` now have a new argument, `merge` (defaults to `FALSE` for backward-compatibility). It may be used to, e.g., replace sequences of white spaces with a single space. * [NEW FEATURE] `stri_enc_toutf8()` now has a new `validate` argument (which defaults to `FALSE` for backward-compatibility). It may be used in a (rare) case where a user wants to fix an invalid UTF-8 byte sequence. `stri_length()` (among others) now detects invalid UTF-8 byte sequences. * [NEW FEATURE] All binary operators `%???%` now also have aliases `%stri???%`. * [GENERAL] Performance improvements in `StriContainerUTF8` and `StriContainerUTF16` (they affect most other functions). * [GENERAL] Significant performance improvements in `stri_join()`, `stri_flatten()`, `stri_cmp()`, `stri_trans_to*()`, and others. * [GENERAL] Added 3rd mirror site for our `icudt` binary distribution. * `U_MISSING_RESOURCE_ERROR` message in `StriException` now suggests calling `stri_install_check()`. * [BUGFIX] UTF-8 BOMs are now silently removed from input strings. * [BUGFIX] No more attempts to re-encode UTF-8 encoded strings if native encoding is UTF-8 in `StriContainerUTF8`. * [BUGFIX] Possible memory leaks when throwing errors via `Rf_error()`. * [BUGFIX] `stri_order()` and `stri_cmp()` could return incorrect results for `opts_collator=NA`. * [BUGFIX] `stri_sort()` did not guarantee to return strings in UTF-8. ## 0.1-25 (2014-03-12) * LICENSE tweaks. * First CRAN release. ## 0.1-24 (2014-03-11) * Fixed bugs detected with `ASAN` and `UBSAN`, e.g., fixed `CharClass::gcmask` type (`enum` -> `uint32_t`) (reported by `UBSAN`). * Fixed array over-runs detected with `valgrind` in `string8.h`. * Fixed uninitialised class fields in `StriContainerUTF8` (reported by `valgrind`). ## 0.1-23 (2014-03-11) * License changed to BSD-3-clause, COPYRIGHTS updated. * `icudt` is not shipped with stringi anymore; it is now downloaded in `install.libs.R` from one of our servers. * New functions: `stri_install_check()`, `stri_install_icudt()`. ## 0.1-22 (2014-02-20) * System ICU is used on systems which do have one (version >= 50 needed). ICU is auto-detected with `pkg-config` in `configure`. Pass `'--disable-pkg-config'` to `configure` to force building ICU from sources. * `icudt52b` (custom subset) is now shipped with stringi (for big-endian, ASCII systems). ## 0.1-21 (2014-02-19) * Fixed some issues on Solaris while preparing stringi for CRAN submission. ## 0.1-20 (2014-02-17) * ICU4C 52.1 sources included (common, i18n, stubdata + `icu52dt.dat` loaded dynamically). Compilation via Makevars. * stringi does not depend on any external libraries anymore. ## 0.1-11 (2013-11-16) * ICU4C is now statically linked on Windows. * First OS X binary build. * The package is being intensively tested by our students at Warsaw University of Technology. ## 0.1-10 (2013-11-13) * Using `pkg-config` via `configure` to look for ICU4C libs. ## 0.1-6 (2013-07-05) * First Windows binary build. * Compilation passed on Oracle Sun Studio compiler collection. * By now we have implemented most of the functionality scheduled for milestone 0.1. ## 0.1-1 (2013-01-05) * The stringi project has been started. stringi/src/0000755000176200001440000000000014771224007012524 5ustar liggesusersstringi/src/stri_callables.cpp0000644000176200001440000000372714750143456016231 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_callables.h" const extern R_CallMethodDef stri_callables[] = { {"stric_u_hasBinaryProperty", (DL_FUNC)(void (*) (void))(&stric_u_hasBinaryProperty), 0/*unused*/}, {NULL, NULL, 0} }; int stric_u_hasBinaryProperty(int c, int which) { return (int)u_hasBinaryProperty((UChar32)c, (UProperty)which); } stringi/src/stri_ICU_settings.cpp0000644000176200001440000000735514770540074016647 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #ifndef STRI_ICU_FOUND #include "uconfig_local.h" #endif /** Get current-default ICU locale and charset information * * @return an R named list with 7 components: * \code{Unicode.version} == ICU Unicode version, * \code{ICU.version} == U_ICU_VERSION * \code{Locale} == \code{stri_locale_info()}, * \code{Charset.internal} == \code{"UTF-8", "UTF-16"}, * \code{Charset.native} == \code{stri_enc_info()}) * \code{ICU.system} == is system ICU used? * \code{ICU.UTF8} == is U_CHARSET_IS_UTF8 set? * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException friendly * * @version 0.1-?? (Marek Gagolewski, 2013-11-17) * added U_ICU_VERSION * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.5-3 (Marek Gagolewski, 2015-06-24) * new retval field: ICU.system * * @version 1.3.1 (Marek Gagolewski, 2019-02-06) * new retval field: ICU.UTF8 * */ SEXP stri_info() { STRI__ERROR_HANDLER_BEGIN(0) const R_len_t infosize = 7; SEXP vals; STRI__PROTECT(vals = Rf_allocVector(VECSXP, infosize)); SET_VECTOR_ELT(vals, 0, Rf_mkString(U_UNICODE_VERSION)); SET_VECTOR_ELT(vals, 1, Rf_mkString(U_ICU_VERSION)); SET_VECTOR_ELT(vals, 2, stri_locale_info(R_NilValue)); // may call Rf_error SET_VECTOR_ELT(vals, 3, stri__make_character_vector_char_ptr(2, "UTF-8", "UTF-16")); // fixed strings SET_VECTOR_ELT(vals, 4, stri_enc_info(R_NilValue)); // may call Rf_error SET_VECTOR_ELT(vals, 5, Rf_ScalarLogical(STRI_ICU_FOUND)); SET_VECTOR_ELT(vals, 6, Rf_ScalarLogical(0)); #ifdef U_CHARSET_IS_UTF8 #if U_CHARSET_IS_UTF8 SET_VECTOR_ELT(vals, 6, Rf_ScalarLogical(1)); #endif #endif stri__set_names(vals, infosize, "Unicode.version", "ICU.version", "Locale", "Charset.internal", "Charset.native", "ICU.system", "ICU.UTF8"); STRI__UNPROTECT_ALL return vals; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } stringi/src/stri_brkiter.h0000644000176200001440000002225614770541312015407 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_brkiter_h #define __stri_brkiter_h #include "stri_stringi.h" #include #include #include #include #include #include /** * A class to manage a break iterator's options * * @version 0.4-1 (Marek Gagolewski, 2014-12-02) * * @version 1.1.3 (Marek Gagolewski, 2017-01-07) UBRK_COUNT deprecated * * @version 1.1.6 (Marek Gagolewski, 2017-04-22) Add support for RBBI */ class StriBrkIterOptions { protected: const char* locale; // R_alloc'd UnicodeString rules; UBreakIteratorType type; int32_t* skip_rules; // R_alloc'd R_len_t skip_size; // number of elements in skip_rules private: void setEmptyOpts() { locale = NULL; type = UBRK_CHARACTER; skip_rules = NULL; skip_size = 0; } void setType(SEXP opts_brkiter, const char* default_type); void setLocale(SEXP opts_brkiter); void setSkipRuleStatus(SEXP opts_brkiter); public: StriBrkIterOptions() { setEmptyOpts(); } StriBrkIterOptions(SEXP opts_brkiter, const char* default_type) { setEmptyOpts(); setLocale(opts_brkiter); setSkipRuleStatus(opts_brkiter); setType(opts_brkiter, default_type); } }; /** * A class to manage a break iterator * * @version 0.3-1 (Marek Gagolewski, 2014-10-30) * * @version 0.4-1 (Marek Gagolewski, 2014-12-02) separate class * * @version 1.1.3 (Marek Gagolewski, 2017-01-07) UBRK_COUNT deprecated * * @version 1.1.6 (Marek Gagolewski, 2017-04-22) Add support for RBBI * * @version 1.8.1 (Marek Gagolewski, 2023-11-09) * warn if resource bundle for an explicitly set locale is unavailable */ class StriUBreakIterator : public StriBrkIterOptions { private: UBreakIterator* uiterator; void open() { #ifndef NDEBUG if (uiterator) throw StriException("!NDEBUG: StriUBreakIterator::open()"); #endif UErrorCode status = U_ZERO_ERROR; if (!rules.isEmpty()) { UParseError parseErr; uiterator = ubrk_openRules(rules.getTerminatedBuffer(), -1/*null-terminated*/, NULL, 0, &parseErr, &status); } else { switch (type) { case UBRK_CHARACTER: // character uiterator = ubrk_open(UBRK_CHARACTER, locale, NULL, 0, &status); break; case UBRK_LINE: // line_break uiterator = ubrk_open(UBRK_LINE, locale, NULL, 0, &status); break; case UBRK_SENTENCE: // sentence uiterator = ubrk_open(UBRK_SENTENCE, locale, NULL, 0, &status); break; case UBRK_WORD: // word uiterator = ubrk_open(UBRK_WORD, locale, NULL, 0, &status); break; default: throw StriException(MSG__INTERNAL_ERROR); } } STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (status == U_USING_DEFAULT_WARNING && uiterator && locale) { UErrorCode status2 = U_ZERO_ERROR; const char* valid_locale = ubrk_getLocaleByType(uiterator, ULOC_VALID_LOCALE, &status2); if (valid_locale && !strcmp(valid_locale, "root")) Rf_warning("%s", ICUError::getICUerrorName(status)); } } public: StriUBreakIterator() : StriBrkIterOptions() { uiterator = NULL; } StriUBreakIterator(const StriBrkIterOptions& bropt) : StriBrkIterOptions(bropt) { uiterator = NULL; } StriUBreakIterator& operator=(const StriBrkIterOptions& bropt) { this->~StriUBreakIterator(); (StriBrkIterOptions&) (*this) = (StriBrkIterOptions&)bropt; uiterator = NULL; return *this; } ~StriUBreakIterator() { if (uiterator) { ubrk_close(uiterator); uiterator = NULL; } } void free(bool dealloc=true) { if (uiterator && dealloc) { ubrk_close(uiterator); } uiterator = NULL; } UBreakIterator* getIterator() { if (!uiterator) open(); return uiterator; } const char* getLocale() { return locale; } }; /** * A class to manage a break iterator * * @version 0.3-1 (Marek Gagolewski, 2014-10-30) * * @version 0.4-1 (Marek Gagolewski, 2014-12-02) * separate class * * @version 1.1.6 (Marek Gagolewski, 2017-04-22) Add support for RBBI * * @version 1.8.1 (Marek Gagolewski, 2023-11-09) * warn if resource bundle for an explicitly set locale is unavailable */ class StriRuleBasedBreakIterator : public StriBrkIterOptions { private: BreakIterator* rbiterator; UText* searchText; R_len_t searchPos; // may be BreakIterator::DONE const char* searchStr; // owned by caller R_len_t searchLen; // in bytes void setEmptyOpts() { rbiterator = NULL; searchText = NULL; searchPos = BreakIterator::DONE; searchStr = NULL; searchLen = 0; } void open() { UErrorCode status = U_ZERO_ERROR; Locale loc = Locale::createFromName(locale); if (!rules.isEmpty()) { UParseError parseErr; rbiterator = (BreakIterator*) new RuleBasedBreakIterator( UnicodeString(rules), parseErr, status ); } else { switch (type) { case UBRK_CHARACTER: // character rbiterator = (BreakIterator*)BreakIterator::createCharacterInstance(loc, status); break; case UBRK_LINE: // line_break rbiterator = (BreakIterator*)BreakIterator::createLineInstance(loc, status); break; case UBRK_SENTENCE: // sentence rbiterator = (BreakIterator*)BreakIterator::createSentenceInstance(loc, status); break; case UBRK_WORD: // word rbiterator = (BreakIterator*)BreakIterator::createWordInstance(loc, status); break; default: throw StriException(MSG__INTERNAL_ERROR); } } STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (status == U_USING_DEFAULT_WARNING && rbiterator && locale) { UErrorCode status2 = U_ZERO_ERROR; const char* valid_locale = rbiterator->getLocaleID(ULOC_VALID_LOCALE, status2); if (valid_locale && !strcmp(valid_locale, "root")) Rf_warning("%s", ICUError::getICUerrorName(status)); } } bool ignoreBoundary(); public: StriRuleBasedBreakIterator() : StriBrkIterOptions() { setEmptyOpts(); } StriRuleBasedBreakIterator(const StriBrkIterOptions& bropt) : StriBrkIterOptions(bropt) { setEmptyOpts(); } StriRuleBasedBreakIterator& operator=(const StriBrkIterOptions& bropt) { this->~StriRuleBasedBreakIterator(); (StriBrkIterOptions&) (*this) = (StriBrkIterOptions&)bropt; setEmptyOpts(); return *this; } ~StriRuleBasedBreakIterator() { if (rbiterator) { delete rbiterator; rbiterator = NULL; } if (searchText) { utext_close(searchText); searchText = NULL; } } void setupMatcher(const char* searchStr, R_len_t searchLen); void first(); bool next(); bool next(std::pair& bdr); void last(); bool previous(std::pair& bdr); }; #endif stringi/src/stri_search_regex_split.cpp0000644000176200001440000002222514770541312020146 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_integer.h" #include "stri_container_logical.h" #include "stri_container_regex.h" #include #include using namespace std; /** * Split a string into parts. * * The pattern matches identify delimiters that separate the input into fields. * The input data between the matches becomes the fields themselves. * * @param str character vector * @param pattern character vector * @param n integer vector * @param opts_regex * @param tokens_only single logical value * @param simplify single logical value * * @return list of character vectors or character matrix * * @version 0.1-?? (Marek Gagolewski, 2013-06-21) * * @version 0.1-?? (Marek Gagolewski, 2013-07-10) * BUGFIX: wrong behavior on empty str * * @version 0.1-24 (Marek Gagolewski, 2014-03-11) * Added missing utext_close call to avoid memleaks * * @version 0.3-1 (Marek Gagolewski, 2014-10-19) * added tokens_only param * * @version 0.3-1 (Marek Gagolewski, 2014-10-23) * added split param * * @version 0.3-1 (Marek Gagolewski, 2014-10-24) * allow omit_empty=NA * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-04) * allow `simplify=NA`; FR #126: pass n to stri_list2matrix * * @version 1.4.7 (Marek Gagolewski, 2020-08-24) * Use StriContainerRegexPattern::getRegexOptions */ SEXP stri_split_regex(SEXP str, SEXP pattern, SEXP n, SEXP omit_empty, SEXP tokens_only, SEXP simplify, SEXP opts_regex) { bool tokens_only1 = stri__prepare_arg_logical_1_notNA(tokens_only, "tokens_only"); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); PROTECT(n = stri__prepare_arg_integer(n, "n")); PROTECT(omit_empty = stri__prepare_arg_logical(omit_empty, "omit_empty")); PROTECT(simplify = stri__prepare_arg_logical_1(simplify, "simplify")); R_len_t vectorize_length = stri__recycling_rule(true, 4, LENGTH(str), LENGTH(pattern), LENGTH(n), LENGTH(omit_empty)); StriRegexMatcherOptions pattern_opts = StriContainerRegexPattern::getRegexOptions(opts_regex); UText* str_text = NULL; // may potentially be slower, but definitely is more convenient! STRI__ERROR_HANDLER_BEGIN(5) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerInteger n_cont(n, vectorize_length); StriContainerLogical omit_empty_cont(omit_empty, vectorize_length); StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_opts); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (n_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } int n_cur = n_cont.get(i); int omit_empty_cur = !omit_empty_cont.isNA(i) && omit_empty_cont.get(i); STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));, SET_VECTOR_ELT(ret, i, (omit_empty_cont.isNA(i))?stri__vector_NA_strings(1): stri__vector_empty_strings((omit_empty_cur || n_cur == 0)?0:1));) R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); if (n_cur >= INT_MAX-1) throw StriException(MSG__INCORRECT_NAMED_ARG "; " MSG__EXPECTED_SMALLER, "n"); else if (n_cur < 0) n_cur = INT_MAX; else if (n_cur == 0) { SET_VECTOR_ELT(ret, i, Rf_allocVector(STRSXP, 0)); continue; } else if (tokens_only1) n_cur++; // we need to do one split ahead here UErrorCode status = U_ZERO_ERROR; RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically str_text = utext_openUTF8(str_text, str_cont.get(i).c_str(), str_cont.get(i).length(), &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) matcher->reset(str_text); R_len_t k; deque< pair > fields; // byte based-indices fields.push_back(pair(0,0)); for (k=1; k < n_cur; ) { int m_res = (int)matcher->find(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (!m_res) break; R_len_t s1 = (R_len_t)matcher->start(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) R_len_t s2 = (R_len_t)matcher->end(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (omit_empty_cur && fields.back().first == s1) fields.back().first = s2; // don't start any new field else { fields.back().second = s1; fields.push_back(pair(s2, s2)); // start a new field here ++k; // another field } } fields.back().second = str_cur_n; if (omit_empty_cur && fields.back().first == fields.back().second) fields.pop_back(); if (tokens_only1 && n_cur < INT_MAX) { n_cur--; // one split ahead could have been made, see above while (fields.size() > (size_t)n_cur) fields.pop_back(); // get rid of the remainder } SEXP ans; STRI__PROTECT(ans = Rf_allocVector(STRSXP, fields.size())); deque< pair >::iterator iter = fields.begin(); for (k = 0; iter != fields.end(); ++iter, ++k) { pair curoccur = *iter; if (curoccur.second == curoccur.first && omit_empty_cont.isNA(i)) SET_STRING_ELT(ans, k, NA_STRING); else SET_STRING_ELT(ans, k, Rf_mkCharLenCE(str_cur_s+curoccur.first, curoccur.second-curoccur.first, CE_UTF8)); } SET_VECTOR_ELT(ret, i, ans); STRI__UNPROTECT(1); } if (str_text) { utext_close(str_text); str_text = NULL; } if (LOGICAL(simplify)[0] == NA_LOGICAL || LOGICAL(simplify)[0]) { R_len_t n_min = 0; R_len_t n_length = LENGTH(n); int* n_tab = INTEGER(n); for (R_len_t i=0; i * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_container_usearch_h #define __stri_container_usearch_h #include "stri_container_utf16.h" #include #include #include /** * A class to handle UStringSearch searches * * @version 0.1-?? (Marek Gagolewski, 2013-06-23) * * @version 0.2-1 (Marek Gagolewski, 2014-04-18) * BUGFIX: memleaks on StriException * * @version 0.3-1 (Marek Gagolewski, 2014-05-27) * BUGFIX: invalid matcher reuse on empty search string * * @version 0.3-1 (Marek Gagolewski, 2014-11-01) * getMatcher() now also accepts UChar* * * @version 1.3.1 (Marek Gagolewski, 2019-02-06) * #337: warn on empty search pattern here */ class StriContainerUStringSearch : public StriContainerUTF16 { private: UCollator* col; ///< collator, owned by creator UStringSearch* lastMatcher; ///< recently used UStringSearch R_len_t lastMatcherIndex; ///< used by vectorize_getMatcher public: StriContainerUStringSearch(); StriContainerUStringSearch(SEXP rstr, R_len_t nrecycle, UCollator* col); StriContainerUStringSearch(StriContainerUStringSearch& container); ~StriContainerUStringSearch(); StriContainerUStringSearch& operator=(StriContainerUStringSearch& container); UStringSearch* getMatcher(R_len_t i, const UnicodeString& searchStr); UStringSearch* getMatcher(R_len_t i, const UChar* searchStr, int32_t searchStr_len); }; #endif stringi/src/stri_search_fixed_detect.cpp0000644000176200001440000001530514770541312020251 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_bytesearch.h" /** * Detect if a pattern occurs in a string [fast but dummy bitewise compare] * * @param str character vector * @param pattern character vector * @param negate single bool * @param max_count single int * @return logical vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF8, BUGFIX: the loop could go to far * * @version 0.1-?? (Marek Gagolewski) * corrected behavior on empty str/pattern * * @version 0.1-?? (Marek Gagolewski, 2013-06-23) * make StriException-friendly, use StriContainerByteSearch * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * stri_detect_fixed now uses byte search only * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * FR #110, #23: opts_fixed arg added * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use StriByteSearchMatcher * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * #216: `negate` arg added * * @version 1.3.1 (Marek Gagolewski, 2019-02-08) * #232: `max_count` arg added */ SEXP stri_detect_fixed(SEXP str, SEXP pattern, SEXP negate, SEXP max_count, SEXP opts_fixed) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); int max_count_1 = stri__prepare_arg_integer_1_notNA(max_count, "max_count"); uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); STRI__ERROR_HANDLER_BEGIN(2) int vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF8 str_cont(str, vectorize_length); StriContainerByteSearch pattern_cont(pattern, vectorize_length, pattern_flags); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (max_count_1 == 0) { ret_tab[i] = NA_LOGICAL; continue; } STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_LOGICAL, { ret_tab[i] = negate_1; if (max_count_1 > 0 && ret_tab[i]) --max_count_1; }) StriByteSearchMatcher* matcher = pattern_cont.getMatcher(i); matcher->reset(str_cont.get(i).c_str(), str_cont.get(i).length()); ret_tab[i] = (int)(matcher->findFirst() != USEARCH_DONE); if (negate_1) ret_tab[i] = !ret_tab[i]; if (max_count_1 > 0 && ret_tab[i]) --max_count_1; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( ;/* do nothing special on error */ ) // Version 2 -- slower for long strings // UText *uts = NULL; // UText *utp = NULL; // URegularExpression* matcher = NULL; // // STRI__ERROR_HANDLER_BEGIN // int vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); // StriContainerUTF8 str_cont(str, vectorize_length); // StriContainerByteSearch pattern_cont(pattern, vectorize_length); // // SEXP ret; // PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); // int* ret_tab = LOGICAL(ret); // // // const String8* last_s = NULL; // const String8* last_p = NULL; // UErrorCode err = U_ZERO_ERROR; // // for (R_len_t i = pattern_cont.vectorize_init(); // i != pattern_cont.vectorize_end(); // i = pattern_cont.vectorize_next(i)) // { // STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, // ret_tab[i] = NA_LOGICAL, // ret_tab[i] = FALSE) // // const String8* cur_s = &(str_cont.get(i)); // const String8* cur_p = &(pattern_cont.get(i)); // // if (last_p != cur_p) { // last_p = cur_p; // if (matcher) uregex_close(matcher); // utp = utext_openUTF8(utp, last_p->c_str(), last_p->length(), &err); // matcher = uregex_openUText(utp, UREGEX_LITERAL, NULL, &err); // STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // } // // if (last_s != cur_s) { // last_s = cur_s; // uts = utext_openUTF8(uts, last_s->c_str(), last_s->length(), &err); // } // // uregex_setUText(matcher, uts, &err); // uregex_reset(matcher, 0, &err); // int found = (int)uregex_find(matcher, -1, &err); // if (U_FAILURE(err)) // throw StriException(MSG__REGEX_FAILED); // LOGICAL(ret)[i] = found; // } // // if (matcher) { uregex_close(matcher); matcher=NULL; } // if (uts) { utext_close(uts); uts=NULL; } // if (utp) { utext_close(utp); utp=NULL; } // UNPROTECT(1); // return ret; // STRI__ERROR_HANDLER_END({ // if (matcher) { uregex_close(matcher); matcher=NULL; } // if (uts) { utext_close(uts); uts=NULL; } // if (utp) { utext_close(utp); utp=NULL; } // }) } stringi/src/stri_pad.cpp0000644000176200001440000002000614770541312015033 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_integer.h" #include "stri_string8buf.h" #include #include /** * Pad a string * * vectorized over str, length and pad * if str or pad or length is NA the result will be NA * * @param str character vector * @param min_length integer vector * @param side [internal int] * @param pad character vector * @param use_length single logical value * @return character vector * * @version 0.1-?? (Bartlomiej Tartanus) * * @version 0.2-2 (Marek Gagolewski, 2014-04-20) * use stri_error_handler, pad should be a single code point, not byte * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.5-1 (Marek Gagolewski, 2015-04-22) * `use_length` arg added, * second argument renamed `width` */ SEXP stri_pad(SEXP str, SEXP width, SEXP side, SEXP pad, SEXP use_length) { // this is an internal arg, check manually, error() allowed here if (!Rf_isInteger(side) || LENGTH(side) != 1) Rf_error(MSG__INCORRECT_INTERNAL_ARG); int _side = INTEGER(side)[0]; if (_side < 0 || _side > 2) Rf_error(MSG__INCORRECT_INTERNAL_ARG); bool use_length_val = stri__prepare_arg_logical_1_notNA(use_length, "use_length"); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(width = stri__prepare_arg_integer(width, "width")); PROTECT(pad = stri__prepare_arg_string(pad, "pad")); // side = stri__prepare_arg_string(side, "side"); // const char* side_opts[] = {"left", "right", "both", NULL}; R_len_t str_length = LENGTH(str); R_len_t width_length = LENGTH(width); // R_len_t side_length = LENGTH(side); R_len_t pad_length = LENGTH(pad); R_len_t vectorize_length = stri__recycling_rule(true, 3, str_length, width_length, /*side_length, */ pad_length); STRI__ERROR_HANDLER_BEGIN(3) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerInteger width_cont(width, vectorize_length); // StriContainerUTF8 side_cont(side, vectorize_length); StriContainerUTF8 pad_cont(pad, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); String8buf buf(0); // TODO: prealloc for (R_len_t i=0; i= width_cur) { // no padding at all SET_STRING_ELT(ret, i, str_cont.toR(i)); continue; } R_len_t padnum = width_cur-str_cur_width; buf.resize(str_cur_n+padnum*pad_cur_n, false); char* buftmp = buf.data(); R_len_t k = 0; switch(_side) { case 0: // left for (k=0; k 0) { // UChar cur_pad = (pad_cont.get(i))[0]; // This is Uchar - 16 bit..... // str_cont.getWritable(i).padLeading(length_cont.get(i), cur_pad); // } // // SET_STRING_ELT(ret, i, str_cont.toR(i)); // } // // UNPROTECT(1); // return ret; // STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) //} stringi/src/stri_search_in.cpp0000644000176200001440000001630214770541312016226 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //// VERSION III: naive O(n*m) - damn slower //#include "stri_stringi.h" //#include "stri_container_utf8.h" // // ///** Value Matching // * // * @param str character vector // * @param table character vector // * @nomatch single integer value // * // * @return integer vector // * // * @version 0.3-1 (Marek Gagolewski, 2014-06-06) // */ //SEXP stri_in_fixed(SEXP str, SEXP table, SEXP nomatch) //{ // str = stri__prepare_arg_string(str, "str"); // table = stri__prepare_arg_string(table, "table"); // nomatch = stri__prepare_arg_integer_1(nomatch, "nomatch"); // R_len_t str_length = LENGTH(str); // R_len_t table_length = LENGTH(table); // // R_len_t nomatch_cur = INTEGER(nomatch)[0]; // // STRI__ERROR_HANDLER_BEGIN // StriContainerUTF8 str_cont(str, str_length); // StriContainerUTF8 table_cont(table, table_length); // // SEXP ret; // STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_length)); // int* ret_tab = INTEGER(ret); // // for (R_len_t i = 0; i //#include // ///** helper struct (comparer) for stri_in_fixed // * // */ //struct cmp_str //{ // StriContainerUTF8* cont; // // cmp_str(StriContainerUTF8* _cont) { // cont = _cont; // } // // bool operator()(int i, int j) // { // if (cont->isNA(i)) return false; // else if (cont->isNA(j)) return true; // else return strcmp(cont->get(i).c_str(), cont->get(j).c_str()) < 0; // } //}; // // ///** Value Matching // * // * @param str character vector // * @param table character vector // * @nomatch single integer value // * // * @return integer vector // * // * @version 0.3-1 (Marek Gagolewski, 2014-06-06) // */ //SEXP stri_in_fixed(SEXP str, SEXP table, SEXP nomatch) //{ // str = stri__prepare_arg_string(str, "str"); // table = stri__prepare_arg_string(table, "table"); // nomatch = stri__prepare_arg_integer_1(nomatch, "nomatch"); // R_len_t str_length = LENGTH(str); // R_len_t table_length = LENGTH(table); // // R_len_t nomatch_cur = INTEGER(nomatch)[0]; // // STRI__ERROR_HANDLER_BEGIN // StriContainerUTF8 str_cont(str, str_length); // StriContainerUTF8 table_cont(table, table_length); // // cmp_str comparer(&str_cont); // std::vector idx(table_length); // for (int i=0; i //#include // // ///** Value Matching // * // * @param str character vector // * @param table character vector // * @nomatch single integer value // * // * @return integer vector // * // * @version 0.3-1 (Marek Gagolewski, 2014-06-06) // */ //SEXP stri_in_fixed(SEXP str, SEXP table, SEXP nomatch) //{ // str = stri__prepare_arg_string(str, "str"); // table = stri__prepare_arg_string(table, "table"); // nomatch = stri__prepare_arg_integer_1(nomatch, "nomatch"); // R_len_t str_length = LENGTH(str); // R_len_t table_length = LENGTH(table); // // R_len_t nomatch_cur = INTEGER(nomatch)[0]; // // STRI__ERROR_HANDLER_BEGIN // StriContainerUTF8 str_cont(str, str_length); // StriContainerUTF8 table_cont(table, table_length); // // boost::unordered_map dict(table_length); // for (R_len_t i=table_length-1; i>=0; --i) { // if (table_cont.isNA(i)) continue; // dict[std::string(table_cont.get(i).c_str())] = i+1; // 0-based index -> 1-based // } // // SEXP ret; // STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_length)); // int* ret_tab = INTEGER(ret); // // for (R_len_t i = 0; i::iterator it = dict.find(str_cur); // if (it != dict.end()) // ret_tab[i] = (*it).second; // else // ret_tab[i] = nomatch_cur; // } // // STRI__UNPROTECT_ALL // return ret; // STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) //} stringi/src/stri_search_fixed_extract.cpp0000644000176200001440000002142614770541312020454 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_bytesearch.h" #include #include using namespace std; /** * Extract first or last occurrences of pattern in a string [exact byte search] * * @param str character vector * @param pattern character vector * @param first looking for first or last match? * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-24) * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * stri_extract_fixed now uses byte search only * * @version 0.4-1 (Marek Gagolewski, 2014-12-08) * new args: opts_fixed * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use StriByteSearchMatcher */ SEXP stri__extract_firstlast_fixed(SEXP str, SEXP pattern, SEXP opts_fixed, bool first) { uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed); PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); // prepare string argument STRI__ERROR_HANDLER_BEGIN(2) int vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF8 str_cont(str, vectorize_length); StriContainerByteSearch pattern_cont(pattern, vectorize_length, pattern_flags); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, SET_STRING_ELT(ret, i, NA_STRING);, SET_STRING_ELT(ret, i, NA_STRING);) StriByteSearchMatcher* matcher = pattern_cont.getMatcher(i); matcher->reset(str_cont.get(i).c_str(), str_cont.get(i).length()); int start, len; if (first) { start = matcher->findFirst(); } else { start = matcher->findLast(); } if (start == USEARCH_DONE) { SET_STRING_ELT(ret, i, NA_STRING); continue; } len = matcher->getMatchedLength(); SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cont.get(i).c_str()+start, len, CE_UTF8)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* no-op */ }) } /** * Extract first occurrence of a fixed pattern in each string * * @param str character vector * @param pattern character vector * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-24) * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * stri_extract_fixed now uses byte search only * * @version 0.4-1 (Marek Gagolewski, 2014-12-08) * new args: opts_fixed */ SEXP stri_extract_first_fixed(SEXP str, SEXP pattern, SEXP opts_fixed) { return stri__extract_firstlast_fixed(str, pattern, opts_fixed, true); } /** * Extract last occurrence of a fixed pattern in each string * * @param str character vector * @param pattern character vector * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-24) * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * stri_extract_fixed now uses byte search only * * @version 0.4-1 (Marek Gagolewski, 2014-12-08) * new args: opts_fixed */ SEXP stri_extract_last_fixed(SEXP str, SEXP pattern, SEXP opts_fixed) { return stri__extract_firstlast_fixed(str, pattern, opts_fixed, false); } /** * Extract all occurrences of pattern in a string [exact byte search] * * @param str character vector * @param pattern character vector * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-24) * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * stri_extract_fixed now uses byte search only * * @version 0.4-1 (Marek Gagolewski, 2014-12-08) * new args: opts_fixed, omit_no_match, simplify * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use StriByteSearchMatcher */ SEXP stri_extract_all_fixed(SEXP str, SEXP pattern, SEXP simplify, SEXP omit_no_match, SEXP opts_fixed) { uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed, /*allow_overlap*/true); bool omit_no_match1 = stri__prepare_arg_logical_1_notNA(omit_no_match, "omit_no_match"); PROTECT(simplify = stri__prepare_arg_logical_1(simplify, "simplify")); PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); // prepare string argument STRI__ERROR_HANDLER_BEGIN(3) R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF8 str_cont(str, vectorize_length); StriContainerByteSearch pattern_cont(pattern, vectorize_length, pattern_flags); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));, SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(omit_no_match1?0:1));) StriByteSearchMatcher* matcher = pattern_cont.getMatcher(i); matcher->reset(str_cont.get(i).c_str(), str_cont.get(i).length()); int start = matcher->findFirst(); deque< pair > occurrences; while (start != USEARCH_DONE) { occurrences.push_back(pair(start, start+matcher->getMatchedLength())); start = matcher->findNext(); } R_len_t noccurrences = (R_len_t)occurrences.size(); if (noccurrences <= 0) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(omit_no_match1?0:1)); continue; } const char* str_cur_s = str_cont.get(i).c_str(); SEXP cur_res; STRI__PROTECT(cur_res = Rf_allocVector(STRSXP, noccurrences)); deque< pair >::iterator iter = occurrences.begin(); for (R_len_t j = 0; iter != occurrences.end(); ++iter, ++j) { pair curo = *iter; SET_STRING_ELT(cur_res, j, Rf_mkCharLenCE(str_cur_s+curo.first, curo.second-curo.first, CE_UTF8)); } SET_VECTOR_ELT(ret, i, cur_res); STRI__UNPROTECT(1); } if (LOGICAL(simplify)[0] == NA_LOGICAL || LOGICAL(simplify)[0]) { SEXP robj_TRUE, robj_zero, robj_na_strings, robj_empty_strings; STRI__PROTECT(robj_TRUE = Rf_ScalarLogical(TRUE)); STRI__PROTECT(robj_zero = Rf_ScalarInteger(0)); STRI__PROTECT(robj_na_strings = stri__vector_NA_strings(1)); STRI__PROTECT(robj_empty_strings = stri__vector_empty_strings(1)); STRI__PROTECT(ret = stri_list2matrix(ret, robj_TRUE, (LOGICAL(simplify)[0] == NA_LOGICAL)?robj_na_strings :robj_empty_strings, robj_zero)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({/* no-op */}) } stringi/src/stri_search_regex_subset.cpp0000644000176200001440000002107014770541312020315 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf16.h" #include "stri_container_utf8.h" #include "stri_container_regex.h" /** * Detect if a pattern occurs in a string * * @param str R character vector * @param pattern R character vector containing regular expressions * @param omit_na single logical value * @param opts_regex list * @return character vector * * @version 0.3-1 (Bartek Tartanus, 2014-07-25) * * @version 0.3-1 (Marek Gagolewski, 2014-10-17) * using std::vector to avoid mem-leaks * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-04) * #122: omit_na arg added * * @version 1.0-2 (Marek Gagolewski, 2016-01-29) * #214: allow a regex pattern like `.*` to match an empty string * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * #216: `negate` arg added * * @version 1.4.7 (Marek Gagolewski, 2020-08-24) * Use StriContainerRegexPattern::getRegexOptions * * @version 1.7.1 (Marek Gagolewski, 2021-06-17) * assure LENGTH(pattern) <= LENGTH(str) */ SEXP stri_subset_regex(SEXP str, SEXP pattern, SEXP omit_na, SEXP negate, SEXP opts_regex) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); bool omit_na1 = stri__prepare_arg_logical_1_notNA(omit_na, "omit_na"); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); if (LENGTH(str) > 0 && LENGTH(str) < LENGTH(pattern)) Rf_error(MSG__WARN_RECYCLING_RULE2); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); if (vectorize_length == 0) { UNPROTECT(2); return Rf_allocVector(STRSXP, 0); } StriRegexMatcherOptions pattern_opts = StriContainerRegexPattern::getRegexOptions(opts_regex); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF16 str_cont(str, vectorize_length); StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_opts); // BT: this cannot be done with deque, because pattern is reused so i does not // go like 0,1,2...n but 0,pat_len,2*pat_len,1,pat_len+1 and so on // MG: agreed std::vector which(vectorize_length); int result_counter = 0; for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_PATTERN(str_cont, pattern_cont, {if (omit_na1) which[i] = FALSE; else { which[i] = NA_LOGICAL; result_counter++; } }) RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically matcher->reset(str_cont.get(i)); UErrorCode status = U_ZERO_ERROR; which[i] = (int)matcher->find(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (negate_1) which[i] = !which[i]; if (which[i]) result_counter++; } SEXP ret; STRI__PROTECT(ret = stri__subset_by_logical(str_cont, which, result_counter)); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** * Substitutes vector elements if a pattern occurs in a string * * @param str character vector * @param pattern character vector * @param opts_regex list * @param value character vector * @return character vector * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * #124 * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * #216: `negate` arg added * * @version 1.4.7 (Marek Gagolewski, 2020-08-24) * Use StriContainerRegexPattern::getRegexOptions * * @version 1.7.1 (Marek Gagolewski, 2021-06-17) * assure LENGTH(pattern) and LENGTH(value) <= LENGTH(str) */ SEXP stri_subset_regex_replacement(SEXP str, SEXP pattern, SEXP negate, SEXP opts_regex, SEXP value) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); PROTECT(value = stri__prepare_arg_string(value, "value")); // we are subsetting `str`, therefore recycling is slightly different here if (LENGTH(value) == 0) Rf_error(MSG__REPLACEMENT_ZERO); if (LENGTH(pattern) == 0) Rf_error(MSG__WARN_EMPTY_VECTOR); if (LENGTH(str) == 0) { UNPROTECT(3); return Rf_allocVector(STRSXP, 0); } if (LENGTH(str) < LENGTH(pattern)) // for LENGTH(value), we emit warning later on Rf_error(MSG__WARN_RECYCLING_RULE2); if ((LENGTH(str) % LENGTH(pattern)) != 0) Rf_warning(MSG__WARN_RECYCLING_RULE); R_len_t vectorize_length = LENGTH(str); StriRegexMatcherOptions pattern_opts = StriContainerRegexPattern::getRegexOptions(opts_regex); UText* str_text = NULL; // might be slower, but definitely is more convenient! STRI__ERROR_HANDLER_BEGIN(3) R_len_t value_length = LENGTH(value); StriContainerUTF8 value_cont(value, value_length); StriContainerUTF8 str_cont(str, vectorize_length); StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_opts); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); std::vector detected(vectorize_length, 0); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (pattern_cont.isNA(i)) { // behave like `[<-` detected[i] = false; continue; } STRI__CONTINUE_ON_EMPTY_OR_NA_PATTERN(str_cont, pattern_cont, {detected[i] = NA_INTEGER;}) UErrorCode status = U_ZERO_ERROR; RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically str_text = utext_openUTF8(str_text, str_cont.get(i).c_str(), str_cont.get(i).length(), &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) matcher->reset(str_text); bool found = matcher->find(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) detected[i] = ((found && !negate_1) || (!found && negate_1)); } R_len_t k = 0; // we must traverse `str_cont` in order now for (R_len_t i = 0; i * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_container_bytesearch_h #define __stri_container_bytesearch_h #include "stri_container_utf8.h" #include "stri_bytesearch_matcher.h" // #define STRI__BYTESEARCH_DISABLE_SHORTPAT /** * A class to handle StriByteSearch patterns * * @version 0.1-?? (Marek Gagolewski, 2013-06-23) * * @version 0.1-?? (Bartek Tartanus, 2013-08-15) * KMP algorithm implemented * * @version 0.2-3 (Marek Gagolewski, 2014-05-11) * KMP used by default; * KMP_from back implemented; * tweeks for short patterns * * @version 0.3-1 (Marek Gagolewski, 2014-05-27) * BUGFIX: invalid matcher reuse on empty search string * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * getByteSearchFlags static method added, * allow for case-insensitive search * * @version 0.4-1 (Marek Gagolewski, 2014-12-08) * #23: add `overlap` option * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use StriByteSearchMatcher * * @version 1.3.1 (Marek Gagolewski, 2019-02-06) * #337: warn on empty search pattern here */ class StriContainerByteSearch : public StriContainerUTF8 { private: typedef enum ByteSearchFlag { BYTESEARCH_CASE_INSENSITIVE = 2, BYTESEARCH_OVERLAP = 4 } ByteSearchFlag; StriByteSearchMatcher* matcher; uint32_t flags; ///< ByteSearch flags public: static uint32_t getByteSearchFlags(SEXP opts_fixed, bool allow_overlap=false); StriContainerByteSearch(); StriContainerByteSearch(SEXP rstr, R_len_t nrecycle, uint32_t flags); StriContainerByteSearch(StriContainerByteSearch& container); ~StriContainerByteSearch(); StriContainerByteSearch& operator=(StriContainerByteSearch& container); StriByteSearchMatcher* getMatcher(R_len_t i); inline bool isCaseInsensitive() { return (bool)(flags&BYTESEARCH_CASE_INSENSITIVE); } inline bool isOverlap() { return (bool)(flags&BYTESEARCH_OVERLAP); } }; #endif stringi/src/stri_search_class_locate.cpp0000644000176200001440000002237714770541312020265 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_charclass.h" #include "stri_container_logical.h" #include #include using namespace std; /** * Locate first or last occurrences of a character class in each string * * @param str character vector * @param pattern character vector * @return matrix with 2 columns * * @version 0.1-?? (Marek Gagolewski, 2013-06-04) * * @version 0.1-?? (Marek Gagolewski, 2013-06-15) * Use StrContainerCharClass * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * detects invalid UTF-8 byte stream * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * StriContainerCharClass now relies on UnicodeSet * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.7.1 (Marek Gagolewski, 2021-06-29) * get_length */ SEXP stri__locate_firstlast_charclass(SEXP str, SEXP pattern, bool first, bool get_length1) { PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocMatrix(INTSXP, vectorize_length, 2)); stri__locate_set_dimnames_matrix(ret, get_length1); int* ret_tab = INTEGER(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { ret_tab[i] = NA_INTEGER; ret_tab[i+vectorize_length] = NA_INTEGER; if (str_cont.isNA(i) || pattern_cont.isNA(i)) continue; if (get_length1) { ret_tab[i] = -1; ret_tab[i+vectorize_length] = -1; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); R_len_t j; R_len_t k = 0; UChar32 chr; for (j=0; jcontains(chr)) { ret_tab[i] = k; ret_tab[i+vectorize_length] = get_length1 ? 1 : ret_tab[i]; if (first) break; // that's enough for first // note that for last, we can't go backwards from the end, as we need a proper index! } } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** * Locate first occurrence of a character class in each string * * @param str character vector * @param pattern character vector * @return matrix with 2 columns * * @version 0.1-?? (Marek Gagolewski, 2013-06-04) * * @version 1.7.1 (Marek Gagolewski, 2021-06-29) * get_length */ SEXP stri_locate_first_charclass(SEXP str, SEXP pattern, SEXP get_length) { bool get_length1 = stri__prepare_arg_logical_1_notNA(get_length, "get_length"); return stri__locate_firstlast_charclass(str, pattern, true, get_length1); } /** * Locate last occurrence of a character class in each string * * @param str character vector * @param pattern character vector * @return matrix with 2 columns * * @version 0.1-?? (Marek Gagolewski, 2013-06-04) * * @version 1.7.1 (Marek Gagolewski, 2021-06-29) * get_length */ SEXP stri_locate_last_charclass(SEXP str, SEXP pattern, SEXP get_length) { bool get_length1 = stri__prepare_arg_logical_1_notNA(get_length, "get_length"); return stri__locate_firstlast_charclass(str, pattern, false, get_length1); } /** * Locate first or last occurrences of a character class in each string * * @param str character vector * @param pattern character vector * @return list of matrices with 2 columns * * @version 0.1-?? (Marek Gagolewski, 2013-06-04) * * @version 0.1-?? (Marek Gagolewski, 2013-06-09) * use R_len_t_x2 for merge=TRUE * [R_len_t_x2 changed to pair thereafter] * * @version 0.1-?? (Marek Gagolewski, 2013-06-15) * Use StrContainerCharClass * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * detects invalid UTF-8 byte stream * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * StriContainerCharClass now relies on UnicodeSet * * @version 0.3-1 (Marek Gagolewski, 2014-11-02) * using StriContainerCharClass::locateAll; * no longer vectorized over `merge` * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-11-27) * FR #117: omit_no_match arg added * * @version 1.7.1 (Marek Gagolewski, 2021-06-29) * get_length */ SEXP stri_locate_all_charclass(SEXP str, SEXP pattern, SEXP merge, SEXP omit_no_match, SEXP get_length) { bool omit_no_match1 = stri__prepare_arg_logical_1_notNA(omit_no_match, "omit_no_match"); bool get_length1 = stri__prepare_arg_logical_1_notNA(get_length, "get_length"); bool merge_cur = stri__prepare_arg_logical_1_notNA(merge, "merge"); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (pattern_cont.isNA(i) || str_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, stri__matrix_NA_INTEGER(1, 2)); continue; } deque< pair > occurrences; StriContainerCharClass::locateAll( occurrences, &pattern_cont.get(i), str_cont.get(i).c_str(), str_cont.get(i).length(), merge_cur, true /* code point-based indexes */ ); R_len_t noccurrences = (R_len_t)occurrences.size(); if (noccurrences == 0) { SET_VECTOR_ELT(ret, i, stri__matrix_NA_INTEGER(omit_no_match1?0:1, 2, get_length1?-1:NA_INTEGER)); continue; } SEXP cur_res; STRI__PROTECT(cur_res = Rf_allocMatrix(INTSXP, noccurrences, 2)); int* cur_res_int = INTEGER(cur_res); deque< pair >::iterator iter = occurrences.begin(); for (R_len_t f = 0; iter != occurrences.end(); ++iter, ++f) { pair curoccur = *iter; cur_res_int[f] = curoccur.first+1; // 0-based => 1-based cur_res_int[f+noccurrences] = get_length1?(curoccur.second-cur_res_int[f]+1):curoccur.second; } SET_VECTOR_ELT(ret, i, cur_res); STRI__UNPROTECT(1) } stri__locate_set_dimnames_list(ret, get_length1); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } stringi/src/stri_search_common.cpp0000644000176200001440000001200314770541312017102 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_usearch.h" #include #include "stri_string8buf.h" #include using namespace std; /** * Set colnames for matrix returned by stri_locate_first_* or stri_locate_last_* * @param matrix R matrix with two columns * * @version 0.1-?? (Marek Gagolewski) * * @version 1.7.1 (Marek Gagolewski, 2021-06-29) name_col1, name_col2 */ void stri__locate_set_dimnames_matrix( SEXP matrix, bool get_length ) { SEXP dimnames; SEXP colnames; PROTECT(dimnames = Rf_allocVector(VECSXP, 2)); PROTECT(colnames = Rf_allocVector(STRSXP, 2)); SET_STRING_ELT(colnames, 0, Rf_mkChar(MSG__LOCATE_DIM_START)); SET_STRING_ELT(colnames, 1, Rf_mkChar( get_length?MSG__LOCATE_DIM_LENGTH:MSG__LOCATE_DIM_END )); SET_VECTOR_ELT(dimnames, 0, R_NilValue); SET_VECTOR_ELT(dimnames, 1, colnames); Rf_setAttrib(matrix, R_DimNamesSymbol, dimnames); UNPROTECT(2); } /** * Set colnames for matrices stored in a list returned by stri_locate_all_* or stri_locate_all_* * @param matrix R matrix with two columns * * @version 0.1-?? (Marek Gagolewski) * * @version 1.7.1 (Marek Gagolewski, 2021-06-29) name_col1, name_col2 */ void stri__locate_set_dimnames_list( SEXP list, bool get_length ) { R_len_t n = LENGTH(list); if (n <= 0) return; SEXP dimnames; SEXP colnames; PROTECT(dimnames = Rf_allocVector(VECSXP, 2)); PROTECT(colnames = Rf_allocVector(STRSXP, 2)); SET_STRING_ELT(colnames, 0, Rf_mkChar(MSG__LOCATE_DIM_START)); SET_STRING_ELT(colnames, 1, Rf_mkChar( get_length?MSG__LOCATE_DIM_LENGTH:MSG__LOCATE_DIM_END )); SET_VECTOR_ELT(dimnames, 1, colnames); for (R_len_t i = 0; i < n; ++i) Rf_setAttrib(VECTOR_ELT(list, i), R_DimNamesSymbol, dimnames); UNPROTECT(2); } // I really love macros /MG/ :) #define stri__subset_by_logical__MACRO \ SEXP ret; \ PROTECT(ret = Rf_allocVector(STRSXP, result_counter)); \ for (R_len_t j=0, i=0; i to avoid mem-leaks, and * const StriContainer& for increased performance */ SEXP stri__subset_by_logical(const StriContainerUTF8& str_cont, const std::vector& which, int result_counter) { stri__subset_by_logical__MACRO } /** * Subset str_cont to SEXP by logical table ret_tab * * @param str_cont * @param which logical * @param result_counter * @return character vector * * @version 0.3-1 (Bartlomiej Tartanus, 2014-07-25) * @version 0.3-1 (Marek Gagolewski, 2014-10-17) * using std::vector to avoid mem-leaks, and * const StriContainer& for increased performance */ SEXP stri__subset_by_logical(const StriContainerUTF16& str_cont, const std::vector& which, int result_counter) { stri__subset_by_logical__MACRO } stringi/src/stri_search_boundaries_split.cpp0000644000176200001440000001456014770541312021172 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8_indexable.h" #include "stri_container_integer.h" #include "stri_brkiter.h" /** Split a string at BreakIterator boundaries * * @param str character vector * @param n integer * @param tokens_only logical * @param simplify logical * @param opts_brkiter named list * @return list * * @version 0.2-2 (Marek Gagolewski, 2014-04-21) * * @version 0.2-2 (Marek Gagolewski, 2014-04-23) * removed "title": For Unicode 4.0 and above title boundary * iteration, please use Word Boundary iterator. * * @version 0.2-2 (Marek Gagolewski, 2014-04-25) * use stri__split_or_locate_boundaries * * @version 0.3-1 (Marek Gagolewski, 2014-10-29) * use opts_brkiter * * @version 0.4-1 (Marek Gagolewski, 2014-11-28) * new args: n, tokens_only, simplify * * @version 0.4-1 (Marek Gagolewski, 2014-12-02) * use StriRuleBasedBreakIterator * * @version 0.4-1 (Marek Gagolewski, 2014-12-04) * allow `simplify=NA`; FR #126: pass n to stri_list2matrix */ SEXP stri_split_boundaries(SEXP str, SEXP n, SEXP tokens_only, SEXP simplify, SEXP opts_brkiter) { bool tokens_only1 = stri__prepare_arg_logical_1_notNA(tokens_only, "tokens_only"); PROTECT(simplify = stri__prepare_arg_logical_1(simplify, "simplify")); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(n = stri__prepare_arg_integer(n, "n")); StriBrkIterOptions opts_brkiter2(opts_brkiter, "line_break"); STRI__ERROR_HANDLER_BEGIN(3) R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(n)); StriContainerUTF8_indexable str_cont(str, vectorize_length); StriContainerInteger n_cont(n, vectorize_length); StriRuleBasedBreakIterator brkiter(opts_brkiter2); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = 0; i < vectorize_length; ++i) { if (n_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } int n_cur = n_cont.get(i); if (str_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } if (n_cur >= INT_MAX-1) throw StriException(MSG__INCORRECT_NAMED_ARG "; " MSG__EXPECTED_SMALLER, "n"); else if (n_cur < 0) n_cur = INT_MAX; else if (n_cur == 0) { SET_VECTOR_ELT(ret, i, Rf_allocVector(STRSXP, 0)); continue; } R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); deque< pair > occurrences; brkiter.setupMatcher(str_cur_s, str_cur_n); brkiter.first(); pair curpair; R_len_t k = 0; while (k < n_cur && brkiter.next(curpair)) { occurrences.push_back(curpair); ++k; // another field } R_len_t noccurrences = (R_len_t)occurrences.size(); if (noccurrences <= 0) { SET_VECTOR_ELT(ret, i, stri__vector_empty_strings(0)); // @TODO: Should it be a NA? Hard to say... continue; } if (k == n_cur && !tokens_only1) occurrences.back().second = str_cur_n; SEXP ans; STRI__PROTECT(ans = Rf_allocVector(STRSXP, noccurrences)); deque< pair >::iterator iter = occurrences.begin(); for (R_len_t j = 0; iter != occurrences.end(); ++iter, ++j) { SET_STRING_ELT(ans, j, Rf_mkCharLenCE(str_cur_s+(*iter).first, (*iter).second-(*iter).first, CE_UTF8)); } SET_VECTOR_ELT(ret, i, ans); STRI__UNPROTECT(1); } if (LOGICAL(simplify)[0] == NA_LOGICAL || LOGICAL(simplify)[0]) { R_len_t n_min = 0; R_len_t n_length = LENGTH(n); int* n_tab = INTEGER(n); for (R_len_t i=0; i * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_bytesearch_matcher_h #define __stri_bytesearch_matcher_h #include "stri_stringi.h" #ifndef USEARCH_DONE #define USEARCH_DONE -1 #endif /** * Performs actual pattern matching on behalf of StriContainerByteSearch * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * code taken from StriContainerByteSearch */ class StriByteSearchMatcher { private: StriByteSearchMatcher(const StriByteSearchMatcher&); /* no copy-able */ StriByteSearchMatcher& operator=(const StriByteSearchMatcher&); protected: bool m_optOverlap; R_len_t m_searchPos; // -1 after reset, searchLen on no further matches R_len_t m_searchEnd; const char* m_searchStr; // owned by caller R_len_t m_searchLen; // in bytes R_len_t m_patternLen; const char* m_patternStr; virtual R_len_t findFromPos(R_len_t pos) = 0; public: StriByteSearchMatcher(const char* patternStr, R_len_t patternLen, bool optOverlap) { this->m_optOverlap = optOverlap; this->m_patternStr = patternStr; this->m_patternLen = patternLen; this->m_searchStr = NULL; } const char* getPatternStr() const { return m_patternStr; } virtual ~StriByteSearchMatcher() { } virtual void reset(const char* searchStr, R_len_t searchLen) { this->m_searchStr = searchStr; this->m_searchLen = searchLen; this->m_searchPos = -1; this->m_searchEnd = -1; } virtual R_len_t findFirst() = 0; virtual R_len_t findLast() = 0; R_len_t findNext() { if (m_searchPos < 0) return findFirst(); if (m_optOverlap) { int pos = m_searchPos; U8_FWD_1(m_searchStr, pos, m_searchLen); return findFromPos(pos); } else return findFromPos(m_searchEnd); } /** get start index of pattern match from the last search * * @return byte index in searchStr */ inline R_len_t getMatchedStart() { #ifndef NDEBUG if (!this->m_searchStr || !this->m_patternStr) throw StriException("DEBUG: StriByteSearchMatcher: reset() hasn't been called yet"); if (m_searchPos < 0 || m_searchEnd-m_searchPos <= 0 || m_searchPos >= m_searchLen) throw StriException("StriByteSearchMatcher: no match at current position! This is a BUG."); #endif return m_searchPos; } /** get length of pattern match from the last search * * @return byte index in searchStr */ inline R_len_t getMatchedLength() { #ifndef NDEBUG if (!this->m_searchStr || !this->m_patternStr) throw StriException("DEBUG: StriByteSearchMatcher: reset() hasn't been called yet"); if (m_searchPos < 0 || m_searchEnd-m_searchPos <= 0 || m_searchEnd > m_searchLen) throw StriException("StriByteSearchMatcher: no match at current position! This is a BUG."); #endif return m_searchEnd-m_searchPos; } }; class StriByteSearchMatcherKMP : public StriByteSearchMatcher { private: StriByteSearchMatcherKMP(const StriByteSearchMatcherKMP&); /* no copy-able */ StriByteSearchMatcherKMP& operator=(const StriByteSearchMatcherKMP&); protected: int* m_kmpNext; int m_patternPos; virtual R_len_t findFromPos(R_len_t startPos) { #ifndef NDEBUG if (!m_searchStr) throw StriException("!m_searchStr"); #endif int j = startPos; m_patternPos = 0; while (j < m_searchLen) { while (m_patternPos >= 0 && m_patternStr[m_patternPos] != m_searchStr[j]) m_patternPos = m_kmpNext[m_patternPos]; m_patternPos++; j++; if (m_patternPos == m_patternLen) { m_searchEnd = j; m_searchPos = j-m_patternLen; return m_searchPos; } } // else not found m_searchPos = m_searchEnd = m_searchLen; return USEARCH_DONE; } public: virtual ~StriByteSearchMatcherKMP() { delete [] m_kmpNext; } #ifndef NDEBUG #endif StriByteSearchMatcherKMP(const char* patternStr, R_len_t patternLen, bool optOverlap) : StriByteSearchMatcher(patternStr, patternLen, optOverlap) { int kmpMaxSize = patternLen+1; // that's sufficient this->m_kmpNext = new int[kmpMaxSize]; if (!this->m_kmpNext) throw StriException(MSG__MEM_ALLOC_ERROR); this->m_kmpNext[0] = -100; // magic constant for an uninitialized KMP table } virtual void reset(const char* searchStr, R_len_t searchLen) { StriByteSearchMatcher::reset(searchStr, searchLen); m_patternPos = -1; } virtual R_len_t findFirst() { if (this->m_kmpNext[0] <= -100) { // Setup KMP table for FWD search m_kmpNext[0] = -1; for (R_len_t i=0; i 0 && m_patternStr[i] != m_patternStr[m_kmpNext[i+1]-1]) m_kmpNext[i+1] = m_kmpNext[m_kmpNext[i+1]-1]+1; } } return findFromPos(0); } virtual R_len_t findLast() { if (this->m_kmpNext[0] <= -100) { // Setup KMP table for BACK search m_kmpNext[0] = -1; for (R_len_t i=0; i 0 && m_patternStr[m_patternLen-i-1] != m_patternStr[m_patternLen-(m_kmpNext[i+1]-1)-1]) m_kmpNext[i+1] = m_kmpNext[m_kmpNext[i+1]-1]+1; } } int j = m_searchLen; m_patternPos = 0; while (j > 0) { j--; while (m_patternPos >= 0 && m_patternStr[m_patternLen-1-m_patternPos] != m_searchStr[j]) m_patternPos = m_kmpNext[m_patternPos]; m_patternPos++; if (m_patternPos == m_patternLen) { m_searchEnd = j+m_patternLen; m_searchPos = j; return m_searchPos; } } m_searchPos = m_searchEnd = m_searchLen; return USEARCH_DONE; } }; class StriByteSearchMatcherKMPci : public StriByteSearchMatcher { private: StriByteSearchMatcherKMPci(const StriByteSearchMatcherKMPci&); /* no copy-able */ StriByteSearchMatcherKMPci& operator=(const StriByteSearchMatcherKMPci&); protected: int* m_kmpNext; int m_patternPos; R_len_t m_patternLenCaseInsensitive; UChar32* m_patternStrCaseInsensitive; virtual R_len_t findFromPos(R_len_t startPos) { int j = startPos; m_patternPos = 0; UChar32 c = 0; while (j < m_searchLen) { U8_NEXT(m_searchStr, j, m_searchLen, c); c = u_toupper(c); while (m_patternPos >= 0 && m_patternStrCaseInsensitive[m_patternPos] != c) m_patternPos = m_kmpNext[m_patternPos]; m_patternPos++; if (m_patternPos == m_patternLenCaseInsensitive) { m_searchEnd = j; // we need to go back by patternLenCaseInsensitive code points R_len_t k = m_patternLenCaseInsensitive; m_searchPos = j; while (k > 0) { U8_BACK_1((const uint8_t*)m_searchStr, 0, m_searchPos); k--; } return m_searchPos; } } // else not found m_searchPos = m_searchEnd = m_searchLen; return USEARCH_DONE; } public: virtual ~StriByteSearchMatcherKMPci() { delete [] m_kmpNext; delete [] m_patternStrCaseInsensitive; } StriByteSearchMatcherKMPci(const char* patternStr, R_len_t patternLen, bool optOverlap) : StriByteSearchMatcher(patternStr, patternLen, optOverlap) { int kmpMaxSize = patternLen+1; // that's sufficient this->m_kmpNext = new int[kmpMaxSize]; if (!this->m_kmpNext) throw StriException(MSG__MEM_ALLOC_ERROR); this->m_kmpNext[0] = -100; // magic constant for an uninitialized KMP table this->m_patternStrCaseInsensitive = new UChar32[kmpMaxSize]; if (!this->m_patternStrCaseInsensitive) throw StriException(MSG__MEM_ALLOC_ERROR); UChar32 c = 0; R_len_t j = 0; m_patternLenCaseInsensitive = 0; while (j < patternLen) { U8_NEXT(patternStr, j, patternLen, c); #ifndef NDEBUG if (m_patternLenCaseInsensitive >= kmpMaxSize) throw StriException("!NDEBUG: StriByteSearchMatcherKMPci::StriByteSearchMatcherKMPci()"); #endif m_patternStrCaseInsensitive[m_patternLenCaseInsensitive++] = u_toupper(c); } m_patternStrCaseInsensitive[m_patternLenCaseInsensitive] = 0; } virtual void reset(const char* searchStr, R_len_t searchLen) { StriByteSearchMatcher::reset(searchStr, searchLen); m_patternPos = -1; } virtual R_len_t findFirst() { if (this->m_kmpNext[0] <= -100) { // Setup KMP table for FWD search m_kmpNext[0] = -1; for (R_len_t i=0; i 0 && m_patternStrCaseInsensitive[i] != m_patternStrCaseInsensitive[m_kmpNext[i+1]-1]) m_kmpNext[i+1] = m_kmpNext[m_kmpNext[i+1]-1]+1; } } return findFromPos(0); } virtual R_len_t findLast() { if (this->m_kmpNext[0] <= -100) { // Setup KMP table for BACK search m_kmpNext[0] = -1; for (R_len_t i=0; i 0 && m_patternStrCaseInsensitive[m_patternLen-i-1] != m_patternStrCaseInsensitive[m_patternLenCaseInsensitive-(m_kmpNext[i+1]-1)-1]) m_kmpNext[i+1] = m_kmpNext[m_kmpNext[i+1]-1]+1; } } int j = m_searchLen; m_patternPos = 0; while (j > 0) { UChar32 c; U8_PREV(m_searchStr, 0, j, c); c = u_toupper(c); while (m_patternPos >= 0 && m_patternStrCaseInsensitive[m_patternLenCaseInsensitive-1-m_patternPos] != c) m_patternPos = m_kmpNext[m_patternPos]; m_patternPos++; if (m_patternPos == m_patternLenCaseInsensitive) { m_searchPos = j; // we need to go forward by patternLenCaseInsensitive code points R_len_t k = m_patternLenCaseInsensitive; m_searchEnd = j; while (k > 0) { U8_FWD_1((const uint8_t*)m_searchStr, m_searchEnd, m_searchLen); k--; } return m_searchPos; } } m_searchPos = m_searchEnd = m_searchLen; return USEARCH_DONE; } }; class StriByteSearchMatcher1 : public StriByteSearchMatcher { private: StriByteSearchMatcher1(const StriByteSearchMatcher1&); /* no copy-able */ StriByteSearchMatcher1& operator=(const StriByteSearchMatcher1&); protected: virtual R_len_t findFromPos(R_len_t startPos) { #ifndef NDEBUG if (!m_searchStr) throw StriException("!m_searchStr"); #endif if (startPos > m_searchLen-m_patternLen) { // this check is OK, we do a case-sensitive search m_searchPos = m_searchEnd = m_searchLen; return USEARCH_DONE; } const char* res = strchr(m_searchStr+startPos, m_patternStr[0]); if (res) { m_searchPos = (int)(res-m_searchStr); m_searchEnd = m_searchPos+1; return m_searchPos; } else { m_searchPos = m_searchEnd = m_searchLen; return USEARCH_DONE; } /*unsigned char pat = (unsigned char)m_patternStr[0]; for (m_searchPos = startPos; m_searchPos=0; --m_searchPos) { if (pat == (unsigned char)m_searchStr[m_searchPos]) { m_searchEnd = m_searchPos + 1; return m_searchPos; } } // else not found m_searchPos = m_searchEnd = m_searchLen; return USEARCH_DONE; } }; class StriByteSearchMatcherShort : public StriByteSearchMatcher { private: StriByteSearchMatcherShort(const StriByteSearchMatcherShort&); /* no copy-able */ StriByteSearchMatcherShort& operator=(const StriByteSearchMatcherShort&); protected: virtual R_len_t findFromPos(R_len_t startPos) { #ifndef NDEBUG if (!m_searchStr) throw StriException("!m_searchStr"); #endif if (startPos > m_searchLen-m_patternLen) { // this check is OK, we do a case-sensitive search m_searchPos = m_searchEnd = m_searchLen; return USEARCH_DONE; } const char* res = strstr(m_searchStr+startPos, m_patternStr); if (res) { m_searchPos = (int)(res-m_searchStr); m_searchEnd = m_searchPos+m_patternLen; return m_searchPos; } else { m_searchPos = m_searchEnd = m_searchLen; return USEARCH_DONE; } } public: StriByteSearchMatcherShort(const char* patternStr, R_len_t patternLen, bool optOverlap) : StriByteSearchMatcher(patternStr, patternLen, optOverlap) { } virtual R_len_t findFirst() { return findFromPos(0); } virtual R_len_t findLast() { R_len_t startPos = m_searchLen; for (m_searchPos = startPos-m_patternLen; m_searchPos>=0; --m_searchPos) { if (0 == strncmp(m_searchStr+m_searchPos, m_patternStr, m_patternLen)) { m_searchEnd = m_searchPos + m_patternLen; return m_searchPos; } } // else not found m_searchPos = m_searchEnd = m_searchLen; return USEARCH_DONE; } }; #endif stringi/src/stri_search_class_replace.cpp0000644000176200001440000003607114770541312020425 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_charclass.h" #include "stri_container_logical.h" #include "stri_string8buf.h" #include #include using namespace std; /** * Replace all occurrences of a character class * * @param str character vector; strings to search in * @param pattern character vector; charclasses to search for * @param replacement character vector; strings to replace with * @param merge merge consecutive matches into a single one? * * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-07) * * @version 0.1-?? (Marek Gagolewski, 2013-06-15) * Use StrContainerCharClass * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * detects invalid UTF-8 byte stream; * merge arg added (replacement of old stri_trim_both/double by BT) * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * StriContainerCharClass now relies on UnicodeSet * * @version 0.3-1 (Marek Gagolewski, 2014-11-02) * using String8buf::replaceAllAtPos and StriContainerCharClass::locateAll; * no longer vectorized over merge * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.0-2 (Marek Gagolewski, 2016-01-30) * Issue #210: Allow NA replacement */ SEXP stri__replace_all_charclass_yes_vectorize_all(SEXP str, SEXP pattern, SEXP replacement, SEXP merge) { PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); PROTECT(replacement = stri__prepare_arg_string(replacement, "replacement")); bool merge_cur = stri__prepare_arg_logical_1_notNA(merge, "merge"); R_len_t vectorize_length = stri__recycling_rule(true, 3, LENGTH(str), LENGTH(pattern), LENGTH(replacement)); STRI__ERROR_HANDLER_BEGIN(3) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerUTF8 replacement_cont(replacement, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); String8buf buf(0); // @TODO: calculate buf len a priori? for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); deque< pair > occurrences; R_len_t sumbytes = StriContainerCharClass::locateAll( occurrences, &pattern_cont.get(i), str_cur_s, str_cur_n, merge_cur, false /* byte-based indices */ ); if (occurrences.size() == 0) { SET_STRING_ELT(ret, i, str_cont.toR(i)); // no change continue; } if (replacement_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t replacement_cur_n = replacement_cont.get(i).length(); R_len_t buf_need = str_cur_n+(R_len_t)occurrences.size()*replacement_cur_n-sumbytes; buf.resize(buf_need, false/*destroy contents*/); R_len_t buf_used = buf.replaceAllAtPos(str_cur_s, str_cur_n, replacement_cont.get(i).c_str(), replacement_cur_n, occurrences); #ifndef NDEBUG if (buf_need != buf_used) throw StriException("!NDEBUG: stri__replace_allfirstlast_fixed: (buf_need != buf_used)"); #endif SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buf_used, CE_UTF8)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** * Replace all occurrences of a character class * * @param str character vector; strings to search in * @param pattern character vector; charclasses to search for * @param replacement character vector; strings to replace with * @param merge merge consecutive matches into a single one? * * @return character vector * * @version 0.3-1 (Marek Gagolewski, 2014-11-02) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.0-2 (Marek Gagolewski, 2016-01-30) * Issue #210: Allow NA replacement */ SEXP stri__replace_all_charclass_no_vectorize_all(SEXP str, SEXP pattern, SEXP replacement, SEXP merge) { PROTECT(str = stri__prepare_arg_string(str, "str")); // if str_n is 0, then return an empty vector R_len_t str_n = LENGTH(str); if (str_n <= 0) { UNPROTECT(1); return stri__vector_empty_strings(0); } PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); PROTECT(replacement = stri__prepare_arg_string(replacement, "replacement")); R_len_t pattern_n = LENGTH(pattern); R_len_t replacement_n = LENGTH(replacement); if (pattern_n < replacement_n || pattern_n <= 0 || replacement_n <= 0) { UNPROTECT(3); Rf_error(MSG__WARN_RECYCLING_RULE2); } if (pattern_n % replacement_n != 0) Rf_warning(MSG__WARN_RECYCLING_RULE); if (pattern_n == 1) {// this will be much faster: SEXP ret; PROTECT(ret = stri__replace_all_charclass_yes_vectorize_all(str, pattern, replacement, merge)); UNPROTECT(4); return ret; } bool merge_cur = stri__prepare_arg_logical_1_notNA(merge, "merge"); STRI__ERROR_HANDLER_BEGIN(3) StriContainerUTF8 str_cont(str, str_n, false); // writable); StriContainerUTF8 replacement_cont(replacement, pattern_n); StriContainerCharClass pattern_cont(pattern, pattern_n); String8buf buf(0); // @TODO: calculate buf len a priori? for (R_len_t i = 0; i > occurrences; R_len_t sumbytes = StriContainerCharClass::locateAll( occurrences, &pattern_cont.get(i), str_cur_s, str_cur_n, merge_cur, false /* byte-based indices */ ); if (occurrences.size() == 0) continue; if (replacement_cont.isNA(i)) { str_cont.setNA(j); continue; } R_len_t replacement_cur_n = replacement_cont.get(i).length(); R_len_t buf_need = str_cur_n+(R_len_t)occurrences.size()*replacement_cur_n-sumbytes; buf.resize(buf_need, false/*destroy contents*/); str_cont.getWritable(j).replaceAllAtPos(buf_need, replacement_cont.get(i).c_str(), replacement_cur_n, occurrences); } } STRI__UNPROTECT_ALL return str_cont.toR(); STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** * Replace all occurrences of a character class * * @param str character vector; strings to search in * @param pattern character vector; charclasses to search for * @param replacement character vector; strings to replace with * @param merge merge consecutive matches into a single one? * @param vectorize_all single logical value * * @return character vector * * @version 0.3-1 (Marek Gagolewski, 2014-11-02) * added `vectorize_all` arg */ SEXP stri_replace_all_charclass(SEXP str, SEXP pattern, SEXP replacement, SEXP merge, SEXP vectorize_all) { if (stri__prepare_arg_logical_1_notNA(vectorize_all, "vectorize_all")) return stri__replace_all_charclass_yes_vectorize_all(str, pattern, replacement, merge); else return stri__replace_all_charclass_no_vectorize_all(str, pattern, replacement, merge); } /** * Replace first or last occurrence of a character class [internal] * * @param str character vector; strings to search in * @param pattern character vector; charclasses to search for * @param replacement character vector; strings to replace with * @param first replace first (TRUE) or last (FALSE)? * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-06) * * @version 0.1-?? (Marek Gagolewski, 2013-06-15) * Use StrContainerCharClass * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * detects invalid UTF-8 byte stream * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * StriContainerCharClass now relies on UnicodeSet * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri__replace_firstlast_charclass(SEXP str, SEXP pattern, SEXP replacement, bool first) { PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); PROTECT(replacement = stri__prepare_arg_string(replacement, "replacement")); R_len_t vectorize_length = stri__recycling_rule(true, 3, LENGTH(str), LENGTH(pattern), LENGTH(replacement)); STRI__ERROR_HANDLER_BEGIN(3) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerUTF8 replacement_cont(replacement, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); String8buf buf(0); // @TODO: consider calculating buflen a priori for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); R_len_t j, jlast; UChar32 chr; if (first) { // search for first for (jlast=j=0; jcontains(chr)) { break; // break at first occurrence } jlast = j; } } else { // search for last for (jlast=j=str_cur_n; jlast>0; ) { U8_PREV(str_cur_s, 0, jlast, chr); // "look behind" if (chr < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { break; // break at first occurrence } j = jlast; } } // match is at jlast, and ends right before j if (j == jlast) { // iff not found SET_STRING_ELT(ret, i, str_cont.toR(i)); // no change continue; } if (replacement_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t replacement_cur_n = replacement_cont.get(i).length(); const char* replacement_cur_s = replacement_cont.get(i).c_str(); R_len_t buf_need = str_cur_n+replacement_cur_n-(j-jlast); buf.resize(buf_need, false/*destroy contents*/); memcpy(buf.data(), str_cur_s, (size_t)jlast); memcpy(buf.data()+jlast, replacement_cur_s, (size_t)replacement_cur_n); memcpy(buf.data()+jlast+replacement_cur_n, str_cur_s+j, (size_t)str_cur_n-j); SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buf_need, CE_UTF8)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** * Replace first occurrence of a character class * * @param str character vector; strings to search in * @param pattern character vector; charclasses to search for * @param replacement character vector; strings to replace with * * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-06) */ SEXP stri_replace_first_charclass(SEXP str, SEXP pattern, SEXP replacement) { return stri__replace_firstlast_charclass(str, pattern, replacement, true); } /** * Replace last occurrence of a character class * * @param str character vector; strings to search in * @param pattern character vector; charclasses to search for * @param replacement character vector; strings to replace with * * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-06) */ SEXP stri_replace_last_charclass(SEXP str, SEXP pattern, SEXP replacement) { return stri__replace_firstlast_charclass(str, pattern, replacement, false); } stringi/src/Makevars.win0000644000176200001440000000334614750110642015016 0ustar liggesusers# kate: hl Makefile ## `stringi` Makevars.win ## Copyright (c) 2013-2025, Marek Gagolewski PKG_CPPFLAGS=-I. -Iicu74/ -Iicu74/unicode -Iicu74/common -Iicu74/i18n \ -DUCONFIG_USE_LOCAL \ -DU_STATIC_IMPLEMENTATION -DU_COMMON_IMPLEMENTATION \ -DU_I18N_IMPLEMENTATION -DU_TOOLUTIL_IMPLEMENTATION \ -UDEBUG -DNDEBUG -DWINVER=0x0601 -D_WIN32_WINNT=0x0601 \ -DU_USE_STRTOD_L=0 # 0x0600 == Windows Vista/Server 2008 # 0x0601 == Windows 7 # 0x0602 == Windows 8 # 0x0603 == Windows 8.1 # 0x0A00 == Windows 10 # ICU 69 uses LOCALE_ALLOW_NEUTRAL_NAMES which is Windows 7 and later SOURCES_CPP=$(wildcard stri_*.cpp) OBJECTS=$(SOURCES_CPP:.cpp=.o) ICU_STUBDATA_SOURCES_CPP=$(wildcard icu74/stubdata/*.cpp) ICU_STUBDATA_OBJECTS=$(ICU_STUBDATA_SOURCES_CPP:.cpp=.o) ICU_COMMON_SOURCES_CPP=$(wildcard icu74/common/*.cpp) ICU_COMMON_OBJECTS=$(ICU_COMMON_SOURCES_CPP:.cpp=.o) ICU_I18N_SOURCES_CPP=$(wildcard icu74/i18n/*.cpp) ICU_I18N_OBJECTS=$(ICU_I18N_SOURCES_CPP:.cpp=.o) ## OBJECTS=$(OBJECTS) $(ICU_COMMON_OBJECTS) $(ICU_I18N_OBJECTS) $(ICU_STUBDATA_OBJECTS) ## There was a Cygwin bug which reported "mem alloc error" while linking ## too many .o files at once. At other times, we can get a "make: execvp: sh: ## Argument list too long" error. Thus, below we split the build process into ## a few parts using static libs. .PHONY: all all: $(SHLIB) $(SHLIB): $(OBJECTS) libicu_common.a libicu_i18n.a libicu_stubdata.a PKG_LIBS=-L. -licu_i18n -licu_common -licu_stubdata libicu_common.a: $(ICU_COMMON_OBJECTS) libicu_i18n.a: $(ICU_I18N_OBJECTS) libicu_stubdata.a: $(ICU_STUBDATA_OBJECTS) clean: rm -f $(OBJECTS) $(ICU_COMMON_OBJECTS) $(ICU_I18N_OBJECTS) \ $(ICU_STUBDATA_OBJECTS) libicu_common.a libicu_i18n.a \ libicu_stubdata.a stringi/src/stri_encoding_management.cpp0000644000176200001440000002542014770541312020256 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_ucnv.h" /** * Sets current (default) ICU charset * * If given charset is unavailable, an error is raised * * @param enc new charset (single string) * @return nothing (\code{R_NilValue}) * * @version 0.1-?? (Marek Gagolewski) * * @version 0.2-1 (Marek Gagolewski) * use StriUcnv; make StriException-friendly * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.3.1 (Marek Gagolewski, 2019-02-06) * #335: if system ICU uses U_CHARSET_IS_UTF8=1, the function has no effect */ SEXP stri_enc_set(SEXP enc) { // here, the default encoding may not be requested: const char* selected_enc = stri__prepare_arg_enc(enc, "enc", false/*no default*/); /* this is R_alloc'ed */ #ifdef U_CHARSET_IS_UTF8 #if U_CHARSET_IS_UTF8 // #335: if system ICU uses U_CHARSET_IS_UTF8=1, the function has no effect Rf_warning(MSG__U_CHARSET_IS_UTF8); return R_NilValue; #endif #endif STRI__ERROR_HANDLER_BEGIN(0) StriUcnv uconv_obj(selected_enc); // this will generate an error if selected_enc is not supported: UConverter* uconv = uconv_obj.getConverter(); UErrorCode status = U_ZERO_ERROR; // get "official" encoding name: const char* name = ucnv_getName(uconv, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) /* DO NOT call this function when ANY ICU function is being used from more than one thread! This function sets the current default converter name. If this function needs to be called, it should be called during application initialization. Do not use unless you know what you are doing. */ ucnv_setDefaultName(name); // set as default return R_NilValue; STRI__ERROR_HANDLER_END({/* no special action on error */}) } /** * Get all available ICU charsets and their aliases (elems 2,3,...) * * @return R list object; element name == ICU charset canonical name; * elements are character vectors (aliases) * * @version 0.1-?? (Marek Gagolewski) * * @version 0.2-1 (Marek Gagolewski) * use StriUcnv; make StriException-friendly * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_list() { R_len_t c = (R_len_t)ucnv_countAvailable(); STRI__ERROR_HANDLER_BEGIN(0) SEXP ret; SEXP names; STRI__PROTECT(ret = Rf_allocVector(VECSXP, c)); STRI__PROTECT(names = Rf_allocVector(STRSXP, c)); for (R_len_t i=0; i standards = StriUcnv::getStandards(); R_len_t standards_n = (R_len_t)standards.size(); // alloc output list SEXP vals; SEXP names; const int nval = standards_n+2+5; STRI__PROTECT(names = Rf_allocVector(STRSXP, nval)); SET_STRING_ELT(names, 0, Rf_mkChar("Name.friendly")); SET_STRING_ELT(names, 1, Rf_mkChar("Name.ICU")); for (R_len_t i=0; i * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf16.h" #include "stri_container_usearch.h" #include "stri_string8buf.h" #include using namespace std; /** * Replace all/first/last occurrences of a fixed pattern [with collation] * * @param str character vector * @param pattern character vector * @param replacement character vector * @param opts_collator list * @return character vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-26) * StriException friendly & Use StriContainers * * @version 0.1-?? (Marek Gagolewski, 2013-07-10) * BUGFIX: wrong behavior on empty str * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * new fun: stri__replace_allfirstlast_coll (opts_collator == NA not allowed) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.0-2 (Marek Gagolewski, 2016-01-30) * Issue #210: Allow NA replacement */ SEXP stri__replace_allfirstlast_coll(SEXP str, SEXP pattern, SEXP replacement, SEXP opts_collator, int type) { PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(replacement = stri__prepare_arg_string(replacement, "replacement")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); UCollator* collator = NULL; collator = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(3) R_len_t vectorize_length = stri__recycling_rule(true, 3, LENGTH(str), LENGTH(pattern), LENGTH(replacement)); StriContainerUTF16 str_cont(str, vectorize_length, false); // writable StriContainerUStringSearch pattern_cont(pattern, vectorize_length, collator); // collator is not owned by pattern_cont StriContainerUTF16 replacement_cont(replacement, vectorize_length); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, str_cont.setNA(i);, /*just skip on empty str*/;) UStringSearch *matcher = pattern_cont.getMatcher(i, str_cont.get(i)); usearch_reset(matcher); UErrorCode status = U_ZERO_ERROR; R_len_t remUChars = 0; deque< pair > occurrences; if (type >= 0) { // first or all int start = (int)usearch_first(matcher, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (start == USEARCH_DONE) // no match continue; // no change in str_cont[i] at all if (replacement_cont.isNA(i)) { str_cont.setNA(i); continue; } while (start != USEARCH_DONE) { R_len_t mlen = usearch_getMatchedLength(matcher); remUChars += mlen; occurrences.push_back(pair(start, start+mlen)); if (type > 0) break; // break if first and not all start = usearch_next(matcher, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } } else { // if last int start = (int)usearch_last(matcher, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (start == USEARCH_DONE) // no match continue; // no change in str_cont[i] at all if (replacement_cont.isNA(i)) { str_cont.setNA(i); continue; } R_len_t mlen = usearch_getMatchedLength(matcher); remUChars += mlen; occurrences.push_back(pair(start, start+mlen)); } R_len_t replacement_cur_n = replacement_cont.get(i).length(); R_len_t noccurrences = (R_len_t)occurrences.size(); UnicodeString ans(str_cont.get(i).length()-remUChars+noccurrences*replacement_cur_n, (UChar)0xfffd, 0); R_len_t jlast = 0; R_len_t anslast = 0; deque< pair >::iterator iter = occurrences.begin(); for (; iter != occurrences.end(); ++iter) { pair match = *iter; ans.replace(anslast, match.first-jlast, str_cont.get(i), jlast, match.first-jlast); anslast += match.first-jlast; jlast = match.second; ans.replace(anslast, replacement_cur_n, replacement_cont.get(i)); anslast += replacement_cur_n; } ans.replace(anslast, str_cont.get(i).length()-jlast, str_cont.get(i), jlast, str_cont.get(i).length()-jlast); str_cont.getWritable(i) = ans; } if (collator) { ucol_close(collator); collator=NULL; } STRI__UNPROTECT_ALL return str_cont.toR(); STRI__ERROR_HANDLER_END( if (collator) ucol_close(collator); ) } /** * Replace all occurrences of a coll pattern; vectorize_all=FALSE * * @param str character vector * @param pattern character vector * @param replacement character vector * @param opts_collator a named list * @return character vector * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.3-1 (Marek Gagolewski, 2014-11-06) * Added missing ucol_close * * @version 1.0-2 (Marek Gagolewski, 2016-01-30) * Issue #210: Allow NA replacement */ SEXP stri__replace_all_coll_no_vectorize_all(SEXP str, SEXP pattern, SEXP replacement, SEXP opts_collator) { // version beta PROTECT(str = stri__prepare_arg_string(str, "str")); // if str_n is 0, then return an empty vector R_len_t str_n = LENGTH(str); if (str_n <= 0) { UNPROTECT(1); return stri__vector_empty_strings(0); } PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); PROTECT(replacement = stri__prepare_arg_string(replacement, "replacement")); R_len_t pattern_n = LENGTH(pattern); R_len_t replacement_n = LENGTH(replacement); if (pattern_n < replacement_n || pattern_n <= 0 || replacement_n <= 0) { UNPROTECT(3); Rf_error(MSG__WARN_RECYCLING_RULE2); } if (pattern_n % replacement_n != 0) { Rf_warning(MSG__WARN_RECYCLING_RULE); } if (pattern_n == 1) {// this will be much faster: SEXP ret; PROTECT(ret = stri__replace_allfirstlast_coll(str, pattern, replacement, opts_collator, 0)); UNPROTECT(4); return ret; } UCollator* collator = NULL; collator = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(3) StriContainerUTF16 str_cont(str, str_n, false); // writable StriContainerUStringSearch pattern_cont(pattern, pattern_n, collator); // collator is not owned by pattern_cont StriContainerUTF16 replacement_cont(replacement, pattern_n); for (R_len_t i = 0; i > occurrences; int start = (int)usearch_first(matcher, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (start == USEARCH_DONE) // no match continue; // no change in str_cont[j] at all if (replacement_cont.isNA(i)) { str_cont.setNA(j); continue; } while (start != USEARCH_DONE) { R_len_t mlen = usearch_getMatchedLength(matcher); remUChars += mlen; occurrences.push_back(pair(start, start+mlen)); start = usearch_next(matcher, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } R_len_t replacement_cur_n = replacement_cont.get(i).length(); R_len_t noccurrences = (R_len_t)occurrences.size(); UnicodeString ans(str_cont.get(j).length()-remUChars+noccurrences*replacement_cur_n, (UChar)0xfffd, 0); R_len_t jlast = 0; R_len_t anslast = 0; deque< pair >::iterator iter = occurrences.begin(); for (; iter != occurrences.end(); ++iter) { pair match = *iter; ans.replace(anslast, match.first-jlast, str_cont.get(j), jlast, match.first-jlast); anslast += match.first-jlast; jlast = match.second; ans.replace(anslast, replacement_cur_n, replacement_cont.get(i)); anslast += replacement_cur_n; } ans.replace(anslast, str_cont.get(j).length()-jlast, str_cont.get(j), jlast, str_cont.get(j).length()-jlast); str_cont.getWritable(j) = ans; } } if (collator) { ucol_close(collator); collator=NULL; } STRI__UNPROTECT_ALL return str_cont.toR(); STRI__ERROR_HANDLER_END( if (collator) ucol_close(collator); ) } /** * Replace all occurrences of a fixed pattern [with collation] * * @param str character vector * @param pattern character vector * @param replacement character vector * @param opts_collator list * @return character vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-26) * use stri__replace_allfirstlast_fixed * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * new fun: stri_replace_all_coll (opts_collator == NA not allowed) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * vectorize_all arg added */ SEXP stri_replace_all_coll(SEXP str, SEXP pattern, SEXP replacement, SEXP vectorize_all, SEXP opts_collator) { if (stri__prepare_arg_logical_1_notNA(vectorize_all, "vectorize_all")) return stri__replace_allfirstlast_coll(str, pattern, replacement, opts_collator, 0); else return stri__replace_all_coll_no_vectorize_all(str, pattern, replacement, opts_collator); } /** * Replace last occurrence of a fixed pattern [with collation] * * @param str character vector * @param pattern character vector * @param replacement character vector * @param opts_collator list * @return character vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-26) * use stri__replace_allfirstlast_fixed * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * new fun: stri_replace_last_coll (opts_collator == NA not allowed) */ SEXP stri_replace_last_coll(SEXP str, SEXP pattern, SEXP replacement, SEXP opts_collator) { return stri__replace_allfirstlast_coll(str, pattern, replacement, opts_collator, -1); } /** * Replace first occurrence of a fixed pattern [with collation] * * @param str character vector * @param pattern character vector * @param replacement character vector * @param opts_collator list * @return character vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-26) * use stri__replace_allfirstlast_fixed * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * new fun: stri_replace_first_coll (opts_collator == NA not allowed) */ SEXP stri_replace_first_coll(SEXP str, SEXP pattern, SEXP replacement, SEXP opts_collator) { return stri__replace_allfirstlast_coll(str, pattern, replacement, opts_collator, 1); } stringi/src/stri_search_class_count.cpp0000644000176200001440000000760414770541312020142 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_charclass.h" /** * Count the number of occurrences of a character class * * @param str character vector * @param pattern character vector * @return integer vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-02) * * @version 0.1-?? (Marek Gagolewski, 2013-06-15) * Use StrContainerCharClass * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * detects invalid UTF-8 byte stream * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * StriContainerCharClass now relies on UnicodeSet * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_count_charclass(SEXP str, SEXP pattern) { PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, vectorize_length)); int* ret_tab = INTEGER(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i)) { ret_tab[i] = NA_INTEGER; continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); UChar32 chr = 0; R_len_t count = 0; for (R_len_t j=0; jcontains(chr)) ++count; } ret_tab[i] = count; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } stringi/src/stri_search_fixed_locate.cpp0000644000176200001440000002624114770541312020251 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8_indexable.h" #include "stri_container_bytesearch.h" #include #include using namespace std; /** * Locate first or last occurrences of a pattern in a string * * @param str character vector * @param pattern character vector * @param first looking for first or last match? * @return integer matrix (2 columns) * * @version 0.1-?? (Bartlomiej Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-23) * StriException friendly, use StriContainerByteSearch * * @version 0.2-1 (Marek Gagolewski, 2014-03-20) * Use StriContainerUTF8_indexable * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * stri_locate_fixed now uses byte search only * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * FR #110, #23: opts_fixed arg added * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use StriByteSearchMatcher * * @version 1.7.1 (Marek Gagolewski, 2021-06-29) * get_length */ SEXP stri__locate_firstlast_fixed(SEXP str, SEXP pattern, SEXP opts_fixed, bool first, bool get_length1) { uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); STRI__ERROR_HANDLER_BEGIN(2) int vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF8_indexable str_cont(str, vectorize_length); StriContainerByteSearch pattern_cont(pattern, vectorize_length, pattern_flags); SEXP ret; STRI__PROTECT(ret = Rf_allocMatrix(INTSXP, vectorize_length, 2)); stri__locate_set_dimnames_matrix(ret, get_length1); int* ret_tab = INTEGER(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { ret_tab[i] = NA_INTEGER; ret_tab[i+vectorize_length] = NA_INTEGER; STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN( str_cont, pattern_cont, ;/*nothing on NA - keep NA_INTEGER*/, { if (get_length1) ret_tab[i] = ret_tab[i+vectorize_length] = -1; } ) StriByteSearchMatcher* matcher = pattern_cont.getMatcher(i); matcher->reset(str_cont.get(i).c_str(), str_cont.get(i).length()); int start; if (first) { start = matcher->findFirst(); } else { start = matcher->findLast(); } if (start != USEARCH_DONE) { // there is a match ret_tab[i] = start; ret_tab[i+vectorize_length] = start+matcher->getMatchedLength(); // Adjust UTF8 byte index -> UChar32 index str_cont.UTF8_to_UChar32_index(i, ret_tab+i, ret_tab+i+vectorize_length, 1, 1, // 0-based index -> 1-based 0 // end returns position of next character after match ); if (get_length1) ret_tab[i+vectorize_length] -= ret_tab[i] - 1; // to->length } else if (get_length1) { // not found ret_tab[i+vectorize_length] = ret_tab[i] = -1; } // else NA_INTEGER already } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( ;/* do nothing special on error */ ) } /** * Locate first occurrences of pattern in a string [fixed pattern] * * @param str character vector * @param pattern character vector * @return integer matrix (2 columns) * * @version 0.1-?? (Bartlomiej Tartanus) * * @version 0.1-?? (Bartlomiej Tartanus, 2013-06-09) * StriContainerUTF16 & collator * * @version 0.1-?? (Marek Gagolewski, 2013-06-23) * use stri_locate_firstlast_fixed * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * stri_locate_fixed now uses byte search only * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * FR #110, #23: opts_fixed arg added * * @version 1.7.1 (Marek Gagolewski, 2021-06-29) * get_length */ SEXP stri_locate_first_fixed(SEXP str, SEXP pattern, SEXP opts_fixed, SEXP get_length) { bool get_length1 = stri__prepare_arg_logical_1_notNA(get_length, "get_length"); return stri__locate_firstlast_fixed(str, pattern, opts_fixed, true, get_length1); } /** * Locate last occurrences of pattern in a string [fixed pattern] * * @param str character vector * @param pattern character vector * @return integer matrix (2 columns) * * @version 0.1-?? (Bartlomiej Tartanus) * * @version 0.1-?? (Bartlomiej Tartanus, 2013-06-09) * StriContainerUTF16 & collator * * @version 0.1-?? (Marek Gagolewski, 2013-06-23) * use stri_locate_firstlast_fixed * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * stri_locate_fixed now uses byte search only * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * FR #110, #23: opts_fixed arg added * * @version 1.7.1 (Marek Gagolewski, 2021-06-29) * get_length */ SEXP stri_locate_last_fixed(SEXP str, SEXP pattern, SEXP opts_fixed, SEXP get_length) { bool get_length1 = stri__prepare_arg_logical_1_notNA(get_length, "get_length"); return stri__locate_firstlast_fixed(str, pattern, opts_fixed, false, get_length1); } /** Locate all occurrences of fixed-byte pattern * * @param str character vector * @param pattern character vector * @return list of integer matrices (2 columns) * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-23) * StriException friendly, use StriContainerByteSearch * * @version 0.2-1 (Marek Gagolewski, 2014-03-20) * Use StriContainerUTF8_indexable * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * stri_locate_fixed now uses byte search only * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-11-27) * #117: omit_no_match arg added * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * #110, #23: opts_fixed arg added * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use StriByteSearchMatcher * * @version 1.7.1 (Marek Gagolewski, 2021-06-29) * get_length */ SEXP stri_locate_all_fixed(SEXP str, SEXP pattern, SEXP omit_no_match, SEXP opts_fixed, SEXP get_length) { uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed, /*allow_overlap*/true); bool omit_no_match1 = stri__prepare_arg_logical_1_notNA(omit_no_match, "omit_no_match"); bool get_length1 = stri__prepare_arg_logical_1_notNA(get_length, "get_length"); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); STRI__ERROR_HANDLER_BEGIN(2) int vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF8_indexable str_cont(str, vectorize_length); StriContainerByteSearch pattern_cont(pattern, vectorize_length, pattern_flags); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, SET_VECTOR_ELT(ret, i, stri__matrix_NA_INTEGER(1, 2));, SET_VECTOR_ELT(ret, i, stri__matrix_NA_INTEGER(omit_no_match1?0:1, 2, get_length1?-1:NA_INTEGER));) StriByteSearchMatcher* matcher = pattern_cont.getMatcher(i); matcher->reset(str_cont.get(i).c_str(), str_cont.get(i).length()); int start = matcher->findFirst(); if (start == USEARCH_DONE) { // no matches at all SET_VECTOR_ELT(ret, i, stri__matrix_NA_INTEGER(omit_no_match1?0:1, 2, get_length1?-1:NA_INTEGER)); continue; } deque< pair > occurrences; while (start != USEARCH_DONE) { occurrences.push_back(pair(start, start+matcher->getMatchedLength())); start = matcher->findNext(); } R_len_t noccurrences = (R_len_t)occurrences.size(); SEXP ans; STRI__PROTECT(ans = Rf_allocMatrix(INTSXP, noccurrences, 2)); int* ans_tab = INTEGER(ans); deque< pair >::iterator iter = occurrences.begin(); for (R_len_t j = 0; iter != occurrences.end(); ++iter, ++j) { pair match = *iter; ans_tab[j] = match.first; ans_tab[j+noccurrences] = match.second; } // Adjust UChar index -> UChar32 index (1-2 byte UTF16 to 1 byte UTF32-code points) str_cont.UTF8_to_UChar32_index(i, ans_tab, ans_tab+noccurrences, noccurrences, 1, // 0-based index -> 1-based 0 // end returns position of next character after match ); if (get_length1) { for (R_len_t j=0; j < noccurrences; ++j) ans_tab[j+noccurrences] -= ans_tab[j] - 1; // to->length } SET_VECTOR_ELT(ret, i, ans); STRI__UNPROTECT(1); } stri__locate_set_dimnames_list(ret, get_length1); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( ;/* do nothing special on error */ ) } stringi/src/stri_search_fixed_subset.cpp0000644000176200001440000001771614770541312020316 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_utf16.h" #include "stri_container_usearch.h" #include "stri_container_bytesearch.h" #include /** * Returns vector elements if a pattern occurs in a string * * @param str character vector * @param pattern character vector * @param omit_na single logical value * @param opts_fixed list * @return character vector * * @version 0.3-1 (Bartek Tartanus, 2014-07-25) * * @version 0.3-1 (Marek Gagolewski, 2014-10-17) * using std::vector to avoid mem-leaks * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-04) * #122: omit_na arg added * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * #110, #23: opts_fixed arg added * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use StriByteSearchMatcher * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * #216: `negate` arg added * * @version 1.7.1 (Marek Gagolewski, 2021-06-17) * assure LENGTH(pattern) <= LENGTH(str) */ SEXP stri_subset_fixed(SEXP str, SEXP pattern, SEXP omit_na, SEXP negate, SEXP opts_fixed) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed); bool omit_na1 = stri__prepare_arg_logical_1_notNA(omit_na, "omit_na"); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); if (LENGTH(str) > 0 && LENGTH(str) < LENGTH(pattern)) Rf_error(MSG__WARN_RECYCLING_RULE2); int vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); if (vectorize_length == 0) { UNPROTECT(2); return Rf_allocVector(STRSXP, 0); } STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerByteSearch pattern_cont(pattern, vectorize_length, pattern_flags); // BT: this cannot be done with deque, because pattern is reused so i does not // go like 0,1,2...n but 0,pat_len,2*pat_len,1,pat_len+1 and so on // MG: agreed std::vector which(vectorize_length); int result_counter = 0; for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, {if (omit_na1) which[i] = FALSE; else { which[i] = NA_LOGICAL; result_counter++; } }, {which[i] = negate_1; if (which[i]) result_counter++;}) StriByteSearchMatcher* matcher = pattern_cont.getMatcher(i); matcher->reset(str_cont.get(i).c_str(), str_cont.get(i).length()); which[i] = (int)(matcher->findFirst() != USEARCH_DONE); if (negate_1) which[i] = !which[i]; if (which[i]) result_counter++; } SEXP ret; STRI__PROTECT(ret = stri__subset_by_logical(str_cont, which, result_counter)); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( ;/* do nothing special on error */ ) } /** * Substitutes vector elements if a pattern occurs in a string * * @param str character vector * @param pattern character vector * @param opts_fixed list * @param value character vector * @return character vector * * @version 1.0-3 (Marek Gagolewski, 2016-02-02) * #124 * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * #216: `negate` arg added * * @version 1.7.1 (Marek Gagolewski, 2021-06-17) * assure LENGTH(pattern) and LENGTH(value) <= LENGTH(str) */ SEXP stri_subset_fixed_replacement(SEXP str, SEXP pattern, SEXP negate, SEXP opts_fixed, SEXP value) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); PROTECT(value = stri__prepare_arg_string(value, "value")); // we are subsetting `str`, therefore recycling is slightly different here if (LENGTH(value) == 0) Rf_error(MSG__REPLACEMENT_ZERO); if (LENGTH(pattern) == 0) Rf_error(MSG__WARN_EMPTY_VECTOR); if (LENGTH(str) == 0) { UNPROTECT(3); return Rf_allocVector(STRSXP, 0); } if (LENGTH(str) < LENGTH(pattern)) // for LENGTH(value), we emit warning later on Rf_error(MSG__WARN_RECYCLING_RULE2); if ((LENGTH(str) % LENGTH(pattern)) != 0) Rf_warning(MSG__WARN_RECYCLING_RULE); R_len_t vectorize_length = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(3) R_len_t value_length = LENGTH(value); StriContainerUTF8 value_cont(value, value_length); StriContainerUTF8 str_cont(str, vectorize_length); StriContainerByteSearch pattern_cont(pattern, vectorize_length, pattern_flags); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); std::vector detected(vectorize_length, 0); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (pattern_cont.isNA(i)) { // behave like `[<-` detected[i] = false; continue; } STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, {detected[i] = NA_INTEGER;}, {detected[i] = negate_1;} ) StriByteSearchMatcher* matcher = pattern_cont.getMatcher(i); matcher->reset(str_cont.get(i).c_str(), str_cont.get(i).length()); detected[i] = (((int)(matcher->findFirst() != USEARCH_DONE) && !negate_1) || ((int)(matcher->findFirst() == USEARCH_DONE) && negate_1)); } R_len_t k = 0; // we must traverse `str_cont` in order now for (R_len_t i = 0; i * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8_indexable.h" #include "stri_container_integer.h" #include "stri_brkiter.h" /** Count the number of BreakIterator boundaries * * @param str character vector * @param opts_brkiter identifier * @return character vector * * @version 0.3-1 (Marek Gagolewski, 2014-10-30) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-02) * use StriRuleBasedBreakIterator */ SEXP stri_count_boundaries(SEXP str, SEXP opts_brkiter) { PROTECT(str = stri__prepare_arg_string(str, "str")); StriBrkIterOptions opts_brkiter2(opts_brkiter, "line_break"); STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_length = LENGTH(str); StriContainerUTF8_indexable str_cont(str, str_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_length)); StriRuleBasedBreakIterator brkiter(opts_brkiter2); for (R_len_t i = 0; i < str_length; ++i) { if (str_cont.isNA(i)) { INTEGER(ret)[i] = NA_INTEGER; continue; } brkiter.setupMatcher(str_cont.get(i).c_str(), str_cont.get(i).length()); brkiter.first(); R_len_t cur_count = 0; while (brkiter.next()) ++cur_count; INTEGER(ret)[i] = cur_count; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* no action */ }) } stringi/src/stri_stringi.cpp0000644000176200001440000005433714750143163015764 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_callables.h" #include #include #include #ifndef STRI_ICU_FOUND #include "uconfig_local.h" #endif #define STRI__MK_CALL(symb, name, args) \ {symb, (DL_FUNC)(void (*) (void))(&name), args} /** * List of functions available via .Call() in R * * Form: * \code{\{"method_name", (DL_FUNC)pointer, number_of_args\}} - * this is generated by the STRI__MK_CALL macro. */ const R_CallMethodDef cCallMethods[] = { // STRI__MK_CALL("C_stri_c_posixst", stri_c_posixst, 1), // internal STRI__MK_CALL("C_stri_cmp_eq", stri_cmp_eq, 2), STRI__MK_CALL("C_stri_cmp_neq", stri_cmp_neq, 2), STRI__MK_CALL("C_stri_cmp", stri_cmp, 3), STRI__MK_CALL("C_stri_cmp_lt", stri_cmp_lt, 3), STRI__MK_CALL("C_stri_cmp_le", stri_cmp_le, 3), STRI__MK_CALL("C_stri_cmp_gt", stri_cmp_gt, 3), STRI__MK_CALL("C_stri_cmp_ge", stri_cmp_ge, 3), STRI__MK_CALL("C_stri_cmp_equiv", stri_cmp_equiv, 3), STRI__MK_CALL("C_stri_cmp_nequiv", stri_cmp_nequiv, 3), STRI__MK_CALL("C_stri_count_boundaries", stri_count_boundaries, 2), STRI__MK_CALL("C_stri_count_charclass", stri_count_charclass, 2), STRI__MK_CALL("C_stri_count_fixed", stri_count_fixed, 3), STRI__MK_CALL("C_stri_count_coll", stri_count_coll, 3), STRI__MK_CALL("C_stri_count_regex", stri_count_regex, 3), STRI__MK_CALL("C_stri_datetime_symbols", stri_datetime_symbols, 3), STRI__MK_CALL("C_stri_datetime_fields", stri_datetime_fields, 3), STRI__MK_CALL("C_stri_datetime_fstr", stri_datetime_fstr, 1), STRI__MK_CALL("C_stri_datetime_now", stri_datetime_now, 0), STRI__MK_CALL("C_stri_datetime_create", stri_datetime_create, 9), STRI__MK_CALL("C_stri_datetime_format", stri_datetime_format, 4), STRI__MK_CALL("C_stri_datetime_parse", stri_datetime_parse, 5), STRI__MK_CALL("C_stri_datetime_add", stri_datetime_add, 5), STRI__MK_CALL("C_stri_detect_charclass", stri_detect_charclass, 4), STRI__MK_CALL("C_stri_detect_coll", stri_detect_coll, 5), STRI__MK_CALL("C_stri_detect_fixed", stri_detect_fixed, 5), STRI__MK_CALL("C_stri_detect_regex", stri_detect_regex, 5), STRI__MK_CALL("C_stri_dup", stri_dup, 2), STRI__MK_CALL("C_stri_duplicated", stri_duplicated, 3), STRI__MK_CALL("C_stri_duplicated_any", stri_duplicated_any, 3), STRI__MK_CALL("C_stri_enc_detect", stri_enc_detect, 2), STRI__MK_CALL("C_stri_enc_detect2", stri_enc_detect2, 2), STRI__MK_CALL("C_stri_enc_isutf8", stri_enc_isutf8, 1), STRI__MK_CALL("C_stri_enc_isutf16le", stri_enc_isutf16le, 1), STRI__MK_CALL("C_stri_enc_isutf16be", stri_enc_isutf16be, 1), STRI__MK_CALL("C_stri_enc_isutf32le", stri_enc_isutf32le, 1), STRI__MK_CALL("C_stri_enc_isutf32be", stri_enc_isutf32be, 1), STRI__MK_CALL("C_stri_enc_isascii", stri_enc_isascii, 1), STRI__MK_CALL("C_stri_enc_info", stri_enc_info, 1), STRI__MK_CALL("C_stri_enc_list", stri_enc_list, 0), STRI__MK_CALL("C_stri_enc_mark", stri_enc_mark, 1), STRI__MK_CALL("C_stri_enc_set", stri_enc_set, 1), STRI__MK_CALL("C_stri_enc_fromutf32", stri_enc_fromutf32, 1), STRI__MK_CALL("C_stri_enc_toascii", stri_enc_toascii, 1), STRI__MK_CALL("C_stri_enc_toutf8", stri_enc_toutf8, 3), STRI__MK_CALL("C_stri_enc_toutf32", stri_enc_toutf32, 1), STRI__MK_CALL("C_stri_encode", stri_encode, 4), STRI__MK_CALL("C_stri_endswith_charclass", stri_endswith_charclass, 4), STRI__MK_CALL("C_stri_endswith_coll", stri_endswith_coll, 5), STRI__MK_CALL("C_stri_endswith_fixed", stri_endswith_fixed, 5), STRI__MK_CALL("C_stri_escape_unicode", stri_escape_unicode, 1), STRI__MK_CALL("C_stri_extract_first_boundaries", stri_extract_first_boundaries, 2), STRI__MK_CALL("C_stri_extract_last_boundaries", stri_extract_last_boundaries, 2), STRI__MK_CALL("C_stri_extract_all_boundaries", stri_extract_all_boundaries, 4), STRI__MK_CALL("C_stri_extract_first_charclass", stri_extract_first_charclass, 2), STRI__MK_CALL("C_stri_extract_last_charclass", stri_extract_last_charclass, 2), STRI__MK_CALL("C_stri_extract_all_charclass", stri_extract_all_charclass, 5), STRI__MK_CALL("C_stri_extract_first_coll", stri_extract_first_coll, 3), STRI__MK_CALL("C_stri_extract_last_coll", stri_extract_last_coll, 3), STRI__MK_CALL("C_stri_extract_all_coll", stri_extract_all_coll, 5), STRI__MK_CALL("C_stri_extract_first_fixed", stri_extract_first_fixed, 3), STRI__MK_CALL("C_stri_extract_last_fixed", stri_extract_last_fixed, 3), STRI__MK_CALL("C_stri_extract_all_fixed", stri_extract_all_fixed, 5), STRI__MK_CALL("C_stri_extract_first_regex", stri_extract_first_regex, 3), STRI__MK_CALL("C_stri_extract_last_regex", stri_extract_last_regex, 3), STRI__MK_CALL("C_stri_extract_all_regex", stri_extract_all_regex, 5), STRI__MK_CALL("C_stri_flatten", stri_flatten, 4), STRI__MK_CALL("C_stri_info", stri_info, 0), STRI__MK_CALL("C_stri_isempty", stri_isempty, 1), STRI__MK_CALL("C_stri_join", stri_join, 4), STRI__MK_CALL("C_stri_join_list", stri_join_list, 3), STRI__MK_CALL("C_stri_join2", stri_join2, 2), STRI__MK_CALL("C_stri_length", stri_length, 1), STRI__MK_CALL("C_stri_list2matrix", stri_list2matrix, 4), STRI__MK_CALL("C_stri_locale_info", stri_locale_info, 1), STRI__MK_CALL("C_stri_locale_list", stri_locale_list, 0), STRI__MK_CALL("C_stri_locale_set", stri_locale_set, 1), STRI__MK_CALL("C_stri_locate_all_boundaries", stri_locate_all_boundaries, 4), STRI__MK_CALL("C_stri_locate_first_boundaries", stri_locate_first_boundaries, 3), STRI__MK_CALL("C_stri_locate_last_boundaries", stri_locate_last_boundaries, 3), STRI__MK_CALL("C_stri_locate_first_charclass", stri_locate_first_charclass, 3), STRI__MK_CALL("C_stri_locate_last_charclass", stri_locate_last_charclass, 3), STRI__MK_CALL("C_stri_locate_all_charclass", stri_locate_all_charclass, 5), STRI__MK_CALL("C_stri_locate_last_fixed", stri_locate_last_fixed, 4), STRI__MK_CALL("C_stri_locate_first_fixed", stri_locate_first_fixed, 4), STRI__MK_CALL("C_stri_locate_all_fixed", stri_locate_all_fixed, 5), STRI__MK_CALL("C_stri_locate_last_coll", stri_locate_last_coll, 4), STRI__MK_CALL("C_stri_locate_first_coll", stri_locate_first_coll, 4), STRI__MK_CALL("C_stri_locate_all_coll", stri_locate_all_coll, 5), STRI__MK_CALL("C_stri_locate_all_regex", stri_locate_all_regex, 6), STRI__MK_CALL("C_stri_locate_first_regex", stri_locate_first_regex, 5), STRI__MK_CALL("C_stri_locate_last_regex", stri_locate_last_regex, 5), STRI__MK_CALL("C_stri_match_first_regex", stri_match_first_regex, 4), STRI__MK_CALL("C_stri_match_last_regex", stri_match_last_regex, 4), STRI__MK_CALL("C_stri_match_all_regex", stri_match_all_regex, 5), STRI__MK_CALL("C_stri_numbytes", stri_numbytes, 1), STRI__MK_CALL("C_stri_order", stri_order, 4), STRI__MK_CALL("C_stri_rank", stri_rank, 2), STRI__MK_CALL("C_stri_sort", stri_sort, 4), STRI__MK_CALL("C_stri_sort_key", stri_sort_key, 2), STRI__MK_CALL("C_stri_pad", stri_pad, 5), STRI__MK_CALL("C_stri_prepare_arg_string", stri_prepare_arg_string, 2), STRI__MK_CALL("C_stri_prepare_arg_double", stri_prepare_arg_double, 2), STRI__MK_CALL("C_stri_prepare_arg_integer", stri_prepare_arg_integer, 2), STRI__MK_CALL("C_stri_prepare_arg_logical", stri_prepare_arg_logical, 2), STRI__MK_CALL("C_stri_prepare_arg_raw", stri_prepare_arg_raw, 2), STRI__MK_CALL("C_stri_prepare_arg_string_1", stri_prepare_arg_string_1, 2), STRI__MK_CALL("C_stri_prepare_arg_double_1", stri_prepare_arg_double_1, 2), STRI__MK_CALL("C_stri_prepare_arg_integer_1", stri_prepare_arg_integer_1, 2), STRI__MK_CALL("C_stri_prepare_arg_logical_1", stri_prepare_arg_logical_1, 2), STRI__MK_CALL("C_stri_rand_shuffle", stri_rand_shuffle, 1), STRI__MK_CALL("C_stri_rand_strings", stri_rand_strings, 3), STRI__MK_CALL("C_stri_replace_na", stri_replace_na, 2), STRI__MK_CALL("C_stri_replace_rstr", stri_replace_rstr, 1), STRI__MK_CALL("C_stri_replace_all_fixed", stri_replace_all_fixed, 5), STRI__MK_CALL("C_stri_replace_first_fixed", stri_replace_first_fixed, 4), STRI__MK_CALL("C_stri_replace_last_fixed", stri_replace_last_fixed, 4), STRI__MK_CALL("C_stri_replace_all_coll", stri_replace_all_coll, 5), STRI__MK_CALL("C_stri_replace_first_coll", stri_replace_first_coll, 4), STRI__MK_CALL("C_stri_replace_last_coll", stri_replace_last_coll, 4), STRI__MK_CALL("C_stri_replace_all_regex", stri_replace_all_regex, 5), STRI__MK_CALL("C_stri_replace_first_regex", stri_replace_first_regex, 4), STRI__MK_CALL("C_stri_replace_last_regex", stri_replace_last_regex, 4), STRI__MK_CALL("C_stri_replace_all_charclass", stri_replace_all_charclass, 5), STRI__MK_CALL("C_stri_replace_first_charclass", stri_replace_first_charclass, 3), STRI__MK_CALL("C_stri_replace_last_charclass", stri_replace_last_charclass, 3), STRI__MK_CALL("C_stri_reverse", stri_reverse, 1), STRI__MK_CALL("C_stri_split_boundaries", stri_split_boundaries, 5), STRI__MK_CALL("C_stri_split_charclass", stri_split_charclass, 6), STRI__MK_CALL("C_stri_split_coll", stri_split_coll, 7), STRI__MK_CALL("C_stri_split_fixed", stri_split_fixed, 7), STRI__MK_CALL("C_stri_split_lines", stri_split_lines, 2), STRI__MK_CALL("C_stri_split_lines1", stri_split_lines1, 1), STRI__MK_CALL("C_stri_split_regex", stri_split_regex, 7), STRI__MK_CALL("C_stri_sprintf", stri_sprintf, 6), STRI__MK_CALL("C_stri_startswith_charclass", stri_startswith_charclass, 4), STRI__MK_CALL("C_stri_startswith_coll", stri_startswith_coll, 5), STRI__MK_CALL("C_stri_startswith_fixed", stri_startswith_fixed, 5), STRI__MK_CALL("C_stri_stats_general", stri_stats_general, 1), STRI__MK_CALL("C_stri_stats_latex", stri_stats_latex, 1), STRI__MK_CALL("C_stri_sub", stri_sub, 6), STRI__MK_CALL("C_stri_sub_all", stri_sub_all, 6), STRI__MK_CALL("C_stri_sub_replacement", stri_sub_replacement, 7), STRI__MK_CALL("C_stri_sub_replacement_all", stri_sub_replacement_all, 7), STRI__MK_CALL("C_stri_subset_charclass", stri_subset_charclass, 4), STRI__MK_CALL("C_stri_subset_coll", stri_subset_coll, 5), STRI__MK_CALL("C_stri_subset_fixed", stri_subset_fixed, 5), STRI__MK_CALL("C_stri_subset_regex", stri_subset_regex, 5), STRI__MK_CALL("C_stri_subset_charclass_replacement", stri_subset_charclass_replacement, 4), STRI__MK_CALL("C_stri_subset_coll_replacement", stri_subset_coll_replacement, 5), STRI__MK_CALL("C_stri_subset_fixed_replacement", stri_subset_fixed_replacement, 5), STRI__MK_CALL("C_stri_subset_regex_replacement", stri_subset_regex_replacement, 5), STRI__MK_CALL("C_stri_test_Rmark", stri_test_Rmark, 1), STRI__MK_CALL("C_stri_test_returnasis", stri_test_returnasis, 1), STRI__MK_CALL("C_stri_test_UnicodeContainer16", stri_test_UnicodeContainer16, 1), STRI__MK_CALL("C_stri_test_UnicodeContainer16b", stri_test_UnicodeContainer16b, 1), STRI__MK_CALL("C_stri_test_UnicodeContainer8", stri_test_UnicodeContainer8, 1), STRI__MK_CALL("C_stri_timezone_list", stri_timezone_list, 2), STRI__MK_CALL("C_stri_timezone_set", stri_timezone_set, 1), STRI__MK_CALL("C_stri_timezone_info", stri_timezone_info, 3), STRI__MK_CALL("C_stri_trans_char", stri_trans_char, 3), STRI__MK_CALL("C_stri_trans_isnfc", stri_trans_isnfc, 1), STRI__MK_CALL("C_stri_trans_isnfd", stri_trans_isnfd, 1), STRI__MK_CALL("C_stri_trans_isnfkc", stri_trans_isnfkc, 1), STRI__MK_CALL("C_stri_trans_isnfkd", stri_trans_isnfkd, 1), STRI__MK_CALL("C_stri_trans_isnfkc_casefold", stri_trans_isnfkc_casefold, 1), STRI__MK_CALL("C_stri_trans_general", stri_trans_general, 4), STRI__MK_CALL("C_stri_trans_list", stri_trans_list, 0), STRI__MK_CALL("C_stri_trans_nfc", stri_trans_nfc, 1), STRI__MK_CALL("C_stri_trans_nfd", stri_trans_nfd, 1), STRI__MK_CALL("C_stri_trans_nfkc", stri_trans_nfkc, 1), STRI__MK_CALL("C_stri_trans_nfkd", stri_trans_nfkd, 1), STRI__MK_CALL("C_stri_trans_nfkc_casefold", stri_trans_nfkc_casefold, 1), STRI__MK_CALL("C_stri_trans_totitle", stri_trans_totitle, 2), STRI__MK_CALL("C_stri_trans_tolower", stri_trans_tolower, 2), STRI__MK_CALL("C_stri_trans_toupper", stri_trans_toupper, 2), STRI__MK_CALL("C_stri_trans_casefold", stri_trans_casefold, 1), STRI__MK_CALL("C_stri_trim_both", stri_trim_both, 3), STRI__MK_CALL("C_stri_trim_left", stri_trim_left, 3), STRI__MK_CALL("C_stri_trim_right", stri_trim_right, 3), STRI__MK_CALL("C_stri_unescape_unicode", stri_unescape_unicode, 1), STRI__MK_CALL("C_stri_unique", stri_unique, 2), STRI__MK_CALL("C_stri_width", stri_width, 1), STRI__MK_CALL("C_stri_wrap", stri_wrap, 10), // the list must be NULL-terminated: {NULL, NULL, 0} }; /** Sets ICU data dir * * @param libpath */ void stri_set_icu_data_directory(const char* libpath) { // libpath == "...../libs" -> "...../libs" // libpath == "...../libs/i386" -> "...../libs" // libpath == "...../libs/x64" -> "...../libs" string dir(libpath); size_t idx = dir.rfind("libs"); if (idx == string::npos) { // this shouldn't happen u_setDataDirectory(libpath); // just use the libpath return; } // idx+5 -> if the string is shorter, as many characters as possible are used dir = dir.substr(0, idx+4); // 4 == strlen("libs") u_setDataDirectory(dir.c_str()); // #ifndef NDEBUG // fprintf(stderr, "ICU data directory=%s\n", dir.c_str()); // #endif // anyway, if .dat file will not be found, // ICU will use system data (may be stub) // 1. Examine the contents of the default ICU data shared library. // If it contains data, use that data. // If the data library is empty, a stub library, proceed to the next step. // 2. Dynamically load (memory map, typically) a common format (.dat) file // containing the default ICU data. } /** * Library initialization. * * R calls this automatically on lib load/attach. */ extern "C" void R_init_stringi(DllInfo* dll) { #if STRI_ICU_FOUND == 0 stri_set_icu_data_directory((char*)*(char**)(dll) /* dll->path */); #endif /* BTW: u_init: It is OK to simply use ICU services and functions without first having initialized ICU by calling u_init(). u_init() will attempt to load some part of ICU's data, and is useful as a test for configuration or installation problems that leave the ICU data inaccessible. A successful invocation of u_init() does not, however, guarantee that all ICU data are accessible. */ UErrorCode status = U_ZERO_ERROR; u_init(&status); if (U_FAILURE(status)) Rf_error("ICU init failed: %s", u_errorName(status)); if (stri__is_C_locale(uloc_getDefault())) { // C locale -> en_US_POSIX status = U_ZERO_ERROR; uloc_setDefault("en_US_POSIX", &status); if (U_FAILURE(status)) Rf_error("ICU init failed: %s", u_errorName(status)); } R_registerRoutines(dll, NULL, cCallMethods, NULL, NULL); R_useDynamicSymbols(dll, (Rboolean)FALSE); #if defined(R_VERSION) && R_VERSION >= R_Version(3, 0, 0) R_forceSymbols(dll, (Rboolean)TRUE); #endif const R_CallMethodDef* methods = cCallMethods; while (methods->name) { R_RegisterCCallable("stringi", methods->name, methods->fun); methods++; } methods = stri_callables; while (methods->name) { R_RegisterCCallable("stringi", methods->name, methods->fun); methods++; } if (!SUPPORT_UTF8) { /* Rconfig.h states that all R platforms support that */ Rf_error("R does not support UTF-8 encoding."); } } #ifndef NDEBUG #include /** * Library cleanup */ extern "C" void R_unload_stringi(DllInfo*) { // see http://bugs.icu-project.org/trac/ticket/10897 // and https://github.com/Rexamine/stringi/issues/78 u_cleanup(); } #endif stringi/src/stri_container_double.h0000644000176200001440000000723014770541312017254 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_container_double_h #define __stri_container_double_h #include "stri_container_base.h" /** * A wrapper-class for R double vectors * * @version 0.5-1 (Marek Gagolewski, 2014-12-30) */ class StriContainerDouble : public StriContainerBase { private: double* data; public: StriContainerDouble() : StriContainerBase() { data = NULL; } StriContainerDouble(SEXP rvec, R_len_t _nrecycle) { this->data = NULL; #ifndef NDEBUG if (!Rf_isReal(rvec)) throw StriException("DEBUG: !Rf_isReal in StriContainerDouble"); #endif R_len_t ndata = LENGTH(rvec); this->init_Base(ndata, _nrecycle, true); this->data = REAL(rvec); // TODO: ALTREP will be problematic? } // StriContainerDouble(StriContainerDouble& container); // default-shallow // ~StriContainerDouble(); // default-shallow // StriContainerDouble& operator=(StriContainerDouble& container); // default-shallow /** check if the vectorized ith element is NA * @param i index * @return true if is NA */ inline bool isNA(R_len_t i) const { #ifndef NDEBUG if (i < 0 || i >= nrecycle) throw StriException("StriContainerDouble::isNA(): INDEX OUT OF BOUNDS"); #endif return (ISNA(data[i%n])); } /** get the vectorized ith element * * @param i index * @return double */ inline double get(R_len_t i) const { #ifndef NDEBUG if (i < 0 || i >= nrecycle) throw StriException("StriContainerDouble::get(): INDEX OUT OF BOUNDS"); if (ISNA(data[i%n])) throw StriException("StriContainerDouble::get(): isNA"); #endif return (data[i%n]); } /** get the vectorized ith element, no NA check here * * @param i index * @return double */ inline double getNAble(R_len_t i) const { #ifndef NDEBUG if (i < 0 || i >= nrecycle) throw StriException("StriContainerDouble::get(): INDEX OUT OF BOUNDS"); #endif return (data[i%n]); } }; #endif stringi/src/stri_search_class_detect.cpp0000644000176200001440000001105214770541312020252 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_charclass.h" /** * Detect if a character class occurs in a string * * @param str character vector * @param pattern character vector * @param negate single bool * @param max_count single int * @return logical vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-02) * Use StrContainerUTF8 and CharClass classes * * @version 0.1-?? (Marek Gagolewski, 2013-06-15) * Use StrContainerCharClass * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * detects invalid UTF-8 byte stream * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * StriContainerCharClass now relies on UnicodeSet * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR #216: `negate` arg added * * @version 1.3.1 (Marek Gagolewski, 2019-02-08) * #232: `max_count` arg added */ SEXP stri_detect_charclass(SEXP str, SEXP pattern, SEXP negate, SEXP max_count) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); int max_count_1 = stri__prepare_arg_integer_1_notNA(max_count, "max_count"); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (max_count_1 == 0 || str_cont.isNA(i) || pattern_cont.isNA(i)) { ret_tab[i] = NA_LOGICAL; continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); UChar32 chr = 0; ret_tab[i] = FALSE; for (R_len_t j=0; jcontains(chr)) { ret_tab[i] = TRUE; break; } } if (negate_1) ret_tab[i] = !ret_tab[i]; if (max_count_1 > 0 && ret_tab[i]) --max_count_1; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } stringi/src/stri_exports.h0000644000176200001440000004650414750110642015447 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_exports_h #define __stri_exports_h #include #include // compare.cpp: SEXP stri_cmp(SEXP e1, SEXP e2, SEXP opts_collator=R_NilValue); SEXP stri_cmp_le(SEXP e1, SEXP e2, SEXP opts_collator=R_NilValue); SEXP stri_cmp_lt(SEXP e1, SEXP e2, SEXP opts_collator=R_NilValue); SEXP stri_cmp_ge(SEXP e1, SEXP e2, SEXP opts_collator=R_NilValue); SEXP stri_cmp_gt(SEXP e1, SEXP e2, SEXP opts_collator=R_NilValue); SEXP stri_cmp_equiv(SEXP e1, SEXP e2, SEXP opts_collator=R_NilValue); SEXP stri_cmp_nequiv(SEXP e1, SEXP e2, SEXP opts_collator=R_NilValue); SEXP stri_cmp_eq(SEXP e1, SEXP e2); SEXP stri_cmp_neq(SEXP e1, SEXP e2); // sort.cpp SEXP stri_sort(SEXP str, SEXP decreasing=Rf_ScalarLogical(FALSE), SEXP na_last=Rf_ScalarLogical(NA_LOGICAL), SEXP opts_collator=R_NilValue); SEXP stri_rank(SEXP str, SEXP opts_collator=R_NilValue); SEXP stri_order(SEXP str, SEXP decreasing=Rf_ScalarLogical(FALSE), SEXP na_last=Rf_ScalarLogical(TRUE), SEXP opts_collator=R_NilValue); SEXP stri_sort_key(SEXP str, SEXP opts_collator=R_NilValue); SEXP stri_unique(SEXP str, SEXP opts_collator=R_NilValue); SEXP stri_duplicated(SEXP str, SEXP fromLast=Rf_ScalarLogical(FALSE), SEXP opts_collator=R_NilValue); SEXP stri_duplicated_any(SEXP str, SEXP fromLast=Rf_ScalarLogical(FALSE), SEXP opts_collator=R_NilValue); // ICU_settings.cpp: SEXP stri_info(); // escape.cpp SEXP stri_escape_unicode(SEXP str); SEXP stri_unescape_unicode(SEXP str); // join.cpp: SEXP stri_flatten(SEXP str, SEXP collapse=Rf_mkString(""), SEXP na_empty=Rf_ScalarLogical(FALSE), SEXP omit_empty=Rf_ScalarLogical(FALSE)); SEXP stri_join(SEXP strlist, SEXP sep=Rf_mkString(""), SEXP collapse=R_NilValue, SEXP ignore_null=Rf_ScalarLogical(FALSE)); SEXP stri_join_list(SEXP x, SEXP sep=Rf_mkString(""), SEXP collapse=R_NilValue); SEXP stri_join2(SEXP e1, SEXP e2); SEXP stri_dup(SEXP str, SEXP times); // length.cpp SEXP stri_numbytes(SEXP str); SEXP stri_length(SEXP str); SEXP stri_isempty(SEXP str); SEXP stri_width(SEXP str); // reverse.cpp SEXP stri_reverse(SEXP s); // sub.cpp SEXP stri_sub(SEXP str, SEXP from, SEXP to, SEXP length, SEXP use_matrix=Rf_ScalarLogical(TRUE), SEXP ignore_negative_length=Rf_ScalarLogical(FALSE)); SEXP stri_sub_replacement(SEXP str, SEXP from, SEXP to, SEXP length, SEXP omit_na, SEXP value, SEXP use_matrix=Rf_ScalarLogical(TRUE)); SEXP stri_sub_all(SEXP str, SEXP from, SEXP to, SEXP length, SEXP use_matrix=Rf_ScalarLogical(TRUE), SEXP ignore_negative_length=Rf_ScalarLogical(TRUE)); SEXP stri_sub_replacement_all(SEXP str, SEXP from, SEXP to, SEXP length, SEXP omit_na, SEXP value, SEXP use_matrix=Rf_ScalarLogical(TRUE)); // encoding_management.cpp: SEXP stri_enc_list(); SEXP stri_enc_info(SEXP enc=R_NilValue); SEXP stri_enc_set(SEXP enc); SEXP stri_enc_mark(SEXP str); // uloc.cpp: SEXP stri_locale_info(SEXP loc=R_NilValue); SEXP stri_locale_list(); SEXP stri_locale_set(SEXP loc); // trim.cpp: SEXP stri_trim_both(SEXP str, SEXP pattern, SEXP negate=Rf_ScalarLogical(FALSE)); SEXP stri_trim_left(SEXP str, SEXP pattern, SEXP negate=Rf_ScalarLogical(FALSE)); SEXP stri_trim_right(SEXP str, SEXP pattern, SEXP negate=Rf_ScalarLogical(FALSE)); // random.cpp SEXP stri_rand_shuffle(SEXP str); SEXP stri_rand_strings(SEXP n, SEXP length, SEXP pattern=Rf_mkString("[A-Za-z0-9]")); // stats.cpp SEXP stri_stats_general(SEXP str); SEXP stri_stats_latex(SEXP str); // trans_transliterate.cpp: SEXP stri_trans_list(); SEXP stri_trans_general(SEXP str, SEXP id, SEXP rules, SEXP forward); // utils.cpp SEXP stri_list2matrix(SEXP x, SEXP byrow=Rf_ScalarLogical(FALSE), SEXP fill=Rf_ScalarString(NA_STRING), SEXP n_min=Rf_ScalarInteger(0)); // encoding_conversion.cpp: SEXP stri_encode(SEXP str, SEXP from=R_NilValue, SEXP to=R_NilValue, SEXP to_raw=Rf_ScalarLogical(FALSE)); SEXP stri_enc_fromutf32(SEXP str); SEXP stri_enc_toutf32(SEXP str); SEXP stri_enc_toutf8(SEXP str, SEXP is_unknown_8bit=Rf_ScalarLogical(FALSE), SEXP validate=Rf_ScalarLogical(FALSE)); SEXP stri_enc_toascii(SEXP str); // encoding_detection.cpp: SEXP stri_enc_detect2(SEXP str, SEXP loc=R_NilValue); SEXP stri_enc_detect(SEXP str, SEXP filter_angle_brackets=Rf_ScalarLogical(FALSE)); SEXP stri_enc_isascii(SEXP str); SEXP stri_enc_isutf8(SEXP str); SEXP stri_enc_isutf16le(SEXP str); SEXP stri_enc_isutf16be(SEXP str); SEXP stri_enc_isutf32le(SEXP str); SEXP stri_enc_isutf32be(SEXP str); // pad.cpp SEXP stri_pad(SEXP str, SEXP width, SEXP side=Rf_mkString("left"), SEXP pad=Rf_mkString(" "), SEXP use_length=Rf_ScalarLogical(FALSE)); // sprintf.cpp SEXP stri_sprintf(SEXP format, SEXP x, SEXP na_string=Rf_ScalarString(NA_STRING), SEXP inf_string=Rf_mkString("Inf"), SEXP nan_string=Rf_mkString("NaN"), SEXP use_length=Rf_ScalarLogical(FALSE)); // wrap.cpp SEXP stri_wrap(SEXP str, SEXP width, SEXP cost_exponent=Rf_ScalarInteger(2), SEXP indent=Rf_ScalarInteger(0), SEXP exdent=Rf_ScalarInteger(0), SEXP prefix=Rf_mkString(""), SEXP initial=Rf_mkString(""), SEXP whitespace_only=Rf_ScalarLogical(FALSE), SEXP use_length=Rf_ScalarLogical(FALSE), SEXP locale=R_NilValue); // trans_other.cpp: SEXP stri_trans_char(SEXP str, SEXP pattern, SEXP replacement); // trans_casemap.cpp: SEXP stri_trans_totitle(SEXP str, SEXP opts_brkiter=R_NilValue); SEXP stri_trans_tolower(SEXP str, SEXP locale=R_NilValue); SEXP stri_trans_toupper(SEXP str, SEXP locale=R_NilValue); SEXP stri_trans_casefold(SEXP str); // trans_normalization.cpp: SEXP stri_trans_nfc(SEXP s); SEXP stri_trans_nfd(SEXP s); SEXP stri_trans_nfkc(SEXP s); SEXP stri_trans_nfkd(SEXP s); SEXP stri_trans_nfkc_casefold(SEXP s); SEXP stri_trans_isnfc(SEXP s); SEXP stri_trans_isnfd(SEXP s); SEXP stri_trans_isnfkc(SEXP s); SEXP stri_trans_isnfkd(SEXP s); SEXP stri_trans_isnfkc_casefold(SEXP s); // search SEXP stri_split_lines(SEXP str, SEXP omit_empty=Rf_ScalarLogical(FALSE)); SEXP stri_split_lines1(SEXP str); SEXP stri_replace_na(SEXP str, SEXP replacement=Rf_mkString("NA")); SEXP stri_replace_rstr(SEXP x); SEXP stri_detect_coll(SEXP str, SEXP pattern, SEXP negate=Rf_ScalarLogical(FALSE), SEXP max_count=Rf_ScalarInteger(-1), SEXP opts_collator=R_NilValue); SEXP stri_count_coll(SEXP str, SEXP pattern, SEXP opts_collator=R_NilValue); SEXP stri_locate_all_coll(SEXP str, SEXP pattern, SEXP omit_no_match=Rf_ScalarLogical(FALSE), SEXP opts_collator=R_NilValue, SEXP get_length=Rf_ScalarLogical(FALSE) ); SEXP stri_locate_first_coll( SEXP str, SEXP pattern, SEXP opts_collator=R_NilValue, SEXP get_length=Rf_ScalarLogical(FALSE) ); SEXP stri_locate_last_coll( SEXP str, SEXP pattern, SEXP opts_collator=R_NilValue, SEXP get_length=Rf_ScalarLogical(FALSE) ); SEXP stri_extract_first_coll(SEXP str, SEXP pattern, SEXP opts_collator=R_NilValue); SEXP stri_extract_last_coll(SEXP str, SEXP pattern, SEXP opts_collator=R_NilValue); SEXP stri_extract_all_coll(SEXP str, SEXP pattern, SEXP simplify=Rf_ScalarLogical(FALSE), SEXP omit_no_match=Rf_ScalarLogical(FALSE), SEXP opts_collator=R_NilValue); SEXP stri_replace_all_coll(SEXP str, SEXP pattern, SEXP replacement, SEXP vectorize_all=Rf_ScalarLogical(TRUE), SEXP opts_collator=R_NilValue); SEXP stri_replace_first_coll(SEXP str, SEXP pattern, SEXP replacement, SEXP opts_collator=R_NilValue); SEXP stri_replace_last_coll(SEXP str, SEXP pattern, SEXP replacement, SEXP opts_collator=R_NilValue); SEXP stri_split_coll(SEXP str, SEXP split, SEXP n=Rf_ScalarInteger(-1), SEXP omit_empty=Rf_ScalarLogical(FALSE), SEXP tokens_only=Rf_ScalarLogical(FALSE), SEXP simplify=Rf_ScalarLogical(FALSE), SEXP opts_collator=R_NilValue); SEXP stri_endswith_coll(SEXP str, SEXP pattern, SEXP to=Rf_ScalarInteger(-1), SEXP negate=Rf_ScalarLogical(FALSE), SEXP opts_collator=R_NilValue); SEXP stri_startswith_coll(SEXP str, SEXP pattern, SEXP from=Rf_ScalarInteger(1), SEXP negate=Rf_ScalarLogical(FALSE), SEXP opts_collator=R_NilValue); SEXP stri_subset_coll(SEXP str, SEXP pattern, SEXP omit_na=Rf_ScalarLogical(FALSE), SEXP negate=Rf_ScalarLogical(FALSE), SEXP opts_collator=R_NilValue); SEXP stri_subset_coll_replacement(SEXP str, SEXP pattern, SEXP negate, SEXP opts_collator, SEXP value); SEXP stri_detect_fixed(SEXP str, SEXP pattern, SEXP negate=Rf_ScalarLogical(FALSE), SEXP max_count=Rf_ScalarInteger(-1), SEXP opts_fixed=R_NilValue); SEXP stri_count_fixed(SEXP str, SEXP pattern, SEXP opts_fixed=R_NilValue); SEXP stri_locate_all_fixed( SEXP str, SEXP pattern, SEXP omit_no_match=Rf_ScalarLogical(FALSE), SEXP opts_fixed=R_NilValue, SEXP get_length=Rf_ScalarLogical(FALSE) ); SEXP stri_locate_first_fixed( SEXP str, SEXP pattern, SEXP opts_fixed=R_NilValue, SEXP get_length=Rf_ScalarLogical(FALSE) ); SEXP stri_locate_last_fixed( SEXP str, SEXP pattern, SEXP opts_fixed=R_NilValue, SEXP get_length=Rf_ScalarLogical(FALSE) ); SEXP stri_extract_first_fixed( SEXP str, SEXP pattern, SEXP opts_fixed=R_NilValue ); SEXP stri_extract_last_fixed( SEXP str, SEXP pattern, SEXP opts_fixed=R_NilValue ); SEXP stri_extract_all_fixed( SEXP str, SEXP pattern, SEXP simplify=Rf_ScalarLogical(FALSE), SEXP omit_no_match=Rf_ScalarLogical(FALSE), SEXP opts_fixed=R_NilValue ); SEXP stri_replace_all_fixed(SEXP str, SEXP pattern, SEXP replacement, SEXP vectorize_all=Rf_ScalarLogical(TRUE), SEXP opts_fixed=R_NilValue); SEXP stri_replace_first_fixed(SEXP str, SEXP pattern, SEXP replacement, SEXP opts_fixed=R_NilValue); SEXP stri_replace_last_fixed(SEXP str, SEXP pattern, SEXP replacement, SEXP opts_fixed=R_NilValue); SEXP stri_split_fixed(SEXP str, SEXP split, SEXP n=Rf_ScalarInteger(-1), SEXP omit_empty=Rf_ScalarLogical(FALSE), SEXP tokens_only=Rf_ScalarLogical(FALSE), SEXP simplify=Rf_ScalarLogical(FALSE), SEXP opts_fixed=R_NilValue); SEXP stri_subset_fixed(SEXP str, SEXP pattern, SEXP omit_na=Rf_ScalarLogical(FALSE), SEXP negate=Rf_ScalarLogical(FALSE), SEXP opts_fixed=R_NilValue); SEXP stri_endswith_fixed(SEXP str, SEXP pattern, SEXP to=Rf_ScalarInteger(-1), SEXP negate=Rf_ScalarLogical(FALSE), SEXP opts_fixed=R_NilValue); SEXP stri_startswith_fixed(SEXP str, SEXP pattern, SEXP from=Rf_ScalarInteger(1), SEXP negate=Rf_ScalarLogical(FALSE), SEXP opts_fixed=R_NilValue); SEXP stri_subset_fixed_replacement(SEXP str, SEXP pattern, SEXP negate, SEXP opts_fixed, SEXP value); SEXP stri_detect_regex( SEXP str, SEXP pattern, SEXP negate=Rf_ScalarLogical(FALSE), SEXP max_count=Rf_ScalarInteger(-1), SEXP opts_regex=R_NilValue ); SEXP stri_count_regex(SEXP str, SEXP pattern, SEXP opts_regex=R_NilValue); SEXP stri_locate_all_regex( SEXP str, SEXP pattern, SEXP omit_no_match=Rf_ScalarLogical(FALSE), SEXP opts_regex=R_NilValue, SEXP capture_groups=Rf_ScalarLogical(FALSE), SEXP get_length=Rf_ScalarLogical(FALSE) ); SEXP stri_locate_first_regex( SEXP str, SEXP pattern, SEXP opts_regex=R_NilValue, SEXP capture_groups=Rf_ScalarLogical(FALSE), SEXP get_length=Rf_ScalarLogical(FALSE) ); SEXP stri_locate_last_regex( SEXP str, SEXP pattern, SEXP opts_regex=R_NilValue, SEXP capture_groups=Rf_ScalarLogical(FALSE), SEXP get_length=Rf_ScalarLogical(FALSE) ); SEXP stri_replace_all_regex( SEXP str, SEXP pattern, SEXP replacement, SEXP vectorize_all=Rf_ScalarLogical(FALSE), SEXP opts_regex=R_NilValue ); SEXP stri_replace_first_regex( SEXP str, SEXP pattern, SEXP replacement, SEXP opts_regex=R_NilValue ); SEXP stri_replace_last_regex( SEXP str, SEXP pattern, SEXP replacement, SEXP opts_regex=R_NilValue ); SEXP stri_split_regex( SEXP str, SEXP pattern, SEXP n=Rf_ScalarInteger(-1), SEXP omit_empty=Rf_ScalarLogical(FALSE), SEXP tokens_only=Rf_ScalarLogical(FALSE), SEXP simplify=Rf_ScalarLogical(FALSE), SEXP opts_regex=R_NilValue ); SEXP stri_subset_regex( SEXP str, SEXP pattern, SEXP omit_na=Rf_ScalarLogical(FALSE), SEXP negate=Rf_ScalarLogical(FALSE), SEXP opts_regex=R_NilValue ); SEXP stri_extract_first_regex(SEXP str, SEXP pattern, SEXP opts_regex=R_NilValue); SEXP stri_extract_last_regex(SEXP str, SEXP pattern, SEXP opts_regex=R_NilValue); SEXP stri_extract_all_regex(SEXP str, SEXP pattern, SEXP simplify=Rf_ScalarLogical(FALSE), SEXP omit_no_match=Rf_ScalarLogical(FALSE), SEXP opts_regex=R_NilValue); SEXP stri_match_first_regex(SEXP str, SEXP pattern, SEXP cg_missing=Rf_ScalarString(NA_STRING), SEXP opts_regex=R_NilValue); SEXP stri_match_last_regex(SEXP str, SEXP pattern, SEXP cg_missing=Rf_ScalarString(NA_STRING), SEXP opts_regex=R_NilValue); SEXP stri_match_all_regex(SEXP str, SEXP pattern, SEXP omit_no_match=Rf_ScalarLogical(FALSE), SEXP cg_missing=Rf_ScalarString(NA_STRING), SEXP opts_regex=R_NilValue); SEXP stri_subset_regex_replacement(SEXP str, SEXP pattern, SEXP negate, SEXP opts_regex, SEXP value); SEXP stri_detect_charclass(SEXP str, SEXP pattern, SEXP negate=Rf_ScalarLogical(FALSE), SEXP max_count=Rf_ScalarInteger(-1)); SEXP stri_count_charclass(SEXP str, SEXP pattern); SEXP stri_extract_first_charclass(SEXP str, SEXP pattern); SEXP stri_extract_last_charclass(SEXP str, SEXP pattern); SEXP stri_extract_all_charclass(SEXP str, SEXP pattern, SEXP merge=Rf_ScalarLogical(TRUE), SEXP simplify=Rf_ScalarLogical(FALSE), SEXP omit_no_match=Rf_ScalarLogical(FALSE)); SEXP stri_locate_first_charclass( SEXP str, SEXP pattern, SEXP get_length=Rf_ScalarLogical(FALSE) ); SEXP stri_locate_last_charclass( SEXP str, SEXP pattern, SEXP get_length=Rf_ScalarLogical(FALSE) ); SEXP stri_locate_all_charclass( SEXP str, SEXP pattern, SEXP merge=Rf_ScalarLogical(TRUE), SEXP omit_no_match=Rf_ScalarLogical(FALSE), SEXP get_length=Rf_ScalarLogical(FALSE) ); SEXP stri_replace_last_charclass(SEXP str, SEXP pattern, SEXP replacement); SEXP stri_replace_first_charclass(SEXP str, SEXP pattern, SEXP replacement); SEXP stri_replace_all_charclass(SEXP str, SEXP pattern, SEXP replacement, SEXP merge=Rf_ScalarLogical(FALSE), SEXP vectorize_all=Rf_ScalarLogical(TRUE)); SEXP stri_split_charclass(SEXP str, SEXP pattern, SEXP n=Rf_ScalarInteger(-1), SEXP omit_empty=Rf_ScalarLogical(FALSE), SEXP tokens_only=Rf_ScalarLogical(FALSE), SEXP simplify=Rf_ScalarLogical(FALSE)); SEXP stri_endswith_charclass(SEXP str, SEXP pattern, SEXP to=Rf_ScalarInteger(-1), SEXP negate=Rf_ScalarLogical(FALSE)); SEXP stri_startswith_charclass(SEXP str, SEXP pattern, SEXP from=Rf_ScalarInteger(1), SEXP negate=Rf_ScalarLogical(FALSE)); SEXP stri_subset_charclass(SEXP str, SEXP pattern, SEXP omit_na=Rf_ScalarLogical(FALSE), SEXP negate=Rf_ScalarLogical(FALSE)); SEXP stri_subset_charclass_replacement(SEXP str, SEXP pattern, SEXP negate, SEXP value); SEXP stri_extract_all_boundaries(SEXP str, SEXP simplify, SEXP omit_no_match=Rf_ScalarLogical(FALSE), SEXP opts_brkiter=R_NilValue); SEXP stri_extract_first_boundaries(SEXP str, SEXP opts_brkiter=R_NilValue); SEXP stri_extract_last_boundaries(SEXP str, SEXP opts_brkiter=R_NilValue); SEXP stri_locate_all_boundaries( SEXP str, SEXP omit_no_match=Rf_ScalarLogical(FALSE), SEXP opts_brkiter=R_NilValue, SEXP get_length=Rf_ScalarLogical(FALSE) ); SEXP stri_locate_first_boundaries( SEXP str, SEXP opts_brkiter=R_NilValue, SEXP get_length=Rf_ScalarLogical(FALSE) ); SEXP stri_locate_last_boundaries( SEXP str, SEXP opts_brkiter=R_NilValue, SEXP get_length=Rf_ScalarLogical(FALSE) ); SEXP stri_split_boundaries(SEXP str, SEXP n=Rf_ScalarInteger(-1), SEXP tokens_only=Rf_ScalarLogical(FALSE), SEXP simplify=Rf_ScalarLogical(FALSE), SEXP opts_brkiter=R_NilValue); SEXP stri_count_boundaries(SEXP str, SEXP opts_brkiter=R_NilValue); // date/time SEXP stri_timezone_list(SEXP region=Rf_ScalarString(NA_STRING), SEXP offset=Rf_ScalarInteger(NA_INTEGER)); SEXP stri_timezone_set(SEXP tz); SEXP stri_timezone_info(SEXP tz=R_NilValue, SEXP locale=R_NilValue, SEXP display_type=Rf_mkString("long")); SEXP stri_datetime_symbols(SEXP locale=R_NilValue, SEXP context=Rf_mkString("standalone"), SEXP width=Rf_mkString("wide")); SEXP stri_datetime_now(); SEXP stri_datetime_add(SEXP time, SEXP value=Rf_ScalarInteger(1), SEXP units=Rf_mkString("seconds"), SEXP tz=R_NilValue, SEXP locale=R_NilValue); SEXP stri_datetime_fields(SEXP time, SEXP tz=R_NilValue, SEXP locale=R_NilValue); SEXP stri_datetime_create(SEXP year, SEXP month, SEXP day, SEXP hour=Rf_ScalarInteger(12), SEXP minute=Rf_ScalarInteger(0), SEXP second=Rf_ScalarInteger(0), SEXP lenient=Rf_ScalarLogical(FALSE), SEXP tz=R_NilValue, SEXP locale=R_NilValue); SEXP stri_datetime_format(SEXP time, SEXP format=Rf_mkString("uuuu-MM-dd HH:mm:ss"), SEXP tz=R_NilValue, SEXP locale=R_NilValue); SEXP stri_datetime_parse(SEXP str, SEXP format=Rf_mkString("uuuu-MM-dd HH:mm:ss"), SEXP lenient=Rf_ScalarLogical(FALSE), SEXP tz=R_NilValue, SEXP locale=R_NilValue); SEXP stri_datetime_fstr(SEXP x); // SEXP stri_c_posixst(SEXP x); // internal // prepare_arg.cpp: SEXP stri_prepare_arg_string_1(SEXP x, SEXP argname); SEXP stri_prepare_arg_double_1(SEXP x, SEXP argname); // TODO: factors_as_strings SEXP stri_prepare_arg_integer_1(SEXP x, SEXP argname); // TODO: factors_as_strings SEXP stri_prepare_arg_logical_1(SEXP x, SEXP argname); SEXP stri_prepare_arg_string(SEXP x, SEXP argname); SEXP stri_prepare_arg_double(SEXP x, SEXP argname); // TODO: factors_as_strings SEXP stri_prepare_arg_integer(SEXP x, SEXP argname); // TODO: factors_as_strings SEXP stri_prepare_arg_logical(SEXP x, SEXP argname); SEXP stri_prepare_arg_raw(SEXP x, SEXP argname); // TODO: factors_as_strings // TODO: other prepare args // encoding_conversion.cpp: // SEXP stri_encode_from_marked(SEXP str, SEXP to, SEXP to_raw); // internal // test.cpp /* internal, but in namespace: for testing */ SEXP stri_test_Rmark(SEXP str); SEXP stri_test_UnicodeContainer16(SEXP str); SEXP stri_test_UnicodeContainer16b(SEXP str); SEXP stri_test_UnicodeContainer8(SEXP str); SEXP stri_test_returnasis(SEXP x); #endif stringi/src/stri_container_utf16.cpp0000644000176200001440000003177314770541312017313 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf16.h" #include "stri_string8buf.h" #include "stri_ucnv.h" /** * Default constructor * */ StriContainerUTF16::StriContainerUTF16() : StriContainerBase() { this->str = NULL; } /** container for nrecycle fresh, brand new, writable UnicodeStrings * * Each string is initially empty. * * @param nrecycle number of strings */ StriContainerUTF16::StriContainerUTF16(R_len_t _nrecycle) { this->str = NULL; this->init_Base(_nrecycle, _nrecycle, false); if (this->n > 0) { this->str = new UnicodeString[this->n]; STRI_ASSERT(this->str); if (!this->str) throw StriException(MSG__MEM_ALLOC_ERROR_WITH_SIZE, this->n*sizeof(UnicodeString)); } } /** * Construct String Container from an R character vector * * @param rstr R character vector * @param nrecycle extend length [vectorization] * @param shallowrecycle will \code{this->str} be ever modified? * * @version 1.0.6 (Marek Gagolewski, 2017-05-25) * #270 latin-1 is windows-1252 on Windows */ StriContainerUTF16::StriContainerUTF16(SEXP rstr, R_len_t _nrecycle, bool _shallowrecycle) { this->str = NULL; #ifndef NDEBUG if (!Rf_isString(rstr)) throw StriException("DEBUG: !Rf_isString in StriContainerUTF16::StriContainerUTF16(SEXP rstr)"); #endif R_len_t nrstr = LENGTH(rstr); this->init_Base(nrstr, _nrecycle, _shallowrecycle); // calling LENGTH(rstr) fails on constructor call if (this->n == 0) return; /* nothing more to do */ STRI_ASSERT(this->n > 0); this->str = new UnicodeString[this->n]; STRI_ASSERT(this->str); if (!this->str) throw StriException(MSG__MEM_ALLOC_ERROR_WITH_SIZE, this->n*sizeof(UnicodeString)); for (R_len_t i=0; in; ++i) this->str[i].setToBogus(); // in case it fails during conversion (this is NA) /* Important: ICU provides full internationalisation functionality without any conversion table data. The common library contains code to handle several important encodings algorithmically: US-ASCII, ISO-8859-1, UTF-7/8/16/32, SCSU, BOCU-1, CESU-8, and IMAP-mailbox-name */ //StriUcnv ucnvASCII("US-ASCII"); #if defined(_WIN32) || defined(_WIN64) // #270: latin-1 is windows-1252 on Windows StriUcnv ucnvLatin1("WINDOWS-1252"); #else StriUcnv ucnvLatin1("ISO-8859-1"); #endif StriUcnv ucnvNative(NULL); for (R_len_t i=0; istr[i].setTo( // UnicodeString((const char*)CHAR(curs), (int32_t)LENGTH(curs), ucnv, status) // ); // STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // // // Performance improvement attempt #1: // // this->str[i] = new UnicodeString(UnicodeString::fromUTF8(CHAR(curs))); // // if (!this->str) throw StriException(MSG__MEM_ALLOC_ERROR); // // slower than the above // // // Performance improvement attempt #2: // // Create UChar buf with LENGTH(curs) items, fill it with (CHAR(curs)[i], 0x00), i=1,... // // This wasn't faster than the ucnvASCII approach. // // // Performance improvement attempt #3: // // slightly slower than ucnvASCII // // R_len_t curs_n = LENGTH(curs); // // const char* curs_s = CHAR(curs); // // this->str[i].remove(); // unset bogus (NA) // // UChar* buf = this->str[i].getBuffer(curs_n); // // for (R_len_t k=0; kstr[i].releaseBuffer(curs_n); // } // else if (IS_ASCII(curs) || IS_UTF8(curs)) { // using ucnvUTF8 is slower for UTF-8 // the same is done for native encoding && ucnvNative_isUTF8 // this is slower if IS_ASCII than ucnvASCII, but doesn't limit // the input string length to 858993458 characters (#487) this->str[i].setTo(UnicodeString::fromUTF8(CHAR(curs))); } else if (IS_LATIN1(curs)) { UConverter* ucnv = ucnvLatin1.getConverter(); UErrorCode status = U_ZERO_ERROR; this->str[i].setTo( UnicodeString((const char*)CHAR(curs), (int32_t)LENGTH(curs), ucnv, status) ); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } else if (IS_BYTES(curs)) { throw StriException(MSG__BYTESENC); } else { // an "unknown" (native) encoding may be set to UTF-8 (speedup) if (ucnvNative.isUTF8()) { // UTF-8 this->str[i].setTo(UnicodeString::fromUTF8(CHAR(curs))); } else { UConverter* ucnv = ucnvNative.getConverter(); UErrorCode status = U_ZERO_ERROR; this->str[i].setTo( UnicodeString((const char*)CHAR(curs), (int32_t)LENGTH(curs), ucnv, status) ); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } } } if (!_shallowrecycle) { for (R_len_t i=nrstr; in; ++i) { this->str[i].setTo(str[i%nrstr]); } } } /** Copy constructor * * @param container source */ StriContainerUTF16::StriContainerUTF16(StriContainerUTF16& container) : StriContainerBase((StriContainerBase&)container) { if (container.str) { this->str = new UnicodeString[this->n]; STRI_ASSERT(this->str); if (!this->str) throw StriException(MSG__MEM_ALLOC_ERROR_WITH_SIZE, this->n*sizeof(UnicodeString)); for (int i=0; in; ++i) { this->str[i].setTo(container.str[i]); } } else { this->str = NULL; } } /** * @param container source * @return self */ StriContainerUTF16& StriContainerUTF16::operator=(StriContainerUTF16& container) { this->~StriContainerUTF16(); (StriContainerBase&) (*this) = (StriContainerBase&)container; if (container.str) { this->str = new UnicodeString[this->n]; STRI_ASSERT(this->str); if (!this->str) throw StriException(MSG__MEM_ALLOC_ERROR_WITH_SIZE, this->n*sizeof(UnicodeString)); for (int i=0; in; ++i) { this->str[i].setTo(container.str[i]); } } else { this->str = NULL; } return *this; } /** Destructor * */ StriContainerUTF16::~StriContainerUTF16() { if (str) { delete [] str; str = NULL; } } /** Export character vector to R * * THE OUTPUT IS ALWAYS IN UTF-8 * * Recycle rule is applied, so length == nrecycle * * @version 0.1-?? (Marek Gagolewski) * * @version 0.2-1 (Marek Gagolewski, 2014-03-23) * using 1 tmpbuf + u_strToUTF8 for slightly better performance * * @return STRSXP */ SEXP StriContainerUTF16::toR() const { R_len_t outbufsize = 0; for (R_len_t i=0; i outbufsize) outbufsize = thissize; } } // One UChar -- <= U+FFFF -> 1-3 bytes UTF8 // Two UChars -- >=U+10000 -> 4 bytes UTF8 outbufsize = UCNV_GET_MAX_BYTES_FOR_STRING(outbufsize, 3); String8buf outbuf(outbufsize); SEXP ret; PROTECT(ret = Rf_allocVector(STRSXP, nrecycle)); UErrorCode status = U_ZERO_ERROR; for (R_len_t i=0; i= nrecycle) throw StriException("StriContainerUTF16::toR(): INDEX OUT OF BOUNDS"); #endif if (str[i%n].isBogus()) return NA_STRING; else { std::string s; str[i%n].toUTF8String(s); return Rf_mkCharLenCE(s.c_str(), (int)s.length(), (cetype_t)CE_UTF8); } } /** Convert Unicode16-Char indexes to Unicode32 (code points) * * \code{i1} and \code{i2} must be sorted increasingly * * @param i element index * @param i1 indexes, 1-based [in/out] * @param i2 indexes, 1-based [in/out] * @param ni size of \code{i1} and \code{i2} * @param adj1 adjust for \code{i1} * @param adj2 adjust for \code{i2} * * @version 0.5-1 (Marek Gagolewski, 2014-12-21) * #132 incorrect behaviour for i2[j2] == i2[j2+1] * * @version 1.7.1 (Marek Gagolewski, 2021-06-29) ignore NA and negative indexes */ void StriContainerUTF16::UChar16_to_UChar32_index( R_len_t i, int* i1, int* i2, const int ni, int adj1, int adj2 ) { const UnicodeString* str_data = &(this->get(i)); const UChar* cstr = str_data->getBuffer(); const int nstr = str_data->length(); int j1 = 0; int j2 = 0; int i16 = 0; int i32 = 0; while (i16 < nstr && (j1 < ni || j2 < ni)) { while (j1 < ni && i1[j1] <= i16) { if (i1[j1] == NA_INTEGER || i1[j1] < 0) { ++j1; continue; } #ifndef NDEBUG if (j1 < ni-1 && i1[j1+1] != NA_INTEGER && i1[j1+1] >= 0 && i1[j1] > i1[j1+1]) throw StriException("DEBUG: stri__UChar16_to_UChar32_index 1"); #endif i1[j1] = i32 + adj1; ++j1; } while (j2 < ni && i2[j2] <= i16) { if (i2[j2] == NA_INTEGER || i2[j2] < 0) { ++j2; continue; } #ifndef NDEBUG if (j2 < ni-1 && i2[j2+1] != NA_INTEGER && i2[j2+1] >= 0 && i2[j2] > i2[j2+1]) throw StriException("DEBUG: stri__UChar16_to_UChar32_index 2"); #endif i2[j2] = i32 + adj2; ++j2; } // Next UChar32 U16_FWD_1(cstr, i16, nstr); ++i32; } // CONVERT LAST: while (j1 < ni && i1[j1] <= nstr) { if (i1[j1] == NA_INTEGER || i1[j1] < 0) { ++j1; continue; } //#ifndef NDEBUG // if (j1 < ni-1 && i1[j1] >= i1[j1+1]) // throw StriException("DEBUG: stri__UChar16_to_UChar32_index 3"); //#endif i1[j1] = i32 + adj1; ++j1; } while (j2 < ni && i2[j2] <= nstr) { if (i2[j2] == NA_INTEGER || i2[j2] < 0) { ++j2; continue; } //#ifndef NDEBUG // if (j2 < ni-1 && i2[j2] >= i2[j2+1]) // throw StriException("DEBUG: stri__UChar16_to_UChar32_index 4"); //#endif i2[j2] = i32 + adj2; ++j2; } // CHECK: #ifndef NDEBUG if (i16 >= nstr && (j1 < ni || j2 < ni)) throw StriException("DEBUG: stri__UChar16_to_UChar32_index 5"); #endif } stringi/src/stri_search_fixed_split.cpp0000644000176200001440000002121414770541312020130 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_bytesearch.h" #include "stri_container_integer.h" #include "stri_container_logical.h" #include #include using namespace std; /** * Split a string into parts [byte compare] * * The pattern matches identify delimiters that separate the input into fields. * The input data between the matches becomes the fields themselves. * * @param str character vector * @param pattern character vector * @param n integer vector * @param omit_empty logical vector * @param tokens_only single logical value * @param simplify single logical value * * @return list of character vectors or character matrix * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-25) * StriException friendly, use StriContainerUTF8 * * @version 0.1-?? (Marek Gagolewski, 2013-07-10) * BUGFIX: wrong behavior on empty str * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * stri_split_fixed now uses byte search only * * @version 0.3-1 (Marek Gagolewski, 2014-10-19) * added tokens_only param * * @version 0.3-1 (Marek Gagolewski, 2014-10-23) * added split param * * @version 0.3-1 (Marek Gagolewski, 2014-10-24) * allow omit_empty=NA * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-04) * allow `simplify=NA`; FR #126: pass n to stri_list2matrix * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * FR #110, #23: opts_fixed arg added * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use StriByteSearchMatcher */ SEXP stri_split_fixed(SEXP str, SEXP pattern, SEXP n, SEXP omit_empty, SEXP tokens_only, SEXP simplify, SEXP opts_fixed) { uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed); bool tokens_only1 = stri__prepare_arg_logical_1_notNA(tokens_only, "tokens_only"); PROTECT(simplify = stri__prepare_arg_logical_1(simplify, "simplify")); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); PROTECT(n = stri__prepare_arg_integer(n, "n")); PROTECT(omit_empty = stri__prepare_arg_logical(omit_empty, "omit_empty")); STRI__ERROR_HANDLER_BEGIN(5) R_len_t vectorize_length = stri__recycling_rule(true, 4, LENGTH(str), LENGTH(pattern), LENGTH(n), LENGTH(omit_empty)); StriContainerUTF8 str_cont(str, vectorize_length); StriContainerByteSearch pattern_cont(pattern, vectorize_length, pattern_flags); StriContainerInteger n_cont(n, vectorize_length); StriContainerLogical omit_empty_cont(omit_empty, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (n_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } int n_cur = n_cont.get(i); int omit_empty_cur = !omit_empty_cont.isNA(i) && omit_empty_cont.get(i); STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));, SET_VECTOR_ELT(ret, i, (omit_empty_cont.isNA(i))?stri__vector_NA_strings(1): stri__vector_empty_strings((omit_empty_cur || n_cur == 0)?0:1));) R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); if (n_cur >= INT_MAX-1) throw StriException(MSG__INCORRECT_NAMED_ARG "; " MSG__EXPECTED_SMALLER, "n"); else if (n_cur < 0) n_cur = INT_MAX; else if (n_cur == 0) { SET_VECTOR_ELT(ret, i, Rf_allocVector(STRSXP, 0)); continue; } else if (tokens_only1) n_cur++; // we need to do one split ahead here StriByteSearchMatcher* matcher = pattern_cont.getMatcher(i); matcher->reset(str_cont.get(i).c_str(), str_cont.get(i).length()); R_len_t k; deque< pair > fields; // byte based-indices fields.push_back(pair(0,0)); for (k=1; k < n_cur && USEARCH_DONE != matcher->findNext(); ) { R_len_t s1 = (R_len_t)matcher->getMatchedStart(); R_len_t s2 = (R_len_t)matcher->getMatchedLength() + s1; if (omit_empty_cur && fields.back().first == s1) fields.back().first = s2; // don't start any new field else { fields.back().second = s1; fields.push_back(pair(s2, s2)); // start a new field here ++k; // another field } } fields.back().second = str_cur_n; if (omit_empty_cur && fields.back().first == fields.back().second) fields.pop_back(); if (tokens_only1 && n_cur < INT_MAX) { n_cur--; // one split ahead could have been made, see above while (fields.size() > (size_t)n_cur) fields.pop_back(); // get rid of the remainder } SEXP ans; STRI__PROTECT(ans = Rf_allocVector(STRSXP, fields.size())); deque< pair >::iterator iter = fields.begin(); for (k = 0; iter != fields.end(); ++iter, ++k) { pair curoccur = *iter; if (curoccur.second == curoccur.first && omit_empty_cont.isNA(i)) SET_STRING_ELT(ans, k, NA_STRING); else SET_STRING_ELT(ans, k, Rf_mkCharLenCE(str_cur_s+curoccur.first, curoccur.second-curoccur.first, CE_UTF8)); } SET_VECTOR_ELT(ret, i, ans); STRI__UNPROTECT(1); } if (LOGICAL(simplify)[0] == NA_LOGICAL || LOGICAL(simplify)[0]) { R_len_t n_min = 0; R_len_t n_length = LENGTH(n); int* n_tab = INTEGER(n); for (R_len_t i=0; i * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf16.h" #include "stri_container_regex.h" #include #include using namespace std; /* Coverts a deque with (from,to) pairs to a 2-column R matrix * * does not set dimnames * * @param i if < 0, then adjust indexes of all is * * TODO: use also in stri_locate_all_fixed etc. * * @version 1.7.1 (Marek Gagolewski, 2021-06-20) */ SEXP stri__locate_get_fromto_matrix( deque< pair >& occurrences, StriContainerUTF16& str_cont, R_len_t i, bool omit_no_match1, bool get_length1 ) { SEXP ans; R_len_t noccurrences = (R_len_t)occurrences.size(); if (noccurrences <= 0) { return stri__matrix_NA_INTEGER( omit_no_match1?0:1, 2, get_length1?-1:NA_INTEGER ); } PROTECT(ans = Rf_allocMatrix(INTSXP, noccurrences, 2)); int* ans_tab = INTEGER(ans); deque< pair >::iterator iter = occurrences.begin(); for (R_len_t j = 0; iter != occurrences.end(); ++iter, ++j) { pair match = *iter; ans_tab[j] = match.first; ans_tab[j+noccurrences] = match.second; } // Adjust UChar index -> UChar32 index // (1-2 byte UTF16 to 1 byte UTF32-code points) if (i < 0) { STRI_ASSERT(noccurrences == str_cont.get_nrecycle()); for (i=0; i 1-based 0 // end returns position of next character after match ); } } else { str_cont.UChar16_to_UChar32_index( i, ans_tab, ans_tab+noccurrences, noccurrences, 1, // 0-based index -> 1-based 0 // end returns position of next character after match ); } if (get_length1) { for (R_len_t j = 0; j < noccurrences; ++j) { if (ans_tab[j] != NA_INTEGER && ans_tab[j] >= 0) ans_tab[j+noccurrences] -= ans_tab[j] - 1; } } UNPROTECT(1); return ans; } /** Locate all occurrences of a regex pattern * * @param str character vector * @param pattern character vector * @param opts_regex list * @param omit_no_match single logical value * @param capture_groups single logical value * @return list of integer matrices (2 columns) * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski) * StriContainerUTF16+deque usage * * @version 0.1-?? (Marek Gagolewski, 2013-06-19) * use StriContainerRegexPattern + opts_regex * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-11-27) * FR #117: omit_no_match arg added * * @version 1.0-2 (Marek Gagolewski, 2016-01-29) * Issue #214: allow a regex pattern like `.*` to match an empty string * * @version 1.4.7 (Marek Gagolewski, 2020-08-24) * Use StriContainerRegexPattern::getRegexOptions * * @version 1.7.1 (Marek Gagolewski, 2021-06-20) * #25: capture_groups * * @version 1.7.1 (Marek Gagolewski, 2021-06-29) * get_length */ SEXP stri_locate_all_regex(SEXP str, SEXP pattern, SEXP omit_no_match, SEXP opts_regex, SEXP capture_groups, SEXP get_length) { bool omit_no_match1 = stri__prepare_arg_logical_1_notNA(omit_no_match, "omit_no_match"); bool capture_groups1 = stri__prepare_arg_logical_1_notNA(capture_groups, "capture_groups"); bool get_length1 = stri__prepare_arg_logical_1_notNA(get_length, "get_length"); StriRegexMatcherOptions pattern_opts = StriContainerRegexPattern::getRegexOptions(opts_regex); PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); // prepare string argument R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF16 str_cont(str, vectorize_length); StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_opts); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); // R_len_t last_i = -1; for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if ((pattern_cont).isNA(i) || (pattern_cont).get(i).length() <= 0) { if (!(pattern_cont).isNA(i)) Rf_warning(MSG__EMPTY_SEARCH_PATTERN_UNSUPPORTED); SEXP ans; STRI__PROTECT(ans = stri__matrix_NA_INTEGER(1, 2)); if (capture_groups1) { SEXP ans2; STRI__PROTECT(ans2 = Rf_allocVector(VECSXP, 0)); Rf_setAttrib(ans, Rf_ScalarString(Rf_mkChar("capture_groups")), ans2); STRI__UNPROTECT(1); } SET_VECTOR_ELT(ret, i, ans); STRI__UNPROTECT(1); continue; } UErrorCode status = U_ZERO_ERROR; RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically deque< pair > occurrences; vector< deque< pair > > cg_occurrences; R_len_t pattern_cur_groups = matcher->groupCount(); if (capture_groups1 && pattern_cur_groups > 0) cg_occurrences.resize(pattern_cur_groups); if (!(str_cont).isNA(i)) { matcher->reset(str_cont.get(i)); int found = (int)matcher->find(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) while (found) { UErrorCode status = U_ZERO_ERROR; int start = (int)matcher->start(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) int end = (int)matcher->end(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) occurrences.push_back(pair(start, end)); if (capture_groups1) { for (R_len_t j=0; jstart(j+1, status); STRI__CHECKICUSTATUS_THROW(status, {}) end = (int)matcher->end(j+1, status); STRI__CHECKICUSTATUS_THROW(status, {}) if (start >= 0 && end >= 0) { // e.g., conditional capture group cg_occurrences[j].push_back(pair(start, end)); } else { cg_occurrences[j].push_back(pair( get_length1?-1:NA_INTEGER, get_length1?-1:NA_INTEGER )); } } } found = (int)matcher->find(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) }; } SEXP ans; if (str_cont.isNA(i)) STRI__PROTECT(ans = stri__matrix_NA_INTEGER(1, 2)) else STRI__PROTECT(ans = stri__locate_get_fromto_matrix( occurrences, str_cont, i, omit_no_match1, get_length1) ); if (capture_groups1) { SEXP cgs, names; STRI__PROTECT(cgs = Rf_allocVector(VECSXP, pattern_cur_groups)); STRI__PROTECT(names = pattern_cont.getCaptureGroupRNames(i)); // TODO: reuse // last_i = i; for (R_len_t j=0; j > > cg_occurrences; //cg_occurrences[i] -- i-th capture group for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { ret_tab[i] = NA_INTEGER; ret_tab[i+vectorize_length] = NA_INTEGER; if ((pattern_cont).isNA(i) || (pattern_cont).get(i).length() <= 0) { if (!(pattern_cont).isNA(i)) Rf_warning(MSG__EMPTY_SEARCH_PATTERN_UNSUPPORTED); continue; } // if str is NA, we may still be generating capture_groups if (!(str_cont).isNA(i) && get_length1) { ret_tab[i] = -1; ret_tab[i+vectorize_length] = -1; } RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically R_len_t pattern_cur_groups = matcher->groupCount(); if (capture_groups1 && pattern_cur_groups > 0) { while ((R_len_t)cg_occurrences.size() < pattern_cur_groups) { cg_occurrences.push_back( deque< pair >( vectorize_length, pair( NA_INTEGER, NA_INTEGER ) ) ); } } if ((str_cont).isNA(i)) { continue; } matcher->reset(str_cont.get(i)); UErrorCode status = U_ZERO_ERROR; int m_res = (int)matcher->find(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (!m_res) { if (capture_groups1 && get_length1) { for (R_len_t j=0; jstart(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) ret_tab[i+vectorize_length] = (int)matcher->end(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (capture_groups1) { for (R_len_t j=0; jstart(j+1, status); STRI__CHECKICUSTATUS_THROW(status, {}) int end = (int)matcher->end(j+1, status); STRI__CHECKICUSTATUS_THROW(status, {}) if (start >= 0 && end >= 0) { // e.g., conditional capture group cg_occurrences[j][i].first = start; cg_occurrences[j][i].second = end; } else { cg_occurrences[j][i].first = get_length1?-1:NA_INTEGER; cg_occurrences[j][i].second = get_length1?-1:NA_INTEGER; } } } if (first) break; // only first match m_res = (int)matcher->find(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (!m_res) break; } // Adjust UChar index -> UChar32 index (1-2 byte UTF16 to 1 byte UTF32-code points) str_cont.UChar16_to_UChar32_index( i, ret_tab+i, ret_tab+i+vectorize_length, 1, 1, // 0-based index -> 1-based 0 // end returns position of next character after match ); if (get_length1 && ret_tab[i] != NA_INTEGER && ret_tab[i] >= 0) ret_tab[i+vectorize_length] -= ret_tab[i] - 1; } if (capture_groups1) { SEXP cgs; R_len_t pattern_cur_groups = (R_len_t)cg_occurrences.size(); STRI__PROTECT(cgs = Rf_allocVector(VECSXP, pattern_cur_groups)); // last_i = i; for (R_len_t j=0; j * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_interval_h #define __stri_interval_h /** * @version 0.3-1 (Marek Gagolewski, 2014-11-01) */ template struct StriInterval { int a; int b; T data; StriInterval(int _a, int _b, const T& _data) { this->a = _a; this->b = _b; this->data = _data; } }; template bool operator<(const StriInterval& i1, const StriInterval& i2) { return (i1.a < i2.a); } #endif stringi/src/stri_search_regex_replace.cpp0000644000176200001440000003470214770541312020431 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_utf16.h" #include "stri_container_regex.h" /** * Replace occurrences of a regex pattern * * @param str strings to search in * @param pattern regex patterns to search for * @param replacement replacements * @param opts_regex list * @return character vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16's vectorization * * @version 0.1-?? (Marek Gagolewski, 2013-06-21) * use StriContainerRegexPattern + more general * * @version 0.1-?? (Marek Gagolewski, 2013-07-10) * BUGFIX: wrong behavior on empty str * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.0-2 (Marek Gagolewski, 2016-01-29) * Issue #214: allow a regex pattern like `.*` to match an empty string * * @version 1.0-2 (Marek Gagolewski, 2016-01-30) * Issue #210: Allow NA replacement * * @version 1.4.7 (Marek Gagolewski, 2020-08-24) * Use StriContainerRegexPattern::getRegexOptions */ SEXP stri__replace_allfirstlast_regex(SEXP str, SEXP pattern, SEXP replacement, SEXP opts_regex, int type) { PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(replacement = stri__prepare_arg_string(replacement, "replacement")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); StriRegexMatcherOptions pattern_opts = StriContainerRegexPattern::getRegexOptions(opts_regex); STRI__ERROR_HANDLER_BEGIN(3) R_len_t vectorize_length = stri__recycling_rule(true, 3, LENGTH(str), LENGTH(pattern), LENGTH(replacement)); StriContainerUTF16 str_cont(str, vectorize_length, false); // writable StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_opts); StriContainerUTF16 replacement_cont(replacement, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_PATTERN(str_cont, pattern_cont, SET_STRING_ELT(ret, i, NA_STRING);) RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically matcher->reset(str_cont.get(i)); UErrorCode status = U_ZERO_ERROR; if (replacement_cont.isNA(i)) { int m_res = matcher->find(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (m_res) str_cont.setNA(i); SET_STRING_ELT(ret, i, str_cont.toR(i)); continue; } if (type == 0) { // all str_cont.set(i, matcher->replaceAll(replacement_cont.get(i), status)); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } else if (type == 1) { // first str_cont.set(i, matcher->replaceFirst(replacement_cont.get(i), status)); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } else if (type == -1) { // end int start = -1; int end = -1; while (1) { // find last match int m_res = matcher->find(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (!m_res) break; start = matcher->start(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) end = matcher->end(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } if (start >= 0) { matcher->find(start, status); // go back STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) UnicodeString out; matcher->appendReplacement(out, replacement_cont.get(i), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) out.append(str_cont.get(i), end, str_cont.get(i).length()-end); str_cont.set(i, out); } } else { throw StriException(MSG__INTERNAL_ERROR); } SET_STRING_ELT(ret, i, str_cont.toR(i)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** * Replace all occurrences of a regex pattern; vectorize_all=FALSE * * @param str character vector * @param pattern character vector * @param replacement character vector * @param opts_regex a named list * @return character vector * * @version 0.3-1 (Marek Gagolewski, 2014-11-01) * * @version 0.3-1 (Marek Gagolewski, 2014-11-02) * Second version, 3x faster, 2 for loops + replaceAll * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.0-2 (Marek Gagolewski, 2016-01-30) * Issue #210: Allow NA replacement */ SEXP stri__replace_all_regex_no_vectorize_all(SEXP str, SEXP pattern, SEXP replacement, SEXP opts_regex) { // version beta PROTECT(str = stri__prepare_arg_string(str, "str")); // if str_n is 0, then return an empty vector R_len_t str_n = LENGTH(str); if (str_n <= 0) { UNPROTECT(1); return stri__vector_empty_strings(0); } PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); PROTECT(replacement = stri__prepare_arg_string(replacement, "replacement")); StriRegexMatcherOptions pattern_opts = StriContainerRegexPattern::getRegexOptions(opts_regex); R_len_t pattern_n = LENGTH(pattern); R_len_t replacement_n = LENGTH(replacement); if (pattern_n < replacement_n || pattern_n <= 0 || replacement_n <= 0) { UNPROTECT(3); Rf_error(MSG__WARN_RECYCLING_RULE2); } else if (pattern_n % replacement_n != 0) Rf_warning(MSG__WARN_RECYCLING_RULE); if (pattern_n == 1) {// this will be much faster: SEXP ret; PROTECT(ret = stri__replace_allfirstlast_regex(str, pattern, replacement, opts_regex, 0)); UNPROTECT(4); return ret; } STRI__ERROR_HANDLER_BEGIN(3) StriContainerUTF16 str_cont(str, str_n, false); // writable StriContainerRegexPattern pattern_cont(pattern, pattern_n, pattern_opts); StriContainerUTF16 replacement_cont(replacement, pattern_n); for (R_len_t i = 0; ireset(str_cont.get(j)); UErrorCode status = U_ZERO_ERROR; if (replacement_cont.isNA(i)) { int m_res = matcher->find(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (m_res) str_cont.setNA(j); continue; } str_cont.set(j, matcher->replaceAll(replacement_cont.get(i), status)); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } } STRI__UNPROTECT_ALL return str_cont.toR(); STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } // version alpha == to slow == too many toutf16 conversions //{ // PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); // PROTECT(replacement = stri__prepare_arg_string(replacement, "replacement")); // // R_len_t pattern_n = LENGTH(pattern); // R_len_t replacement_n = LENGTH(replacement); // if (pattern_n < replacement_n || pattern_n <= 0 || replacement_n <= 0) // Rf_error(MSG__WARN_RECYCLING_RULE2); // if (pattern_n % replacement_n != 0) // Rf_warning(MSG__WARN_RECYCLING_RULE); // // // no str_error_handlers needed here // SEXP pattern_cur, replacement_cur; // PROTECT(pattern_cur = Rf_allocVector(STRSXP, 1)); // PROTECT(replacement_cur = Rf_allocVector(STRSXP, 1)); // // PROTECT(str); // for (R_len_t i=0; i= n) { // dangling backslash //throw StriException(MSG__INVALID_FORMAT_SPECIFIER, ""); // gsub compatibility: break; } if (x[i] == '$') buf.append("\\$"); else if (x[i] == '\\') buf.append("\\\\"); else if (x[i] >= '1' && x[i] <= '9') { // \\0 not supported buf.push_back('$'); buf.push_back(x[i]); if (i+1 < n && (x[i+1] >= '0' && x[i+1] <= '9')) buf.push_back('\\'); } else buf.push_back(x[i]); } else buf.push_back(x[i]); i++; } return Rf_mkCharLenCE(buf.data(), buf.size(), CE_UTF8); } /** * Convert \1 to $1 and $ to \$ and \a to a * (gsub vs. stri_replace replacement strings) * * @param x character vector * * @return character vector * * @version 1.6.4 (Marek Gagolewski, 2021-06-16) */ SEXP stri_replace_rstr(SEXP x) { PROTECT(x = stri__prepare_arg_string(x, "x")); R_len_t vectorize_length = LENGTH(x); if (vectorize_length <= 0) { UNPROTECT(1); return Rf_allocVector(STRSXP, 0); } STRI__ERROR_HANDLER_BEGIN(1) StriContainerUTF8 x_cont(x, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); for ( R_len_t i = x_cont.vectorize_init(); i != x_cont.vectorize_end(); i = x_cont.vectorize_next(i) ) { if (x_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } SEXP out; STRI__PROTECT(out = stri__replace_rstr_1(x_cont.get(i))); SET_STRING_ELT(ret, i, out); STRI__UNPROTECT(1); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } stringi/src/stri_test.cpp0000644000176200001440000001111514750110642015243 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_utf16.h" /** dummy fun to measure the performance of .Call * * @version 0.1-?? (Marek Gagolewski) */ SEXP stri_test_returnasis(SEXP x) { return x; } /** Check R encoding marking *for testing only* * This function should not be exported * * @param s character vector * * Results are printed on STDERR * * @version 0.1-?? (Marek Gagolewski) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_test_Rmark(SEXP s) { #ifndef NDEBUG PROTECT(s = stri__prepare_arg_string(s, "str")); int ns = LENGTH(s); for (int i=0; i < ns; ++i) { fprintf(stdout, "!NDEBUG: Element #%d:\n", i); SEXP curs = STRING_ELT(s, i); if (curs == NA_STRING) { fprintf(stdout, "!NDEBUG: \tNA\n"); continue; } //const char* string = CHAR(curs); fprintf(stdout, "!NDEBUG: \tMARK_ASCII = %d\n", (IS_ASCII(curs) > 0)); fprintf(stdout, "!NDEBUG: \tMARK_UTF8 = %d\n", (IS_UTF8(curs) > 0)); fprintf(stdout, "!NDEBUG: \tMARK_LATIN1= %d\n", (IS_LATIN1(curs) > 0)); fprintf(stdout, "!NDEBUG: \tMARK_BYTES = %d\n", (IS_BYTES(curs) > 0)); fprintf(stdout, "!NDEBUG: \n"); } UNPROTECT(1); return R_NilValue; #else Rf_error("This function is enabled only if NDEBUG is undef."); return s; // s here avoids compiler warning #endif } /** for testing efficiency of StriContainerUTF16 [internal] * * @param str character vector * @return R_NilValue * * @version 0.1-?? (Marek Gagolewski) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_test_UnicodeContainer16(SEXP str) { PROTECT(str = stri__prepare_arg_string(str, "str")); STRI__ERROR_HANDLER_BEGIN(1) StriContainerUTF16 ss(str, LENGTH(str)); STRI__UNPROTECT_ALL return R_NilValue; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** for testing efficiency of StriContainerUTF16 [internal] * * @param str character vector * @return R_NilValue * * @version 0.4-1 (Marek Gagolewski, 2014-12-03) */ SEXP stri_test_UnicodeContainer16b(SEXP str) { PROTECT(str = stri__prepare_arg_string(str, "str")); STRI__ERROR_HANDLER_BEGIN(1) StriContainerUTF16 ss(str, LENGTH(str)); STRI__UNPROTECT_ALL return ss.toR(); STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** for testing efficiency of StriContainerUTF8 [internal] * @param str character vector * @return R_NilValue * * @version 0.1-?? (Marek Gagolewski) */ SEXP stri_test_UnicodeContainer8(SEXP str) { PROTECT(str = stri__prepare_arg_string(str, "str")); STRI__ERROR_HANDLER_BEGIN(1) StriContainerUTF8 ss(str, LENGTH(str)); STRI__UNPROTECT_ALL return R_NilValue; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } stringi/src/stri_cpp.txt0000644000176200001440000000436614750143247015124 0ustar liggesusersstri_brkiter.cpp \ stri_callables.cpp \ stri_collator.cpp \ stri_common.cpp \ stri_compare.cpp \ stri_container_base.cpp \ stri_container_bytesearch.cpp \ stri_container_listint.cpp \ stri_container_listraw.cpp \ stri_container_listutf8.cpp \ stri_container_regex.cpp \ stri_container_usearch.cpp \ stri_container_utf16.cpp \ stri_container_utf8.cpp \ stri_container_utf8_indexable.cpp \ stri_encoding_conversion.cpp \ stri_encoding_detection.cpp \ stri_encoding_management.cpp \ stri_escape.cpp \ stri_exception.cpp \ stri_ICU_settings.cpp \ stri_join.cpp \ stri_length.cpp \ stri_pad.cpp \ stri_prepare_arg.cpp \ stri_random.cpp \ stri_reverse.cpp \ stri_search_class_count.cpp \ stri_search_class_detect.cpp \ stri_search_class_extract.cpp \ stri_search_class_locate.cpp \ stri_search_class_replace.cpp \ stri_search_class_split.cpp \ stri_search_class_startsendswith.cpp \ stri_search_class_subset.cpp \ stri_search_class_trim.cpp \ stri_search_common.cpp \ stri_search_coll_count.cpp \ stri_search_coll_detect.cpp \ stri_search_coll_extract.cpp \ stri_search_coll_locate.cpp \ stri_search_coll_replace.cpp \ stri_search_coll_split.cpp \ stri_search_coll_startsendswith.cpp \ stri_search_coll_subset.cpp \ stri_search_boundaries_count.cpp \ stri_search_boundaries_extract.cpp \ stri_search_boundaries_locate.cpp \ stri_search_boundaries_split.cpp \ stri_search_fixed_count.cpp \ stri_search_fixed_detect.cpp \ stri_search_fixed_extract.cpp \ stri_search_fixed_locate.cpp \ stri_search_fixed_replace.cpp \ stri_search_fixed_split.cpp \ stri_search_fixed_subset.cpp \ stri_search_fixed_startsendswith.cpp \ stri_search_in.cpp \ stri_search_other_split.cpp \ stri_search_regex_count.cpp \ stri_search_regex_detect.cpp \ stri_search_regex_extract.cpp \ stri_search_regex_locate.cpp \ stri_search_regex_match.cpp \ stri_search_regex_replace.cpp \ stri_search_regex_split.cpp \ stri_search_regex_subset.cpp \ stri_sort.cpp \ stri_sprintf.cpp \ stri_stats.cpp \ stri_string8.cpp \ stri_stringi.cpp \ stri_sub.cpp \ stri_test.cpp \ stri_time_zone.cpp \ stri_time_calendar.cpp \ stri_time_symbols.cpp \ stri_time_format.cpp \ stri_trans_casemap.cpp \ stri_trans_other.cpp \ stri_trans_normalization.cpp \ stri_trans_transliterate.cpp \ stri_ucnv.cpp \ stri_uloc.cpp \ stri_utils.cpp \ stri_wrap.cpp stringi/src/stri_container_listraw.cpp0000644000176200001440000001212114770541312020015 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_listraw.h" /** * Default constructor * */ StriContainerListRaw::StriContainerListRaw() : StriContainerBase() { data = NULL; } /** * Construct String Container from R object * @param rstr R object * * if you want nrecycle > n, call set_nrecycle * * @version 1.6.2 (Marek Gagolewski, 2021-05-14) * #354 Force the copying of ALTREP data */ StriContainerListRaw::StriContainerListRaw(SEXP rstr) { this->data = NULL; if (Rf_isNull(rstr)) { this->init_Base(1, 1, true); this->data = new String8[this->n]; // 1 string, NA if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); } else if (isRaw(rstr)) { this->init_Base(1, 1, true); this->data = new String8[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); bool memalloc = ALTREP(rstr); // #354: force copying of ALTREP data this->data[0].initialize((const char*)RAW(rstr), LENGTH(rstr), memalloc, false/*killbom*/, false/*isASCII*/); // shallow copy } else if (Rf_isVectorList(rstr)) { R_len_t nv = LENGTH(rstr); this->init_Base(nv, nv, true); this->data = new String8[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (R_len_t i=0; in; ++i) { SEXP cur = VECTOR_ELT(rstr, i); if (!Rf_isNull(cur)) { bool memalloc = ALTREP(cur); // #354: force copying of ALTREP data this->data[i].initialize((const char*)RAW(cur), LENGTH(cur), memalloc, false/*killbom*/, false/*isASCII*/); // shallow copy } // else leave as-is, i.e., NA } } else { // it's surely a character vector (args have been checked) R_len_t nv = LENGTH(rstr); this->init_Base(nv, nv, true); this->data = new String8[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (R_len_t i=0; in; ++i) { SEXP cur = STRING_ELT(rstr, i); if (cur != NA_STRING) { bool memalloc = ALTREP(rstr); // #354: force copying of ALTREP data this->data[i].initialize(CHAR(cur), LENGTH(cur), memalloc, false/*killbom*/, false/*isASCII*/); // shallow copy } // else leave as-is, i.e., NA } } } StriContainerListRaw::StriContainerListRaw(StriContainerListRaw& container) : StriContainerBase((StriContainerBase&)container) { if (container.data) { this->data = new String8[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (int i=0; in; ++i) { this->data[i] = container.data[i]; } } else { this->data = NULL; } } StriContainerListRaw& StriContainerListRaw::operator=(StriContainerListRaw& container) { this->~StriContainerListRaw(); (StriContainerBase&) (*this) = (StriContainerBase&)container; if (container.data) { this->data = new String8[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (int i=0; in; ++i) { this->data[i] = container.data[i]; } } else { this->data = NULL; } return *this; } StriContainerListRaw::~StriContainerListRaw() { if (data) { delete [] data; data = NULL; } } stringi/src/stri_container_bytesearch.cpp0000644000176200001440000003436614770541421020501 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_bytesearch.h" #include /** * Default constructor * */ StriContainerByteSearch::StriContainerByteSearch() : StriContainerUTF8() { this->matcher = NULL; this->flags = 0; } /** * Construct String Container from R character vector * @param rstr R character vector * @param _nrecycle extend length [vectorization] */ StriContainerByteSearch::StriContainerByteSearch(SEXP rstr, R_len_t _nrecycle, uint32_t _flags) : StriContainerUTF8(rstr, _nrecycle, true) { this->flags = _flags; this->matcher = NULL; R_len_t n = get_n(); for (R_len_t i=0; imatcher = NULL; this->flags = container.flags; } /** Copy operator * @param container source * @return *this */ StriContainerByteSearch& StriContainerByteSearch::operator=(StriContainerByteSearch& container) { this->~StriContainerByteSearch(); (StriContainerUTF8&) (*this) = (StriContainerUTF8&)container; return *this; } /** Destructor * */ StriContainerByteSearch::~StriContainerByteSearch() { if (matcher) { delete matcher; matcher = NULL; } } /** * @version 0.5-1 (Marek Gagolewski, 2015-02-14) */ StriByteSearchMatcher* StriContainerByteSearch::getMatcher(R_len_t i) { if (i >= n && matcher && matcher->getPatternStr() == get(i).c_str()) { // matcher reuse } else { if (matcher) { delete matcher; matcher = NULL; } if (isCaseInsensitive()) matcher = new StriByteSearchMatcherKMPci(get(i).c_str(), get(i).length(), isOverlap()); else if (get(i).length() == 1) matcher = new StriByteSearchMatcher1(get(i).c_str(), get(i).length(), isOverlap()); else if (get(i).length() < 16) matcher = new StriByteSearchMatcherShort(get(i).c_str(), get(i).length(), isOverlap()); else matcher = new StriByteSearchMatcherKMP(get(i).c_str(), get(i).length(), isOverlap()); } return matcher; } /** find first match - case of short pattern * * @param startPos where to start * @return USEARCH_DONE on no match, otherwise start index * * @version 0.2-3 (Marek Gagolewski, 2014-05-11) * special procedure for patternLen <= 4 * * @version 0.2-4 (Marek Gagolewski, 2014-05-15) * BUGFIX: load of misaligned addresses * * @version 0.4-1 (Marek Gagolewski, 2014-11-30) * BUGFIX: ret USEARCH_DONE immediately if startPos is too far away */ //R_len_t StriContainerByteSearch::findFromPosFwd_short(R_len_t startPos) //{ // if (startPos > searchLen-patternLen) { // this check is OK, we do a case-sensitive search // searchPos = searchEnd = searchLen; // return USEARCH_DONE; // } // // if (patternLen == 1) { // // else not found // unsigned char pat = (unsigned char)patternStr[0]; /* TO DO: why can't this be cached? */ // for (searchPos = startPos; searchPos=0; --searchPos) { // if (pat == (unsigned char)searchStr[searchPos]) { // searchEnd = searchPos + 1; // return searchPos; // } // } // } // else if (patternLen == 2) { // // be careful: little vs big endian! // uint16_t pat = ((uint16_t)((unsigned char)patternStr[1])); // pat <<= 8; // pat |= ((uint16_t)((unsigned char)patternStr[0])); // // unsigned char* curstr = (unsigned char*)(searchStr+startPos); // uint16_t cur = ((uint16_t)(*curstr)); // --curstr; // for (searchPos = startPos-1; searchPos>=0; --searchPos) { // cur <<= 8; // cur |= (uint16_t)(*curstr); // --curstr; // if (pat == cur) { // searchEnd = searchPos + 2; // return searchPos; // } // } // } // else if (patternLen == 3) { // uint32_t pat = ((uint32_t)((unsigned char)patternStr[2])); // pat <<= 8; // pat |= ((uint32_t)((unsigned char)patternStr[1])); // pat <<= 8; // pat |= ((uint32_t)((unsigned char)patternStr[0])); // // unsigned char* curstr = (unsigned char*)(searchStr+startPos); // uint32_t cur = ((uint32_t)(*curstr)); // --curstr; // cur <<= 8; // cur |= (uint32_t)(*curstr); // --curstr; // // uint32_t mask = ~(((unsigned char)0xff)<<24); // // for (searchPos = startPos-2; searchPos>=0; --searchPos) { // cur <<= 8; // cur |= (uint32_t)(*curstr); // --curstr; // if ((pat&mask) == (cur&mask)) { // searchEnd = searchPos + 3; // return searchPos; // } // } // } // else if (patternLen == 4) { // uint32_t pat = ((uint32_t)((unsigned char)patternStr[3])); /* TO DO: can't this be cached? */ // pat <<= 8; // pat |= ((uint32_t)((unsigned char)patternStr[2])); // pat <<= 8; // pat |= ((uint32_t)((unsigned char)patternStr[1])); // pat <<= 8; // pat |= ((uint32_t)((unsigned char)patternStr[0])); // // unsigned char* curstr = (unsigned char*)(searchStr+startPos); // uint32_t cur = ((uint32_t)(*curstr)); // --curstr; // cur <<= 8; // cur |= (uint32_t)(*curstr); // --curstr; // cur <<= 8; // cur |= (uint32_t)(*curstr); // --curstr; // // // for (searchPos = startPos-3; searchPos>=0; --searchPos) { // cur <<= 8; // cur |= (uint32_t)(*curstr); // --curstr; // if (pat == cur) { // searchEnd = searchPos + 4; // return searchPos; // } // } // } // // else not found // searchPos = searchEnd = searchLen; // return USEARCH_DONE; //} /** Read settings flags from a list * * may call Rf_error * * @param opts_fixed list * @param allow_overlap * @return flags * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * * @version 0.4-1 (Marek Gagolewski, 2014-12-08) * add `overlap` option * * @version 1.1.6 (Marek Gagolewski, 2017-11-10) * PROTECT STRING_ELT(names, i) */ uint32_t StriContainerByteSearch::getByteSearchFlags(SEXP opts_fixed, bool allow_overlap) { uint32_t flags = 0; if (!Rf_isNull(opts_fixed) && !Rf_isVectorList(opts_fixed)) Rf_error(MSG__ARG_EXPECTED_LIST, "opts_fixed"); // error() call allowed here R_len_t narg = Rf_isNull(opts_fixed)?0:LENGTH(opts_fixed); if (narg > 0) { SEXP names = PROTECT(Rf_getAttrib(opts_fixed, R_NamesSymbol)); if (names == R_NilValue || LENGTH(names) != narg) Rf_error(MSG__FIXED_CONFIG_FAILED); // error() call allowed here for (R_len_t i=0; i * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf16.h" #include "stri_container_regex.h" /** * Count the number of recurrences of \code{pattern} in \code{s} * * @param str strings to search in * @param pattern regex patterns to search for * @param opts_regex list * * @return integer vector * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16's vectorization * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.1-?? (Marek Gagolewski, 2013-06-17) * use StriContainerRegexPattern + opts_regex * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.0-2 (Marek Gagolewski, 2016-01-29) * Issue #214: allow a regex pattern like `.*` to match an empty string * * @version 1.4.7 (Marek Gagolewski, 2020-08-24) * Use StriContainerRegexPattern::getRegexOptions */ SEXP stri_count_regex(SEXP str, SEXP pattern, SEXP opts_regex) { PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriRegexMatcherOptions pattern_opts = StriContainerRegexPattern::getRegexOptions(opts_regex); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF16 str_cont(str, vectorize_length); StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_opts); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, vectorize_length)); int* ret_tab = INTEGER(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_INTEGER) // see search_regex_detect for UText implementation (often slower) RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically matcher->reset(str_cont.get(i)); UErrorCode status = U_ZERO_ERROR; int count = 0; while (1) { int m_res = (bool)matcher->find(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (!m_res) break; ++count; } ret_tab[i] = count; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } stringi/src/stri_container_charclass.h0000644000176200001440000001627214770541312017753 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_container_charclass_h #define __stri_container_charclass_h #include "stri_container_base.h" #include "stri_container_utf8.h" #include /** * A container handling charclass searches * * @version 0.1-?? (Marek Gagolewski, 2013-06-15) * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * Use StriContainerUTF8 to convert pattern strings in a constructor; * Use UnicodeSet instead of stringi::CharClass * * @version 0.3-1 (Marek Gagolewski, 2014-11-02) * New method: locateAll * * @version 1.6.3 (Marek Gagolewski, 2021-06-10) * negate */ class StriContainerCharClass : public StriContainerBase { private: UnicodeSet* data; // array public: StriContainerCharClass() : StriContainerBase() { data = NULL; } StriContainerCharClass(SEXP rvec, R_len_t _nrecycle, bool negate=false) { #ifndef NDEBUG if (!Rf_isString(rvec)) throw StriException("DEBUG: !Rf_isString in StriContainerCharClass::StriContainerCharClass(SEXP rvec)"); #endif int _n = LENGTH(rvec); this->init_Base(_n, _nrecycle, true); this->data = NULL; if (_n > 0) { StriContainerUTF8 rvec_cont(rvec, _n, true); this->data = new UnicodeSet[_n]; for (int i=0; i<_n; ++i) { if (rvec_cont.isNA(i)) this->data[i].setToBogus(); else { UErrorCode status = U_ZERO_ERROR; this->data[i].applyPattern( UnicodeString::fromUTF8(rvec_cont.get(i).c_str()), status ); STRI__CHECKICUSTATUS_THROW(status, {delete [] data; data = NULL;}) if (negate) this->data[i].complement(); this->data[i].freeze(); } } } } StriContainerCharClass(StriContainerCharClass& container) :StriContainerBase((StriContainerBase&)container) { if (container.data) { this->data = new UnicodeSet[container.n]; for (int i=0; idata[i] = container.data[i]; } else this->data = NULL; } ~StriContainerCharClass() { if (data) delete [] data; } StriContainerCharClass& operator=(StriContainerCharClass& container) { this->~StriContainerCharClass(); (StriContainerBase&) (*this) = (StriContainerBase&)container; if (container.data) { this->data = new UnicodeSet[container.n]; for (int i=0; idata[i] = container.data[i]; } else this->data = NULL; return *this; } /** check if the vectorized ith element is NA * @param i index * @return true if is NA */ inline bool isNA(R_len_t i) const { #ifndef NDEBUG if (i < 0 || i >= nrecycle) throw StriException("StriContainerCharClass::isNA(): INDEX OUT OF BOUNDS"); #endif return data[i%n].isBogus(); } /** get the vectorized ith element * @param i index * @return integer */ inline const UnicodeSet& get(R_len_t i) const { #ifndef NDEBUG if (i < 0 || i >= nrecycle) throw StriException("StriContainerCharClass::get(): INDEX OUT OF BOUNDS"); if (data[i%n].isBogus()) throw StriException("StriContainerCharClass::get(): isNA"); #endif return (data[i%n]); } /** Locate all occurrences of a charclass * * @return total number of bytes @ pattern matches (idx_codepoint==false) * or total number of codepoints matched (idx_codepoint==true) */ static R_len_t locateAll(deque< pair >& occurrences, const UnicodeSet* pattern_cur, const char* str_cur_s, R_len_t str_cur_n, bool merge_cur, bool idx_codepoint) { if (idx_codepoint) { R_len_t j, k; UChar32 chr; R_len_t sumcodepoints = 0; for (k=j=0; jcontains(chr)) { if (merge_cur && occurrences.size() > 0 && occurrences.back().second == k-1) occurrences.back().second = k; else occurrences.push_back(pair(k-1, k)); ++sumcodepoints; } } return sumcodepoints; } else { R_len_t j, jlast; UChar32 chr; R_len_t sumbytes = 0; for (jlast=j=0; jcontains(chr)) { if (merge_cur && occurrences.size() > 0 && occurrences.back().second == jlast) occurrences.back().second = j; else occurrences.push_back(pair(jlast, j)); sumbytes += j-jlast; } jlast = j; } return sumbytes; } } }; #endif stringi/src/stri_callables.h0000644000176200001440000000420114750143224015653 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include // the callables library is defined in stri_callables.cpp const extern R_CallMethodDef stri_callables[]; /* Third-party packages can retrieve the following functions from stringi's shared library via a call to: #include R_GetCCallable("stringi", "function_name"); If you would like to get access to any additional functions (e.g., from ICU), feel free to contact the maintainer of stringi. */ int stric_u_hasBinaryProperty(int c, int which); stringi/src/stri_messages.h0000644000176200001440000002637414770541572015571 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_messages_h #define __stri_messages_h // CONVENTION: start with a lowercase letter, do not end with a dot // separate messages with "; " like "hello" "; " "it is me you're looking for" // or MSG__INCORRECT_NAMED_ARG "; " MSG__EXPECTED_SMALLER // arg names and symbols should be `backticked` // string-like objects should be 'quoted' // ASCII only // ensure proper capitalisation: ASCII, Unicode, UTF-8, ... // see stri_exception.cpp for more messages #define MSG__INCORRECT_INTERNAL_ARG \ "incorrect argument" #define MSG__INCORRECT_NAMED_ARG \ "incorrect argument `%s`" #define MSG__INTERNAL_ERROR \ "internal error" #define MSG__ICU_ERROR \ "%s (%s)" #define MSG__ICU_ERROR_WITH_CONTEXT \ "%s (%s, context=`%s`)" #define MSG__ICU_WARNING \ "%s (%s)" /// incorrect uchar class id, see stri_char_getpropertyid and stri_char_getcategoryid #define MSG__INCORRECT_UCHAR_CLASS_ID \ "incorrect character class identifier" #define MSG__INCORRECT_MATCH_OPTION \ "incorrect option for `%s`" #define MSG__INCORRECT_COLLATOR_OPTION \ "incorrect opts_collator setting: '%s'; ignoring" #define MSG__INCORRECT_COLLATOR_OPTION_SPEC \ "incorrect collator option specifier, see ?stri_opts_collator" #define MSG__INCORRECT_BRKITER_OPTION_SPEC \ "incorrect break iterator option specifier, see ?stri_opts_brkiter" #define MSG__INCORRECT_FIXED_OPTION \ "incorrect opts_fixed setting: '%s'; ignoring" #define MSG__INCORRECT_REGEX_OPTION \ "incorrect opts_regex setting: '%s'; ignoring" #define MSG__INVALID_CODE_POINT \ "invalid Unicode code point \\U%08x" #define MSG__INVALID_CODE_POINT_FIXING \ "invalid UTF-8 code point definition; fixing" #define MSG__INVALID_CODE_POINT_REPLNA \ "invalid UTF-8 code point definition; setting string to NA, see also ?stri_enc_toutf8" #define MSG__INVALID_UTF8 \ "invalid UTF-8 byte sequence detected; try calling stri_enc_toutf8()" #define MSG__INVALID_ESCAPE \ "invalid escape sequence detected; setting NA" #define MSG__REPLACEMENT_ZERO \ "replacement has length zero" #define MSG_REPLACEMENT_MULTIPLE \ "number of items to replace is not a multiple of replacement length" #define MSG__UNCONVERTIBLE_CODE_POINT \ "the Unicode code point \\U%08x cannot be converted to destination encoding" #define MSG__UNCONVERTIBLE_BINARY_1 \ "input data \\x%02x in the current source encoding could not be converted to Unicode" #define MSG__UNCONVERTIBLE_BINARY_2 \ "input data \\x%02x\\x%02x in the current source encoding could not be converted to Unicode" #define MSG__UNCONVERTIBLE_BINARY_3 \ "input data \\x%02x\\x%02x\\x%02x in the current source encoding could not be converted to Unicode" #define MSG__UNCONVERTIBLE_BINARY_4 \ "input data \\x%02x\\x%02x\\x%02x\\x%02x in the current source encoding could not be converted to Unicode" #define MSG__UNCONVERTIBLE_BINARY_n \ "some input data in the current source encoding could not be converted to Unicode" #define MSG__WARN_LIST_COERCION \ "argument is not an atomic vector; coercing" /// warning when applying recycling rule to not fully recycled args #define MSG__WARN_RECYCLING_RULE \ "longer object length is not a multiple of shorter object length" #define MSG__WARN_RECYCLING_RULE2 \ "vector length not consistent with other arguments" #define MSG__WARN_EMPTY_VECTOR \ "vector has length zero" #define MSG__EXPECTED_NONNEGATIVE \ "expected a nonnegative numeric value" #define MSG__EXPECTED_POSITIVE \ "expected a positive numeric value" #define MSG__EXPECTED_SMALLER \ "value too large" #define MSG__EXPECTED_LARGER \ "value too small" #define MSG__EXPECTED_ASCII \ "expected a string that consists of ASCII characters only" #define MSG__EXPECTED_CHAR_IN_SET \ "expected a character in [%s]" #define MSG__TIMEZONE_INCORRECT_ID \ "incorrect time zone identifier" #define MSG__LOCALE_ERROR_SET \ "locale could not be set or selected" #define MSG__ENC_ERROR_GETNAME \ "character encoding name could not be fetched by the ICU converter" #define MSG__ENC_ERROR_SET \ "character encoding could not be set, queried, or selected" #define MSG__ENC_ERROR_CONVERT \ "encoding could not be converted" #define MSG__LOCALE_INCORRECT_ID \ "incorrect locale identifier" #define MSG__ENC_INCORRECT_ID \ "incorrect character encoding identifier" #define MSG__ENC_INCORRECT_ID_WHAT \ "incorrect character encoding identifier: '%s'" #define MSG__ENC_NOT8BIT \ "encoding %s is not an 8-bit encoding" #define MSG__BYTESENC \ "bytes encoding is not supported by this function" #define MSG__REGEX_FAILED \ "regex search failed" #define MSG__REGEX_CONFIG_FAILED \ "regex engine configuration failed" #define MSG__REGEX_FAILED_DETAILS \ "regex search failed: %s" #define MSG__FIXED_CONFIG_FAILED \ "fixed search engine configuration failed" #define MSG__STRSEARCH_FAILED \ "string search failed" #define MSG__RESOURCE_ERROR_GET \ "requested ICU resource is unavailable" #define MSG__RESOURCE_ERROR_APPLY \ "error while applying operation" #define MSG__LOCATE_DIM_START \ "start" #define MSG__LOCATE_DIM_END \ "end" #define MSG__LOCATE_DIM_LENGTH \ "length" #define MSG__NEWLINE_FOUND \ "newline character found in a string" #define MSG__NOT_EQ_N_CODEPOINTS \ "each string in `%s` should consist of exactly %d code points" #define MSG__NOT_EQ_N_WIDTH \ "each string in `%s` should consist of code points of total width %d" #define MSG__CHARCLASS_INCORRECT_WHICH \ "unknown character class '%s'; assuming NA" #define MSG__CHARCLASS_INCORRECT \ "unknown character class" #define MSG__ARG_EXPECTED_NOT_NA \ "missing values in argument `%s` is not supported" #define MSG__ARG_EXPECTED_NOT_EMPTY \ "argument `%s` should be a non-empty vector" #define MSG__ARG_EXPECTED_NOT_NULL \ "argument `%s` should not be a NULL" #define MSG__ARG_EXPECTED_1_STRING \ "argument `%s` should be a single character string; only the first element is used" #define MSG__ARG_EXPECTED_1_LOGICAL \ "argument `%s` should be a single logical value; only the first element is used" #define MSG__ARG_EXPECTED_1_INTEGER \ "argument `%s` should be a single integer value; only the first element is used" #define MSG__ARG_EXPECTED_1_NUMERIC \ "argument `%s` should be a single numeric value; only the first element is used" #define MSG__ARG_EXPECTED_STRING \ "argument `%s` should be a character vector (or an object coercible to)" #define MSG__ARG_EXPECTED_LIST \ "argument `%s` should be a list" #define MSG__ARG_EXPECTED_LIST_STRING \ "argument `%s` should be a list of character vectors (or an object coercible to)" #define MSG__ARG_EXPECTED_LIST_INTEGER \ "argument `%s` should be a list of integer vectors or an integer vector (or an object coercible to)" #define MSG__ARG_EXPECTED_VECTOR \ "argument `%s` should be a vector" #define MSG__ARG_EXPECTED_RAW \ "argument `%s` should be a raw vector (or an object coercible to)" #define MSG__ARG_EXPECTED_LOGICAL \ "argument `%s` should be a logical vector (or an object coercible to)" #define MSG__ARG_EXPECTED_INTEGER \ "argument `%s` should be an integer vector (or an object coercible to)" #define MSG__ARG_EXPECTED_NUMERIC \ "argument `%s` should be a numeric vector (or an object coercible to)" #define MSG__ARG_EXPECTED_POSIXct \ "argument `%s` should be an object of class POSIXct (or an object coercible to)" #define MSG__ARG_EXPECTED_STRING_NO_COERCION \ "argument `%s` should be a character vector" #define MSG__ARG_EXPECTED_RAW_IN_LIST_NO_COERCION \ "all elements in `%s` should be a raw vectors" #define MSG__ARG_EXPECTED_RAW_NO_COERCION \ "argument `%s` should be a raw vector" #define MSG__ARG_EXPECTED_LOGICAL_NO_COERCION \ "argument `%s` should be a logical vector" #define MSG__ARG_EXPECTED_INTEGER_NO_COERCION \ "argument `%s` should be an integer vector" #define MSG__ARG_EXPECTED_NUMERIC_NO_COERCION \ "argument `%s` should be a numeric vector" #define MSG__ARG_EXPECTED_MATRIX_WITH_GIVEN_COLUMNS \ "argument `%s` should be a matrix with %d columns" #define MSG__ARG_EXPECTED_NOT_MATRIX \ "argument `%s` is a matrix, which is not supported in this context" #define MSG__ARG_IGNORING \ "ignoring argument `%s` in this context" #define MSG__ARG_EXCLUSIVE \ "arguments `%s` and `%s` are mutually exclusive in this context" #define MSG__ARG_NEED_MORE \ "too few arguments" #define MSG__ARG_UNUSED \ "some arguments have not been used" #define MSG__ARG_UNUSED_1 \ "one argument has not been used" #define MSG__ARG_UNUSED_N \ "%d arguments have not been used" #define MSG__PROBLEMATIC_FORMAT_SPECIFIER_CHAR \ "conversion specifier '%%%c' might be non-portable/problematic" #define MSG__INVALID_FORMAT_SPECIFIER \ "conversion specifier '%%%s' is not valid" #define MSG__INVALID_FORMAT_SPECIFIER_SUB \ "conversion specifier '%%%.*s' is not valid" #define MSG__EMPTY_SEARCH_PATTERN_UNSUPPORTED \ "empty search patterns are not supported" #define MSG__OVERLAPPING_PATTERN_UNSUPPORTED \ "overlapping pattern matches are not supported" #define MSG__OVERLAPPING_OR_UNSORTED_INDEXES \ "index ranges must be sorted and mutually disjoint" #define MSG__MEM_ALLOC_ERROR \ "memory allocation or access error" #define MSG__MEM_ALLOC_ERROR_WITH_SIZE \ "memory allocation error: failed to allocate %zu bytes" #define MSG__BUF_SIZE_EXCEEDED \ "internal error: required buffer size is beyond the permitted limit" #define MSG__U_CHARSET_IS_UTF8 \ "system ICU assumes that the default character set is always UTF-8, and hence this function has no effect" #define MSG__CHARSXP_2147483647 \ "Elements of character vectors (CHARSXPs) are limited to 2^31-1 bytes" #endif stringi/src/stri_search_class_trim.cpp0000644000176200001440000001514414770541312017763 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_charclass.h" /** * Trim characters from a charclass from left AND/OR right side of the string * * @param str character vector * @param pattern character vector * @param left from left? * @param right from left? * @return character vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-04) * Use StriContainerUTF8 and CharClass * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly & Use StrContainerCharClass * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * detects invalid UTF-8 byte stream * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * StriContainerCharClass now relies on UnicodeSet * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.6.3 (Marek Gagolewski, 2021-06-10) negate */ SEXP stri__trim_leftright(SEXP str, SEXP pattern, bool left, bool right, bool negate) { PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length, negate); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); R_len_t jlast1 = 0; R_len_t jlast2 = str_cur_n; if (left) { UChar32 chr; for (R_len_t j=0; jcontains(chr)) { break; // break at first occurrence } jlast1 = j; } } if (right && jlast1 < str_cur_n) { UChar32 chr; for (R_len_t j=str_cur_n; j>0; ) { U8_PREV(str_cur_s, 0, j, chr); // "look behind" if (chr < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { break; // break at first occurrence } jlast2 = j; } } // now jlast is the index, from which we start copying SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+jlast1, (jlast2-jlast1), CE_UTF8)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** * Trim characters from a charclass from both sides of the string * * @param str character vector * @param pattern character vector * @return character vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-04) * Use stri__trim_leftright * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 1.6.3 (Marek Gagolewski, 2021-06-10) negate */ SEXP stri_trim_both(SEXP str, SEXP pattern, SEXP negate) { bool negate_val = stri__prepare_arg_logical_1_notNA(negate, "negate"); return stri__trim_leftright(str, pattern, true, true, negate_val); } /** * Trim characters from a charclass from the left of the string * * @param str character vector * @param pattern character vector * @return character vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-04) * Use stri__trim_leftright * * @version 1.6.3 (Marek Gagolewski, 2021-06-10) negate */ SEXP stri_trim_left(SEXP str, SEXP pattern, SEXP negate) { bool negate_val = stri__prepare_arg_logical_1_notNA(negate, "negate"); return stri__trim_leftright(str, pattern, true, false, negate_val); } /** * Trim characters from a charclass from the right of the string * * @param str character vector * @param pattern character vector * @return character vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-04) * Use stri__trim_leftright * * @version 1.6.3 (Marek Gagolewski, 2021-06-10) negate */ SEXP stri_trim_right(SEXP str, SEXP pattern, SEXP negate) { bool negate_val = stri__prepare_arg_logical_1_notNA(negate, "negate"); return stri__trim_leftright(str, pattern, false, true, negate_val); } stringi/src/stri_encoding_conversion.cpp0000644000176200001440000005730014770541312020331 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_utf16.h" #include "stri_container_listraw.h" #include "stri_container_listint.h" #include "stri_string8buf.h" #include "stri_ucnv.h" #include #define BUF_MAX_LENGTH 2147483647 /** Convert from UTF-32 * * @param vec integer vector or list with integer vectors * @return character vector * * @version 0.1-?? (Marek Gagolewski) * * @version 0.2-1 (Marek Gagolewski, 2014-03-25) * StriException friently; * use StriContainerListInt * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_fromutf32(SEXP vec) { PROTECT(vec = stri__prepare_arg_list_integer(vec, "vec")); STRI__ERROR_HANDLER_BEGIN(1) StriContainerListInt vec_cont(vec); R_len_t vec_n = vec_cont.get_n(); // get required buf size R_len_t bufsize = 0; for (R_len_t i=0; i bufsize) bufsize = vec_cont.get(i).size(); } bufsize = U8_MAX_LENGTH*bufsize+1; // this will surely be sufficient String8buf buf(bufsize); char* bufdata = buf.data(); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vec_n)); for (R_len_t i=0; i buf instead of R_alloc; * warn and set NULL on improper UTF-8 byte sequences * * @version 0.2-3 (Marek Gagolewski, 2014-05-12) * Use UChar32* instead of vector as ::data is C++11 * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_toutf32(SEXP str) { PROTECT(str = stri__prepare_arg_string(str, "str")); R_len_t n = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(1) StriContainerUTF8 str_cont(str, n); R_len_t bufsize = 1; // to avoid allocating an empty buffer for (R_len_t i=0; i bufsize) bufsize = ni; } UChar32* buf = (UChar32*)R_alloc((size_t)bufsize, (int)sizeof(UChar32)); // at most bufsize UChars32 (bufsize/4 min.) STRI_ASSERT(buf); if (!buf) throw StriException(MSG__MEM_ALLOC_ERROR); // deque was slower than using a common, over-sized buf SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, n)); // all for (R_len_t i=0; i= 0 && j < sn) { U8_NEXT(s, j, sn, c); buf[k++] = (int)c; } if (c < 0) { throw StriException(MSG__INVALID_UTF8); // SET_VECTOR_ELT(ret, i, R_NilValue); // continue; } else { SEXP conv; STRI__PROTECT(conv = Rf_allocVector(INTSXP, k)); memcpy(INTEGER(conv), buf, (size_t)sizeof(int)*k); SET_VECTOR_ELT(ret, i, conv); STRI__UNPROTECT(1); } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* do nothing on error */ }) } /** Convert character vector to UTF-8 * * @param str character vector * @param is_unknown_8bit single logical value; * if TRUE, then in case of ENC_NATIVE or ENC_LATIN1, UTF-8 * REPLACEMENT CHARACTERs (U+FFFD) are * put for codes > 127 * @param validate single logical value (or NA) * * @return character vector * * @version 0.1-XX (Marek Gagolewski) * * @version 0.1-XX (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-26) * Use one String8buf; * is_unknown_8bit_logical and UTF-8 tries now to remove BOMs * * @version 0.2-1 (Marek Gagolewksi, 2014-03-30) * added validate arg * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_toutf8(SEXP str, SEXP is_unknown_8bit, SEXP validate) { PROTECT(validate = stri__prepare_arg_logical_1(validate, "validate")); bool is_unknown_8bit_logical = stri__prepare_arg_logical_1_notNA(is_unknown_8bit, "is_unknown_8bit"); PROTECT(str = stri__prepare_arg_string(str, "str")); R_len_t n = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(2) SEXP ret; if (!is_unknown_8bit_logical) { // Trivial - everything we need is in StriContainerUTF8 :) // which removes BOMs silently StriContainerUTF8 str_cont(str, n); STRI__PROTECT(ret = str_cont.toR()); } else { // get buf size size_t bufsize = 0; for (R_len_t i=0; i bufsize) bufsize = ni; } String8buf buf(bufsize*3); // either 1 byte < 127 or U+FFFD == 3 bytes UTF-8 char* bufdata = buf.data(); STRI__PROTECT(ret = Rf_allocVector(STRSXP, n)); for (R_len_t i=0; i= 3 && (uint8_t)(curs_s[0]) == UTF8_BOM_BYTE1 && (uint8_t)(curs_s[1]) == UTF8_BOM_BYTE2 && (uint8_t)(curs_s[2]) == UTF8_BOM_BYTE3) { // has BOM - get rid of it SET_STRING_ELT(ret, i, Rf_mkCharLenCE(curs_s+3, curs_n-3, CE_UTF8)); } else SET_STRING_ELT(ret, i, curs); continue; } // otherwise, we have an 8-bit encoding R_len_t curn = LENGTH(curs); const char* curs_tab = CHAR(curs); // TODO: ALTREP will be problematic? R_len_t k = 0; for (R_len_t j=0; j= 0 && j < sn) { U8_NEXT(s, j, sn, c); } if (c >= 0) continue; // valid, nothing to do if (LOGICAL(validate)[0] == NA_LOGICAL) { Rf_warning(MSG__INVALID_CODE_POINT_REPLNA); SET_STRING_ELT(ret, i, NA_STRING); } else { size_t bufsize = sn*3; // maximum: 1 byte -> U+FFFD (3 bytes) String8buf buf(bufsize); // maximum: 1 byte -> U+FFFD (3 bytes) char* bufdata = buf.data(); j = 0; size_t k = 0; UBool err = FALSE; while (!err && j < sn) { U8_NEXT(s, j, sn, c); if (c >= 0) { U8_APPEND((uint8_t*)bufdata, k, bufsize, c, err); } else { Rf_warning(MSG__INVALID_CODE_POINT_FIXING); bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2; bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3; } } if (err) throw StriException(MSG__INTERNAL_ERROR); SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8)); } } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** Convert character vector to ASCII * * All charcodes > 127 are replaced with subst chars (0x1A) * * @param str character vector * @return character vector * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-30) * use single common buf; * warn on invalid utf8 byte stream * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_toascii(SEXP str) { PROTECT(str = stri__prepare_arg_string(str, "str")); R_len_t n = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(1) // get buf size size_t bufsize = 0; for (R_len_t i=0; i bufsize) bufsize = ni; } String8buf buf(bufsize); // no more bytes than this needed char* bufdata = buf.data(); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, n)); for (R_len_t i=0; i ASCII_MAXCHARCODE) bufdata[k++] = ASCII_SUBSTITUTE; else bufdata[k++] = (char)c; } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8)); // the string will be marked as ASCII anyway by mkCharLenCE } else { // some 8-bit encoding R_len_t k = 0; for (R_len_t j=0; j bufsize) bufsize = str_cont.get(i).length(); } bufsize = UCNV_GET_MAX_BYTES_FOR_STRING(bufsize, ucnv_getMaxCharSize(uconv_to)); // "The calculated size is guaranteed to be sufficient for this conversion." if (bufsize > BUF_MAX_LENGTH) bufsize = BUF_MAX_LENGTH; String8buf buf(bufsize); for (R_len_t i=0; i BUF_MAX_LENGTH) throw StriException(MSG__BUF_SIZE_EXCEEDED); buf.resize(bufneed, false/*destroy contents*/); status = U_ZERO_ERROR; ucnv_resetFromUnicode(uconv_to); bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } if (to_raw_logical) { SEXP outobj; STRI__PROTECT(outobj = Rf_allocVector(RAWSXP, bufneed)); memcpy(RAW(outobj), buf.data(), (size_t)bufneed); SET_VECTOR_ELT(ret, i, outobj); STRI__UNPROTECT(1); } else { SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), bufneed, encmark_to)); } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({/* nothing special on error */}) } /** * Convert character vector between given encodings * * @param str input character/raw vector or list of raw vectors * @param from source encoding, \code{NULL} or \code{""} for default enc * @param to target encoding, \code{NULL} or \code{""} for default enc * @param to_raw single logical, should list of raw vectors be returned? * @return a converted character vector or list of raw vectors * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * arg to_raw_added, encoding marking * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.1-?? (Marek Gagolewski, 2013-08-08) * use StriContainerListRaw * * @version 0.1-?? (Marek Gagolewski, 2013-11-20) * BUGFIX call stri_encode_from_marked if necessary * * @version 0.2-1 (Marek Gagolewski, 2014-03-28) * use StriUcnv * * @version 0.2-1 (Marek Gagolewski, 2014-04-01) * estimate required buf size a priori * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_encode(SEXP str, SEXP from, SEXP to, SEXP to_raw) { const char* selected_from = stri__prepare_arg_enc(from, "from", true); /* this is R_alloc'ed */ if (!selected_from && Rf_isVectorAtomic(str) && !isRaw(str)) return stri_encode_from_marked(str, to, to_raw); const char* selected_to = stri__prepare_arg_enc(to, "to", true); /* this is R_alloc'ed */ bool to_raw_logical = stri__prepare_arg_logical_1_notNA(to_raw, "to_raw"); // raw vector, character vector, or list of raw vectors: PROTECT(str = stri__prepare_arg_list_raw(str, "str")); STRI__ERROR_HANDLER_BEGIN(1) StriContainerListRaw str_cont(str); R_len_t str_n = str_cont.get_n(); // get the number of strings to convert; if == 0, then you know what's the result if (str_n <= 0) { STRI__UNPROTECT_ALL return Rf_allocVector(to_raw_logical?VECSXP:STRSXP, 0); } // Open converters StriUcnv ucnv1(selected_from); StriUcnv ucnv2(selected_to); UConverter* uconv_from = ucnv1.getConverter(true /*register_callbacks*/); UConverter* uconv_to = ucnv2.getConverter(true /*register_callbacks*/); // Get target encoding mark cetype_t encmark_to = to_raw_logical?CE_BYTES:ucnv2.getCE(); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(to_raw_logical?VECSXP:STRSXP, str_n)); // // estimate required buf size // size_t bufsize = 0; // for (R_len_t i=0; i bufsize) // bufsize = str_cont.get(i).length(); // } // bufsize = bufsize*4; // this is just an estimate (for 8bit->utf8 conversions) // String8buf buf(bufsize); String8buf buf(0); for (R_len_t i=0; i UTF-16 [this is the slow part] if (status == U_ILLEGAL_ARGUMENT_ERROR) throw StriException(MSG__MEM_ALLOC_ERROR); // see #395 STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) R_len_t curn_tmp = encs.length(); const UChar* curs_tmp = encs.getBuffer(); // The buffer contents is (probably) not NUL-terminated. if (!curs_tmp) { throw StriException(MSG__INTERNAL_ERROR); } size_t bufneed = UCNV_GET_MAX_BYTES_FOR_STRING(curn_tmp, ucnv_getMaxCharSize(uconv_to)); // "The calculated size is guaranteed to be sufficient for this conversion." if (bufneed > BUF_MAX_LENGTH) bufneed = BUF_MAX_LENGTH; buf.resize(bufneed, false/*destroy contents*/); // grows or stays as-is status = U_ZERO_ERROR; ucnv_resetFromUnicode(uconv_to); bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &status); if (bufneed <= buf.size()) { STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } else {// larger buffer needed if (bufneed > BUF_MAX_LENGTH) throw StriException(MSG__BUF_SIZE_EXCEEDED); buf.resize(bufneed, false/*destroy contents*/); status = U_ZERO_ERROR; ucnv_resetFromUnicode(uconv_to); bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } if (to_raw_logical) { SEXP outobj; STRI__PROTECT(outobj = Rf_allocVector(RAWSXP, bufneed)); memcpy(RAW(outobj), buf.data(), (size_t)bufneed); SET_VECTOR_ELT(ret, i, outobj); STRI__UNPROTECT(1); } else { SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), bufneed, encmark_to)); } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({/* no special action on error */}) } stringi/src/stri_container_base.h0000644000176200001440000000711314770541312016714 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_container_base_h #define __stri_container_base_h #include "stri_external.h" #include "stri_exception.h" /** * Base class for all StriContainers * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * removed ucnvNative, ucnvLatin1 (not needed per-object) * * @version 0.1-?? (Marek Gagolewski) * removed enc array * * @version 0.2-1 (Marek Gagolewski, 2014-03-22) * added sexp field */ class StriContainerBase { protected: R_len_t n; ///< number of strings (size of \code{str}) R_len_t nrecycle; ///< number of strings for the recycle rule (can be > \code{n}) SEXP sexp; ///< #ifndef NDEBUG bool isShallow; ///< have we made only shallow copy of the strings? (=> read only) #endif StriContainerBase(); // StriContainerBase(StriContainerBase& container); // use default (shallow copy) //~StriContainerBase(); // use default void init_Base(R_len_t n, R_len_t nrecycle, bool shallowrecycle, SEXP sexp=NULL); public: //StriContainerBase& operator=(StriContainerBase& container); // use default (shallow) inline R_len_t get_n() { return n; } inline R_len_t get_nrecycle() { return nrecycle; } inline void set_nrecycle(R_len_t nval) { nrecycle = nval; } /** Loop over vectorized container - init */ inline R_len_t vectorize_init() const { if (n <= 0) return nrecycle; else return 0; } /** Loop over vectorized container - end iterator */ inline R_len_t vectorize_end() const { return nrecycle; } /** Loop over vectorized container - next iteration */ inline R_len_t vectorize_next(R_len_t i) const { if (i == nrecycle - 1 - (nrecycle%n)) return nrecycle; // this is the end i = i + n; if (i >= nrecycle) return (i % n) + 1; else return i; } }; #endif stringi/src/stri_string8buf.h0000644000176200001440000001410714770541312016034 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_string8buf_h #define __stri_string8buf_h #include "stri_stringi.h" #include /** * [DEPRECATED] A class to represent a temporary string buffer * * Quite similar to std::string and/or std::vector; * actually, we could/should now * TODO: rewrite most code to use the C++11 std::string. * * @version 0.2-1 (Marek Gagolewski, 2014-03-24) * Separated from String8 * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * Use malloc+realloc */ class String8buf { private: char* m_str; size_t m_size; ///< physical buffer size in bytes public: /** allocate string buffer * * @param size buffer length-1 */ String8buf(size_t size=0) { this->m_size = size+1; this->m_str = (char*)malloc(sizeof(char)*this->m_size); STRI_ASSERT(this->m_str); if (!this->m_str) throw StriException(MSG__MEM_ALLOC_ERROR_WITH_SIZE, sizeof(char)*this->m_size); this->m_str[0] = '\0'; } /** destructor */ ~String8buf() { if (this->m_str) { free(this->m_str); this->m_str = NULL; } } /** copy constructor */ String8buf(const String8buf& s) { this->m_size = s.m_size; this->m_str = (char*)malloc(sizeof(char)*this->m_size); STRI_ASSERT(this->m_str); if (!this->m_str) throw StriException(MSG__MEM_ALLOC_ERROR_WITH_SIZE, sizeof(char)*this->m_size); memcpy(this->m_str, s.m_str, (size_t)this->m_size); } /** copy */ String8buf& operator=(const String8buf& s) { if (this->m_str) free(this->m_str); this->m_size = s.m_size; this->m_str = (char*)malloc(sizeof(char)*this->m_size); STRI_ASSERT(this->m_str); if (!this->m_str) throw StriException(MSG__MEM_ALLOC_ERROR_WITH_SIZE, sizeof(char)*this->m_size); memcpy(this->m_str, s.m_str, (size_t)this->m_size); return *this; } /* return data */ inline char* data() { return this->m_str; } /** buffer size in bytes */ inline size_t size() const { return this->m_size; } /** increase buffer size; * * @param size new size-1 * @param copy should the existing buffer content be retained? */ inline void resize(size_t size, bool copy=true) { if (this->m_size > size) return; // do nothing (the requested buffer size is available) char* old_str = this->m_str; this->m_size = size+1; this->m_str = (char*)realloc(this->m_str, sizeof(char)*this->m_size); STRI_ASSERT(this->m_str); if (!this->m_str) throw StriException(MSG__MEM_ALLOC_ERROR_WITH_SIZE, sizeof(char)*this->m_size); if (!old_str || !copy) { this->m_str[0] = '\0'; } } /** Replace substrings with a given replacement string * * TODO: How does this relate to String8::replaceAllAtPos? Is it redundant? * * @return number of bytes written * * @version 0.3-1 (Marek Gagolewski, 2014-11-02) */ size_t replaceAllAtPos(const char* str_cur_s, size_t str_cur_n, const char* replacement_cur_s, size_t replacement_cur_n, std::deque< std::pair >& occurrences) { size_t buf_used = 0; size_t jlast = 0; std::deque< std::pair >::iterator iter = occurrences.begin(); for (; iter != occurrences.end(); ++iter) { pair match = *iter; memcpy(m_str+buf_used, str_cur_s+jlast, (size_t)(match.first-jlast)); buf_used += match.first-jlast; #ifndef NDEBUG if (buf_used > m_size) throw StriException("!NDEBUG: String8buf::replaceAllAtPos: buf_used > buf_size"); #endif jlast = match.second; memcpy(m_str+buf_used, replacement_cur_s, (size_t)(replacement_cur_n)); buf_used += replacement_cur_n; #ifndef NDEBUG if (buf_used > m_size) throw StriException("!NDEBUG: String8buf::replaceAllAtPos: buf_used > buf_size"); #endif } memcpy(m_str+buf_used, str_cur_s+jlast, (size_t)(str_cur_n-jlast)); buf_used += (str_cur_n-jlast); #ifndef NDEBUG if (buf_used > m_size) throw StriException("!NDEBUG: String8buf::replaceAllAtPos: buf_used > buf_size"); #endif return buf_used; } }; #endif stringi/src/stri_search_coll_extract.cpp0000644000176200001440000002400114770541312020276 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf16.h" #include "stri_container_usearch.h" #include #include using namespace std; /** * Extract first occurrence of a fixed pattern in each string [with collation] * * @param str character vector * @param pattern character vector * @param opts_collator list * @param firs logical - search for the first or the last occurrence? * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-24) * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * new fun: stri__extract_firstlast_coll (opts_collator == NA not allowed) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri__extract_firstlast_coll(SEXP str, SEXP pattern, SEXP opts_collator, bool first) { // call stri__ucol_open after prepare_arg: // if prepare_arg had failed, we would have a mem leak UCollator* collator = NULL; collator = stri__ucol_open(opts_collator); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); STRI__ERROR_HANDLER_BEGIN(2) R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF16 str_cont(str, vectorize_length, false); // writable StriContainerUStringSearch pattern_cont(pattern, vectorize_length, collator); // collator is not owned by pattern_cont SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, SET_STRING_ELT(ret, i, NA_STRING);, SET_STRING_ELT(ret, i, NA_STRING);) UStringSearch *matcher = pattern_cont.getMatcher(i, str_cont.get(i)); usearch_reset(matcher); int start; if (first) { UErrorCode status = U_ZERO_ERROR; start = (int)usearch_first(matcher, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } else { UErrorCode status = U_ZERO_ERROR; start = (int)usearch_last(matcher, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } if (start == USEARCH_DONE) { SET_STRING_ELT(ret, i, NA_STRING); continue; } str_cont.getWritable(i).setTo(str_cont.get(i), (int32_t) start, (int32_t) usearch_getMatchedLength(matcher)); // str[i] will be destroyed, but it's ok - it's a deep copy SET_STRING_ELT(ret, i, str_cont.toR(i)); } if (collator) { ucol_close(collator); collator=NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( if (collator) ucol_close(collator); ) } /** * Extract first occurrence of a fixed pattern in each string [with collation] * * @param str character vector * @param pattern character vector * @param opts_collator list * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-24) * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * new fun: stri_extract_first_coll (opts_collator == NA not allowed) */ SEXP stri_extract_first_coll(SEXP str, SEXP pattern, SEXP opts_collator) { return stri__extract_firstlast_coll(str, pattern, opts_collator, true); } /** * Extract last occurrence of a fixed pattern in each string [with collation] * * @param str character vector * @param pattern character vector * @param opts_collator list * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-24) * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * new fun: stri_extract_last_coll (opts_collator == NA not allowed) */ SEXP stri_extract_last_coll(SEXP str, SEXP pattern, SEXP opts_collator) { return stri__extract_firstlast_coll(str, pattern, opts_collator, false); } /** * Extract all occurrences of a fixed pattern in each string [with collation] * * @param str character vector * @param pattern character vector * @param opts_collator list * @param simplify single logical value * * @return list of character vectors or character matrix * * @version 0.1-?? (Marek Gagolewski, 2013-06-24) * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * new fun: stri_extract_all_coll (opts_collator == NA not allowed) * * @version 0.3-1 (Marek Gagolewski, 2014-10-24) * added simplify param * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-11-27) * FR #117: omit_no_match arg added * * @version 0.4-1 (Marek Gagolewski, 2014-12-04) * allow `simplify=NA` */ SEXP stri_extract_all_coll(SEXP str, SEXP pattern, SEXP simplify, SEXP omit_no_match, SEXP opts_collator) { bool omit_no_match1 = stri__prepare_arg_logical_1_notNA(omit_no_match, "omit_no_match"); PROTECT(simplify = stri__prepare_arg_logical_1(simplify, "simplify")); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); // call stri__ucol_open after prepare_arg: // if prepare_arg had failed, we would have a mem leak UCollator* collator = NULL; collator = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(3) R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF16 str_cont(str, vectorize_length); StriContainerUStringSearch pattern_cont(pattern, vectorize_length, collator); // collator is not owned by pattern_cont SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));, SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(omit_no_match1?0:1));) UStringSearch *matcher = pattern_cont.getMatcher(i, str_cont.get(i)); usearch_reset(matcher); UErrorCode status = U_ZERO_ERROR; int start = (int)usearch_first(matcher, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (start == USEARCH_DONE) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(omit_no_match1?0:1)); continue; } deque< pair > occurrences; while (start != USEARCH_DONE) { occurrences.push_back(pair(start, usearch_getMatchedLength(matcher))); start = usearch_next(matcher, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } R_len_t noccurrences = (R_len_t)occurrences.size(); StriContainerUTF16 out_cont(noccurrences); deque< pair >::iterator iter = occurrences.begin(); for (R_len_t j = 0; iter != occurrences.end(); ++iter, ++j) { pair match = *iter; out_cont.getWritable(j).setTo(str_cont.get(i), match.first, match.second); } SET_VECTOR_ELT(ret, i, out_cont.toR()); } if (collator) { ucol_close(collator); collator=NULL; } if (LOGICAL(simplify)[0] == NA_LOGICAL || LOGICAL(simplify)[0]) { SEXP robj_TRUE, robj_zero, robj_na_strings, robj_empty_strings; STRI__PROTECT(robj_TRUE = Rf_ScalarLogical(TRUE)); STRI__PROTECT(robj_zero = Rf_ScalarInteger(0)); STRI__PROTECT(robj_na_strings = stri__vector_NA_strings(1)); STRI__PROTECT(robj_empty_strings = stri__vector_empty_strings(1)); STRI__PROTECT(ret = stri_list2matrix(ret, robj_TRUE, (LOGICAL(simplify)[0] == NA_LOGICAL)?robj_na_strings :robj_empty_strings, robj_zero)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( if (collator) ucol_close(collator); ) } stringi/src/stri_reverse.cpp0000644000176200001440000000773714770541312015762 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_string8buf.h" /** * Reverse Each String * @param str character vector * @return character vector with every string reversed * * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16 * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly + StriContainerUTF8 (bug fix, do reversing manually) * * @version 0.2-1 (Marek Gagolewski, 2014-04-01) * detect incorrect utf8 byte stream * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_reverse(SEXP str) { PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_len = LENGTH(str); StriContainerUTF8 str_cont(str, str_len); // writable, no recycle // STEP 1. // Calculate the required buffer length R_len_t bufsize = 0; for (R_len_t i=0; i bufsize) bufsize = cursize; } // STEP 2. // Alloc buffer & result vector String8buf buf(bufsize); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_len)); for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); R_len_t j, k; UChar32 chr; UBool isError = FALSE; for (j=str_cur_n, k=0; !isError && j>0; ) { U8_PREV(str_cur_s, 0, j, chr); // go backwards if (chr < 0) { throw StriException(MSG__INVALID_UTF8); } U8_APPEND((uint8_t*)buf.data(), k, str_cur_n, chr, isError); } if (isError) throw StriException(MSG__INTERNAL_ERROR); SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), str_cur_n, CE_UTF8)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } stringi/src/stri_uloc.cpp0000644000176200001440000001031514770541312015233 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include /** * Sets current (default) ICU locale * * @param loc new locale (a single character string) * @return nothing (\code{R_NilValue}) * * @version 0.1-?? (Marek Gagolewski) */ SEXP stri_locale_set(SEXP loc) { const char* qloc = stri__prepare_arg_locale(loc, "locale", false, false); /* this is R_alloc'ed */ UErrorCode status = U_ZERO_ERROR; uloc_setDefault(qloc, &status); STRI__CHECKICUSTATUS_RFERROR(status, {/* do nothing special on err */}) // error() allowed here return R_NilValue; } /** Get list of available locales * * @return character vector * * @version 0.1-?? (Marek Gagolewski) */ SEXP stri_locale_list() { R_len_t c = (R_len_t)uloc_countAvailable(); SEXP ret; PROTECT(ret = Rf_allocVector(STRSXP, c)); for (R_len_t i=0; i * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_bytesearch.h" #include "stri_string8buf.h" //#include "stri_interval.h" #include //#include //#include using namespace std; /** * Replace all/first/last occurrences of a fixed pattern * * @param str character vector * @param pattern character vector * @param replacement character vector * @return character vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-26) * StriException friendly & Use StriContainers * * @version 0.1-?? (Marek Gagolewski, 2013-07-10) * BUGFIX: wrong behavior on empty str * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * stri_replace_fixed now uses byte search only * * @version 0.3-1 (Marek Gagolewski, 2014-11-02) * using String8buf::replaceAllAtPos, slightly faster * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * FR #110, #23: opts_fixed arg added * * @version 1.0-2 (Marek Gagolewski, 2016-01-30) * Issue #210: Allow NA replacement */ SEXP stri__replace_allfirstlast_fixed(SEXP str, SEXP pattern, SEXP replacement, SEXP opts_fixed, int type) { uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); PROTECT(replacement = stri__prepare_arg_string(replacement, "replacement")); R_len_t vectorize_length = stri__recycling_rule(true, 3, LENGTH(str), LENGTH(pattern), LENGTH(replacement)); STRI__ERROR_HANDLER_BEGIN(3) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerUTF8 replacement_cont(replacement, vectorize_length); StriContainerByteSearch pattern_cont(pattern, vectorize_length, pattern_flags); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); String8buf buf(0); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, SET_STRING_ELT(ret, i, NA_STRING);, SET_STRING_ELT(ret, i, Rf_mkCharLenCE(NULL, 0, CE_UTF8));) StriByteSearchMatcher* matcher = pattern_cont.getMatcher(i); matcher->reset(str_cont.get(i).c_str(), str_cont.get(i).length()); R_len_t start; if (type >= 0) { // first or all start = matcher->findFirst(); } else { start = matcher->findLast(); } if (start == USEARCH_DONE) { SET_STRING_ELT(ret, i, str_cont.toR(i)); continue; } if (replacement_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t len = matcher->getMatchedLength(); R_len_t sumbytes = len; deque< pair > occurrences; occurrences.push_back(pair(start, start+len)); if (type == 0) { while (USEARCH_DONE != matcher->findNext()) { // all start = matcher->getMatchedStart(); len = matcher->getMatchedLength(); occurrences.push_back(pair(start, start+len)); sumbytes += len; } } R_len_t str_cur_n = str_cont.get(i).length(); R_len_t replacement_cur_n = replacement_cont.get(i).length(); R_len_t buf_need = str_cur_n+replacement_cur_n*(R_len_t)occurrences.size()-sumbytes; buf.resize(buf_need, false/*destroy contents*/); R_len_t buf_used = buf.replaceAllAtPos(str_cont.get(i).c_str(), str_cur_n, replacement_cont.get(i).c_str(), replacement_cur_n, occurrences); #ifndef NDEBUG if (buf_need != buf_used) throw StriException("!NDEBUG: stri__replace_allfirstlast_fixed: (buf_need != buf_used)"); #endif SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buf_used, CE_UTF8)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } // Version 2, 2014-11-02, using String8::replaceAllAtPos, slower //SEXP stri__replace_allfirstlast_fixed(SEXP str, SEXP pattern, SEXP replacement, int type) //{ // str = stri__prepare_arg_string(str, "str"); // pattern = stri__prepare_arg_string(pattern, "pattern"); // replacement = stri__prepare_arg_string(replacement, "replacement"); // R_len_t vectorize_length = stri__recycling_rule(true, 3, LENGTH(str), LENGTH(pattern), LENGTH(replacement)); // // STRI__ERROR_HANDLER_BEGIN // StriContainerUTF8 str_cont(str, vectorize_length, false); // writable); // StriContainerUTF8 replacement_cont(replacement, vectorize_length); // StriContainerByteSearch pattern_cont(pattern, vectorize_length); // // for (R_len_t i = pattern_cont.vectorize_init(); // i != pattern_cont.vectorize_end(); // i = pattern_cont.vectorize_next(i)) // { // STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, // str_cont.setNA(i), // {/* zero-length string, just continue */}) // // if (replacement_cont.isNA(i)) { // str_cont.setNA(i); // continue; // } // // R_len_t start; // if (type >= 0) { // first or all // pattern_cont.setupMatcherFwd(i, str_cont.get(i).c_str(), str_cont.get(i).length()); // start = pattern_cont.findFirst(); // } else { // pattern_cont.setupMatcherBack(i, str_cont.get(i).c_str(), str_cont.get(i).length()); // start = pattern_cont.findLast(); // } // // if (start == USEARCH_DONE) { // // nothing to do, no change, leave as-is // continue; // } // // R_len_t len = pattern_cont.getMatchedLength(); // R_len_t sumbytes = len; // deque< pair > occurrences; // occurrences.push_back(pair(start, start+len)); // // if (type == 0) { // while (USEARCH_DONE != pattern_cont.findNext()) { // all // start = pattern_cont.getMatchedStart(); // len = pattern_cont.getMatchedLength(); // occurrences.push_back(pair(start, start+len)); // sumbytes += len; // } // } // // R_len_t str_cur_n = str_cont.get(i).length(); // R_len_t replacement_cur_n = replacement_cont.get(i).length(); // R_len_t buf_need = // str_cur_n+replacement_cur_n*(R_len_t)occurrences.size()-sumbytes; // // str_cont.getWritable(i).replaceAllAtPos(buf_need, // replacement_cont.get(i).c_str(), replacement_cur_n, // occurrences); // } // // STRI__UNPROTECT_ALL // return str_cont.toR(); // STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) //} /** * Replace all occurrences of a fixed pattern; vectorize_all=FALSE * * @param str character vector * @param pattern character vector * @param replacement character vector * @return character vector * * @version 0.3-1 (Marek Gagolewski, 2014-11-01) * * @version 0.3-1 (Marek Gagolewski, 2014-11-02) * Complete rewrite; faster * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * FR #110, #23: opts_fixed arg added * * @version 1.0-2 (Marek Gagolewski, 2016-01-30) * Issue #210: Allow NA replacement */ SEXP stri__replace_all_fixed_no_vectorize_all(SEXP str, SEXP pattern, SEXP replacement, SEXP opts_fixed) { // version gamma: PROTECT(str = stri__prepare_arg_string(str, "str")); // if str_n is 0, then return an empty vector R_len_t str_n = LENGTH(str); if (str_n <= 0) { UNPROTECT(1); return stri__vector_empty_strings(0); } PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); PROTECT(replacement = stri__prepare_arg_string(replacement, "replacement")); R_len_t pattern_n = LENGTH(pattern); R_len_t replacement_n = LENGTH(replacement); if (pattern_n < replacement_n || pattern_n <= 0 || replacement_n <= 0) { UNPROTECT(3); Rf_error(MSG__WARN_RECYCLING_RULE2); } if (pattern_n % replacement_n != 0) Rf_warning(MSG__WARN_RECYCLING_RULE); if (pattern_n == 1) { // this will be much faster: SEXP ret; PROTECT(ret = stri__replace_allfirstlast_fixed(str, pattern, replacement, opts_fixed, 0)); UNPROTECT(4); return ret; } uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed); STRI__ERROR_HANDLER_BEGIN(3) StriContainerUTF8 str_cont(str, str_n, false); // writable StriContainerUTF8 replacement_cont(replacement, pattern_n); StriContainerByteSearch pattern_cont(pattern, pattern_n, pattern_flags); for (R_len_t i = 0; ireset(str_cont.get(j).c_str(), str_cont.get(j).length()); R_len_t start = matcher->findFirst(); if (start == USEARCH_DONE) continue; // nothing to do now if (replacement_cont.isNA(i)) { str_cont.setNA(j); continue; } R_len_t len = matcher->getMatchedLength(); R_len_t sumbytes = len; deque< pair > occurrences; occurrences.push_back(pair(start, start+len)); while (USEARCH_DONE != matcher->findNext()) { // all start = matcher->getMatchedStart(); len = matcher->getMatchedLength(); occurrences.push_back(pair(start, start+len)); sumbytes += len; } R_len_t str_cur_n = str_cont.get(j).length(); R_len_t replacement_cur_n = replacement_cont.get(i).length(); R_len_t buf_need = str_cur_n+replacement_cur_n*(R_len_t)occurrences.size()-sumbytes; str_cont.getWritable(j).replaceAllAtPos(buf_need, replacement_cont.get(i).c_str(), replacement_cur_n, occurrences); } } STRI__UNPROTECT_ALL return str_cont.toR(); STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } // stri__replace_all_fixed_no_vectorize_all //{ // version beta: for-loop like, 2014-11-01 // PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); // PROTECT(replacement = stri__prepare_arg_string(replacement, "replacement")); // // R_len_t pattern_n = LENGTH(pattern); // R_len_t replacement_n = LENGTH(replacement); // if (pattern_n < replacement_n || pattern_n <= 0 || replacement_n <= 0) // Rf_error(MSG__WARN_RECYCLING_RULE2); // if (pattern_n % replacement_n != 0) // Rf_warning(MSG__WARN_RECYCLING_RULE); // // // no str_error_handlers needed here // SEXP pattern_cur, replacement_cur; // PROTECT(pattern_cur = Rf_allocVector(STRSXP, 1)); // PROTECT(replacement_cur = Rf_allocVector(STRSXP, 1)); // // PROTECT(str); // for (R_len_t i=0; i > > queues(str_n); // matches // // vector which_NA(str_n, false); // which str[i] will be NA // for (R_len_t j=0; j= 1 match // if (replacement_cont.isNA(i)) { // which_NA[j] = true; // this string will be missing in result // // it may have overlapping patterns BTW, but we won't check for that // continue; // the same pattern, next string // } // do { // queues[j].push_back(StriInterval(match_idx, match_idx+pattern_cont.getMatchedLength(), i)); // match_idx = pattern_cont.findNext(); // } // while (match_idx != USEARCH_DONE); // } // } // // // check if there are overlapping patterns, // // determine max buf size // R_len_t bufsize = 0; // for (R_len_t i=0; i >::iterator iter = queues[i].begin(); // // StriInterval last_int = *(iter++); // bufsize_cur = bufsize_cur - pattern_cont.get(last_int.data).length() // + replacement_cont.get(last_int.data).length(); // for (; iter != queues[i].end(); ++iter) { // StriInterval cur_int = *iter; // if (cur_int.a < last_int.b) // throw StriException(MSG__OVERLAPPING_PATTERN_UNSUPPORTED); // bufsize_cur = bufsize_cur - pattern_cont.get(cur_int.data).length() // + replacement_cont.get(cur_int.data).length(); // last_int = cur_int; // } // // if (bufsize < bufsize_cur) bufsize = bufsize_cur; // } // // // construct the resulting vector // SEXP ret; // STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_n)); // String8buf buf(bufsize); // for (R_len_t i=0; i >::iterator iter = queues[i].begin(); // iter != queues[i].end(); ++iter) { // StriInterval cur_int = *iter; // memcpy(curbuf+bufused, str_cur_s+last_b, cur_int.a-last_b); // bufused += (cur_int.a-last_b); // memcpy(curbuf+bufused, replacement_cont.get(cur_int.data).c_str(), // replacement_cont.get(cur_int.data).length()); // bufused += replacement_cont.get(cur_int.data).length(); // last_b = cur_int.b; // } // // // the remainder // memcpy(curbuf+bufused, str_cur_s+last_b, str_cur_n-last_b); // bufused += (str_cur_n-last_b); // SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), bufused, CE_UTF8)); // } // // STRI__UNPROTECT_ALL // return ret; // STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) //} /** * Replace all occurrences of a fixed pattern * * @param str character vector * @param pattern character vector * @param replacement character vector * @param vectorize_all single logical value * @return character vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-26) * use stri__replace_allfirstlast_fixed * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * stri_replace_fixed now uses byte search only * * @version 0.3-1 (Marek Gagolewski, 2014-11-01) * vectorize_all argument added * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * FR #110, #23: opts_fixed arg added */ SEXP stri_replace_all_fixed(SEXP str, SEXP pattern, SEXP replacement, SEXP vectorize_all, SEXP opts_fixed) { if (stri__prepare_arg_logical_1_notNA(vectorize_all, "vectorize_all")) return stri__replace_allfirstlast_fixed(str, pattern, replacement, opts_fixed, 0); else return stri__replace_all_fixed_no_vectorize_all(str, pattern, replacement, opts_fixed); } /** * Replace last occurrence of a fixed pattern * * @param str character vector * @param pattern character vector * @param replacement character vector * @return character vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-26) * use stri__replace_allfirstlast_fixed * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * stri_replace_fixed now uses byte search only * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * FR #110, #23: opts_fixed arg added */ SEXP stri_replace_last_fixed(SEXP str, SEXP pattern, SEXP replacement, SEXP opts_fixed) { return stri__replace_allfirstlast_fixed(str, pattern, replacement, opts_fixed, -1); } /** * Replace first occurrence of a fixed pattern * * @param str character vector * @param pattern character vector * @param replacement character vector * @return character vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-26) * use stri__replace_allfirstlast_fixed * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * stri_replace_fixed now uses byte search only * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * FR #110, #23: opts_fixed arg added */ SEXP stri_replace_first_fixed(SEXP str, SEXP pattern, SEXP replacement, SEXP opts_fixed) { return stri__replace_allfirstlast_fixed(str, pattern, replacement, opts_fixed, 1); } stringi/src/stri_brkiter.cpp0000644000176200001440000003340214770541312015735 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_brkiter.h" /** Select Break Iterator * * @param opts_brkiter named list * @param _default default break iterator type * @return break iterator ID * * @version 0.3-1 (Marek Gagolewski, 2014-10-29) * * @version 0.3-1 (Marek Gagolewski, 2014-10-30) * add param `_default` * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-02) * moved to the StriBrkIterOptions class * * @version 1.1.6 (Marek Gagolewski, 2017-04-22) * Add support for RBBI */ void StriBrkIterOptions::setType(SEXP opts_brkiter, const char* _default) { const char* type_opts[] = {"character", "line_break", "sentence", "word", NULL}; int brkiter_cur = stri__match_arg(_default, type_opts); if (Rf_isNull(opts_brkiter)) { // use default settings } else if (Rf_isVectorList(opts_brkiter)) { R_len_t narg = LENGTH(opts_brkiter); SEXP names = Rf_getAttrib(opts_brkiter, R_NamesSymbol); if (names == R_NilValue || LENGTH(names) != narg) Rf_error(MSG__INCORRECT_BRKITER_OPTION_SPEC); // error() allowed here // search for "type" option for (R_len_t i=0; irules = UnicodeString::fromUTF8(curval3); brkiter_cur = stri__match_arg(curval3, type_opts); UNPROTECT(2); break; } } } else { Rf_error(MSG__INCORRECT_BRKITER_OPTION_SPEC); } switch (brkiter_cur) { case 0: // character this->type = UBRK_CHARACTER; this->rules = UnicodeString(); break; case 1: // line_break this->type = UBRK_LINE; this->rules = UnicodeString(); break; case 2: // sentence this->type = UBRK_SENTENCE; this->rules = UnicodeString(); break; case 3: // word this->type = UBRK_WORD; this->rules = UnicodeString(); break; default: // do nothing - custom rules specified break; } } /** Get Break Iterator's locale * * @param opts_brkiter named list * @return locale ID, R_alloc'ed * * @version 0.3-1 (Marek Gagolewski, 2014-10-29) * * @version 0.4-1 (Marek Gagolewski, 2014-12-02) * moved to the StriBrkIterOptions class */ void StriBrkIterOptions::setLocale(SEXP opts_brkiter) { if (Rf_isNull(opts_brkiter)) { // use default locale } else if (Rf_isVectorList(opts_brkiter)) { R_len_t narg = LENGTH(opts_brkiter); SEXP names = Rf_getAttrib(opts_brkiter, R_NamesSymbol); if (names == R_NilValue || LENGTH(names) != narg) Rf_error(MSG__INCORRECT_BRKITER_OPTION_SPEC); // error() allowed here // search for "locale" option for (R_len_t i=0; ilocale = stri__prepare_arg_locale(VECTOR_ELT(opts_brkiter, i), "locale"); // this is R_alloc'ed return; } } } else { Rf_error(MSG__INCORRECT_BRKITER_OPTION_SPEC); // error() allowed here } // otherwise return default locale this->locale = stri__prepare_arg_locale(R_NilValue, "locale"); /* this is R_alloc'ed */ } /** Get Break Iterator's skip rule status * * @param opts_brkiter named list * @return vector of indexes [even, odd) -- IDs to skip * * @version 0.3-1 (Marek Gagolewski, 2014-10-29) * * @version 0.4-1 (Marek Gagolewski, 2014-12-02) * moved to the StriBrkIterOptions class */ void StriBrkIterOptions::setSkipRuleStatus(SEXP opts_brkiter) { if (Rf_isNull(opts_brkiter)) { return; // leave as-is (empty) } R_len_t tmp_size = 0; int32_t tmp_rules[32]; if (!Rf_isVectorList(opts_brkiter)) Rf_error(MSG__INCORRECT_BRKITER_OPTION_SPEC); // error() allowed here R_len_t narg = LENGTH(opts_brkiter); SEXP names = Rf_getAttrib(opts_brkiter, R_NamesSymbol); if (names == R_NilValue || LENGTH(names) != narg) Rf_error(MSG__INCORRECT_BRKITER_OPTION_SPEC); // error() allowed here for (R_len_t i=0; isearchStr = _searchStr; this->searchLen = _searchLen; this->searchPos = BreakIterator::DONE; UErrorCode status = U_ZERO_ERROR; this->searchText = utext_openUTF8(this->searchText, _searchStr, _searchLen, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; this->rbiterator->setText(this->searchText, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } /** Should a boundary be ignored by a Break Iterator * * @param brkskip vector of indexes [even, odd) -- IDs to skip * @param rule current rule * @return logical value * * @version 0.3-1 (Marek Gagolewski, 2014-10-29) * * @version 0.4-1 (Marek Gagolews, 2014-12-03) * moved to StriRuleBasedBreakIterator */ bool StriRuleBasedBreakIterator::ignoreBoundary() { #ifndef NDEBUG if (!rbiterator || !searchText) throw StriException("!NDEBUG: StriRuleBasedBreakIterator::ignoreBoundary()"); #endif if (skip_size <= 0) return false; int rule = rbiterator->getRuleStatus(); /* this is ICU 52 */ for (int i=0; i= skip_rules[i] && rule < skip_rules[i+1]) return true; } return false; // don't ignore } /** * * @ version 0.4-1 (Marek Gagolewski, 2014-12-03) */ void StriRuleBasedBreakIterator::first() { #ifndef NDBEGUG if (!rbiterator) throw StriException("!NDEBUG: StriRuleBasedBreakIterator::first"); #endif this->searchPos = rbiterator->first(); // ICU man: "The offset of the beginning of the text, zero." #ifndef NDBEGUG if (this->searchPos != 0) throw StriException("!NDEBUG: StriRuleBasedBreakIterator::first"); #endif } /** * * @ version 0.4-1 (Marek Gagolewski, 2014-12-03) */ bool StriRuleBasedBreakIterator::next() { while ((this->searchPos = rbiterator->next()) != BreakIterator::DONE) { if (!ignoreBoundary()) return true; } return false; } /** * * @ version 0.4-1 (Marek Gagolewski, 2014-12-03) */ bool StriRuleBasedBreakIterator::next(std::pair& bdr) { R_len_t lastPos = searchPos; while ((searchPos = rbiterator->next()) != BreakIterator::DONE) { if (!ignoreBoundary()) { bdr.first = lastPos; bdr.second = searchPos; return true; } lastPos = searchPos; } return false; } /** * * @ version 0.4-1 (Marek Gagolewski, 2014-12-05) */ void StriRuleBasedBreakIterator::last() { #ifndef NDBEGUG if (!rbiterator) throw StriException("!NDEBUG: StriRuleBasedBreakIterator::last"); if (searchLen <= 0) throw StriException("!NDEBUG: StriRuleBasedBreakIterator::last"); #endif rbiterator->first(); this->searchPos = rbiterator->last(); // ICU man: "The text's past-the-end offset. " #ifndef NDBEGUG if (this->searchPos > this->searchLen) throw StriException("!NDEBUG: StriRuleBasedBreakIterator::last"); #endif } /** * * @ version 0.4-1 (Marek Gagolewski, 2014-12-05) */ bool StriRuleBasedBreakIterator::previous(std::pair& bdr) { do { if (!ignoreBoundary()) { bdr.second = searchPos; searchPos = rbiterator->previous(); if (searchPos == BreakIterator::DONE) return false; bdr.first = searchPos; return true; } searchPos = rbiterator->previous(); } while (searchPos != BreakIterator::DONE); return false; } stringi/src/stri_time_zone.cpp0000644000176200001440000002100314770541312016256 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include #include /** List available time zone IDs * * @param offset single numeric * @param region single string * @return character vector * * @version 0.5-1 (Marek Gagolewski, 2014-12-24) */ SEXP stri_timezone_list(SEXP region, SEXP offset) { StringEnumeration* tz_enum = NULL; PROTECT(region = stri__prepare_arg_string_1(region, "region")); PROTECT(offset = stri__prepare_arg_double_1(offset, "offset")); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 region_cont(region, 1); UErrorCode status = U_ZERO_ERROR; int32_t offset_hours = 0; const int32_t* o = NULL; const char* r = NULL; if (!ISNA(REAL(offset)[0])) { // 0.5 and 0.75 are represented exactly within the double type offset_hours = (int32_t)(REAL(offset)[0]*1000.0*3600.0); o = &offset_hours; } if (!region_cont.isNA(0)) r = region_cont.get(0).c_str(); tz_enum = TimeZone::createTimeZoneIDEnumeration(UCAL_ZONE_TYPE_ANY, r, o, status); STRI__CHECKICUSTATUS_RFERROR(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; tz_enum->reset(status); STRI__CHECKICUSTATUS_RFERROR(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; R_len_t n = (R_len_t)tz_enum->count(status); STRI__CHECKICUSTATUS_RFERROR(status, {/* do nothing special on err */}) SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, n)); // SEXP nam; // STRI__PROTECT(nam = Rf_allocVector(STRSXP, n)); // MG: I reckon that IDs are more readable than DisplayNames (which are moreover localized) for (R_len_t i=0; inext(&len, status); STRI__CHECKICUSTATUS_RFERROR(status, {/* do nothing special on err */}) SET_STRING_ELT(ret, i, Rf_mkCharLenCE(cur, len, CE_UTF8)); // TimeZone* curtz = TimeZone::createTimeZone(UnicodeString::fromUTF8(cur)); // UnicodeString curdn; // curtz->getDisplayName(locale, curdn); // delete curtz; // string out; // curdn.toUTF8String(out); // SET_STRING_ELT(nam, i, Rf_mkCharCE(out.c_str(), CE_UTF8)); } // Rf_setAttrib(ret, R_NamesSymbol, nam); if (tz_enum) { delete tz_enum; tz_enum = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( if (tz_enum) { delete tz_enum; tz_enum = NULL; } ) } ///** Get default time zone // * // * @return single string // * // * @version 0.5-1 (Marek Gagolewski, 2014-12-24) // */ //SEXP stri_timezone_get() { // TimeZone* curtz = stri__prepare_arg_timezone(R_NilValue, "tz", /*allowdefault*/true); // // UnicodeString id; // curtz->getID(id); // delete curtz; // // std::string id2; // id.toUTF8String(id2); // // return Rf_mkString(id2.c_str()); //} /** Set default time zone * * @param tz single string * @return nothing * * @version 0.5-1 (Marek Gagolewski, 2014-12-24) */ SEXP stri_timezone_set(SEXP tz) { TimeZone* curtz = stri__prepare_arg_timezone(tz, "tz", false/*allowdefault*/); /* This call adopts the TimeZone object passed in; the client is no longer responsible for deleting it. */ TimeZone::adoptDefault(curtz); return R_NilValue; } /** Get localised time zone info * * @param tz single string or NULL * @param locale single string or NULL * @param display_type single string * @return list * * @version 0.5-1 (Marek Gagolewski, 2014-12-24) * * @version 0.5-1 (Marek Gagolewski, 2015-03-01) * new out: WindowsID, NameDaylight, new in: display_type */ SEXP stri_timezone_info(SEXP tz, SEXP locale, SEXP display_type) { TimeZone* curtz = stri__prepare_arg_timezone(tz, "tz", R_NilValue); const char* qloc = stri__prepare_arg_locale(locale, "locale"); /* this is R_alloc'ed */ const char* dtype_str = stri__prepare_arg_string_1_notNA(display_type, "display_type"); /* this is R_alloc'ed */ const char* dtype_opts[] = { "short", "long", "generic_short", "generic_long", "gmt_short", "gmt_long", "common", "generic_location", NULL }; int dtype_cur = stri__match_arg(dtype_str, dtype_opts); TimeZone::EDisplayType dtype; switch (dtype_cur) { case 0: dtype = TimeZone::SHORT; break; case 1: dtype = TimeZone::LONG; break; case 2: dtype = TimeZone::SHORT_GENERIC; break; case 3: dtype = TimeZone::LONG_GENERIC; break; case 4: dtype = TimeZone::SHORT_GMT; break; case 5: dtype = TimeZone::LONG_GMT; break; case 6: dtype = TimeZone::SHORT_COMMONLY_USED; break; case 7: dtype = TimeZone::GENERIC_LOCATION; break; default: Rf_error(MSG__INCORRECT_MATCH_OPTION, "display_type"); break; } const R_len_t infosize = 6; SEXP vals; PROTECT(vals = Rf_allocVector(VECSXP, infosize)); for (int i=0; igetID(val_ID); SET_VECTOR_ELT(vals, curidx, stri__make_character_vector_UnicodeString_ptr(1, &val_ID)); ++curidx; UnicodeString val_name; curtz->getDisplayName(false, dtype, Locale::createFromName(qloc), val_name); SET_VECTOR_ELT(vals, curidx, stri__make_character_vector_UnicodeString_ptr(1, &val_name)); // TODO: U_USING_DEFAULT_WARNING when qloc!=0 // TODO: If the display name is not available for the locale, // then getDisplayName returns a string in the localised GMT offset format // such as GMT[+-]HH:mm. -- we can't check+warn if it is a valid locale // otherwise other than by comparing the output to this pattern ++curidx; if ((bool)curtz->useDaylightTime()) { UnicodeString val_name2; curtz->getDisplayName(true, dtype, Locale::createFromName(qloc), val_name2); SET_VECTOR_ELT(vals, curidx, stri__make_character_vector_UnicodeString_ptr(1, &val_name2)); } else SET_VECTOR_ELT(vals, curidx, Rf_ScalarString(NA_STRING)); ++curidx; UnicodeString val_windows; UErrorCode status = U_ZERO_ERROR; #if U_ICU_VERSION_MAJOR_NUM>=52 TimeZone::getWindowsID(val_ID, val_windows, status); // Stable since ICU 52 #endif if (U_SUCCESS(status) && val_windows.length() > 0) SET_VECTOR_ELT(vals, curidx, stri__make_character_vector_UnicodeString_ptr(1, &val_windows)); else SET_VECTOR_ELT(vals, curidx, Rf_ScalarString(NA_STRING)); ++curidx; SET_VECTOR_ELT(vals, curidx, Rf_ScalarReal(curtz->getRawOffset()/1000.0/3600.0)); ++curidx; SET_VECTOR_ELT(vals, curidx, Rf_ScalarLogical((bool)curtz->useDaylightTime())); delete curtz; stri__set_names(vals, infosize, "ID", "Name", "Name.Daylight", "Name.Windows", "RawOffset", "UsesDaylightTime"); UNPROTECT(1); return vals; } stringi/src/stri_search_coll_detect.cpp0000644000176200001440000001165214770541312020104 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf16.h" #include "stri_container_usearch.h" #include /** * Detect if a pattern occurs in a string [with collation] * * @param str character vector * @param pattern character vector * @param negate single bool * @param max_count single int * @param opts_collator passed to stri__ucol_open(), * if \code{NA}, then \code{stri_detect_fixed_byte} is called * @return logical vector * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * corrected behavior on empty str/pattern * * @version 0.1-?? (Marek Gagolewski, 2013-06-22) * make StriException-friendly, use StriContainerUStringSearch * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * new fun: stri_detect_coll (opts_collator == NA not allowed) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR #216: `negate` arg added * * @version 1.3.1 (Marek Gagolewski, 2019-02-08) * #232: `max_count` arg added */ SEXP stri_detect_coll(SEXP str, SEXP pattern, SEXP negate, SEXP max_count, SEXP opts_collator) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); int max_count_1 = stri__prepare_arg_integer_1_notNA(max_count, "max_count"); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); // call stri__ucol_open after prepare_arg: // if prepare_arg had failed, we would have a mem leak UCollator* collator = NULL; collator = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(2) R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF16 str_cont(str, vectorize_length); StriContainerUStringSearch pattern_cont(pattern, vectorize_length, collator); // collator is not owned by pattern_cont SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (max_count_1 == 0) { ret_tab[i] = NA_LOGICAL; continue; } STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_LOGICAL, { ret_tab[i] = negate_1; if (max_count_1 > 0 && ret_tab[i]) --max_count_1; }) UErrorCode status; UStringSearch *matcher = pattern_cont.getMatcher(i, str_cont.get(i)); usearch_reset(matcher); status = U_ZERO_ERROR; ret_tab[i] = ((int)usearch_first(matcher, &status) != USEARCH_DONE); // this is slow! :-( //ret_tab[i] = ((int)usearch_search(matcher, 0, NULL, NULL, &status)); // this is slow! :-( if (negate_1) ret_tab[i] = !ret_tab[i]; if (max_count_1 > 0 && ret_tab[i]) --max_count_1; STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } if (collator) { ucol_close(collator); collator=NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( if (collator) ucol_close(collator); ) } stringi/src/stri_search_fixed_startsendswith.cpp0000644000176200001440000001622714770541312022073 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8_indexable.h" #include "stri_container_bytesearch.h" #include "stri_container_integer.h" /** * Detect if a string starts with a pattern match * * @param str character vector * @param pattern character vector * @param from integer vector * @return logical vector * * @version 0.3-1 (Marek Gagolewski, 2014-06-03) * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * FR #110, #23: opts_fixed arg added; * use StriContainerByteSearch::startsWith() and endsWith() * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use String8::startsWith() and endsWith() * * @version 1.4.7 (Marek Gagolewski, 2020-08-24) * #345: `negate` arg added */ SEXP stri_startswith_fixed(SEXP str, SEXP pattern, SEXP from, SEXP negate, SEXP opts_fixed) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); PROTECT(from = stri__prepare_arg_integer(from, "from")); STRI__ERROR_HANDLER_BEGIN(3) int vectorize_length = stri__recycling_rule(true, 3, LENGTH(str), LENGTH(pattern), LENGTH(from)); StriContainerUTF8_indexable str_cont(str, vectorize_length); StriContainerByteSearch pattern_cont(pattern, vectorize_length, pattern_flags); StriContainerInteger from_cont(from, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_LOGICAL, ret_tab[i] = negate_1) if (from_cont.isNA(i)) { ret_tab[i] = NA_LOGICAL; continue; } R_len_t from_cur = from_cont.get(i); if (from_cur == 1) from_cur = 0; /* most commonly used case */ else if (from_cur >= 0) from_cur = str_cont.UChar32_to_UTF8_index_fwd(i, from_cur-1); else from_cur = str_cont.UChar32_to_UTF8_index_back(i, -from_cur); // now surely from_cur >= 0 && from_cur <= cur_n ret_tab[i] = (int)(str_cont.get(i).startsWith(from_cur, pattern_cont.get(i).c_str(), pattern_cont.get(i).length(), pattern_cont.isCaseInsensitive())); if (negate_1) ret_tab[i] = !ret_tab[i]; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( ;/* do nothing special on error */ ) } /** * Detect if a string ends with a pattern match * * @param str character vector * @param pattern character vector * @param to integer vector * @return logical vector * * @version 0.3-1 (Marek Gagolewski, 2014-06-03) * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * FR #110, #23: opts_fixed arg added * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use String8::startsWith() and endsWith() * * @version 1.4.7 (Marek Gagolewski, 2020-08-24) * #345: `negate` arg added */ SEXP stri_endswith_fixed(SEXP str, SEXP pattern, SEXP to, SEXP negate, SEXP opts_fixed) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); PROTECT(to = stri__prepare_arg_integer(to, "to")); STRI__ERROR_HANDLER_BEGIN(3) int vectorize_length = stri__recycling_rule(true, 3, LENGTH(str), LENGTH(pattern), LENGTH(to)); StriContainerUTF8_indexable str_cont(str, vectorize_length); StriContainerByteSearch pattern_cont(pattern, vectorize_length, pattern_flags); StriContainerInteger to_cont(to, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_LOGICAL, ret_tab[i] = negate_1) if (to_cont.isNA(i)) { ret_tab[i] = NA_LOGICAL; continue; } R_len_t to_cur = to_cont.get(i); if (to_cur == -1) to_cur = str_cont.get(i).length(); /* most commonly used case */ else if (to_cur >= 0) to_cur = str_cont.UChar32_to_UTF8_index_fwd(i, to_cur); else to_cur = str_cont.UChar32_to_UTF8_index_back(i, -to_cur-1); // now surely to_cur >= 0 && to_cur <= cur_n ret_tab[i] = (int)(str_cont.get(i).endsWith(to_cur, pattern_cont.get(i).c_str(), pattern_cont.get(i).length(), pattern_cont.isCaseInsensitive())); if (negate_1) ret_tab[i] = !ret_tab[i]; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( ;/* do nothing special on error */ ) } stringi/src/stri_ucnv.h0000644000176200001440000001424514770541602014721 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_ucnv_h #define __stri_ucnv_h #include "stri_stringi.h" #include #include #include /** * A class to manage an encoding converter * * @version 0.2-1 (Marek Gagolewski, 2014-03-28) * * @version 1.0.6 (Marek Gagolewski, 2017-05-25) * #270: latin-1 is windows-1252 on Windows * * @version 1.7.5.9001 (Marek Gagolewski, 2021-11-27) * #467: R-win-ucrt not marking strings as latin1 # */ class StriUcnv { private: UConverter* m_ucnv; // converter const char* m_name; // encoding, owned by caller int m_isutf8; int m_is8bit; static void STRI__UCNV_FROM_U_CALLBACK_SUBSTITUTE_WARN ( const void* context, UConverterFromUnicodeArgs* fromArgs, const UChar* codeUnits, int32_t length, UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* err); static void STRI__UCNV_TO_U_CALLBACK_SUBSTITUTE_WARN ( const void* context, UConverterToUnicodeArgs* toArgs, const char* codeUnits, int32_t length, UConverterCallbackReason reason, UErrorCode* err); void openConverter(bool register_callbacks); public: StriUcnv(const char* name=NULL) { m_name = name; m_ucnv = NULL; // lazy m_isutf8 = NA_LOGICAL; m_is8bit = NA_LOGICAL; } ~StriUcnv() { if (m_ucnv) ucnv_close(m_ucnv); m_ucnv = NULL; } StriUcnv(const StriUcnv& obj) { m_name = obj.m_name; m_ucnv = NULL; m_isutf8 = NA_LOGICAL; m_is8bit = NA_LOGICAL; } StriUcnv& operator=(const StriUcnv& obj) { this->~StriUcnv(); m_name = obj.m_name; m_ucnv = NULL; m_isutf8 = NA_LOGICAL; m_is8bit = NA_LOGICAL; return *this; } bool isUTF8() { if (m_isutf8 != NA_LOGICAL) return m_isutf8; openConverter(false); UErrorCode status = U_ZERO_ERROR; // get "official" encoder name const char* ucnv_name = ucnv_getName(m_ucnv, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) m_isutf8 = !strcmp(ucnv_name, "UTF-8"); return m_isutf8; } bool is8bit() { if (m_is8bit != NA_LOGICAL) return m_is8bit; openConverter(false); m_is8bit = (ucnv_getMaxCharSize(m_ucnv) == 1); return m_is8bit; } UConverter* getConverter(bool register_callbacks=false); bool hasASCIIsubset(); bool is1to1Unicode(); static vector getStandards(); static const char* getFriendlyName(const char* canname); // /** restores default ICU's substitute callbacks // */ // void setCallBackSubstitute() { // openConverter(); // // UErrorCode status = U_ZERO_ERROR; // ucnv_setFromUCallBack(m_ucnv, UCNV_FROM_U_CALLBACK_SUBSTITUTE, NULL, NULL, NULL, &status); // STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // // status = U_ZERO_ERROR; // ucnv_setToUCallBack(m_ucnv, UCNV_TO_U_CALLBACK_SUBSTITUTE, NULL, NULL, NULL, &status); // STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // } /** * get R's cetype_t corresponding to this converter */ cetype_t getCE() { openConverter(false); UErrorCode status = U_ZERO_ERROR; const char* ucnv_name = ucnv_getName(m_ucnv, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (!strcmp(ucnv_name, "US-ASCII")) { m_is8bit = true; m_isutf8 = true; return CE_UTF8; } else if (!strcmp(ucnv_name, "UTF-8")) { m_isutf8 = true; m_is8bit = false; return CE_UTF8; } #if defined(_WIN32) || defined(_WIN64) // #270: latin-1 is windows-1252 on Windows // #467: R-win-ucrt not marking strings as latin1 else if ( !strcmp(ucnv_name, "windows-1252") || !strcmp(ucnv_name, "ibm-5348_P100-1997") || !strcmp(ucnv_name, "ibm-1252_P100-2000") || !strcmp(ucnv_name, "ISO-8859-1") || !strcmp(ucnv_name, "latin1") ) { #else else if ( !strcmp(ucnv_name, "ISO-8859-1") || !strcmp(ucnv_name, "latin1") ) { #endif m_is8bit = true; m_isutf8 = false; return CE_LATIN1; } else if (!strcmp(ucnv_name, ucnv_getDefaultName())) return CE_NATIVE; return CE_BYTES; } }; #endif stringi/src/stri_utils.cpp0000644000176200001440000001175714770541312015444 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_listutf8.h" /** * Convert list to a character matrix * * @param x a list * @param fill single string * @param byrow single logical value * @param n_min single integer * @return character matrix * * @version 0.3-1 (Marek Gagolewski, 2014-10-23) * * @version 0.4-1 (Marek Gagolewski, 2014-12-04) * new arg: n_min */ SEXP stri_list2matrix(SEXP x, SEXP byrow, SEXP fill, SEXP n_min) { bool byrow2 = stri__prepare_arg_logical_1_notNA(byrow, "byrow"); R_len_t n_min2 = stri__prepare_arg_integer_1_notNA(n_min, "n_min"); if (n_min2 < 0) Rf_error(MSG__INCORRECT_NAMED_ARG "; " MSG__EXPECTED_NONNEGATIVE, "n_min"); PROTECT(x = stri__prepare_arg_list_string(x, "x")); PROTECT(fill = stri__prepare_arg_string_1(fill, "fill")); // enc2utf8 called in R STRI__ERROR_HANDLER_BEGIN(2) R_len_t n = LENGTH(x); SEXP fill2 = STRING_ELT(fill, 0); R_len_t m = n_min2; // maximal vector length for (int i=0; i m) m = k; } // TODO: the following does not re-encode strings to UTF-8, // it merely emplaces them in a matrix as-is SEXP ret; if (!byrow2) { STRI__PROTECT(ret = Rf_allocMatrix(STRSXP, m, n)); int ret_idx = 0; for (int i=0; i * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_integer.h" #include "stri_string8buf.h" #include #include "stri_container_charclass.h" /** Generate random permutations of code points in each string * * @param str character vector * @return character vector * * @version 0.2-1 (Marek Gagolewski, 2014-04-04) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.2.5 (Marek Gagolewski, 2019-07-23) * #319: Fixed overflow in `stri_rand_shuffle()`. */ SEXP stri_rand_shuffle(SEXP str) { PROTECT(str = stri__prepare_arg_string(str, "str")); R_len_t n = LENGTH(str); GetRNGstate(); STRI__ERROR_HANDLER_BEGIN(1) StriContainerUTF8 str_cont(str, n); R_len_t bufsize = 0; for (R_len_t i=0; i bufsize) bufsize = ni; } std::vector buf1(bufsize); // at most bufsize UChars32 (bufsize/4 min.) String8buf buf2(bufsize); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, n)); for (R_len_t i=0; i= 0 && j < sn) { U8_NEXT(s, j, sn, c); buf1[k++] = (int)c; } if (c < 0) { throw StriException(MSG__INVALID_UTF8); // Rf_warning(...); // SET_STRING_ELT(ret, i, NA_STRING); // continue; } // do shuffle buf1 at pos 0..k-1: (Fisher-Yates shuffle) R_len_t cur_n = k; for (j=0; j n_val || n_val % length_len != 0) Rf_warning(MSG__WARN_RECYCLING_RULE2); R_len_t pattern_len = LENGTH(pattern); if (pattern_len <= 0) { UNPROTECT(2); Rf_error(MSG__ARG_EXPECTED_NOT_EMPTY, "pattern"); } else if (pattern_len > n_val || n_val % pattern_len != 0) Rf_warning(MSG__WARN_RECYCLING_RULE2); GetRNGstate(); STRI__ERROR_HANDLER_BEGIN(2) StriContainerCharClass pattern_cont(pattern, max(n_val, pattern_len)); StriContainerInteger length_cont(length, max(n_val, length_len)); // get max required bufsize int* length_tab = INTEGER(length); size_t bufsize = 0; for (R_len_t i=0; i bufsize) bufsize = length_tab[i]; } bufsize *= 4; // 1 UChar32 -> max. 4 UTF-8 bytes String8buf buf(bufsize); char* bufdata = buf.data(); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, n_val)); for (R_len_t i=0; isize(); // generate string: size_t j = 0; UBool err = FALSE; for (R_len_t k=0; kcharAt(idx); if (c < 0) throw StriException(MSG__INTERNAL_ERROR); U8_APPEND((uint8_t*)bufdata, j, bufsize, c, err); if (err) throw StriException(MSG__INTERNAL_ERROR); } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, j, CE_UTF8)); } PutRNGstate(); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ PutRNGstate(); }) } stringi/src/icu74_common_cpp.txt0000644000176200001440000001322314700200761016424 0ustar liggesusersicu74/common/appendable.cpp \ icu74/common/bmpset.cpp \ icu74/common/brkeng.cpp \ icu74/common/brkiter.cpp \ icu74/common/bytesinkutil.cpp \ icu74/common/bytestream.cpp \ icu74/common/bytestrie.cpp \ icu74/common/bytestriebuilder.cpp \ icu74/common/bytestrieiterator.cpp \ icu74/common/caniter.cpp \ icu74/common/characterproperties.cpp \ icu74/common/chariter.cpp \ icu74/common/charstr.cpp \ icu74/common/cmemory.cpp \ icu74/common/cstr.cpp \ icu74/common/cstring.cpp \ icu74/common/cwchar.cpp \ icu74/common/dictbe.cpp \ icu74/common/dictionarydata.cpp \ icu74/common/dtintrv.cpp \ icu74/common/edits.cpp \ icu74/common/emojiprops.cpp \ icu74/common/errorcode.cpp \ icu74/common/filteredbrk.cpp \ icu74/common/filterednormalizer2.cpp \ icu74/common/icudataver.cpp \ icu74/common/icuplug.cpp \ icu74/common/loadednormalizer2impl.cpp \ icu74/common/localebuilder.cpp \ icu74/common/localematcher.cpp \ icu74/common/localeprioritylist.cpp \ icu74/common/locavailable.cpp \ icu74/common/locbased.cpp \ icu74/common/locdispnames.cpp \ icu74/common/locdistance.cpp \ icu74/common/locdspnm.cpp \ icu74/common/locid.cpp \ icu74/common/loclikely.cpp \ icu74/common/loclikelysubtags.cpp \ icu74/common/locmap.cpp \ icu74/common/locresdata.cpp \ icu74/common/locutil.cpp \ icu74/common/lsr.cpp \ icu74/common/lstmbe.cpp \ icu74/common/messagepattern.cpp \ icu74/common/mlbe.cpp \ icu74/common/normalizer2.cpp \ icu74/common/normalizer2impl.cpp \ icu74/common/normlzr.cpp \ icu74/common/parsepos.cpp \ icu74/common/patternprops.cpp \ icu74/common/pluralmap.cpp \ icu74/common/propname.cpp \ icu74/common/propsvec.cpp \ icu74/common/punycode.cpp \ icu74/common/putil.cpp \ icu74/common/rbbi_cache.cpp \ icu74/common/rbbi.cpp \ icu74/common/rbbidata.cpp \ icu74/common/rbbinode.cpp \ icu74/common/rbbirb.cpp \ icu74/common/rbbiscan.cpp \ icu74/common/rbbisetb.cpp \ icu74/common/rbbistbl.cpp \ icu74/common/rbbitblb.cpp \ icu74/common/resbund_cnv.cpp \ icu74/common/resbund.cpp \ icu74/common/resource.cpp \ icu74/common/restrace.cpp \ icu74/common/ruleiter.cpp \ icu74/common/schriter.cpp \ icu74/common/serv.cpp \ icu74/common/servlk.cpp \ icu74/common/servlkf.cpp \ icu74/common/servls.cpp \ icu74/common/servnotf.cpp \ icu74/common/servrbf.cpp \ icu74/common/servslkf.cpp \ icu74/common/sharedobject.cpp \ icu74/common/simpleformatter.cpp \ icu74/common/static_unicode_sets.cpp \ icu74/common/stringpiece.cpp \ icu74/common/stringtriebuilder.cpp \ icu74/common/uarrsort.cpp \ icu74/common/ubidi_props.cpp \ icu74/common/ubidi.cpp \ icu74/common/ubidiln.cpp \ icu74/common/ubiditransform.cpp \ icu74/common/ubidiwrt.cpp \ icu74/common/ubrk.cpp \ icu74/common/ucase.cpp \ icu74/common/ucasemap_titlecase_brkiter.cpp \ icu74/common/ucasemap.cpp \ icu74/common/ucat.cpp \ icu74/common/uchar.cpp \ icu74/common/ucharstrie.cpp \ icu74/common/ucharstriebuilder.cpp \ icu74/common/ucharstrieiterator.cpp \ icu74/common/uchriter.cpp \ icu74/common/ucln_cmn.cpp \ icu74/common/ucmndata.cpp \ icu74/common/ucnv_bld.cpp \ icu74/common/ucnv_cb.cpp \ icu74/common/ucnv_cnv.cpp \ icu74/common/ucnv_ct.cpp \ icu74/common/ucnv_err.cpp \ icu74/common/ucnv_ext.cpp \ icu74/common/ucnv_io.cpp \ icu74/common/ucnv_lmb.cpp \ icu74/common/ucnv_set.cpp \ icu74/common/ucnv_u16.cpp \ icu74/common/ucnv_u32.cpp \ icu74/common/ucnv_u7.cpp \ icu74/common/ucnv_u8.cpp \ icu74/common/ucnv.cpp \ icu74/common/ucnv2022.cpp \ icu74/common/ucnvbocu.cpp \ icu74/common/ucnvdisp.cpp \ icu74/common/ucnvhz.cpp \ icu74/common/ucnvisci.cpp \ icu74/common/ucnvlat1.cpp \ icu74/common/ucnvmbcs.cpp \ icu74/common/ucnvscsu.cpp \ icu74/common/ucnvsel.cpp \ icu74/common/ucol_swp.cpp \ icu74/common/ucptrie.cpp \ icu74/common/ucurr.cpp \ icu74/common/udata.cpp \ icu74/common/udatamem.cpp \ icu74/common/udataswp.cpp \ icu74/common/uenum.cpp \ icu74/common/uhash_us.cpp \ icu74/common/uhash.cpp \ icu74/common/uidna.cpp \ icu74/common/uinit.cpp \ icu74/common/uinvchar.cpp \ icu74/common/uiter.cpp \ icu74/common/ulist.cpp \ icu74/common/uloc_keytype.cpp \ icu74/common/uloc_tag.cpp \ icu74/common/uloc.cpp \ icu74/common/ulocale.cpp \ icu74/common/ulocbuilder.cpp \ icu74/common/umapfile.cpp \ icu74/common/umath.cpp \ icu74/common/umutablecptrie.cpp \ icu74/common/umutex.cpp \ icu74/common/unames.cpp \ icu74/common/unifiedcache.cpp \ icu74/common/unifilt.cpp \ icu74/common/unifunct.cpp \ icu74/common/uniset_closure.cpp \ icu74/common/uniset_props.cpp \ icu74/common/uniset.cpp \ icu74/common/unisetspan.cpp \ icu74/common/unistr_case_locale.cpp \ icu74/common/unistr_case.cpp \ icu74/common/unistr_cnv.cpp \ icu74/common/unistr_props.cpp \ icu74/common/unistr_titlecase_brkiter.cpp \ icu74/common/unistr.cpp \ icu74/common/unorm.cpp \ icu74/common/unormcmp.cpp \ icu74/common/uobject.cpp \ icu74/common/uprops.cpp \ icu74/common/ures_cnv.cpp \ icu74/common/uresbund.cpp \ icu74/common/uresdata.cpp \ icu74/common/usc_impl.cpp \ icu74/common/uscript_props.cpp \ icu74/common/uscript.cpp \ icu74/common/uset_props.cpp \ icu74/common/uset.cpp \ icu74/common/usetiter.cpp \ icu74/common/ushape.cpp \ icu74/common/usprep.cpp \ icu74/common/ustack.cpp \ icu74/common/ustr_cnv.cpp \ icu74/common/ustr_titlecase_brkiter.cpp \ icu74/common/ustr_wcs.cpp \ icu74/common/ustrcase_locale.cpp \ icu74/common/ustrcase.cpp \ icu74/common/ustrenum.cpp \ icu74/common/ustrfmt.cpp \ icu74/common/ustring.cpp \ icu74/common/ustrtrns.cpp \ icu74/common/utext.cpp \ icu74/common/utf_impl.cpp \ icu74/common/util_props.cpp \ icu74/common/util.cpp \ icu74/common/utrace.cpp \ icu74/common/utrie_swap.cpp \ icu74/common/utrie.cpp \ icu74/common/utrie2_builder.cpp \ icu74/common/utrie2.cpp \ icu74/common/uts46.cpp \ icu74/common/utypes.cpp \ icu74/common/uvector.cpp \ icu74/common/uvectr32.cpp \ icu74/common/uvectr64.cpp \ icu74/common/wintz.cpp stringi/src/stri_escape.cpp0000644000176200001440000001565314770541312015543 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_utf16.h" #define StriEscape_BUFSIZE 12 /** * Escape Unicode code points * * @param str character vector * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-08-17) * * @version 0.2-1 (Marek Gagolewski, 2014-04-01) * fail on incorrect utf8 byte seqs; * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.1.6 (Steve Grubb, 2017-07-20) * if ((char)c >= 32 || (char)c <= 126) should be && */ SEXP stri_escape_unicode(SEXP str) { PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_length = LENGTH(str); StriContainerUTF8 str_cont(str, str_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_length)); std::string out; // @TODO: estimate len a priori? for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } const char* str_cur_s = str_cont.get(i).c_str(); R_len_t str_cur_n = str_cont.get(i).length(); // estimate buf size R_len_t bufsize = 0; UChar32 c; R_len_t j = 0; while (j < str_cur_n) { U8_NEXT(str_cur_s, j, str_cur_n, c); if (c < 0) throw StriException(MSG__INVALID_UTF8); else if ((char)c >= 32 && (char)c <= 126) bufsize += 1; else if (c <= 0xff) bufsize += 6; // for \a, \n this will be overestimated else bufsize += 10; } out.clear(); if ((size_t)bufsize > (size_t)out.size()) out.reserve(bufsize); // do escape j = 0; char buf[StriEscape_BUFSIZE]; while (j < str_cur_n) { U8_NEXT(str_cur_s, j, str_cur_n, c); /* if (c < 0) throw StriException(MSG__INVALID_UTF8); // this has already been checked :) else */ if (c <= ASCII_MAXCHARCODE) { switch ((char)c) { case 0x07: out.append("\\a"); break; case 0x08: out.append("\\b"); break; case 0x09: out.append("\\t"); break; case 0x0a: out.append("\\n"); break; case 0x0b: out.append("\\v"); break; case 0x0c: out.append("\\f"); break; case 0x0d: out.append("\\r"); break; // case 0x1b: out.append("\\e"); break; // R doesn't know that case 0x22: out.append("\\\""); break; case 0x27: out.append("\\'"); break; case 0x5c: out.append("\\\\"); break; default: if ((char)c >= 32 && (char)c <= 126) // printable characters out.append(1, (char)c); else { snprintf(buf, StriEscape_BUFSIZE, "\\u%04x", (uint16_t)c); out.append(buf, 6); } } } else if (c <= 0xffff) { snprintf(buf, StriEscape_BUFSIZE, "\\u%04x", (uint16_t)c); out.append(buf, 6); } else { snprintf(buf, StriEscape_BUFSIZE, "\\U%08x", (uint32_t)c); out.append(buf, 10); } } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(out.c_str(), (int)out.size(), (cetype_t)CE_UTF8) ); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** * Unescape Unicode code points * * @param str character vector * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-08-17) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_unescape_unicode(SEXP str) { PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_length = LENGTH(str); StriContainerUTF16 str_cont(str, str_length, false); // writable for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i) || str_cont.get(i).length() == 0) continue; // leave as-is str_cont.getWritable(i).setTo(str_cont.get(i).unescape()); if (str_cont.get(i).length() == 0) { Rf_warning(MSG__INVALID_ESCAPE); str_cont.setNA(i); // something went wrong } } STRI__UNPROTECT_ALL return str_cont.toR(); STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } stringi/src/stri_ucnv.cpp0000644000176200001440000003406114770541566015263 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_ucnv.h" /** * Opens (on demand) a desired converter * * The converter is opened if necessary. * @param register_callbacks * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski, 2013-08-10) * Use own error callbacks * * @version 0.2-1 (Marek Gagolewski, 2014-03-28) * moved to StriUcnv; * throws StriException instead of calling Rf_error * * @version 0.4-1 (Marek Gagolewski, 2014-12-01) * don't register callbacks by default */ void StriUcnv::openConverter(bool register_callbacks) { if (m_ucnv) return; UErrorCode status = U_ZERO_ERROR; m_ucnv = ucnv_open(m_name, &status); STRI__CHECKICUSTATUS_THROW(status, { m_ucnv = NULL; }) if (register_callbacks) { status = U_ZERO_ERROR; ucnv_setFromUCallBack((UConverter*)m_ucnv, (UConverterFromUCallback)STRI__UCNV_FROM_U_CALLBACK_SUBSTITUTE_WARN, (const void *)NULL, (UConverterFromUCallback *)NULL, (const void **)NULL, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; ucnv_setToUCallBack ((UConverter*)m_ucnv, (UConverterToUCallback)STRI__UCNV_TO_U_CALLBACK_SUBSTITUTE_WARN, (const void *)NULL, (UConverterToUCallback *)NULL, (const void **)NULL, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } } /** Returns a desired converted * * @return UConverter * @param register_callbacks * * @version 0.2-1 (Marek Gagolewski) * * @version 0.4-1 (Marek Gagolewski, 2014-12-01) * don't register callbacks by default */ UConverter* StriUcnv::getConverter(bool register_callbacks) { openConverter(register_callbacks); #ifndef NDEBUG if (!m_ucnv) throw StriException("!NDEBUG: StriUcnv::getConverter()"); #endif return m_ucnv; } /** Own fallback function for ucnv conversion: substitute & warn * * * @param context The function currently recognizes the callback options: * UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE, * returning the error code back to the caller immediately. * NULL: Substitutes any ILLEGAL_SEQUENCE * @param toUArgs Information about the conversion in progress * @param codeUnits Points to 'length' bytes of the concerned codepage sequence * @param length Size (in bytes) of the concerned codepage sequence * @param reason Defines the reason the callback was invoked * @param err Return value will be set to success if the callback was handled, * otherwise this value will be set to a failure status. * * @version 0.1-?? (Marek Gagolewski, 2013-08-10) * * @version 0.2-1 (Marek Gagolewski, 2014-03-28) * moved to StriUcnv */ void StriUcnv::STRI__UCNV_TO_U_CALLBACK_SUBSTITUTE_WARN ( const void *context, UConverterToUnicodeArgs *toArgs, const char* codeUnits, int32_t length, UConverterCallbackReason reason, UErrorCode * err) { bool wasSubstitute = (reason <= UCNV_IRREGULAR && (context == NULL || (*((char*)context) == *UCNV_SUB_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))); // "DO NOT CALL THIS FUNCTION DIRECTLY!" :> UCNV_TO_U_CALLBACK_SUBSTITUTE(context, toArgs, codeUnits, length, reason, err); if (*err == U_ZERO_ERROR && wasSubstitute) { // substitute char was induced switch (length) { case 1: Rf_warning(MSG__UNCONVERTIBLE_BINARY_1, codeUnits[0]); break; case 2: Rf_warning(MSG__UNCONVERTIBLE_BINARY_2, codeUnits[0], codeUnits[1]); break; case 3: Rf_warning(MSG__UNCONVERTIBLE_BINARY_3, codeUnits[0], codeUnits[1], codeUnits[2]); break; case 4: Rf_warning(MSG__UNCONVERTIBLE_BINARY_4, codeUnits[0], codeUnits[1], codeUnits[2], codeUnits[3]); break; default: Rf_warning(MSG__UNCONVERTIBLE_BINARY_n); break; } } } /** Own fallback function for ucnv conversion: substitute & warn * * * @param context The function currently recognizes the callback options: * UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE, * returning the error code back to the caller immediately. * NULL: Substitutes any ILLEGAL_SEQUENCE * @param fromUArgs Information about the conversion in progress * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence * @param length Size (in bytes) of the concerned codepage sequence * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint. * @param reason Defines the reason the callback was invoked * @param err Return value will be set to success if the callback was handled, * otherwise this value will be set to a failure status. * @see ucnv_setSubstChars * * @version 0.1-?? (Marek Gagolewski, 2013-08-10) * * @version 0.2-1 (Marek Gagolewski, 2014-03-28) * moved to StriUcnv */ void StriUcnv::STRI__UCNV_FROM_U_CALLBACK_SUBSTITUTE_WARN ( const void *context, UConverterFromUnicodeArgs *fromArgs, const UChar* codeUnits, int32_t length, UChar32 codePoint, UConverterCallbackReason reason, UErrorCode * err) { bool wasSubstitute = (reason <= UCNV_IRREGULAR && (context == NULL || (*((char*)context) == *UCNV_SUB_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))); // "DO NOT CALL THIS FUNCTION DIRECTLY!" :> UCNV_FROM_U_CALLBACK_SUBSTITUTE(context, fromArgs, codeUnits, length, codePoint, reason, err); if (*err == U_ZERO_ERROR && wasSubstitute) { // substitute char was induced Rf_warning(MSG__UNCONVERTIBLE_CODE_POINT, codePoint); } } /** * Get ICU ucnv standard names and their count * * @return vector of strings owned by ICU (don't delete them) * * @version 0.1-?? (Marek Gagolewski) * * @version 0.2-1 (Marek Gagolewski, 2014-03-28) * moved to StriUcnv; * don't use R_alloc; * return vector */ vector StriUcnv::getStandards() { UErrorCode status = U_ZERO_ERROR; R_len_t std_n = (R_len_t)ucnv_countStandards()-1; // -1 - this is not documented in ICU4C if (std_n <= 0) throw StriException(MSG__ENC_ERROR_SET); // error() allowed here vector standards(std_n); for (R_len_t i=0; i \\u%08x (encoding=%s)", (int)ascii_last[0], c, ucnv_getName(m_ucnv, &status)); #endif return false; } // character not convertible => ignore status = U_ZERO_ERROR; if (c != UCHAR_REPLACEMENT) { ucnv_fromUChars(m_ucnv, buf, buflen, (UChar*)&c, 1, &status); if (U_FAILURE(status)) { #ifndef NDEBUG Rf_warning("Cannot convert character 0x%02x (encoding=%s)", (int)(unsigned char)ascii_last[0], ucnv_getName(m_ucnv, &status)); #endif return false; } if (buf[1] != '\0' || buf[0] != ascii_last[0]) { #ifndef NDEBUG Rf_warning("Problematic character 0x%02x -> \\u%08x -> 0x%02x (encoding=%s)", (int)(unsigned char)ascii_last[0], c, (int)buf[0], ucnv_getName(m_ucnv, &status)); #endif return false; } } // @TODO: check tolower, toupper etc. (???) ascii_last = ascii1; } return true; } stringi/src/stri_container_utf16.h0000644000176200001440000001332414770541435016756 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_container_utf16_h #define __stri_container_utf16_h #include "stri_container_base.h" #include /** * A class to handle conversion between R character vectors * and UTF-16 string vectors * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * lastMatcher cache, supports auto-vectorization * * @version 0.1-?? (Marek Gagolewski) * improved ASCII performance (separate ucnv) * * @version 0.1-?? (Marek Gagolewski) * now NAs are marked as NULLs in str * * @version 0.2-1 (Marek Gagolewski, 2014-03-15) * If native encoding is UTF-8, then encode with * UnicodeString::fromUTF8 (for speedup); * str now is UnicodeString*, and not UnicodeString**; * using UnicodeString::isBogus to represent NA */ class StriContainerUTF16 : public StriContainerBase { protected: UnicodeString* str; ///< data - \code{UnicodeString}s public: StriContainerUTF16(); StriContainerUTF16(R_len_t nrecycle); StriContainerUTF16(SEXP rstr, R_len_t nrecycle, bool shallowrecycle=true); StriContainerUTF16(StriContainerUTF16& container); ~StriContainerUTF16(); StriContainerUTF16& operator=(StriContainerUTF16& container); SEXP toR(R_len_t i) const; SEXP toR() const; /** check if the vectorized ith element is NA * @param i index * @return true if is NA */ inline bool isNA(R_len_t i) const { #ifndef NDEBUG if (!str) throw StriException("StriContainerUTF16::isNA(): !str"); if (i < 0 || i >= nrecycle) throw StriException("StriContainerUTF16::isNA(): INDEX OUT OF BOUNDS"); #endif return str[i%n].isBogus(); } /** get the vectorized ith element * @param i index * @return string */ inline const UnicodeString& get(R_len_t i) const { #ifndef NDEBUG if (isNA(i)) throw StriException("StriContainerUTF16::get(): isNA"); #endif return str[i%n]; } /** get the vectorized ith element * @param i index * @return string */ inline UnicodeString& getWritable(R_len_t i) { #ifndef NDEBUG if (isShallow) throw StriException("StriContainerUTF16::getWritable(): shallow StriContainerUTF16"); if (n != nrecycle) throw StriException("StriContainerUTF16::getWritable(): n!=nrecycle"); if (i < 0 || i >= n) throw StriException("StriContainerUTF16::getWritable(): INDEX OUT OF BOUNDS"); if (isNA(i)) throw StriException("StriContainerUTF16::getWritable(): isNA"); #endif return str[i%n]; // in fact, "%n" is not necessary } /** set NA * @param i index */ inline void setNA(R_len_t i) { #ifndef NDEBUG if (isShallow) throw StriException("StriContainerUTF16::getWritable(): shallow StriContainerUTF16"); if (n != nrecycle) throw StriException("StriContainerUTF16::getWritable(): n!=nrecycle"); if (i < 0 || i >= n) throw StriException("StriContainerUTF16::getWritable(): INDEX OUT OF BOUNDS"); #endif str[i%n].setToBogus(); } /** set the vectorized ith element * @param i index * @param s string to be copied */ inline void set(R_len_t i, const UnicodeString& s) { #ifndef NDEBUG if (isShallow) throw StriException("StriContainerUTF16::set(): shallow StriContainerUTF16"); if (n != nrecycle) throw StriException("StriContainerUTF16::set(): n!=nrecycle"); if (i < 0 || i >= n) throw StriException("StriContainerUTF16::set(): INDEX OUT OF BOUNDS"); if (str[i%n].isBogus()) throw StriException("StriContainerUTF16::set(): isNA"); #endif str[i%n].setTo(s); // in fact, "%n" is not necessary } // @QUESTION: separate StriContainerUTF16_indexable? void UChar16_to_UChar32_index(R_len_t i, int* i1, int* i2, const int ni, int adj1, int adj2); }; SEXP stri__subset_by_logical(const StriContainerUTF16& str_cont, const std::vector& which, int result_counter); #endif stringi/src/stri_external.h0000644000176200001440000000533014750110642015555 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_external_h #define __stri_external_h #ifdef U_CHARSET_IS_UTF8 // do not enable this (must be unset before including ICU headers): #undef U_CHARSET_IS_UTF8 #define U_CHARSET_IS_UTF8 0 #endif //#ifndef NDEBUG //#define U_HIDE_DRAFT_API //#define U_HIDE_DEPRECATED_API //#endif #define UNISTR_FROM_CHAR_EXPLICIT explicit #define UNISTR_FROM_STRING_EXPLICIT explicit #include #include #include #include #include #include #include #include #include #include #include #include using namespace icu; // #define USE_RINTERNALS removed 2021-08-12 - do not use anymore // #define R_NO_REMAP removed 2021-08-12 - do not use anymore #include #include #include #include #include #include #if R_VERSION >= R_Version(3, 5, 0) #else /* ALTREP is R>=3.5.0 */ #define ALTREP(x) (0) #endif #endif stringi/src/stri_search_coll_split.cpp0000644000176200001440000002132214770541312017762 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf16.h" #include "stri_container_usearch.h" #include "stri_container_integer.h" #include "stri_container_logical.h" #include #include using namespace std; /** * Split a string into parts [with collation] * * The pattern matches identify delimiters that separate the input into fields. * The input data between the matches becomes the fields themselves. * * @param str character vector * @param pattern character vector * @param n integer vector * @param omit_empty logical vector * @param opts_collator passed to stri__ucol_open(), * if \code{NA}, then \code{stri_detect_fixed_byte} is called * @param tokens_only single logical value * @param simplify single logical value * * @return list of character vectors or character matrix * * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-25) * StriException friendly, use StriContainerUTF16 * * @version 0.1-?? (Marek Gagolewski, 2013-07-10) * BUGFIX: wrong behavior on empty str * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * new fun: stri_split_coll (opts_collator == NA not allowed) * * @version 0.3-1 (Marek Gagolewski, 2014-10-19) * added tokens_only param * * @version 0.3-1 (Marek Gagolewski, 2014-10-23) * added split param * * @version 0.3-1 (Marek Gagolewski, 2014-10-24) * allow omit_empty=NA * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-04) * allow `simplify=NA`; FR #126: pass n to stri_list2matrix */ SEXP stri_split_coll(SEXP str, SEXP pattern, SEXP n, SEXP omit_empty, SEXP tokens_only, SEXP simplify, SEXP opts_collator) { bool tokens_only1 = stri__prepare_arg_logical_1_notNA(tokens_only, "tokens_only"); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); PROTECT(n = stri__prepare_arg_integer(n, "n")); PROTECT(omit_empty = stri__prepare_arg_logical(omit_empty, "omit_empty")); PROTECT(simplify = stri__prepare_arg_logical_1(simplify, "simplify")); UCollator* collator = NULL; collator = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(5) R_len_t vectorize_length = stri__recycling_rule(true, 4, LENGTH(str), LENGTH(pattern), LENGTH(n), LENGTH(omit_empty)); StriContainerUTF16 str_cont(str, vectorize_length); StriContainerUStringSearch pattern_cont(pattern, vectorize_length, collator); // collator is not owned by pattern_cont StriContainerInteger n_cont(n, vectorize_length); StriContainerLogical omit_empty_cont(omit_empty, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (n_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } int n_cur = n_cont.get(i); int omit_empty_cur = !omit_empty_cont.isNA(i) && omit_empty_cont.get(i); STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));, SET_VECTOR_ELT(ret, i, (omit_empty_cont.isNA(i))?stri__vector_NA_strings(1): stri__vector_empty_strings((omit_empty_cur || n_cur == 0)?0:1));) UStringSearch *matcher = pattern_cont.getMatcher(i, str_cont.get(i)); usearch_reset(matcher); if (n_cur >= INT_MAX-1) throw StriException(MSG__INCORRECT_NAMED_ARG "; " MSG__EXPECTED_SMALLER, "n"); else if (n_cur < 0) n_cur = INT_MAX; else if (n_cur == 0) { SET_VECTOR_ELT(ret, i, Rf_allocVector(STRSXP, 0)); continue; } else if (tokens_only1) n_cur++; // we need to do one split ahead here R_len_t k; deque< pair > fields; // byte based-indices fields.push_back(pair(0,0)); UErrorCode status = U_ZERO_ERROR; for (k=1; k < n_cur && USEARCH_DONE != usearch_next(matcher, &status) && !U_FAILURE(status); ) { R_len_t s1 = (R_len_t)usearch_getMatchedStart(matcher); R_len_t s2 = (R_len_t)usearch_getMatchedLength(matcher) + s1; if (omit_empty_cur && fields.back().first == s1) fields.back().first = s2; // don't start any new field else { fields.back().second = s1; fields.push_back(pair(s2, s2)); // start a new field here ++k; // another field } } STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) fields.back().second = str_cont.get(i).length(); if (omit_empty_cur && fields.back().first == fields.back().second) fields.pop_back(); if (tokens_only1 && n_cur < INT_MAX) { n_cur--; // one split ahead could have been made, see above while (fields.size() > (size_t)n_cur) fields.pop_back(); // get rid of the remainder } R_len_t noccurrences = (R_len_t)fields.size(); StriContainerUTF16 out_cont(noccurrences); deque< pair >::iterator iter = fields.begin(); for (k = 0; iter != fields.end(); ++iter, ++k) { pair curoccur = *iter; if (curoccur.second == curoccur.first && omit_empty_cont.isNA(i)) out_cont.setNA(k); else out_cont.getWritable(k).setTo(str_cont.get(i), curoccur.first, curoccur.second-curoccur.first); } SET_VECTOR_ELT(ret, i, out_cont.toR()); } if (collator) { ucol_close(collator); collator=NULL; } if (LOGICAL(simplify)[0] == NA_LOGICAL || LOGICAL(simplify)[0]) { R_len_t n_min = 0; R_len_t n_length = LENGTH(n); int* n_tab = INTEGER(n); for (R_len_t i=0; i * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_stringi_h #define __stri_stringi_h #include "stri_external.h" #include "stri_messages.h" #include "stri_macros.h" #include "stri_exception.h" #include "stri_exports.h" // common.cpp void stri__set_names(SEXP object, R_len_t numnames, ...); SEXP stri__make_character_vector_char_ptr(R_len_t numnames, ...); SEXP stri__make_character_vector_UnicodeString_ptr(R_len_t numnames, ...); R_len_t stri__recycling_rule(bool enableWarning, int n, ...); SEXP stri__vector_NA_integers(R_len_t howmany); SEXP stri__vector_NA_strings(R_len_t howmany); SEXP stri__vector_empty_strings(R_len_t howmany); SEXP stri__emptyList(); SEXP stri__matrix_NA_INTEGER(R_len_t nrow, R_len_t ncol, int filler=NA_INTEGER); // TODO: other ones can be generalised too SEXP stri__matrix_NA_STRING(R_len_t nrow, R_len_t ncol); int stri__match_arg(const char* option, const char** set); // collator.cpp: struct UCollator; UCollator* stri__ucol_open(SEXP opts_collator); // length.cpp R_len_t stri__numbytes_max(SEXP str); int stri__width_char(UChar32 c); int stri__width_char_with_context(UChar32 c, UChar32 p, bool& reset); int stri__width_string(const char* s, int n, int max_width=NA_INTEGER); int stri__length_string(const char* s, int n, int max_length=NA_INTEGER); // prepare_arg.cpp: SEXP stri__prepare_arg_string_1(SEXP x, const char* argname); SEXP stri__prepare_arg_double_1(SEXP x, const char* argname, bool factors_as_strings=true); SEXP stri__prepare_arg_integer_1(SEXP x, const char* argname, bool factors_as_strings=true); SEXP stri__prepare_arg_logical_1(SEXP x, const char* argname); const char* stri__copy_string_Ralloc(SEXP, const char* argname); const char* stri__prepare_arg_string_1_notNA(SEXP x, const char* argname); double stri__prepare_arg_double_1_notNA(SEXP x, const char* argname); int stri__prepare_arg_integer_1_notNA(SEXP x, const char* argname); bool stri__prepare_arg_logical_1_notNA(SEXP x, const char* argname); const char* stri__prepare_arg_string_1_NA(SEXP x, const char* argname); double stri__prepare_arg_double_1_NA(SEXP x, const char* argname); int stri__prepare_arg_logical_1_NA(SEXP x, const char* argname); int stri__prepare_arg_integer_1_NA(SEXP x, const char* argname); bool stri__is_C_locale(const char* str); const char* stri__prepare_arg_locale( SEXP loc, const char* argname, bool allowdefault=true, bool allownull=true ); const char* stri__prepare_arg_enc( SEXP loc, const char* argname, bool allowdefault ); TimeZone* stri__prepare_arg_timezone(SEXP tz, const char* argname, bool allowdefault); SEXP stri__prepare_arg_list(SEXP x, const char* argname); SEXP stri__prepare_arg_list_string(SEXP x, const char* argname); SEXP stri__prepare_arg_list_integer(SEXP x, const char* argname); SEXP stri__prepare_arg_list_raw(SEXP x, const char* argname); SEXP stri__prepare_arg_string(SEXP x, const char* argname, bool allow_error=true); SEXP stri__prepare_arg_logical(SEXP x, const char* argname, bool allow_error=true); SEXP stri__prepare_arg_double(SEXP x, const char* argname, bool factors_as_strings=true, bool allow_error=true); SEXP stri__prepare_arg_integer(SEXP x, const char* argname, bool factors_as_strings=true, bool allow_error=true); SEXP stri__prepare_arg_raw(SEXP x, const char* argname, bool factors_as_strings=true, bool allow_error=true); SEXP stri__prepare_arg_POSIXct(SEXP x, const char* argname); // search void stri__locate_set_dimnames_list( SEXP list, bool get_length=false ); void stri__locate_set_dimnames_matrix( SEXP matrix, bool get_length=false ); // date/time void stri__set_class_POSIXct(SEXP x); Calendar* stri__get_calendar(const char* locale_val); // ------------------------------------------------------------------------ #endif stringi/src/stri_search_class_split.cpp0000644000176200001440000002032414770541312020137 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_charclass.h" #include "stri_container_integer.h" #include "stri_container_logical.h" #include #include using namespace std; /** * Split a string by occurrences of a character class * * @param str character vector * @param pattern character vector * @param n integer vector * @param omit_empty logical vector * @param tokens_only single logical value * @param simplify single logical value * * @return a list of character vectors or character matrix * * @version 0.1-?? (Marek Gagolewski, 2013-06-14) * * @version 0.1-?? (Marek Gagolewski, 2013-06-15) * omit_empty, use StriContainerInteger, StriContainerLogical, * and StriContainerCharClass * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * detects invalid UTF-8 byte stream * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * StriContainerCharClass now relies on UnicodeSet * * @version 0.3-1 (Marek Gagolewski, 2014-10-19) * added tokens_only param * * @version 0.3-1 (Marek Gagolewski, 2014-10-23) * added split param * * @version 0.3-1 (Marek Gagolewski, 2014-10-24) * allow omit_empty=NA * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-04) * allow `simplify=NA`; FR #126: pass n to stri_list2matrix */ SEXP stri_split_charclass(SEXP str, SEXP pattern, SEXP n, SEXP omit_empty, SEXP tokens_only, SEXP simplify) { bool tokens_only1 = stri__prepare_arg_logical_1_notNA(tokens_only, "tokens_only"); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); PROTECT(n = stri__prepare_arg_integer(n, "n")); PROTECT(omit_empty = stri__prepare_arg_logical(omit_empty, "omit_empty")); PROTECT(simplify = stri__prepare_arg_logical_1(simplify, "simplify")); R_len_t vectorize_length = stri__recycling_rule(true, 4, LENGTH(str), LENGTH(pattern), LENGTH(n), LENGTH(omit_empty)); STRI__ERROR_HANDLER_BEGIN(5) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerInteger n_cont(n, vectorize_length); StriContainerLogical omit_empty_cont(omit_empty, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i) || n_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); int n_cur = n_cont.get(i); int omit_empty_cur = !omit_empty_cont.isNA(i) && omit_empty_cont.get(i); if (n_cur >= INT_MAX-1) throw StriException(MSG__INCORRECT_NAMED_ARG "; " MSG__EXPECTED_SMALLER, "n"); else if (n_cur < 0) n_cur = INT_MAX; else if (n_cur == 0) { SET_VECTOR_ELT(ret, i, Rf_allocVector(STRSXP, 0)); continue; } else if (tokens_only1) n_cur++; // we need to do one split ahead here R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); R_len_t j, k; UChar32 chr; deque< pair > fields; // byte based-indices fields.push_back(pair(0,0)); for (j=0, k=1; jcontains(chr)) { if (omit_empty_cur && fields.back().second == fields.back().first) fields.back().first = fields.back().second = j; // don't start any new field else { fields.push_back(pair(j, j)); // start a new field here ++k; // another field } } else { fields.back().second = j; } } if (k == n_cur) fields.back().second = str_cur_n; if (omit_empty_cur && fields.back().first == fields.back().second) fields.pop_back(); if (tokens_only1 && n_cur < INT_MAX) { n_cur--; // one split ahead could have been made, see above while (fields.size() > (size_t)n_cur) fields.pop_back(); // get rid of the remainder } SEXP ans; STRI__PROTECT(ans = Rf_allocVector(STRSXP, fields.size())); deque< pair >::iterator iter = fields.begin(); for (k = 0; iter != fields.end(); ++iter, ++k) { pair curoccur = *iter; if (curoccur.second == curoccur.first && omit_empty_cont.isNA(i)) SET_STRING_ELT(ans, k, NA_STRING); else SET_STRING_ELT(ans, k, Rf_mkCharLenCE(str_cur_s+curoccur.first, curoccur.second-curoccur.first, CE_UTF8)); } SET_VECTOR_ELT(ret, i, ans); STRI__UNPROTECT(1) } if (LOGICAL(simplify)[0] == NA_LOGICAL || LOGICAL(simplify)[0]) { R_len_t n_min = 0; R_len_t n_length = LENGTH(n); int* n_tab = INTEGER(n); for (R_len_t i=0; i * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8_indexable.h" #include #include #include #include #include /** Greedy word wrap algorithm * * @param wrap_after [out] * @param nwords number of "words" * @param width_val maximal desired out line width * @param widths_orig ith word width original * @param widths_trim ith word width trimmed * @param add_para_1 * @param add_para_n * * @version 0.1-?? (Bartek Tartanus) * original implementation * * @version 0.2-2 (Marek Gagolewski, 2014-04-28) * BreakIterator usage mods * * @version 0.4-1 (Marek Gagolewski, 2014-12-06) * new args: add_para_1, add_para_n */ void stri__wrap_greedy(std::deque& wrap_after, R_len_t nwords, int width_val, const std::vector& widths_orig, const std::vector& widths_trim, int add_para_1, int add_para_n) { R_len_t cur_len = add_para_1+widths_orig[0]; for (R_len_t j = 1; j < nwords; ++j) { if (cur_len + widths_trim[j] > width_val) { cur_len = add_para_n+widths_orig[j]; wrap_after.push_back(j-1); } else { cur_len += widths_orig[j]; } } } /** Dynamic word wrap algorithm * (Knuth's word wrapping algorithm that minimizes raggedness of formatted text) * * @param wrap_after [out] * @param nwords number of "words" * @param width_val maximal desired out line width * @param exponent_val cost function exponent * @param widths_orig ith word width original * @param widths_trim ith word width trimmed * @param add_para_1 * @param add_para_a * * @version 0.1-?? (Bartek Tartanus) * original implementation * * @version 0.2-2 (Marek Gagolewski, 2014-04-30) * BreakIterator usage mods * * @version 0.4-1 (Marek Gagolewski, 2014-12-06) * new args: add_para_1, add_para_n, * cost of the last line is zero */ void stri__wrap_dynamic(std::deque& wrap_after, R_len_t nwords, int width_val, double exponent_val, const std::vector& widths_orig, const std::vector& widths_trim, int add_para_1, int add_para_n) { #define IDX(i,j) (i)*nwords+(j) vector cost(nwords*nwords); // where cost[IDX(i,j)] == cost of printing words i..j in a single line, i<=j // calculate costs: // there is some "punishment" for leaving blanks at the end of each line // (number of "blank" codepoints ^ exponent_val) for (int i=0; i i) { if (cost[IDX(i,j-1)] < 0.0) { // already Inf cost[IDX(i,j)] = -1.0; // Inf continue; } else { sum -= widths_trim[j-1]; sum += widths_orig[j-1]; } } sum += widths_trim[j]; int ct = width_val - sum; if (i == 0) ct -= add_para_1; else ct -= add_para_n; if (j == nwords-1) { // last line == cost 0 if (j == i || ct >= 0) cost[IDX(i,j)] = 0.0; else cost[IDX(i,j)] = -1.0/*Inf*/; } else if (j == i) // some words don't fit in a line at all -> cost 0.0 cost[IDX(i,j)] = (ct < 0) ? 0.0 : pow((double)ct, exponent_val); else cost[IDX(i,j)] = (ct < 0) ? -1.0/*"Inf"*/ : pow((double)ct, exponent_val); } } vector f(nwords); // f[j] == total cost of (optimally) printing words 0..j vector where(nwords*nwords, false); // where[IDX(i,j)] == false iff // we don't wrap after i-th word, i<=j // when (optimally) printing words 0..j for (int j=0; j= 0.0) { // no breaking needed: words 0..j fit in one line f[j] = cost[IDX(0,j)]; continue; } // let i = optimal way of printing of words 0..i + printing i+1..j int i = 0; while (i <= j) { if (cost[IDX(i+1,j)] >= 0.0) break; ++i; } double best_i = f[i] + cost[IDX(i+1,j)]; for (int k=i+1; kgetLocaleID(ULOC_VALID_LOCALE, status2); if (valid_locale && !strcmp(valid_locale, "root")) Rf_warning("%s", ICUError::getICUerrorName(status)); } R_len_t str_length = LENGTH(str); StriContainerUTF8_indexable str_cont(str, str_length); StriContainerUTF8 prefix_cont(prefix, 1); StriContainerUTF8 initial_cont(initial, 1); // prepare indent/exdent/prefix/initial stuff: // 1st line, 1st para (i==0, u==0): initial+indent // nth line, 1st para (i==0, u> 0): prefix +exdent // 1st line, nth para (i> 0, u==0): prefix +indent // nth line, nth para (i> 0, u> 0): prefix +exdent StriWrapLineStart ii(initial_cont.get(0), indent_val); StriWrapLineStart pi(prefix_cont.get(0), indent_val); StriWrapLineStart pe(prefix_cont.get(0), exdent_val); status = U_ZERO_ERROR; //Unicode Newline Guidelines - Unicode Technical Report #13 UnicodeSet uset_linebreaks(UnicodeString::fromUTF8("[\\u000A-\\u000D\\u0085\\u2028\\u2029]"), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) uset_linebreaks.freeze(); status = U_ZERO_ERROR; UnicodeSet uset_whitespaces(UnicodeString::fromUTF8("\\p{White_space}"), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) uset_whitespaces.freeze(); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, str_length)); for (R_len_t i = 0; i < str_length; ++i) { if (str_cont.isNA(i) || prefix_cont.isNA(0) || initial_cont.isNA(0)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } status = U_ZERO_ERROR; const char* str_cur_s = str_cont.get(i).c_str(); R_len_t str_cur_n = str_cont.get(i).length(); str_text = utext_openUTF8(str_text, str_cur_s, str_cont.get(i).length(), &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; briter->setText(str_text, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // first generate a list of positions of line breaks deque< R_len_t > occurrences_list; // this could be an R_len_t queue R_len_t match = briter->first(); while (match != BreakIterator::DONE) { if (!whitespace_only_val) occurrences_list.push_back(match); else { if (match > 0 && match < str_cur_n) { UChar32 c; U8_GET((const uint8_t*)str_cur_s, 0, match-1, str_cur_n, c); if (uset_whitespaces.contains(c)) occurrences_list.push_back(match); } else occurrences_list.push_back(match); } match = briter->next(); } R_len_t noccurrences = (R_len_t)occurrences_list.size(); // number of boundaries if (noccurrences <= 1) { // no match (1 boundary == 0) SET_VECTOR_ELT(ret, i, Rf_ScalarString(str_cont.toR(i))); continue; } // the number of "words" is: R_len_t nwords = noccurrences - 1; // convert occurrences_list to a vector // in order to obtain end positions (in a string) of each "words", // noting that occurrences_list.at(0) == 0 #ifndef NDEBUG if (occurrences_list.at(0) != 0) throw StriException("NDEBUG: stri_wrap: (occurrences_list.at(0) != 0)"); #endif std::vector end_pos_orig(nwords); deque::iterator iter = ++(occurrences_list.begin()); for (R_len_t j = 0; iter != occurrences_list.end(); ++iter, ++j) { end_pos_orig[j] = (*iter); // this is a UTF-8 index } // now: // we'll get the total widths/number of code points in each "word" std::vector widths_orig(nwords); // we'll get the total widths/number of code points without trailing whitespaces std::vector widths_trim(nwords); // we'll get the end positions without trailing whitespaces std::vector end_pos_trim(nwords); // detect line endings (fail on a match) UChar32 p; UChar32 c = 0; bool reset = true; R_len_t j = 0; R_len_t cur_block = 0; R_len_t cur_width_orig = 0; R_len_t cur_width_trim = 0; R_len_t cur_count_orig = 0; R_len_t cur_count_trim = 0; R_len_t cur_end_pos_trim = 0; while (j < str_cur_n) { R_len_t jlast = j; p = c; U8_NEXT(str_cur_s, j, str_cur_n, c); if (c < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (uset_linebreaks.contains(c)) throw StriException(MSG__NEWLINE_FOUND); // OLD: cur_width_orig += stri__width_char(c); cur_width_orig += stri__width_char_with_context(c, p, reset); ++cur_count_orig; if (uset_whitespaces.contains(c)) { // OLD: trim all white spaces from the end: // ++cur_count_trim; // [we have the normalize arg for that] // NEW: trim just one white space at the end: // OLD: cur_width_trim = stri__width_char(c); cur_width_trim = stri__width_char_with_context(c, p, reset); cur_count_trim = 1; cur_end_pos_trim = jlast; } else { cur_width_trim = 0; cur_count_trim = 0; cur_end_pos_trim = j; } if (j >= str_cur_n || end_pos_orig[cur_block] <= j) { // we'll start a new block in a moment if (use_length_val) { widths_orig[cur_block] = cur_count_orig; widths_trim[cur_block] = cur_count_orig-cur_count_trim; } else { widths_orig[cur_block] = cur_width_orig; widths_trim[cur_block] = cur_width_orig-cur_width_trim; } end_pos_trim[cur_block] = cur_end_pos_trim; cur_block++; cur_width_orig = 0; cur_width_trim = 0; cur_count_orig = 0; cur_count_trim = 0; cur_end_pos_trim = j; reset = true; } } // do wrap std::deque wrap_after; // wrap line after which word in {0..nwords-1}? if (exponent_val <= 0.0) { stri__wrap_greedy(wrap_after, nwords, width_val, widths_orig, widths_trim, (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width), (use_length_val)?pe.count:pe.width); } else { stri__wrap_dynamic(wrap_after, nwords, width_val, exponent_val, widths_orig, widths_trim, (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width), (use_length_val)?pe.count:pe.width); } // wrap_after.size() line breaks => wrap_after.size()+1 lines R_len_t nlines = (R_len_t)wrap_after.size()+1; R_len_t last_pos = 0; SEXP ans; STRI__PROTECT(ans = Rf_allocVector(STRSXP, nlines)); deque::iterator iter_wrap = wrap_after.begin(); for (R_len_t u = 0; iter_wrap != wrap_after.end(); ++iter_wrap, ++u) { R_len_t wrap_after_cur = *iter_wrap; R_len_t cur_pos = end_pos_trim[wrap_after_cur]; std::string cs; if (i == 0 && u == 0) cs = ii.str; else if (i > 0 && u == 0) cs = pi.str; else cs = pe.str; cs.append(str_cur_s+last_pos, cur_pos-last_pos); SET_STRING_ELT(ans, u, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8)); last_pos = end_pos_orig[wrap_after_cur]; } // last line goes here: std::string cs; if (i == 0 && nlines-1 == 0) cs = ii.str; else if (i > 0 && nlines-1 == 0) cs = pi.str; else cs = pe.str; cs.append(str_cur_s+last_pos, end_pos_trim[nwords-1]-last_pos); SET_STRING_ELT(ans, nlines-1, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8)); SET_VECTOR_ELT(ret, i, ans); STRI__UNPROTECT(1); } if (briter) { delete briter; briter = NULL; } if (str_text) { utext_close(str_text); str_text = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (briter) { delete briter; briter = NULL; } if (str_text) { utext_close(str_text); str_text = NULL; } }) } stringi/src/stri_container_regex.cpp0000644000176200001440000003775614770541312017467 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_regex.h" /** * Default constructor * */ StriContainerRegexPattern::StriContainerRegexPattern() : StriContainerUTF16() { this->lastMatcherIndex = -1; this->lastMatcher = NULL; this->lastCaptureGroupNamesIndex = -1; //this->lastCaptureGroupNames = ... //this->opts = 0; } /** * Construct String Container from R character vector * @param rstr R character vector * @param nrecycle extend length [vectorization] * @param flags regexp flags */ StriContainerRegexPattern::StriContainerRegexPattern(SEXP rstr, R_len_t _nrecycle, StriRegexMatcherOptions _opts) : StriContainerUTF16(rstr, _nrecycle, true) { this->lastMatcherIndex = -1; this->lastMatcher = NULL; this->lastCaptureGroupNamesIndex = -1; //this->lastCaptureGroupNames = ... this->opts = _opts; R_len_t n = get_n(); for (R_len_t i=0; ilastMatcherIndex = -1; this->lastMatcher = NULL; this->lastCaptureGroupNamesIndex = -1; //this->lastCaptureGroupNames = ... this->opts = container.opts; } StriContainerRegexPattern& StriContainerRegexPattern::operator=(StriContainerRegexPattern& container) { this->~StriContainerRegexPattern(); (StriContainerUTF16&) (*this) = (StriContainerUTF16&)container; this->lastMatcherIndex = -1; this->lastMatcher = NULL; this->lastCaptureGroupNamesIndex = -1; //this->lastCaptureGroupNames = ... this->opts = container.opts; return *this; } /** Destructor * */ StriContainerRegexPattern::~StriContainerRegexPattern() { if (lastMatcher) { delete lastMatcher; lastMatcher = NULL; } } /** Get names of all capture groups in the i-th regex as an R STRSXP * allows reusing previous * * @param i index * @param include_whole_match whether the first elem should be an additional "" * @param last_i set to -1 to recompute * @param ret might copy dimnames from ret[last_i] * * @version 1.7.1 (Marek Gagolewski, 2021-06-20) #153 */ SEXP StriContainerRegexPattern::getCaptureGroupRNames( R_len_t i // TODO allow reuse ) { // TODO - refactor - too similar to getCaptureGroupRDimnames if (this->isNA(i) || this->get(i).length() <= 0) return R_NilValue; const std::vector& cgnames = this->getCaptureGroupNames(i); R_len_t pattern_cur_groups = cgnames.size(); bool has_cgnames = false; for (R_len_t j=0; j")); for (R_len_t j=0; jisNA(i) || this->get(i).length() <= 0) return R_NilValue; // last dimnames could be cached here but then // we'd have to use R_PreserveObject and R_ReleaseObject; // R-ext states "It is less efficient than the normal protection mechanism, // and should be used sparingly." // If a user calls PROTECT and then UNPROTECT on retval, how does this // interfere with R_PreserveObject? if (last_i >= 0 && !Rf_isNull(ret) && (last_i % this->get_n()) == (i % this->get_n())) { // reuse last dimnames SEXP tmp, dimnames; PROTECT(tmp = VECTOR_ELT(ret, last_i)); PROTECT(dimnames = Rf_getAttrib(tmp, R_DimNamesSymbol)); UNPROTECT(2); return dimnames; } else { const std::vector& cgnames = this->getCaptureGroupNames(i); R_len_t pattern_cur_groups = cgnames.size(); bool has_cgnames = false; for (R_len_t j=0; j")); for (R_len_t j=0; j= this->n the last matcher is returned * * @param i index * * @version 1.7.1 (Marek Gagolewski, 2021-06-19) #153 */ const std::vector& StriContainerRegexPattern::getCaptureGroupNames(R_len_t i) { STRI_ASSERT(lastMatcherIndex >= 0 && lastMatcherIndex == (i % n)); STRI_ASSERT(lastMatcher); STRI_ASSERT(!this->isNA(i)); STRI_ASSERT(this->get(i).length() > 0); if (this->lastCaptureGroupNamesIndex == (i % n)) { return lastCaptureGroupNames; // reuse } int ngroups = lastMatcher->groupCount(); lastCaptureGroupNames = std::vector(ngroups); this->lastCaptureGroupNamesIndex = (i % n); if (ngroups == 0) return lastCaptureGroupNames; // nothing to do #if U_ICU_VERSION_MAJOR_NUM>=55 /* Support for named capture groups has been introduced in ICU 55 This is not documented, but the named capture group names are like [A-Za-z][A-Za-z0-9]* uregex.cpp:1506 in ICU 69.1 has something like: if ((c32 >= 0x41 && c32 <= 0x5a) || // A..Z (c32 >= 0x61 && c32 <= 0x7a) || // a..z (c32 >= 0x31 && c32 <= 0x39)) { // 0..9 groupName.append(c32); } RegexPattern (regex.h) has UHashtable *fNamedCaptureMap; // Map from capture group names to numbers. but it's private and we're no friends with it here's a simple regex pattern "parser" to extract group names which then be queried using RegexPattern::groupNumberFromName #if U_ICU_VERSION_MAJOR_NUM>=55 #endif */ UErrorCode status = U_ZERO_ERROR; UText* p = lastMatcher->pattern().patternText(status); if (U_FAILURE(status)) throw StriException(status); UChar32 c = utext_next32From(p, 0); while (c >= 0) { // this is not necessarily bullet-proof, but, come on, // these are just labels ;) // (\\?) -- not a named capture group // \\(?.\\) -- not a capture group // [(?.)] -- not a capture group if (c == '\\') { c = utext_next32(p); // go to next c = utext_next32(p); // ignore next } else if (c == '[') { // go to ...] but ignore \] for (c = utext_next32(p); c >= 0 && c != ']'; c = utext_next32(p)) { if (c == '\\') c = utext_next32(p); // ignore what follows } c = utext_next32(p); // go to next } else if (c == '(') { c = utext_next32(p); if (c != '?') { c = utext_next32(p); continue; } c = utext_next32(p); if (c != '<') { c = utext_next32(p); continue; } std::string groupName; for ( c = utext_next32(p); (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9'); c = utext_next32(p) ) { // technically, first char should be [A-Za-z], but RegexPattern will check that anyway groupName.push_back((char)c); } if (c == '>') { status = U_ZERO_ERROR; int group = lastMatcher->pattern().groupNumberFromName(groupName.c_str(), -1, status); if (U_SUCCESS(status)) { // if not, just ignore group--; // 1-based indexing //Rprintf("%d %s\n", group, groupName.c_str()); STRI_ASSERT(group >= 0 && group < ngroups); lastCaptureGroupNames[group] = groupName; } } c = utext_next32(p);; } else c = utext_next32(p); } #endif /* U_ICU_VERSION_MAJOR_NUM>=55 */ return lastCaptureGroupNames; } /** The returned matcher shall not be deleted by the user * * it is assumed that vectorize_next() is used: * for i >= this->n the last matcher is returned * * @param i index */ RegexMatcher* StriContainerRegexPattern::getMatcher(R_len_t i) { if (lastMatcher) { if (this->lastMatcherIndex >= 0 && this->lastMatcherIndex == (i % n)) { return lastMatcher; // reuse } else { delete lastMatcher; // invalidate this->lastMatcher = NULL; } } UErrorCode status = U_ZERO_ERROR; lastMatcher = new RegexMatcher(this->get(i), opts.flags, status); if (U_FAILURE(status)) { if (lastMatcher) delete lastMatcher; lastMatcher = NULL; const char* context; // to ease debugging, #382 std::string s; if (str[i%n].isBogus()) context = NULL; else { str[i%n].toUTF8String(s); context = s.c_str(); } throw StriException(status, context); } if (!lastMatcher) throw StriException(MSG__MEM_ALLOC_ERROR); if (opts.stack_limit > 0) { lastMatcher->setStackLimit(opts.stack_limit, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } if (opts.time_limit > 0) { lastMatcher->setTimeLimit(opts.time_limit, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } this->lastMatcherIndex = (i % n); return lastMatcher; } /** Read regex flags from a list * * may call Rf_error * * @param opts_regex list * @return flags * * @version 0.1-?? (Marek Gagolewski) * * @version 0.2-3 (Marek Gagolewski, 2014-05-09) * allow NULL for opts_regex * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Disallow NA options * * @version 1.1.6 (Marek Gagolewski, 2017-11-10) * PROTECT STRING_ELT(names, i) * * @version 1.4.7 (Marek Gagolewski, 2020-08-24) * add time_limit and stack_limit */ StriRegexMatcherOptions StriContainerRegexPattern::getRegexOptions(SEXP opts_regex) { int32_t stack_limit = 0; int32_t time_limit = 0; uint32_t flags = 0; if (!Rf_isNull(opts_regex) && !Rf_isVectorList(opts_regex)) Rf_error(MSG__ARG_EXPECTED_LIST, "opts_regex"); // error() call allowed here R_len_t narg = Rf_isNull(opts_regex)?0:LENGTH(opts_regex); if (narg > 0) { SEXP names = PROTECT(Rf_getAttrib(opts_regex, R_NamesSymbol)); if (names == R_NilValue || LENGTH(names) != narg) Rf_error(MSG__REGEX_CONFIG_FAILED); // error() call allowed here for (R_len_t i=0; i * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf16.h" #include "stri_container_usearch.h" #include #include using namespace std; /** * Locate first or last occurrences of pattern in a string [with collation] * * @param str character vector * @param pattern character vector * @param opts_collator passed to stri__ucol_open(), * if \code{NA}, then \code{stri__locate_firstlast_fixed_byte} is called * @param first looking for first or last match? * @return integer matrix (2 columns) * * @version 0.1-?? (Bartlomiej Tartanus) * * @version 0.1-?? (Bartlomiej Tartanus, 2013-06-09) * StriContainerUTF16 & collator * * @version 0.1-?? (Marek Gagolewski, 2013-06-23) * StriException friendly, use StriContainerUStringSearch * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * new fun: stri_locate_firstlast_coll (opts_collator == NA not allowed) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.7.1 (Marek Gagolewski, 2021-06-29) * get_length */ SEXP stri__locate_firstlast_coll(SEXP str, SEXP pattern, SEXP opts_collator, bool first, bool get_length1) { PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); UCollator* collator = NULL; collator = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(2) R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF16 str_cont(str, vectorize_length); StriContainerUStringSearch pattern_cont(pattern, vectorize_length, collator); // collator is not owned by pattern_cont SEXP ret; STRI__PROTECT(ret = Rf_allocMatrix(INTSXP, vectorize_length, 2)); stri__locate_set_dimnames_matrix(ret, get_length1); int* ret_tab = INTEGER(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { ret_tab[i] = NA_INTEGER; ret_tab[i+vectorize_length] = NA_INTEGER; STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN( str_cont, pattern_cont, ;/*nothing on NA - keep NA_INTEGER*/, { if (get_length1) ret_tab[i] = ret_tab[i+vectorize_length] = -1; } ) UStringSearch *matcher = pattern_cont.getMatcher(i, str_cont.get(i)); usearch_reset(matcher); UErrorCode status = U_ZERO_ERROR; int start; if (first) { start = usearch_first(matcher, &status); } else { start = usearch_last(matcher, &status); } STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (start != USEARCH_DONE) { // there is a match ret_tab[i] = start; ret_tab[i+vectorize_length] = start + usearch_getMatchedLength(matcher); // Adjust UChar index -> UChar32 index (1-2 byte UTF16 to 1 byte UTF32-code points) str_cont.UChar16_to_UChar32_index(i, ret_tab+i, ret_tab+i+vectorize_length, 1, 1, // 0-based index -> 1-based 0 // end returns position of next character after match ); if (get_length1) ret_tab[i+vectorize_length] -= ret_tab[i] - 1; // to->length } else if (get_length1) { // not found ret_tab[i+vectorize_length] = ret_tab[i] = -1; } // else NA_INTEGER already } if (collator) { ucol_close(collator); collator=NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( if (collator) ucol_close(collator); ) } /** * Locate first occurrences of pattern in a string [with collation] * * @param str character vector * @param pattern character vector * @param opts_collator list * @return integer matrix (2 columns) * * @version 0.1-?? (Bartlomiej Tartanus) * * @version 0.1-?? (Bartlomiej Tartanus, 2013-06-09) * StriContainerUTF16 & collator * * @version 0.1-?? (Marek Gagolewski, 2013-06-23) * use stri_locate_firstlast_fixed * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * new fun: stri_locate_first_coll (opts_collator == NA not allowed) * * @version 1.7.1 (Marek Gagolewski, 2021-06-29) * get_length */ SEXP stri_locate_first_coll(SEXP str, SEXP pattern, SEXP opts_collator, SEXP get_length) { bool get_length1 = stri__prepare_arg_logical_1_notNA(get_length, "get_length"); return stri__locate_firstlast_coll(str, pattern, opts_collator, true, get_length1); } /** * Locate last occurrences of pattern in a string [with collation] * * @param str character vector * @param pattern character vector * @param opts_collator list * @return integer matrix (2 columns) * * @version 0.1-?? (Bartlomiej Tartanus) * * @version 0.1-?? (Bartlomiej Tartanus, 2013-06-09) * StriContainerUTF16 & collator * * @version 0.1-?? (Marek Gagolewski, 2013-06-23) * use stri_locate_firstlast_fixed * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * new fun: stri_locate_last_coll (opts_collator == NA not allowed) * * @version 1.7.1 (Marek Gagolewski, 2021-06-29) * get_length */ SEXP stri_locate_last_coll(SEXP str, SEXP pattern, SEXP opts_collator, SEXP get_length) { bool get_length1 = stri__prepare_arg_logical_1_notNA(get_length, "get_length"); return stri__locate_firstlast_coll(str, pattern, opts_collator, false, get_length1); } /** * Locate all pattern occurrences in a string [with collation] * * @param str character vector * @param pattern character vector * @param opts_collator passed to stri__ucol_open(), * if \code{NA}, then \code{stri__locate_all_fixed_byte} is called * @return list of integer matrices (2 columns) * * @version 0.1-?? (Bartlomiej Tartanus) * * @version 0.1-?? (Bartlomiej Tartanus, 2013-06-09) * StriContainerUTF16 & collator * * @version 0.1-?? (Marek Gagolewski, 2013-06-23) * StriException friendly, use StriContainerUStringSearch * * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * new fun: stri_locate_all_coll (opts_collator == NA not allowed) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-11-27) * FR #117: omit_no_match arg added * * @version 1.7.1 (Marek Gagolewski, 2021-06-29) * get_length */ SEXP stri_locate_all_coll(SEXP str, SEXP pattern, SEXP omit_no_match, SEXP opts_collator, SEXP get_length) { bool omit_no_match1 = stri__prepare_arg_logical_1_notNA(omit_no_match, "omit_no_match"); bool get_length1 = stri__prepare_arg_logical_1_notNA(get_length, "get_length"); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); UCollator* collator = NULL; collator = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(2) R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF16 str_cont(str, vectorize_length); StriContainerUStringSearch pattern_cont(pattern, vectorize_length, collator); // collator is not owned by pattern_cont SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, SET_VECTOR_ELT(ret, i, stri__matrix_NA_INTEGER(1, 2));, SET_VECTOR_ELT(ret, i, stri__matrix_NA_INTEGER(omit_no_match1?0:1, 2, get_length1?-1:NA_INTEGER));) UStringSearch *matcher = pattern_cont.getMatcher(i, str_cont.get(i)); usearch_reset(matcher); UErrorCode status = U_ZERO_ERROR; int start = (int)usearch_first(matcher, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (start == USEARCH_DONE) { SET_VECTOR_ELT(ret, i, stri__matrix_NA_INTEGER(omit_no_match1?0:1, 2, get_length1?-1:NA_INTEGER)); continue; } deque< pair > occurrences; while (start != USEARCH_DONE) { occurrences.push_back(pair(start, start+usearch_getMatchedLength(matcher))); start = usearch_next(matcher, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } R_len_t noccurrences = (R_len_t)occurrences.size(); SEXP ans; STRI__PROTECT(ans = Rf_allocMatrix(INTSXP, noccurrences, 2)); int* ans_tab = INTEGER(ans); deque< pair >::iterator iter = occurrences.begin(); for (R_len_t j = 0; iter != occurrences.end(); ++iter, ++j) { pair match = *iter; ans_tab[j] = match.first; ans_tab[j+noccurrences] = match.second; } // Adjust UChar index -> UChar32 index (1-2 byte UTF16 to 1 byte UTF32-code points) str_cont.UChar16_to_UChar32_index(i, ans_tab, ans_tab+noccurrences, noccurrences, 1, // 0-based index -> 1-based 0 // end returns position of next character after match ); if (get_length1) { for (R_len_t j=0; j < noccurrences; ++j) ans_tab[j+noccurrences] -= ans_tab[j] - 1; // to->length } SET_VECTOR_ELT(ret, i, ans); STRI__UNPROTECT(1); } stri__locate_set_dimnames_list(ret, get_length1); if (collator) { ucol_close(collator); collator=NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( if (collator) ucol_close(collator); ) } stringi/src/Makevars.in0000644000176200001440000000130714750110642014622 0ustar liggesusers# kate: hl Makefile ## 'stringi' Makevars ## Copyright (C) 2013-2025, Marek Gagolewski @STRINGI_CXXSTD@ PKG_CPPFLAGS=@STRINGI_CPPFLAGS@ PKG_CXXFLAGS=@STRINGI_CXXFLAGS@ #PKG_CFLAGS=@STRINGI_CFLAGS@ PKG_LIBS=@STRINGI_LDFLAGS@ @STRINGI_LIBS@ STRI_SOURCES_CPP=@STRINGI_SOURCES_CPP@ STRI_OBJECTS=$(STRI_SOURCES_CPP:.cpp=.o) ICU_COMMON_SOURCES_CPP=@STRINGI_ICU_COMMON_SOURCES_CPP@ ICU_COMMON_OBJECTS=$(ICU_COMMON_SOURCES_CPP:.cpp=.o) ICU_I18N_SOURCES_CPP=@STRINGI_ICU_I18N_SOURCES_CPP@ ICU_I18N_OBJECTS=$(ICU_I18N_SOURCES_CPP:.cpp=.o) ICU_STUBDATA_SOURCES_CPP=@STRINGI_ICU_STUBDATA_SOURCES_CPP@ ICU_STUBDATA_OBJECTS=$(ICU_STUBDATA_SOURCES_CPP:.cpp=.o) OBJECTS=@STRINGI_OBJECTS@ stringi/src/stri_collator.cpp0000644000176200001440000002316014770541371016117 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include #include /** * Create & set up an ICU Collator * * WARNING: this function is allowed to call the error() function. * Use before STRI__ERROR_HANDLER_BEGIN (with other prepareargs). * * @param opts_collator named R list * @return a Collator object that should be closed with ucol_close() after use * * * @version 0.1-?? (Marek Gagolewski) * * @version 0.2-1 (Marek Gagolewski, 2014-04-17) * allow for NULL opts_collator (identical to list()) * * @version 0.2-3 (Marek Gagolewski, 2014-05-09) * disallow NA as opts_collator * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc; * + many other bugs in settings establishment * * @version 0.3-1 (Marek Gagolewski, 2014-11-06) * Fetch opts vals first to avoid memleaks (missing ucol_close calls on Rf_error) * * @version 0.4-1 (Marek Gagolewski, 2014-12-08) * #23: add `overlap` option * * @version 1.1.6 (Marek Gagolewski, 2017-11-10) * PROTECT STRING_ELT(names, i) * * @version 1.8.1 (Marek Gagolewski, 2023-11-07) * #476: Warn when falling back to the root locale, make C==en_US_POSIX */ UCollator* stri__ucol_open(SEXP opts_collator) { if (!Rf_isNull(opts_collator) && !Rf_isVectorList(opts_collator)) Rf_error(MSG__INCORRECT_COLLATOR_OPTION_SPEC); // error() allowed here R_len_t narg = Rf_isNull(opts_collator)?0:LENGTH(opts_collator); const char* default_locale = stri__prepare_arg_locale(R_NilValue, "locale"); if (narg <= 0) { // no custom settings - use default Collator UErrorCode status = U_ZERO_ERROR; UCollator* col = ucol_open(default_locale, &status); STRI__CHECKICUSTATUS_RFERROR(status, {/* do nothing special on err */}) // error() allowed here return col; } SEXP names = PROTECT(Rf_getAttrib(opts_collator, R_NamesSymbol)); if (names == R_NilValue || LENGTH(names) != narg) Rf_error(MSG__INCORRECT_COLLATOR_OPTION_SPEC); // error() allowed here /* First, let's fetch collator's options -- this process may call Rf_error, so we cannot do uloc_open yet (memleaks!) */ UColAttributeValue opt_FRENCH_COLLATION = UCOL_DEFAULT; UColAttributeValue opt_ALTERNATE_HANDLING = UCOL_DEFAULT; UColAttributeValue opt_CASE_FIRST = UCOL_DEFAULT; UColAttributeValue opt_CASE_LEVEL = UCOL_DEFAULT; UColAttributeValue opt_NORMALIZATION_MODE = UCOL_DEFAULT; UColAttributeValue opt_STRENGTH = UCOL_DEFAULT_STRENGTH; UColAttributeValue opt_NUMERIC_COLLATION = UCOL_DEFAULT; // USearchAttributeValue opt_OVERLAP = USEARCH_OFF; const char* opt_LOCALE = default_locale; for (R_len_t i=0; i (int)UCOL_STRENGTH_LIMIT + 1) val = (int)UCOL_STRENGTH_LIMIT + 1; opt_STRENGTH = (UColAttributeValue)(val-1); // } else if (!strcmp(curname, "overlap") && allow_overlap) { // bool val_bool = stri__prepare_arg_logical_1_notNA(tmp_arg, "overlap"); // opt_OVERLAP = (val_bool?USEARCH_ON:USEARCH_OFF); } else if (!strcmp(curname, "alternate_shifted")) { bool val_bool = stri__prepare_arg_logical_1_notNA(tmp_arg, "alternate_shifted"); opt_ALTERNATE_HANDLING = (val_bool?UCOL_SHIFTED:UCOL_NON_IGNORABLE); } else if (!strcmp(curname, "uppercase_first")) { SEXP val; PROTECT(val = stri__prepare_arg_logical_1(tmp_arg, "uppercase_first")); opt_CASE_FIRST = (LOGICAL(val)[0]==NA_LOGICAL?UCOL_OFF: (LOGICAL(val)[0]?UCOL_UPPER_FIRST:UCOL_LOWER_FIRST)); UNPROTECT(1); } else if (!strcmp(curname, "french")) { bool val_bool = stri__prepare_arg_logical_1_notNA(tmp_arg, "french"); opt_FRENCH_COLLATION = (val_bool?UCOL_ON:UCOL_OFF); } else if (!strcmp(curname, "case_level")) { bool val_bool = stri__prepare_arg_logical_1_notNA(tmp_arg, "case_level"); opt_CASE_LEVEL = (val_bool?UCOL_ON:UCOL_OFF); } else if (!strcmp(curname, "normalization")) { bool val_bool = stri__prepare_arg_logical_1_notNA(tmp_arg, "normalization"); opt_NORMALIZATION_MODE = (val_bool?UCOL_ON:UCOL_OFF); } else if (!strcmp(curname, "numeric")) { bool val_bool = stri__prepare_arg_logical_1_notNA(tmp_arg, "numeric"); opt_NUMERIC_COLLATION = (val_bool?UCOL_ON:UCOL_OFF); } else { Rf_warning(MSG__INCORRECT_COLLATOR_OPTION, curname); } UNPROTECT(1); } UNPROTECT(1); /* names */ // create collator UErrorCode status = U_ZERO_ERROR; UCollator* col = ucol_open(opt_LOCALE, &status); STRI__CHECKICUSTATUS_RFERROR(status, { /* nothing special on err */ }) // error() allowed here if (status == U_USING_DEFAULT_WARNING && opt_LOCALE) { UErrorCode status2 = U_ZERO_ERROR; const char* valid_locale = ucol_getLocaleByType(col, ULOC_VALID_LOCALE, &status2); if (valid_locale && !strcmp(valid_locale, "root")) Rf_warning("%s", ICUError::getICUerrorName(status)); } // else if (status == U_USING_FALLBACK_WARNING) // warning on this would be too invasive // Rf_warning("%s", ICUError::getICUerrorName(status)); // set other opts // if (opt_OVERLAP != UCOL_OFF) { // status = U_ZERO_ERROR; // ucol_setAttribute(col, UCOL_OVERLAP, opt_OVERLAP, &status); // STRI__CHECKICUSTATUS_RFERROR(status, { ucol_close(col); }) // error() allowed here // } if (opt_STRENGTH != UCOL_DEFAULT_STRENGTH) { status = U_ZERO_ERROR; ucol_setAttribute(col, UCOL_STRENGTH, opt_STRENGTH, &status); STRI__CHECKICUSTATUS_RFERROR(status, { ucol_close(col); }) // error() allowed here } if (opt_FRENCH_COLLATION != UCOL_DEFAULT) { status = U_ZERO_ERROR; ucol_setAttribute(col, UCOL_FRENCH_COLLATION, opt_FRENCH_COLLATION, &status); STRI__CHECKICUSTATUS_RFERROR(status, { ucol_close(col); }) // error() allowed here } if (opt_ALTERNATE_HANDLING != UCOL_DEFAULT) { status = U_ZERO_ERROR; ucol_setAttribute(col, UCOL_ALTERNATE_HANDLING, opt_ALTERNATE_HANDLING, &status); STRI__CHECKICUSTATUS_RFERROR(status, { ucol_close(col); }) // error() allowed here } if (opt_CASE_FIRST != UCOL_DEFAULT) { status = U_ZERO_ERROR; ucol_setAttribute(col, UCOL_CASE_FIRST, opt_CASE_FIRST, &status); STRI__CHECKICUSTATUS_RFERROR(status, { ucol_close(col); }) // error() allowed here } if (opt_CASE_LEVEL != UCOL_DEFAULT) { status = U_ZERO_ERROR; ucol_setAttribute(col, UCOL_CASE_LEVEL, opt_CASE_LEVEL, &status); STRI__CHECKICUSTATUS_RFERROR(status, { ucol_close(col); }) // error() allowed here } if (opt_NORMALIZATION_MODE != UCOL_DEFAULT) { status = U_ZERO_ERROR; ucol_setAttribute(col, UCOL_NORMALIZATION_MODE, opt_NORMALIZATION_MODE, &status); STRI__CHECKICUSTATUS_RFERROR(status, { ucol_close(col); }) // error() allowed here } if (opt_NUMERIC_COLLATION != UCOL_DEFAULT) { status = U_ZERO_ERROR; ucol_setAttribute(col, UCOL_NUMERIC_COLLATION, opt_NUMERIC_COLLATION, &status); STRI__CHECKICUSTATUS_RFERROR(status, { ucol_close(col); }) // error() allowed here } return col; } stringi/src/stri_search_class_subset.cpp0000644000176200001440000001774514770541312020326 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_charclass.h" /** * Detect if a character class occurs in a string * * @param str character vector * @param pattern character vector * @param omit_na single logical value * @return logical vector * * @version 0.3-1 (Bartek Tartanus, 2014-07-25) * * @version 0.3-1 (Marek Gagolewski, 2014-10-17) * using std::vector to avoid mem-leaks * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-04) * #122: omit_na arg added * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * #216: `negate` arg added * * @version 1.7.1 (Marek Gagolewski, 2021-06-17) * assure LENGTH(pattern) <= LENGTH(str) */ SEXP stri_subset_charclass(SEXP str, SEXP pattern, SEXP omit_na, SEXP negate) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); bool omit_na1 = stri__prepare_arg_logical_1_notNA(omit_na, "omit_na"); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); if (LENGTH(str) > 0 && LENGTH(str) < LENGTH(pattern)) Rf_error(MSG__WARN_RECYCLING_RULE2); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); if (vectorize_length == 0) { UNPROTECT(2); return Rf_allocVector(STRSXP, 0); } STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); // BT: this cannot be done with deque, because pattern is reused so i does not // go like 0,1,2...n but 0,pat_len,2*pat_len,1,pat_len+1 and so on // MG: agreed std::vector which(vectorize_length); int result_counter = 0; for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i)) { if (omit_na1) which[i] = FALSE; else { which[i] = NA_LOGICAL; result_counter++; } continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); UChar32 chr = 0; which[i] = FALSE; for (R_len_t j=0; jcontains(chr)) { which[i] = TRUE; break; } } if (negate_1) which[i] = !which[i]; if (which[i]) result_counter++; } SEXP ret; STRI__PROTECT(ret = stri__subset_by_logical(str_cont, which, result_counter)); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** * Substitutes vector elements if a pattern occurs in a string * * @param str character vector * @param pattern character vector * @param value character vector * @return character vector * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * #124 * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * #216: `negate` arg added * * @version 1.7.1 (Marek Gagolewski, 2021-06-17) * assure LENGTH(pattern) and LENGTH(value) <= LENGTH(str) */ SEXP stri_subset_charclass_replacement(SEXP str, SEXP pattern, SEXP negate, SEXP value) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); PROTECT(value = stri__prepare_arg_string(value, "value")); // we are subsetting `str`, therefore recycling is slightly different here if (LENGTH(value) == 0) Rf_error(MSG__REPLACEMENT_ZERO); if (LENGTH(pattern) == 0) Rf_error(MSG__WARN_EMPTY_VECTOR); if (LENGTH(str) == 0) { UNPROTECT(3); return Rf_allocVector(STRSXP, 0); } if (LENGTH(str) < LENGTH(pattern)) // for LENGTH(value), we emit warning later on Rf_error(MSG__WARN_RECYCLING_RULE2); if ((LENGTH(str) % LENGTH(pattern)) != 0) Rf_warning(MSG__WARN_RECYCLING_RULE); R_len_t vectorize_length = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(3) R_len_t value_length = LENGTH(value); StriContainerUTF8 value_cont(value, value_length); StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); std::vector detected(vectorize_length, 0); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (pattern_cont.isNA(i)) { // behave like `[<-` detected[i] = false; continue; } if (str_cont.isNA(i)) { detected[i] = NA_INTEGER; continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); UChar32 chr = 0; bool found = false; for (R_len_t j=0; jcontains(chr)) { found = true; break; } } detected[i] = ((found && !negate_1) || (!found && negate_1)); } R_len_t k = 0; // we must traverse `str_cont` in order now for (R_len_t i = 0; i * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" /** * General statistics for a character vector * * @param str a character vector * @return integer vector, see R man for details * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski, 2013-06-09) * Use StriContainerUTF8 * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-04-01) * detect invalid UTF-8 byte streams * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_stats_general(SEXP str) { PROTECT(str = stri__prepare_arg_string(str, "str")); R_len_t str_length = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(1) StriContainerUTF8 str_cont(str, str_length); enum { gsNumLines = 0, gsNumLinesNonEmpty = 1, gsNumChars = 2, gsNumCharsNonWhite = 3, gsAll = 4 // always == number of elements }; SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, gsAll)); int* stats = INTEGER(ret); for (int i=0; i * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_utf16.h" #include "stri_string8buf.h" #include #include #include #include #include #include # define STRI_SORTRANKORDER_SORT 1 # define STRI_SORTRANKORDER_RANK 2 # define STRI_SORTRANKORDER_ORDER 3 /** help struct for stri_order **/ struct StriSortComparer { StriContainerUTF8* cont; bool decreasing; UCollator* col; StriSortComparer(StriContainerUTF8* _cont, UCollator* _col, bool _decreasing) { this->cont = _cont; this->col = _col; this->decreasing = _decreasing; } bool operator() (int a, int b) const { // if (col) { UErrorCode status = U_ZERO_ERROR; int ret = (int)ucol_strcollUTF8(col, cont->get(a).c_str(), cont->get(a).length(), cont->get(b).c_str(), cont->get(b).length(), &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) return (decreasing)?(ret > 0):(ret < 0); // } // else { // int ret = stri__cmp_codepoints( // cont->get(a).c_str(), cont->get(a).length(), // cont->get(b).c_str(), cont->get(b).length() // ); // return (decreasing)?(ret > 0):(ret < 0); // } } }; /** Sort, rank, or generate an ordering permutation * * @param str character vector * @param decreasing single logical value * @param na_last single logical value * @param opts_collator passed to stri__ucol_open() * @param _type internal, 2 for order, 1 for sort, 3 for rank * @return integer vector (permutation/ranks) or character vector * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException friendly * * @version 0.1-?? (Marek Gagolewski, 2013-06-27) * Use UTF16 as ucol_strcollUTF8 is DRAFT as of ICU 4.8 * * @version 0.2-1 (Marek Gagolewski, 2014-03-20) * using ucol_strcollUTF8 again, as we now require ICU >= 50; * performance difference only observed for sorted vectors * (UTF-8: gain, 8bit: loss); * single function for cmp with and without collation; * new param: na_last * * @version 0.2-3 (Marek Gagolewski, 2014-05-07) * opts_collator == NA no longer allowed * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.6-1 (Marek Gagolewski, 2015-07-05) * use stri_order, stri_sort * * @version 1.6.1 (Marek Gagolewski, 2021-04-30) * rank */ SEXP stri_order_rank_or_sort(SEXP str, SEXP decreasing, SEXP na_last, SEXP opts_collator, int _type) { bool decr = stri__prepare_arg_logical_1_notNA(decreasing, "decreasing"); PROTECT(na_last = stri__prepare_arg_logical_1(na_last, "na_last")); PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument int na_last_int = INTEGER(na_last)[0]; // type is an internal arg -- check manually if (_type < 1 || _type > 3) Rf_error(MSG__INCORRECT_INTERNAL_ARG); if ( _type == STRI_SORTRANKORDER_RANK && (decr || na_last_int == NA_LOGICAL || !na_last_int) ) { // decreasing and na_last is ignored for rank Rf_error(MSG__INCORRECT_INTERNAL_ARG); } // call stri__ucol_open after prepare_arg: // if prepare_arg had failed, we would have a mem leak UCollator* col = NULL; col = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(2) R_len_t vectorize_length = LENGTH(str); StriContainerUTF8 str_cont(str, vectorize_length); deque NA_pos; vector order(vectorize_length); R_len_t k = 0; for (R_len_t i=0; i::iterator it=NA_pos.begin(); it!=NA_pos.end(); ++it, ++j) SET_STRING_ELT(ret, j, NA_STRING); } for (std::vector::iterator it=order.begin(); it!=order.end(); ++it, ++j) SET_STRING_ELT(ret, j, str_cont.toR(*it)); if (na_last_int != NA_LOGICAL && na_last_int) { // put NAs last for (std::deque::iterator it=NA_pos.begin(); it!=NA_pos.end(); ++it, ++j) SET_STRING_ELT(ret, j, NA_STRING); } } else if (_type == STRI_SORTRANKORDER_ORDER) { STRI__PROTECT(ret = Rf_allocVector(INTSXP, k+NA_pos.size())); int* ret_tab = INTEGER(ret); R_len_t j = 0; if (na_last_int != NA_LOGICAL && !na_last_int) { // put NAs first for (std::deque::iterator it=NA_pos.begin(); it!=NA_pos.end(); ++it, ++j) ret_tab[j] = (*it)+1; // 1-based indices } for (std::vector::iterator it=order.begin(); it!=order.end(); ++it, ++j) ret_tab[j] = (*it)+1; // 1-based indices if (na_last_int != NA_LOGICAL && na_last_int) { // put NAs last for (std::deque::iterator it=NA_pos.begin(); it!=NA_pos.end(); ++it, ++j) ret_tab[j] = (*it)+1; // 1-based indices } } else { // (_type == STRI_SORTRANKORDER_RANK) // NAs are always preserved, order is increasing STRI__PROTECT(ret = Rf_allocVector(INTSXP, vectorize_length)); int* ret_tab = INTEGER(ret); for (R_len_t i=0; i::iterator it=order.begin(); it!=order.end(); ++it) { cur_idx = *it; if (j_first > 1) { UErrorCode status = U_ZERO_ERROR; if ( 0 != (int)ucol_strcollUTF8( col, str_cont.get(last_idx).c_str(), str_cont.get(last_idx).length(), str_cont.get(cur_idx).c_str(), str_cont.get(cur_idx).length(), &status ) ) { j_min = j_first; } // else reuse j_min == a tie. STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } ret_tab[cur_idx] = j_min; last_idx = cur_idx; j_first++; } } if (col) { ucol_close(col); col = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (col) { ucol_close(col); col = NULL; } }) } /** Sort a character vector * * @param str character vector * @param decreasing single logical value * @param na_last single logical value * @param opts_collator passed to stri__ucol_open() * @return charcter vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-05) * Call stri_order_rank_or_sort */ SEXP stri_sort(SEXP str, SEXP decreasing, SEXP na_last, SEXP opts_collator) { return stri_order_rank_or_sort(str, decreasing, na_last, opts_collator, STRI_SORTRANKORDER_SORT); } /** Return an ordering permutation * * @param str character vector * @param decreasing single logical value * @param na_last single logical value * @param opts_collator passed to stri__ucol_open() * @return integer vector (permutation) * * @version 0.6-1 (Marek Gagolewski, 2015-07-05) * Call stri_order_rank_or_sort */ SEXP stri_order(SEXP str, SEXP decreasing, SEXP na_last, SEXP opts_collator) { return stri_order_rank_or_sort(str, decreasing, na_last, opts_collator, STRI_SORTRANKORDER_ORDER); } /** Rank strings * * @param str character vector * @param opts_collator passed to stri__ucol_open() * @return integer vector (ranks) * * @version 1.6.1 (Marek Gagolewski, 2021-04-29) */ SEXP stri_rank(SEXP str, SEXP opts_collator) { return stri_order_rank_or_sort(str, Rf_ScalarLogical(FALSE)/*decreasing*/, Rf_ScalarLogical(TRUE)/*na_last*/, opts_collator, STRI_SORTRANKORDER_RANK); } /** Get unique elements from a character vector * * @param str character vector * @param opts_collator passed to stri__ucol_open() * @return character vector * * @version 0.2-1 (Bartek Tartanus, 2014-04-17) * * @version 0.2-1 (Marek Gagolewski, 2014-04-17) * using std::deque * * @version 0.2-3 (Marek Gagolewski, 2014-05-07) * opts_collator == NA no longer allowed * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_unique(SEXP str, SEXP opts_collator) { PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument // call stri__ucol_open after prepare_arg: // if prepare_arg had failed, we would have a mem leak UCollator* col = NULL; col = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(1) R_len_t vectorize_length = LENGTH(str); StriContainerUTF8 str_cont(str, vectorize_length); StriSortComparer comp(&str_cont, col, true); set uniqueset(comp); bool was_na = false; deque temp; for (R_len_t i=0; i::iterator,bool> result = uniqueset.insert(i); if (result.second) { temp.push_back(str_cont.toR(i)); } } } SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, temp.size())); R_len_t i = 0; for (deque::iterator it = temp.begin(); it != temp.end(); it++) { SET_STRING_ELT(ret, i++, *it); } if (col) { ucol_close(col); col = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (col) { ucol_close(col); col = NULL; } }) } /** Determine duplicated elements * * @param str character vector * @param fromLast logical value * @param opts_collator passed to stri__ucol_open() * @return logical vector * * @version 0.2-1 (Bartek Tartanus, 2014-04-17) * * @version 0.2-3 (Marek Gagolewski, 2014-05-07) * opts_collator == NA no longer allowed * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_duplicated(SEXP str, SEXP fromLast, SEXP opts_collator) { PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument bool fromLastBool = stri__prepare_arg_logical_1_notNA(fromLast, "fromLast"); // call stri__ucol_open after prepare_arg: // if prepare_arg had failed, we would have a mem leak UCollator* col = NULL; col = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(1) R_len_t vectorize_length = LENGTH(str); StriContainerUTF8 str_cont(str, vectorize_length); StriSortComparer comp(&str_cont, col, true); set uniqueset(comp); bool was_na = false; SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); if (fromLastBool) { for (R_len_t i=vectorize_length-1; i>=0; --i) { if (str_cont.isNA(i)) { ret_tab[i] = was_na; if (!was_na) was_na = true; } else { pair::iterator,bool> result = uniqueset.insert(i); ret_tab[i] = !result.second; } } } else { for (R_len_t i=0; i::iterator,bool> result = uniqueset.insert(i); ret_tab[i] = !result.second; } } } if (col) { ucol_close(col); col = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (col) { ucol_close(col); col = NULL; } }) } /** Determine first duplicated elements * * @param str character vector * @param fromLast logical value * @param opts_collator passed to stri__ucol_open() * @return integer vector * * @version 0.2-1 (Bartek Tartanus, 2014-04-17) * * @version 0.2-3 (Marek Gagolewski, 2014-05-07) * opts_collator == NA no longer allowed * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_duplicated_any(SEXP str, SEXP fromLast, SEXP opts_collator) { PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument bool fromLastBool = stri__prepare_arg_logical_1_notNA(fromLast, "fromLast"); // call stri__ucol_open after prepare_arg: // if prepare_arg had failed, we would have a mem leak UCollator* col = NULL; col = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(1) R_len_t vectorize_length = LENGTH(str); StriContainerUTF8 str_cont(str, vectorize_length); StriSortComparer comp(&str_cont, col, true); set uniqueset(comp); bool was_na = false; SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, 1)); int* ret_tab = INTEGER(ret); ret_tab[0] = 0; if (fromLastBool) { for (R_len_t i=vectorize_length-1; i>=0; --i) { if (str_cont.isNA(i)) { if (!was_na) was_na = true; else { ret_tab[0] = i+1; break; } } else { pair::iterator,bool> result = uniqueset.insert(i); if (!result.second) { ret_tab[0] = i+1; break; } } } } else { for (R_len_t i=0; i::iterator,bool> result = uniqueset.insert(i); if (!result.second) { ret_tab[0] = i+1; break; } } } } if (col) { ucol_close(col); col = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (col) { ucol_close(col); col = NULL; } }) } /** Compute a character sort key * * @param str character vector * @param opts_collator passed to stri__ucol_open() * @return character vector * * @version 1.4.7 (Davis Vaughan, 2020-07-15) * @version 1.6.1 (Marek Gagolewski, 2021-04-29) * output `bytes`-encoded strings */ SEXP stri_sort_key(SEXP str, SEXP opts_collator) { PROTECT(str = stri__prepare_arg_string(str, "str")); // call stri__ucol_open after prepare_arg: // if prepare_arg had failed, we would have a mem leak UCollator* col = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(1) R_len_t length = LENGTH(str); StriContainerUTF16 str_cont(str, length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, length)); // UErrorCode status = U_ZERO_ERROR; // Allocate temporary buffer to hold the current sort key size_t key_buffer_size = 16384; String8buf key_buffer(key_buffer_size); uint8_t* p_key_buffer_u8 = (uint8_t*) key_buffer.data(); for (R_len_t i = 0; i < length; ++i) { if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } const UnicodeString* p_str_cur_data = &(str_cont.get(i)); const UChar* p_str_cur = p_str_cur_data->getBuffer(); const int str_cur_length = p_str_cur_data->length(); int32_t key_size = ucol_getSortKey(col, p_str_cur, str_cur_length, p_key_buffer_u8, key_buffer_size); // Reallocate a larger buffer and retry as required if ((size_t)key_size > key_buffer_size) { const int32_t key_padding = 100; key_buffer_size = key_size + key_padding; key_buffer.resize(key_buffer_size, false); p_key_buffer_u8 = (uint8_t*) key_buffer.data(); // Try again key_size = ucol_getSortKey(col, p_str_cur, str_cur_length, p_key_buffer_u8, key_buffer_size); } // `key_size` includes null terminator, // which we don't want to copy into the R CHARSXP R_len_t key_char_size = key_size - 1; SET_STRING_ELT(ret, i, Rf_mkCharLenCE(key_buffer.data(), key_char_size, CE_BYTES)); } if (col) { ucol_close(col); col = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (col) { ucol_close(col); col = NULL; } }) } stringi/src/stri_search_fixed_count.cpp0000644000176200001440000001001114770541312020116 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_base.h" #include "stri_container_utf8.h" #include "stri_container_bytesearch.h" /** * Count the number of recurrences of \code{pattern} in \code{str} * [fast but dummy bitewise compare] * * @param str strings to search in * @param pattern patterns to search for * @return integer vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF8 * * @version 0.1-?? (Marek Gagolewski) * corrected behavior on empty str/pattern * * @version 0.1-?? (Marek Gagolewski, 2013-06-23) * make StriException-friendly, * use StriContainerByteSearch * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * stri_count_fixed now uses byte search only * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * FR #110, #23: opts_fixed arg added * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use StriByteSearchMatcher */ SEXP stri_count_fixed(SEXP str, SEXP pattern, SEXP opts_fixed) { uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed, /*allow_overlap*/true); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); STRI__ERROR_HANDLER_BEGIN(2) R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF8 str_cont(str, vectorize_length); StriContainerByteSearch pattern_cont(pattern, vectorize_length, pattern_flags); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, vectorize_length)); int* ret_tab = INTEGER(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_INTEGER, ret_tab[i] = 0) StriByteSearchMatcher* matcher = pattern_cont.getMatcher(i); matcher->reset(str_cont.get(i).c_str(), str_cont.get(i).length()); R_len_t found = 0; while (USEARCH_DONE != matcher->findNext()) ++found; ret_tab[i] = found; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( ;/* do nothing special on error */ ) } stringi/src/stri_container_listutf8.cpp0000644000176200001440000001136614770541312020124 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_listutf8.h" /** * Default constructor * */ StriContainerListUTF8::StriContainerListUTF8() : StriContainerBase() { data = NULL; } /** * Construct the Container from an R list * @param rvec R list vector * @param nrecycle extend length of each character vector stored [vectorization] * @param shallowrecycle will stored character vectors be ever modified? */ StriContainerListUTF8::StriContainerListUTF8(SEXP rvec, R_len_t _nrecycle, bool _shallowrecycle) { this->data = NULL; #ifndef NDEBUG if (!Rf_isVectorList(rvec)) throw StriException("DEBUG: !isVectorList in StriContainerListUTF8::StriContainerListUTF8(SEXP rvec)"); #endif R_len_t rvec_length = LENGTH(rvec); this->init_Base(rvec_length, rvec_length, true); if (this->n > 0) { this->data = new StriContainerUTF8*[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (R_len_t i=0; in; ++i) this->data[i] = NULL; // in case it fails during conversion (this is "NA") for (R_len_t i=0; in; ++i) { R_len_t strlist_cur_length = LENGTH(VECTOR_ELT(rvec, i)); if (_nrecycle % strlist_cur_length != 0) { Rf_warning(MSG__WARN_RECYCLING_RULE); break; } } for (R_len_t i=0; in; ++i) { this->data[i] = new StriContainerUTF8(VECTOR_ELT(rvec, i), _nrecycle, _shallowrecycle); if (!this->data[i]) throw StriException(MSG__MEM_ALLOC_ERROR); } } } StriContainerListUTF8::StriContainerListUTF8(StriContainerListUTF8& container) : StriContainerBase((StriContainerBase&)container) { if (container.data) { this->data = new StriContainerUTF8*[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (int i=0; idata[i] = new StriContainerUTF8(*container.data[i]); if (!this->data[i]) throw StriException(MSG__MEM_ALLOC_ERROR); } else this->data[i] = NULL; } } else { this->data = NULL; } } StriContainerListUTF8& StriContainerListUTF8::operator=(StriContainerListUTF8& container) { this->~StriContainerListUTF8(); (StriContainerBase&) (*this) = (StriContainerBase&)container; if (container.data) { this->data = new StriContainerUTF8*[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (int i=0; idata[i] = new StriContainerUTF8(*container.data[i]); if (!this->data[i]) throw StriException(MSG__MEM_ALLOC_ERROR); } else this->data[i] = NULL; } } else { this->data = NULL; } return *this; } StriContainerListUTF8::~StriContainerListUTF8() { if (data) { for (int i=0; i * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_container_listint_h #define __stri_container_listint_h #include "stri_container_base.h" #include "stri_intvec.h" /** * Contains R lists of integer vectors or single integer vectors. * Useful for encoding conversion or detection. * Each string is represented by the IntVec class, * with shallow copy of byte data. * @version 0.2-1 (Marek Gagolewski, 2014-03-25) */ class StriContainerListInt : public StriContainerBase { private: IntVec* data; public: StriContainerListInt(); StriContainerListInt(SEXP rlist); StriContainerListInt(StriContainerListInt& container); ~StriContainerListInt(); StriContainerListInt& operator=(StriContainerListInt& container); /** check if the vectorized ith element is NULL/NA * @param i index * @return true if is NA */ inline bool isNA(R_len_t i) const { #ifndef NDEBUG if (i < 0 || i >= nrecycle) throw StriException("StriContainerListInt::isNA(): INDEX OUT OF BOUNDS"); #endif return (data[i%n].isNA()); } /** get the vectorized ith element * @param i index * @return string, read only */ const IntVec& get(R_len_t i) const { #ifndef NDEBUG if (i < 0 || i >= nrecycle) throw StriException("StriContainerListInt::get(): INDEX OUT OF BOUNDS"); if (data[i%n].isNA()) throw StriException("StriContainerListInt::get(): isNA"); #endif return data[i%n]; } }; #endif stringi/src/stri_macros.h0000644000176200001440000001466314770542065015242 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_macros_h #define __stri_macros_h // undef R's length macro (conflicts with std::string.length()) // use LENGTH instead #undef length #define STRI__CONTINUE_ON_EMPTY_OR_NA_PATTERN(str_cont, pattern_cont, naset) \ if ((str_cont).isNA(i) || (pattern_cont).isNA(i) || (pattern_cont).get(i).length() <= 0) { \ naset; \ continue; \ } \ #define STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, naset, zeroset) \ if ((str_cont).isNA(i) || (pattern_cont).isNA(i) || (pattern_cont).get(i).length() <= 0) { \ naset; \ continue; \ } \ else if ((str_cont).get(i).length() <= 0) { \ zeroset; \ continue; \ } \ #define STRI__GET_INT32_BE(input, index) \ uint32_t(((uint8_t*)input)[index+0] << 24 | ((uint8_t*)input)[index+1] << 16 | ((uint8_t*)input)[index+2] << 8 | ((uint8_t*)input)[index+3]) #define STRI__GET_INT32_LE(input, index) \ uint32_t(((uint8_t*)input)[index+3] << 24 | ((uint8_t*)input)[index+2] << 16 | ((uint8_t*)input)[index+1] << 8 | ((uint8_t*)input)[index+0]) #define STRI__GET_INT16_BE(input, index) \ uint16_t(((uint8_t*)input)[index+0] << 8 | ((uint8_t*)input)[index+1]) #define STRI__GET_INT16_LE(input, index) \ uint16_t(((uint8_t*)input)[index+1] << 8 | ((uint8_t*)input)[index+0]) #define STRI__ENC_HAS_BOM_UTF8(s, n) \ bool(n >= 3 && \ (uint8_t)(s[0]) == (uint8_t)0xEF && \ (uint8_t)(s[1]) == (uint8_t)0xBB && \ (uint8_t)(s[2]) == (uint8_t)0xBF) #define STRI__ENC_HAS_BOM_UTF16LE(s, n) \ bool(n >= 2 && \ (uint8_t)(s[0]) == (uint8_t)0xFF && \ (uint8_t)(s[1]) == (uint8_t)0xFE && \ (n < 4 || ((uint8_t)(s[2]) != (uint8_t)0x00 || \ (uint8_t)(s[3]) != (uint8_t)0x00))) #define STRI__ENC_HAS_BOM_UTF16BE(s, n) \ bool(n >= 2 && \ (uint8_t)(s[0]) == (uint8_t)0xFE && \ (uint8_t)(s[1]) == (uint8_t)0xFF) #define STRI__ENC_HAS_BOM_UTF32BE(s, n) \ bool(n >= 4 && \ (STRI__GET_INT32_BE(str_cur_s, 0) == 0x0000FEFFUL)) #define STRI__ENC_HAS_BOM_UTF32LE(s, n) \ bool(n >= 4 && \ (STRI__GET_INT32_LE(str_cur_s, 0) == 0x0000FEFFUL)) /* ************************************************************************ * based on R's Defn.h * CHARSXP charset bits */ #ifndef BYTES_MASK #define BYTES_MASK (1<<1) #endif #ifndef LATIN1_MASK #define LATIN1_MASK (1<<2) #endif #ifndef UTF8_MASK #define UTF8_MASK (1<<3) #endif #ifndef ASCII_MASK #define ASCII_MASK (1<<6) #endif #ifndef IS_BYTES #define IS_BYTES(x) (Rf_getCharCE(x) == CE_BYTES) // #define IS_BYTES(x) (LEVELS(x) & BYTES_MASK) #endif #ifndef IS_LATIN1 #define IS_LATIN1(x) (Rf_getCharCE(x) == CE_LATIN1) // #define IS_LATIN1(x) (LEVELS(x) & LATIN1_MASK) #endif #ifndef IS_ASCII // #define IS_ASCII(x) (Rf_getCharCE(x) == CE_ANY) /* the function doesn't return this value... */ #if R_VERSION >= R_Version(4, 5, 0) #define IS_ASCII(x) Rf_charIsASCII(x) #else #define IS_ASCII(x) (LEVELS(x) & ASCII_MASK) #endif #endif #ifndef IS_UTF8 #define IS_UTF8(x) (Rf_getCharCE(x) == CE_UTF8) // #define IS_UTF8(x) (LEVELS(x) & UTF8_MASK) #endif //#ifndef ENC_KNOWN //#define ENC_KNOWN(x) (LEVELS(x) & (LATIN1_MASK | UTF8_MASK | ASCII_MASK)) //#endif #ifndef isRaw #define isRaw(x) (TYPEOF(x) == RAWSXP) #endif /* ************************************************************************ */ /* Unicode replacement character */ #define UCHAR_REPLACEMENT 0xFFFD #define ASCII_SUBSTITUTE 0x1A #define ASCII_MAXCHARCODE 127 #define UCHAR_REPLACEMENT_UTF8_BYTE1 0xef #define UCHAR_REPLACEMENT_UTF8_BYTE2 0xbf #define UCHAR_REPLACEMENT_UTF8_BYTE3 0xbd #define UTF8_BOM_BYTE1 ((uint8_t)0xef) #define UTF8_BOM_BYTE2 ((uint8_t)0xbb) #define UTF8_BOM_BYTE3 ((uint8_t)0xbf) #define ASCII_CR 0x0D #define ASCII_LF 0x0A #define ASCII_FF 0x0C #define ASCII_VT 0x0B #define UCHAR_NEL 0x0085 #define UCHAR_LS 0x2028 #define UCHAR_PS 0x2029 #define POW_2_31_M_1 2147483647 #endif stringi/src/stri_join.cpp0000644000176200001440000007657214770541473015261 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_base.h" #include "stri_container_utf8.h" #include "stri_container_integer.h" #include "stri_container_listutf8.h" #include "stri_string8buf.h" #include using namespace std; /** * Prepare list argument -- ignore empty vectors if needed, used by stri_paste * * @param x a list of strings * @param ignore_null FALSE to do nothing * @return a list vector * * @version 0.4-1 (Marek Gagolewski, 2014-11-27) */ SEXP stri__prepare_arg_list_ignore_null(SEXP x, bool ignore_null) { if (!ignore_null) return x; PROTECT(x); #ifndef NDEBUG if (!Rf_isVectorList(x)) Rf_error("stri__prepare_arg_list_ignore_null:: !NDEBUG: not a list"); // error() allowed here #endif R_len_t narg = LENGTH(x); if (narg <= 0) { UNPROTECT(1); return x; } // else if (narg == 1 && LENGTH(VECTOR_ELT(x, 0)) == 0) { // UNPROTECT(1); // return Rf_allocVector(VECSXP, 0); // } SEXP ret; // if (ignore_null != NA_INTEGER && ignore_null < 0) { // remove NULL elements R_len_t nret = 0; for (R_len_t i=0; i 0) ++nret; } PROTECT(ret = Rf_allocVector(VECSXP, nret)); for (R_len_t i=0, j=0; i 0) SET_VECTOR_ELT(ret, j++, VECTOR_ELT(x, i)); } // } // else { // insert one empty string // PROTECT(ret = Rf_allocVector(VECSXP, narg)); // for (R_len_t i=0; i 0) // SET_VECTOR_ELT(ret, i, VECTOR_ELT(x, i)); // else if (ignore_null != NA_INTEGER) // SET_VECTOR_ELT(ret, i, stri__vector_empty_strings(1)); //// else //// SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); // } // } UNPROTECT(2); return ret; } /** Duplicate given strings * * * @param str character vector * @param times integer vector * @return character vector * * The function is vectorized over str and times * if str is NA or times is NA the result will be NA * if times < 0, the result will be NA * if times==0, the result will be an empty string * if str or times is an empty vector, then the result is an empty vector * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF8's vectorization * * @version 0.1-?? (Marek Gagolewski, 2013-06-15) * use StriContainerInteger * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException friendly * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.7.6.9001 (Marek Gagolewski, 2022-03-15) * #473: use size_t */ SEXP stri_dup(SEXP str, SEXP times) { PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument PROTECT(times = stri__prepare_arg_integer(times, "times")); // prepare string argument R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(times)); if (vectorize_length <= 0) { UNPROTECT(2); return Rf_allocVector(STRSXP, 0); } STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerInteger times_cont(times, vectorize_length); // STEP 1. // Calculate the required buffer length size_t bufsize = 0; for (R_len_t i=0; i bufsize) bufsize = cursize; } if (bufsize > POW_2_31_M_1) throw StriException(MSG__CHARSXP_2147483647); // STEP 2. // Alloc buffer & result vector String8buf buf(bufsize); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); // STEP 3. // Duplicate const String8* str_last = NULL; // this will allow for reusing buffer... size_t str_last_index = 0; // ...useful for stri_dup('a', 1:1000) or stri_dup('a', 1000:1) for (R_len_t i = str_cont.vectorize_init(); // this iterator allows for... i != str_cont.vectorize_end(); // ...smart buffer reusage i = str_cont.vectorize_next(i)) { R_len_t times_cur; if (str_cont.isNA(i) || times_cont.isNA(i) || (times_cur = times_cont.get(i)) < 0) { SET_STRING_ELT(ret, i, NA_STRING); continue; } const String8* str_cur = &(str_cont.get(i)); R_len_t str_cur_n = str_cur->length(); if (times_cur <= 0 || str_cur_n <= 0) { SET_STRING_ELT(ret, i, Rf_mkCharLen("", 0)); continue; } // all right, here the result will neither be NA nor an empty string if (str_cur != str_last) { // well, no reuse possible - resetting str_last = str_cur; str_last_index = 0; } // we paste only "additional" duplicates size_t max_index = str_cur_n*times_cur; for (; str_last_index < max_index; str_last_index += str_cur_n) { if (buf.size() < str_last_index+str_cur_n) { throw StriException(MSG__INTERNAL_ERROR); } memcpy(buf.data()+str_last_index, str_cur->c_str(), (size_t)str_cur_n); } // the result is always in UTF-8 SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), max_index, CE_UTF8)); } // STEP 4. // Clean up & finish STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** Join two character vectors, element by element, no separator, no collapse * * Vectorized over e1 and e2. Optimized for |e1| >= |e2| * (but no harm otherwise) * * This is used by %s+% operator in stringi R code. * * @param e1 character vector * @param e2 character vector * @return character vector, res_i=s1_i + s2_i for |e1|==|e2| * if e1 or e2 is NA then result is NA * if e1 or e2 is empty, then the result is just e1 or e2 * * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF8's vectorization * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException friendly * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * #112: str_prepare_arg* retvals were not PROTECTed from gc * */ SEXP stri_join2(SEXP e1, SEXP e2) // a.k.a. stri_join2_nocollapse { PROTECT(e1 = stri__prepare_arg_string(e1, "e1")); // prepare string argument PROTECT(e2 = stri__prepare_arg_string(e2, "e2")); // prepare string argument R_len_t e1_length = LENGTH(e1); R_len_t e2_length = LENGTH(e2); R_len_t vectorize_length = stri__recycling_rule(true, 2, e1_length, e2_length); if (e1_length <= 0) { UNPROTECT(2); return e1; } if (e2_length <= 0) { UNPROTECT(2); return e2; } STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 e1_cont(e1, vectorize_length); StriContainerUTF8 e2_cont(e2, vectorize_length); // 1. find maximal length of the buffer needed size_t nchar = 0; for (int i=0; i nchar) nchar = c1+c2; } // 2. Create buf & retval if (nchar > POW_2_31_M_1) throw StriException(MSG__CHARSXP_2147483647); String8buf buf(nchar); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); // output vector // 3. Set retval const String8* last_string_1 = NULL; R_len_t last_buf_idx = 0; for (R_len_t i = e1_cont.vectorize_init(); // this iterator allows for... i != e1_cont.vectorize_end(); // ...smart buffer reusage i = e1_cont.vectorize_next(i)) { if (e1_cont.isNA(i) || e2_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } // If e1 has length < length of e2, this will be faster: const String8* cur_string_1 = &(e1_cont.get(i)); if (cur_string_1 != last_string_1) { last_string_1 = cur_string_1; last_buf_idx = cur_string_1->length(); memcpy(buf.data(), cur_string_1->c_str(), (size_t)last_buf_idx); } // else reuse string #1 const String8* cur_string_2 = &(e2_cont.get(i)); R_len_t cur_len_2 = cur_string_2->length(); memcpy(buf.data()+last_buf_idx, cur_string_2->c_str(), (size_t)cur_len_2); // the result is always in UTF-8 SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), last_buf_idx+cur_len_2, CE_UTF8)); } // 4. Cleanup & finish STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** Join and flatten two character vectors, no separator between elements but possibly with collapse * * Vectorized over e1 and e2. * * @param e1 character vector * @param e2 character vector * @param collapse single string or NULL * @return character vector * * * @version 0.2-1 (Marek Gagolewski, 2014-03-18) * first version; * This is much faster than stri_flatten(stri_join2(...), ...) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-11-26) * #114: inconsistent behaviour w.r.t. paste() */ SEXP stri_join2_withcollapse(SEXP e1, SEXP e2, SEXP collapse) { if (Rf_isNull(collapse)) { // no collapse - used, e.g., by the %s+% operator return stri_join2(e1, e2); // a.k.a. stri_join2_nocollapse } PROTECT(e1 = stri__prepare_arg_string(e1, "e1")); // prepare string argument PROTECT(e2 = stri__prepare_arg_string(e2, "e2")); // prepare string argument PROTECT(collapse = stri__prepare_arg_string_1(collapse, "collapse")); if (STRING_ELT(collapse, 0) == NA_STRING) { UNPROTECT(3); return stri__vector_NA_strings(1); } R_len_t e1_length = LENGTH(e1); R_len_t e2_length = LENGTH(e2); R_len_t vectorize_length = stri__recycling_rule(true, 2, e1_length, e2_length); if (e1_length <= 0 || e2_length <= 0) { UNPROTECT(3); return stri__vector_empty_strings(1); } STRI__ERROR_HANDLER_BEGIN(3) StriContainerUTF8 e1_cont(e1, vectorize_length); StriContainerUTF8 e2_cont(e2, vectorize_length); StriContainerUTF8 collapse_cont(collapse, 1); R_len_t collapse_nbytes = collapse_cont.get(0).length(); const char* collapse_s = collapse_cont.get(0).c_str(); // find maximal length of the buffer needed: size_t nchar = 0; for (int i=0; i return NA } nchar += e1_cont.get(i).length() + e2_cont.get(i).length() + ((i>0)?collapse_nbytes:0); } if (nchar > POW_2_31_M_1) throw StriException(MSG__CHARSXP_2147483647); String8buf buf(nchar); R_len_t last_buf_idx = 0; for (R_len_t i = 0; i < vectorize_length; ++i) // don't change this order, see #114 { // no need to detect NAs - they already have been excluded if (collapse_nbytes > 0 && i > 0) { // copy collapse (separator) memcpy(buf.data()+last_buf_idx, collapse_s, (size_t)collapse_nbytes); last_buf_idx += collapse_nbytes; } const String8* cur_string_1 = &(e1_cont.get(i)); R_len_t cur_len_1 = cur_string_1->length(); memcpy(buf.data()+last_buf_idx, cur_string_1->c_str(), (size_t)cur_len_1); last_buf_idx += cur_len_1; const String8* cur_string_2 = &(e2_cont.get(i)); R_len_t cur_len_2 = cur_string_2->length(); memcpy(buf.data()+last_buf_idx, cur_string_2->c_str(), (size_t)cur_len_2); last_buf_idx += cur_len_2; } SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, 1)); // output vector SET_STRING_ELT(ret, 0, Rf_mkCharLenCE(buf.data(), last_buf_idx, CE_UTF8)); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** * Concatenate Character Vectors, with no collapse * * @param strlist list of character vectors * @param sep single string * @param ignore_null single integer * @return character vector * * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF8's vectorization * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly, useStriContainerListUTF8 * * @version 0.1-12 (Marek Gagolewski, 2013-12-04) * fixed bug #49 * * @version 0.2-1 (Marek Gagolewski, 2014-03-18) * stri_join has been split to stri_join_nocollapse * and stri_join_withcollapse (for efficiency reasons) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-11-27) * FR #116: ignore_null arg added */ SEXP stri_join_nocollapse(SEXP strlist, SEXP sep, SEXP ignore_null) { bool ignore_null1 = stri__prepare_arg_logical_1_notNA(ignore_null, "ignore_null"); PROTECT(strlist = stri__prepare_arg_list_ignore_null( stri__prepare_arg_list_string(strlist, "..."), ignore_null1 )); R_len_t strlist_length = LENGTH(strlist); if (strlist_length <= 0) { UNPROTECT(1); return stri__vector_empty_strings(0); } // get length of the longest character vector on the list, i.e., vectorize_length R_len_t vectorize_length = 0; for (R_len_t i=0; i vectorize_length) vectorize_length = strlist_cur_length; } PROTECT(sep = stri__prepare_arg_string_1(sep, "sep")); if (STRING_ELT(sep, 0) == NA_STRING) { UNPROTECT(2); return stri__vector_NA_strings(vectorize_length); } // * special case * if (LENGTH(STRING_ELT(sep, 0)) == 0 && strlist_length == 2) { // sep==empty string and 2 vectors -- // an often occurring case - we have some specialized functions for this :-) SEXP ret; PROTECT(ret = stri_join2(VECTOR_ELT(strlist, 0), VECTOR_ELT(strlist, 1))); // a.k.a. stri_join2_nocollapse UNPROTECT(3); return ret; } // note that if 1 vector is given // we cannot return VECTOR_ELT(strlist, 0) directly // -- it needs to be converted to UTF8 // so we proceed SEXP ret; STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 sep_cont(sep, 1); const char* sep_char = sep_cont.get(0).c_str(); R_len_t sep_len = sep_cont.get(0).length(); StriContainerListUTF8 strlist_cont(strlist, vectorize_length); // 4. Get buf size and determine where NAs will occur size_t buf_maxbytes = 0; vector whichNA(vectorize_length, false); // where are NAs in out? for (R_len_t i=0; i0)?sep_len:0); } } if (!whichNA[i] && curchar > buf_maxbytes) buf_maxbytes = curchar; } // 5. Create ret val if (buf_maxbytes > POW_2_31_M_1) throw StriException(MSG__CHARSXP_2147483647); String8buf buf(buf_maxbytes); STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); for (R_len_t i=0; i= 0 && j > 0) { memcpy(buf.data()+cursize, sep_char, (size_t)sep_len); cursize += sep_len; } const String8* curstring = &(strlist_cont.get(j).get(i)); size_t curstring_n = curstring->length(); memcpy(buf.data()+cursize, curstring->c_str(), (size_t)curstring_n); cursize += curstring_n; } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), cursize, CE_UTF8)); } // nothing more to do: STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** * Concatenate Character Vectors, possibly with collapse * * @param strlist list of character vectors * @param sep single string * @param collapse single string or NULL * @param ignore_null single integer * @return character vector * * * @version 0.2-1 (Marek Gagolewski, 2014-03-18) * a specialized version of the original stri_join, which * called stri_flatten at the end, if it was requested; * now collapsing is done directly (for time and memory efficiency); * Now calling specialized functions * stri_join2_withcollapse and stri_flatten_withressep, if needed. * If collapse!=NULL and sep=NA, then the result will be single NA * (and not n*NA); * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-11-27) * FR #116: ignore_null arg added */ SEXP stri_join(SEXP strlist, SEXP sep, SEXP collapse, SEXP ignore_null) { // no collapse-case is handled separately: if (Rf_isNull(collapse)) return stri_join_nocollapse(strlist, sep, ignore_null); // *result will surely be a single string* bool ignore_null1 = stri__prepare_arg_logical_1_notNA(ignore_null, "ignore_null"); PROTECT(strlist = stri__prepare_arg_list_ignore_null( stri__prepare_arg_list_string(strlist, "..."), ignore_null1 )); R_len_t strlist_length = LENGTH(strlist); if (strlist_length <= 0) { UNPROTECT(1); return stri__vector_empty_strings(1); } else if (strlist_length == 1) { // one vector + collapse string -- another frequently occurring case // sep is ignored here SEXP ret; PROTECT(ret = stri_flatten(VECTOR_ELT(strlist, 0), collapse)); // a.k.a. stri_flatten_withressep UNPROTECT(2); return ret; } PROTECT(sep = stri__prepare_arg_string_1(sep, "sep")); PROTECT(collapse = stri__prepare_arg_string_1(collapse, "collapse")); if (STRING_ELT(sep, 0) == NA_STRING || STRING_ELT(collapse, 0) == NA_STRING) { UNPROTECT(3); return stri__vector_NA_strings(1); } else if (LENGTH(STRING_ELT(sep, 0)) == 0 && strlist_length == 2) { // sep==empty string and 2 vectors -- // an often occurring case - we have some specialized functions for this :-) SEXP ret; PROTECT(ret = stri_join2_withcollapse(VECTOR_ELT(strlist, 0), VECTOR_ELT(strlist, 1), collapse)); UNPROTECT(4); return ret; } // get length of the longest character vector on the list, i.e., vectorize_length R_len_t vectorize_length = 0; for (R_len_t i=0; i vectorize_length) vectorize_length = strlist_cur_length; } STRI__ERROR_HANDLER_BEGIN(3) StriContainerListUTF8 strlist_cont(strlist, vectorize_length); StriContainerUTF8 sep_cont(sep, 1); // definitely not NA const char* sep_s = sep_cont.get(0).c_str(); R_len_t sep_n = sep_cont.get(0).length(); StriContainerUTF8 collapse_cont(collapse, 1); // definitely not NA const char* collapse_s = collapse_cont.get(0).c_str(); R_len_t collapse_n = collapse_cont.get(0).length(); // Get required buffer size size_t buf_maxbytes = 0; for (R_len_t i=0; i0)?sep_n:0); } if (i>0) buf_maxbytes += collapse_n; } // 5. Create ret val if (buf_maxbytes > POW_2_31_M_1) throw StriException(MSG__CHARSXP_2147483647); String8buf buf(buf_maxbytes); size_t last_buf_idx = 0; for (R_len_t i=0; i 0 && i > 0) { memcpy(buf.data()+last_buf_idx, collapse_s, (size_t)collapse_n); last_buf_idx += collapse_n; } for (R_len_t j=0; j 0 && j > 0) { memcpy(buf.data()+last_buf_idx, sep_s, (size_t)sep_n); last_buf_idx += sep_n; } const String8* curstring = &(strlist_cont.get(j).get(i)); size_t curstring_n = curstring->length(); memcpy(buf.data()+last_buf_idx, curstring->c_str(), (size_t)curstring_n); last_buf_idx += curstring_n; } } #ifndef NDEBUG if (buf_maxbytes != last_buf_idx) throw StriException("stri_join_withcollapse: buffer overrun"); #endif // we are done SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, 1)); SET_STRING_ELT(ret, 0, Rf_mkCharLenCE(buf.data(), last_buf_idx, CE_UTF8)); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** String vector flatten, with no separator (i.e., empty) between each string * * if any of s is NA, the result will be NA_character_ * * @param s character vector * @return if s is not empty, then a character vector of length 1 * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * StriContainerUTF8 - any R Encoding * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-18) * This function hasn't been used at all before (strange, isn't it?); * From now on it's being called by stri_flatten_withressep * (a small performance gain) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.2.1 (Marek Gagolewski, 2018-04-20) * na_empty arg added * * @version 1.6.2 (Marek Gagolewski, 2021-05-10) * #428 na_empty=NA support */ SEXP stri_flatten_noressep(SEXP str, int na_empty) { PROTECT(str = stri__prepare_arg_string(str, "str")); R_len_t str_length = LENGTH(str); if (str_length <= 0) { UNPROTECT(1); return stri__vector_empty_strings(1); } STRI__ERROR_HANDLER_BEGIN(1) StriContainerUTF8 str_cont(str, str_length); // 1. Get required buffer size size_t nchar = 0; for (int i=0; i return NA } } else { nchar += str_cont.get(i).length(); } } // 2. Fill the buf! if (nchar > POW_2_31_M_1) throw StriException(MSG__CHARSXP_2147483647); String8buf buf(nchar); size_t cur = 0; for (int i=0; i0 && !omit_empty_1)?collapse_nbytes:0); } else { STRI__UNPROTECT_ALL return stri__vector_NA_strings(1); // at least 1 NA => return NA } } else { nbytes += str_cont.get(i).length() + ((i>0)?collapse_nbytes:0); } } // 2. Fill the buf! if (nbytes > POW_2_31_M_1) throw StriException(MSG__CHARSXP_2147483647); String8buf buf(nbytes); size_t cur = 0; bool already_started = false; for (int i=0; i 0) { memcpy(buf.data()+cur, collapse_s, (size_t)collapse_nbytes); cur += collapse_nbytes; } } else already_started = true; if (!str_cont.isNA(i)) { size_t ncur = str_cont.get(i).length(); memcpy(buf.data()+cur, str_cont.get(i).c_str(), (size_t)ncur); cur += ncur; } } // 3. Get ret val & return SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, 1)); SET_STRING_ELT(ret, 0, Rf_mkCharLenCE(buf.data(), cur, CE_UTF8)); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** * Concatenate strings in a list * * @param x list of character vectors * @param sep single string * @param collapse single string or NULL * @return character vector * * @version 1.0-3 (Marek Gagolewski, 2016-02-07) * FR#175 */ SEXP stri_join_list(SEXP x, SEXP sep, SEXP collapse) { PROTECT(x = stri__prepare_arg_list_ignore_null( stri__prepare_arg_list_string(x, "x"), true )); R_len_t strlist_length = LENGTH(x); if (strlist_length <= 0) { UNPROTECT(1); return stri__vector_empty_strings(0); } PROTECT(sep = stri__prepare_arg_string_1(sep, "sep")); if (Rf_isNull(collapse)) PROTECT(collapse); else PROTECT(collapse = stri__prepare_arg_string_1(collapse, "collapse")); STRI__ERROR_HANDLER_BEGIN(3) SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, strlist_length)); for (R_len_t j=0; j * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_container_listraw_h #define __stri_container_listraw_h #include "stri_string8.h" #include "stri_container_base.h" /** * Contains R lists of raw vectors, single raw vectors, * or character string vectors treated as "byte"-encoded. * Useful for encoding conversion or detection. * Each string is represented by the String8 class, * with shallow copy of byte data. * * @version 0.1-?? (Marek Gagolewski, 2013-08-08) * * @version 0.2-1 (Marek Gagolewski, 2014-03-25) * data as String8* and not String8** (performance gain) */ class StriContainerListRaw : public StriContainerBase { private: String8* data; public: StriContainerListRaw(); StriContainerListRaw(SEXP rlist); StriContainerListRaw(StriContainerListRaw& container); ~StriContainerListRaw(); StriContainerListRaw& operator=(StriContainerListRaw& container); /** check if the vectorized ith element is NA * @param i index * @return true if is NA */ inline bool isNA(R_len_t i) const { #ifndef NDEBUG if (i < 0 || i >= nrecycle) throw StriException("StriContainerListRaw::isNA(): INDEX OUT OF BOUNDS"); #endif return (data[i%n].isNA()); } /** get the vectorized ith element * @param i index * @return string, read only */ const String8& get(R_len_t i) const { #ifndef NDEBUG if (i < 0 || i >= nrecycle) throw StriException("StriContainerListRaw::get(): INDEX OUT OF BOUNDS"); if (data[i%n].isNA()) throw StriException("StriContainerListRaw::get(): isNA"); #endif return data[i%n]; } }; #endif stringi/src/stri_search_regex_detect.cpp0000644000176200001440000001234514770541312020265 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf16.h" #include "stri_container_utf8.h" #include "stri_container_regex.h" /** * Detect if a pattern occurs in a string * * @param str R character vector * @param pattern R character vector containing regular expressions * @param negate single bool * @param max_count single int * @param opts_regex list * * @version 0.1-?? (Marcin Bujarski) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16 * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16's vectorization * * @version 0.1-?? (Marek Gagolewski, 2013-06-18) * use StriContainerRegexPattern + opts_regex * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.0-2 (Marek Gagolewski, 2016-01-29) * Issue #214: allow a regex pattern like `.*` to match an empty string * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR #216: `negate` arg added * * @version 1.3.1 (Marek Gagolewski, 2019-02-08) * #232: `max_count` arg added * * @version 1.4.7 (Marek Gagolewski, 2020-08-24) * Use StriContainerRegexPattern::getRegexOptions */ SEXP stri_detect_regex(SEXP str, SEXP pattern, SEXP negate, SEXP max_count, SEXP opts_regex) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); int max_count_1 = stri__prepare_arg_integer_1_notNA(max_count, "max_count"); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriRegexMatcherOptions pattern_opts = StriContainerRegexPattern::getRegexOptions(opts_regex); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF16 str_cont(str, vectorize_length); // StriContainerUTF8 str_cont(str, vectorize_length); // utext_openUTF8, see below StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_opts); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (max_count_1 == 0) { ret_tab[i] = NA_LOGICAL; continue; } STRI__CONTINUE_ON_EMPTY_OR_NA_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_LOGICAL) RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically matcher->reset(str_cont.get(i)); UErrorCode status = U_ZERO_ERROR; ret_tab[i] = (int)matcher->find(status); // returns UBool STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (negate_1) ret_tab[i] = !ret_tab[i]; if (max_count_1 > 0 && ret_tab[i]) --max_count_1; // // mbmark-regex-detect1.R: UTF16 0.07171792 s; UText 0.10531605 s // UText* str_text = NULL; // UErrorCode status = U_ZERO_ERROR; // RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically // str_text = utext_openUTF8(str_text, str_cont.get(i).c_str(), str_cont.get(i).length(), &status); // STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // matcher->reset(str_text); // ret_tab[i] = (int)matcher->find(status); // returns UBool // utext_close(str_text); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } stringi/src/stri_container_base.cpp0000644000176200001440000000544614770542660017265 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_base.h" /** * Default constructor * */ StriContainerBase::StriContainerBase() { this->n = 0; this->nrecycle = 0; this->sexp = (SEXP)NULL; #ifndef NDEBUG this->isShallow = true; #endif } /** * Initialize object data * */ void StriContainerBase::init_Base(R_len_t _n, R_len_t _nrecycle, bool _shallowrecycle, SEXP _sexp) { #ifndef NDEBUG if (this->n != 0 || this->nrecycle != 0 || this->sexp != (SEXP)NULL) throw StriException("StriContainerBase::init_Base(...): already initialized"); this->isShallow = _shallowrecycle; #endif STRI_ASSERT(_n >= 0); STRI_ASSERT(_nrecycle >= 0); if (_n <= 0 || _nrecycle <= 0) { this->nrecycle = 0; this->n = 0; this->sexp = _sexp; } else { this->nrecycle = _nrecycle; this->n = (_shallowrecycle)?_n:_nrecycle; this->sexp = _sexp; #ifndef NDEBUG if (this->n < _n) throw StriException("StriContainerBase::init_Base(...): this->n < _n"); if (this->n > this->nrecycle) throw StriException("StriContainerBase::init_Base(...): this->n > this->nrecycle"); #endif } } stringi/src/icu74_i18n_cpp.txt0000644000176200001440000001565514700200761015726 0ustar liggesusersicu74/i18n/alphaindex.cpp \ icu74/i18n/anytrans.cpp \ icu74/i18n/astro.cpp \ icu74/i18n/basictz.cpp \ icu74/i18n/bocsu.cpp \ icu74/i18n/brktrans.cpp \ icu74/i18n/buddhcal.cpp \ icu74/i18n/calendar.cpp \ icu74/i18n/casetrn.cpp \ icu74/i18n/cecal.cpp \ icu74/i18n/chnsecal.cpp \ icu74/i18n/choicfmt.cpp \ icu74/i18n/coleitr.cpp \ icu74/i18n/coll.cpp \ icu74/i18n/collation.cpp \ icu74/i18n/collationbuilder.cpp \ icu74/i18n/collationcompare.cpp \ icu74/i18n/collationdata.cpp \ icu74/i18n/collationdatabuilder.cpp \ icu74/i18n/collationdatareader.cpp \ icu74/i18n/collationdatawriter.cpp \ icu74/i18n/collationfastlatin.cpp \ icu74/i18n/collationfastlatinbuilder.cpp \ icu74/i18n/collationfcd.cpp \ icu74/i18n/collationiterator.cpp \ icu74/i18n/collationkeys.cpp \ icu74/i18n/collationroot.cpp \ icu74/i18n/collationrootelements.cpp \ icu74/i18n/collationruleparser.cpp \ icu74/i18n/collationsets.cpp \ icu74/i18n/collationsettings.cpp \ icu74/i18n/collationtailoring.cpp \ icu74/i18n/collationweights.cpp \ icu74/i18n/compactdecimalformat.cpp \ icu74/i18n/coptccal.cpp \ icu74/i18n/cpdtrans.cpp \ icu74/i18n/csdetect.cpp \ icu74/i18n/csmatch.cpp \ icu74/i18n/csr2022.cpp \ icu74/i18n/csrecog.cpp \ icu74/i18n/csrmbcs.cpp \ icu74/i18n/csrsbcs.cpp \ icu74/i18n/csrucode.cpp \ icu74/i18n/csrutf8.cpp \ icu74/i18n/curramt.cpp \ icu74/i18n/currfmt.cpp \ icu74/i18n/currpinf.cpp \ icu74/i18n/currunit.cpp \ icu74/i18n/dangical.cpp \ icu74/i18n/datefmt.cpp \ icu74/i18n/dayperiodrules.cpp \ icu74/i18n/dcfmtsym.cpp \ icu74/i18n/decContext.cpp \ icu74/i18n/decimfmt.cpp \ icu74/i18n/decNumber.cpp \ icu74/i18n/displayoptions.cpp \ icu74/i18n/double-conversion-bignum-dtoa.cpp \ icu74/i18n/double-conversion-bignum.cpp \ icu74/i18n/double-conversion-cached-powers.cpp \ icu74/i18n/double-conversion-double-to-string.cpp \ icu74/i18n/double-conversion-fast-dtoa.cpp \ icu74/i18n/double-conversion-string-to-double.cpp \ icu74/i18n/double-conversion-strtod.cpp \ icu74/i18n/dtfmtsym.cpp \ icu74/i18n/dtitvfmt.cpp \ icu74/i18n/dtitvinf.cpp \ icu74/i18n/dtptngen.cpp \ icu74/i18n/dtrule.cpp \ icu74/i18n/erarules.cpp \ icu74/i18n/esctrn.cpp \ icu74/i18n/ethpccal.cpp \ icu74/i18n/fmtable_cnv.cpp \ icu74/i18n/fmtable.cpp \ icu74/i18n/format.cpp \ icu74/i18n/formatted_string_builder.cpp \ icu74/i18n/formattedval_iterimpl.cpp \ icu74/i18n/formattedval_sbimpl.cpp \ icu74/i18n/formattedvalue.cpp \ icu74/i18n/fphdlimp.cpp \ icu74/i18n/fpositer.cpp \ icu74/i18n/funcrepl.cpp \ icu74/i18n/gender.cpp \ icu74/i18n/gregocal.cpp \ icu74/i18n/gregoimp.cpp \ icu74/i18n/hebrwcal.cpp \ icu74/i18n/indiancal.cpp \ icu74/i18n/inputext.cpp \ icu74/i18n/islamcal.cpp \ icu74/i18n/iso8601cal.cpp \ icu74/i18n/japancal.cpp \ icu74/i18n/listformatter.cpp \ icu74/i18n/measfmt.cpp \ icu74/i18n/measunit_extra.cpp \ icu74/i18n/measunit.cpp \ icu74/i18n/measure.cpp \ icu74/i18n/msgfmt.cpp \ icu74/i18n/name2uni.cpp \ icu74/i18n/nfrs.cpp \ icu74/i18n/nfrule.cpp \ icu74/i18n/nfsubs.cpp \ icu74/i18n/nortrans.cpp \ icu74/i18n/nultrans.cpp \ icu74/i18n/number_affixutils.cpp \ icu74/i18n/number_asformat.cpp \ icu74/i18n/number_capi.cpp \ icu74/i18n/number_compact.cpp \ icu74/i18n/number_currencysymbols.cpp \ icu74/i18n/number_decimalquantity.cpp \ icu74/i18n/number_decimfmtprops.cpp \ icu74/i18n/number_fluent.cpp \ icu74/i18n/number_formatimpl.cpp \ icu74/i18n/number_grouping.cpp \ icu74/i18n/number_integerwidth.cpp \ icu74/i18n/number_longnames.cpp \ icu74/i18n/number_mapper.cpp \ icu74/i18n/number_modifiers.cpp \ icu74/i18n/number_multiplier.cpp \ icu74/i18n/number_notation.cpp \ icu74/i18n/number_output.cpp \ icu74/i18n/number_padding.cpp \ icu74/i18n/number_patternmodifier.cpp \ icu74/i18n/number_patternstring.cpp \ icu74/i18n/number_rounding.cpp \ icu74/i18n/number_scientific.cpp \ icu74/i18n/number_simple.cpp \ icu74/i18n/number_skeletons.cpp \ icu74/i18n/number_symbolswrapper.cpp \ icu74/i18n/number_usageprefs.cpp \ icu74/i18n/number_utils.cpp \ icu74/i18n/numfmt.cpp \ icu74/i18n/numparse_affixes.cpp \ icu74/i18n/numparse_compositions.cpp \ icu74/i18n/numparse_currency.cpp \ icu74/i18n/numparse_decimal.cpp \ icu74/i18n/numparse_impl.cpp \ icu74/i18n/numparse_parsednumber.cpp \ icu74/i18n/numparse_scientific.cpp \ icu74/i18n/numparse_symbols.cpp \ icu74/i18n/numparse_validators.cpp \ icu74/i18n/numrange_capi.cpp \ icu74/i18n/numrange_fluent.cpp \ icu74/i18n/numrange_impl.cpp \ icu74/i18n/numsys.cpp \ icu74/i18n/olsontz.cpp \ icu74/i18n/persncal.cpp \ icu74/i18n/pluralranges.cpp \ icu74/i18n/plurfmt.cpp \ icu74/i18n/plurrule.cpp \ icu74/i18n/quant.cpp \ icu74/i18n/quantityformatter.cpp \ icu74/i18n/rbnf.cpp \ icu74/i18n/rbt_data.cpp \ icu74/i18n/rbt_pars.cpp \ icu74/i18n/rbt_rule.cpp \ icu74/i18n/rbt_set.cpp \ icu74/i18n/rbt.cpp \ icu74/i18n/rbtz.cpp \ icu74/i18n/regexcmp.cpp \ icu74/i18n/regeximp.cpp \ icu74/i18n/regexst.cpp \ icu74/i18n/regextxt.cpp \ icu74/i18n/region.cpp \ icu74/i18n/reldatefmt.cpp \ icu74/i18n/reldtfmt.cpp \ icu74/i18n/rematch.cpp \ icu74/i18n/remtrans.cpp \ icu74/i18n/repattrn.cpp \ icu74/i18n/rulebasedcollator.cpp \ icu74/i18n/scientificnumberformatter.cpp \ icu74/i18n/scriptset.cpp \ icu74/i18n/search.cpp \ icu74/i18n/selfmt.cpp \ icu74/i18n/sharedbreakiterator.cpp \ icu74/i18n/simpletz.cpp \ icu74/i18n/smpdtfmt.cpp \ icu74/i18n/smpdtfst.cpp \ icu74/i18n/sortkey.cpp \ icu74/i18n/standardplural.cpp \ icu74/i18n/string_segment.cpp \ icu74/i18n/strmatch.cpp \ icu74/i18n/strrepl.cpp \ icu74/i18n/stsearch.cpp \ icu74/i18n/taiwncal.cpp \ icu74/i18n/timezone.cpp \ icu74/i18n/titletrn.cpp \ icu74/i18n/tmunit.cpp \ icu74/i18n/tmutamt.cpp \ icu74/i18n/tmutfmt.cpp \ icu74/i18n/tolowtrn.cpp \ icu74/i18n/toupptrn.cpp \ icu74/i18n/translit.cpp \ icu74/i18n/transreg.cpp \ icu74/i18n/tridpars.cpp \ icu74/i18n/tzfmt.cpp \ icu74/i18n/tzgnames.cpp \ icu74/i18n/tznames_impl.cpp \ icu74/i18n/tznames.cpp \ icu74/i18n/tzrule.cpp \ icu74/i18n/tztrans.cpp \ icu74/i18n/ucal.cpp \ icu74/i18n/ucln_in.cpp \ icu74/i18n/ucol_res.cpp \ icu74/i18n/ucol_sit.cpp \ icu74/i18n/ucol.cpp \ icu74/i18n/ucoleitr.cpp \ icu74/i18n/ucsdet.cpp \ icu74/i18n/udat.cpp \ icu74/i18n/udateintervalformat.cpp \ icu74/i18n/udatpg.cpp \ icu74/i18n/ufieldpositer.cpp \ icu74/i18n/uitercollationiterator.cpp \ icu74/i18n/ulistformatter.cpp \ icu74/i18n/ulocdata.cpp \ icu74/i18n/umsg.cpp \ icu74/i18n/unesctrn.cpp \ icu74/i18n/uni2name.cpp \ icu74/i18n/units_complexconverter.cpp \ icu74/i18n/units_converter.cpp \ icu74/i18n/units_data.cpp \ icu74/i18n/units_router.cpp \ icu74/i18n/unum.cpp \ icu74/i18n/unumsys.cpp \ icu74/i18n/upluralrules.cpp \ icu74/i18n/uregex.cpp \ icu74/i18n/uregexc.cpp \ icu74/i18n/uregion.cpp \ icu74/i18n/usearch.cpp \ icu74/i18n/uspoof_build.cpp \ icu74/i18n/uspoof_conf.cpp \ icu74/i18n/uspoof_impl.cpp \ icu74/i18n/uspoof.cpp \ icu74/i18n/utf16collationiterator.cpp \ icu74/i18n/utf8collationiterator.cpp \ icu74/i18n/utmscale.cpp \ icu74/i18n/utrans.cpp \ icu74/i18n/vtzone.cpp \ icu74/i18n/vzone.cpp \ icu74/i18n/windtfmt.cpp \ icu74/i18n/winnmfmt.cpp \ icu74/i18n/wintzimpl.cpp \ icu74/i18n/zonemeta.cpp \ icu74/i18n/zrule.cpp \ icu74/i18n/ztrans.cpp stringi/src/stri_container_utf8.h0000644000176200001440000001661514770541312016677 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_container_utf8_h #define __stri_container_utf8_h #include "stri_string8.h" #include "stri_container_base.h" #include /** * A class to handle conversion between R character vectors * and UTF-8 string vectors * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * Improved performance for Native enc->UTF-8 (through u_strToUTF8) * * @version 0.1-?? (Marek Gagolewski, 2013-06-01) * Now NAs are marked as NULLs in str * * @version 0.1-?? (Marek Gagolewski, 2013-06-01) * UChar32_to_UTF8_index_back, UChar32_to_UTF8_index_fwd added * * @version 0.1-24 (Marek Gagolewski, 2014-03-11) * Fixed unitialized fields in constructors (thanks to valgrind detect) * * @version 0.2-1 (Marek Gagolewski, 2014-03-15) * Do not try to re-encode a string if native encoding is UTF-8; * str as String8* and not String8** (performance gain) * * @version 0.2-1 (Marek Gagolewski, 2014-03-20) * BUGFIX: possible mem leaks in the constructor; * separated StriContainerUTF8_indexable * * @version 0.2-1 (Marek Gagolewski, 2014-03-23) * UTF8 BOMs are now silently removed by one of the constructors * (via String8) * * @version 0.2-2 (Marek Gagolewski, 2014-04-20) * New methods: getMaxNumBytes, getMaxLength * * @version 0.3-1 (Marek Gagolewski, 2014-11-02) * New methods: set, getWritable, isNA; * Always try to use shallow copy of char* data in SEXP-based constructor (be lazy) */ class StriContainerUTF8 : public StriContainerBase { private: String8* str; ///< data - \code{string} public: StriContainerUTF8(); StriContainerUTF8(SEXP rstr, R_len_t nrecycle, bool shallowrecycle=true); StriContainerUTF8(StriContainerUTF8& container); ~StriContainerUTF8(); StriContainerUTF8& operator=(StriContainerUTF8& container); SEXP toR(R_len_t i) const; SEXP toR() const; /** check if the vectorized ith element is NA * @param i index * @return true if is NA */ inline bool isNA(R_len_t i) const { #ifndef NDEBUG if (i < 0 || i >= nrecycle) throw StriException("StriContainerUTF8::isNA(): INDEX OUT OF BOUNDS"); #endif return (str[i%n].isNA()); } /** get the vectorized ith element * * @param i index * @return string, read only */ inline const String8& get(R_len_t i) const { #ifndef NDEBUG if (i < 0 || i >= nrecycle) throw StriException("StriContainerUTF8::get(): INDEX OUT OF BOUNDS"); if (str[i%n].isNA()) throw StriException("StriContainerUTF8::get(): isNA"); #endif return str[i%n]; } /** get the vectorized ith element, no NA check here * * @param i index * @return string, read only */ inline const String8& getNAble(R_len_t i) const { #ifndef NDEBUG if (i < 0 || i >= nrecycle) throw StriException("StriContainerUTF8::get(): INDEX OUT OF BOUNDS"); #endif return str[i%n]; } /** get the vectorized ith element, but not as const * * @param i index * @return string */ inline String8& getWritable(R_len_t i) { #ifndef NDEBUG if (isShallow) throw StriException("StriContainerUTF8::getWritable(): shallow StriContainerUTF8"); if (n != nrecycle) throw StriException("StriContainerUTF8::getWritable(): n!=nrecycle"); if (i < 0 || i >= n) throw StriException("StriContainerUTF8::getWritable(): INDEX OUT OF BOUNDS"); // if (str[i%n].isReadOnly()) // not needed: readOnly here => changes are possible (but not on m_str directly) // throw StriException("StriContainerUTF8::getWritable(): isReadOnly"); if (str[i%n].isNA()) throw StriException("StriContainerUTF8::getWritable(): isNA"); #endif return str[i%n]; // in fact, "%n" is not necessary } /** set NA * @param i index */ inline void setNA(R_len_t i) { #ifndef NDEBUG if (isShallow) throw StriException("StriContainerUTF8::setNA(): shallow StriContainerUTF8"); if (n != nrecycle) throw StriException("StriContainerUTF8::setNA(): n!=nrecycle"); if (i < 0 || i >= n) throw StriException("StriContainerUTF8::setNA(): INDEX OUT OF BOUNDS"); #endif str[i%n].setNA(); } /** get the number of bytes used to represent the longest string */ R_len_t getMaxNumBytes() const { R_len_t bufsize = 0; for (R_len_t i=0; i bufsize) bufsize = cursize; } return bufsize; } /** get the length of the longest string */ R_len_t getMaxLength() const { R_len_t bufsize = 0; for (R_len_t i=0; i bufsize) bufsize = cursize; } return bufsize; } /** set the vectorized ith element * @param i index * @param s string to be copied */ inline void set(R_len_t i, const String8& s) { #ifndef NDEBUG if (isShallow) throw StriException("StriContainerUTF8::set(): shallow StriContainerUTF8"); if (n != nrecycle) throw StriException("StriContainerUTF8::set(): n!=nrecycle"); if (i < 0 || i >= n) throw StriException("StriContainerUTF8::set(): INDEX OUT OF BOUNDS"); #endif str[i%n] = s; // in fact, "%n" is not necessary } }; SEXP stri__subset_by_logical(const StriContainerUTF8& str_cont, const std::vector& which, int result_counter); #endif stringi/src/stri_container_utf8_indexable.cpp0000644000176200001440000002511114770541312021234 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8_indexable.h" /** * Default constructor * * @version 0.2-1 (2014-03-20) * separated StriContainerUTF8_indexable class */ StriContainerUTF8_indexable::StriContainerUTF8_indexable() : StriContainerUTF8() { last_ind_back_str = NULL; last_ind_fwd_str = NULL; } /** * Construct String Container from R character vector * @param rstr R character vector * @param nrecycle extend length [vectorization] * @param shallowrecycle will \code{this->str} be ever modified? * * @version 0.2-1 (2014-03-20) * separated StriContainerUTF8_indexable class */ StriContainerUTF8_indexable::StriContainerUTF8_indexable(SEXP rstr, R_len_t _nrecycle, bool _shallowrecycle) : StriContainerUTF8(rstr, _nrecycle, _shallowrecycle) { last_ind_back_str = NULL; last_ind_fwd_str = NULL; } /** Copy constructor * * @version 0.2-1 (2014-03-20) * separated StriContainerUTF8_indexable class */ StriContainerUTF8_indexable::StriContainerUTF8_indexable(StriContainerUTF8_indexable& container) : StriContainerUTF8((StriContainerUTF8&)container) { last_ind_back_str = NULL; last_ind_fwd_str = NULL; } /** Copy/assignment operator * * @version 0.2-1 (2014-03-20) * separated StriContainerUTF8_indexable class */ StriContainerUTF8_indexable& StriContainerUTF8_indexable::operator=(StriContainerUTF8_indexable& container) { ((StriContainerUTF8*)this)->~StriContainerUTF8(); (StriContainerUTF8&) (*this) = (StriContainerUTF8&)container; last_ind_back_str = NULL; last_ind_fwd_str = NULL; return *this; } /** Convert BACKWARD UChar32-based index to UTF-8 based * * @param i string index (in container) * @param wh UChar32 character's position to look for, * counting starts from 0 == byte after last character in the i-th string * @return UTF-8 (byte) index * * * @version 0.1-?? (Bartek Tartanus) * stri_sub * * @version 0.1-?? (Marek Gagolewski) * stri__UChar32_to_UTF8_index * * @version 0.1-?? (Marek Gagolewski, 2013-06-01) * moved to StriContainerUTF8 * * @version 0.2-1 (Marek Gagolewski, 2014-03-20) * moved to StriContainerUTF8_indexable * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use String8::isASCII * * @version 1.1.3 (Marek Gagolewski, 2017-03-21) * Issue#227: buffering bug in stri_sub */ R_len_t StriContainerUTF8_indexable::UChar32_to_UTF8_index_back(R_len_t i, R_len_t wh) { R_len_t cur_n = get(i).length(); if (wh <= 0) return cur_n; if (get(i).isASCII()) return std::max(cur_n-wh, 0); const char* cur_s = get(i).c_str(); #ifndef NDEBUG if (!cur_s) throw StriException("StriContainerUTF8::UChar32_to_UTF8_index_back: NULL cur_s"); #endif if (last_ind_back_str != cur_s) { // starting search in a different string last_ind_back_codepoint = 0; last_ind_back_utf8 = cur_n; last_ind_back_str = cur_s; } R_len_t j = 0; R_len_t jres = cur_n; if (last_ind_back_codepoint > 0) { if (wh < last_ind_back_codepoint) { // check if it makes sense to go towards the end of the string // or maybe it will be better to start from the end and move backwards if ((last_ind_back_codepoint-wh) < (wh-0)) { // less code points will be considered when going backwards j = last_ind_back_codepoint; jres = last_ind_back_utf8; while (j > wh && jres < cur_n) { U8_FWD_1((const uint8_t*)cur_s, jres, cur_n); --j; } last_ind_back_codepoint = wh; last_ind_back_utf8 = jres; return jres; // stop right now } // else } else { //if (wh >= last_ind_back_codepoint) // continue last search j = last_ind_back_codepoint; jres = last_ind_back_utf8; } } // go backward while (j < wh && jres > 0) { U8_BACK_1((const uint8_t*)cur_s, 0, jres); ++j; } last_ind_back_codepoint = j; // it's not wh, as we can advance at the end of the string, compare #227 last_ind_back_utf8 = jres; return jres; } /** Convert FORWARD UChar32-based index to UTF-8 based * * @param i string index (in container) * @param wh UChar32 character's position to look for, * counting starts from 0 == first character in i-th string * @return UTF-8 (byte) index * * * @version 0.1-?? (Bartek Tartanus) * stri_sub * * @version 0.1-?? (Marek Gagolewski) * stri__UChar32_to_UTF8_index * * @version 0.1-?? (Marek Gagolewski, 2013-06-01) * moved to StriContainerUTF8 * * @version 0.2-1 (Marek Gagolewski, 2014-03-20) * moved to StriContainerUTF8_indexable * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use String8::isASCII * * @version 1.1.3 (Marek Gagolewski, 2017-03-21) * Issue#227: buffering bug in stri_sub */ R_len_t StriContainerUTF8_indexable::UChar32_to_UTF8_index_fwd(R_len_t i, R_len_t wh) { if (wh <= 0) return 0; if (get(i).isASCII()) return std::min(wh, get(i).length()); R_len_t cur_n = get(i).length(); const char* cur_s = get(i).c_str(); #ifndef NDEBUG if (!cur_s) throw StriException("StriContainerUTF8::UChar32_to_UTF8_index_fwd: NULL cur_s"); #endif if (last_ind_fwd_str != cur_s) { // starting search in a different string last_ind_fwd_codepoint = 0; last_ind_fwd_utf8 = 0; last_ind_fwd_str = cur_s; } R_len_t j = 0; R_len_t jres = 0; if (last_ind_fwd_codepoint > 0) { if (wh < last_ind_fwd_codepoint) { // check if it makes sense to go backwards from last position, // or it is better to start from scratch if ((last_ind_fwd_codepoint-wh) < (wh-0)) { // less code points will be considered when going backwards j = last_ind_fwd_codepoint; jres = last_ind_fwd_utf8; while (j > wh && jres > 0) { U8_BACK_1((const uint8_t*)cur_s, 0, jres); --j; } last_ind_fwd_codepoint = wh; last_ind_fwd_utf8 = jres; return jres; // stop right now } // else } else { //if (wh >= last_ind_fwd_codepoint) // continue last search j = last_ind_fwd_codepoint; jres = last_ind_fwd_utf8; } } // go forward while (j < wh && jres < cur_n) { U8_FWD_1((const uint8_t*)cur_s, jres, cur_n); ++j; } last_ind_fwd_codepoint = j; // it's not wh, as we can advance at the end of the string, compare #227 last_ind_fwd_utf8 = jres; return jres; } /** Convert UTF8-byte indexes to Unicode32 (code points) * * \code{i1} and \code{i2} must be sorted increasingly * * @param i element index * @param i1 indexes, 1-based [in/out] * @param i2 indexes, 1-based [in/out] * @param ni size of \code{i1} and \code{i2} * @param adj1 adjust for \code{i1} * @param adj2 adjust for \code{i2} * * * @version 0.1-?? (Marek Gagolewski) * * @version 0.2-1 (Marek Gagolewski, 2014-03-20) * moved to StriContainerUTF8_indexable * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use String8::isASCII */ void StriContainerUTF8_indexable::UTF8_to_UChar32_index(R_len_t i, int* i1, int* i2, const int ni, int adj1, int adj2) { if (get(i).isASCII()) { for (int i=0; i= i1[j1+1]) throw StriException("DEBUG: stri__UTF8_to_UChar32_index"); #endif i1[j1] = i32 + adj1; ++j1; } if (j2 < ni && i2[j2] <= i8) { #ifndef NDEBUG if (j2 < ni-1 && i2[j2] >= i2[j2+1]) throw StriException("DEBUG: stri__UTF8_to_UChar32_index"); #endif i2[j2] = i32 + adj2; ++j2; } // Next UChar32 U8_FWD_1(cstr, i8, nstr); ++i32; } // CONVERT LAST: if (j1 < ni && i1[j1] <= nstr) { #ifndef NDEBUG if (j1 < ni-1 && i1[j1] >= i1[j1+1]) throw StriException("DEBUG: stri__UTF8_to_UChar32_index"); #endif i1[j1] = i32 + adj1; ++j1; } if (j2 < ni && i2[j2] <= nstr) { #ifndef NDEBUG if (j2 < ni-1 && i2[j2] >= i2[j2+1]) throw StriException("DEBUG: stri__UTF8_to_UChar32_index"); #endif i2[j2] = i32 + adj2; ++j2; } // CHECK: #ifndef NDEBUG if (i8 >= nstr && (j1 < ni || j2 < ni)) throw StriException("DEBUG: stri__UTF8_to_UChar32_index()"); #endif } stringi/src/install.libs.R.in0000644000176200001440000000610114750110642015644 0ustar liggesusers## This file is part of the R package 'stringi'. ## Copyright (c) 2013-2025, Marek Gagolewski # copy lib: libfile <- paste("stringi", SHLIB_EXT, sep="") dest <- file.path(R_PACKAGE_DIR, paste("libs", R_ARCH, sep="")) dir.create(dest, recursive=TRUE, showWarnings=FALSE) file.copy(libfile, dest, overwrite=TRUE) # copy icudt when building ICU from sources: copyicudt <- !as.logical(@ICU_FOUND@) if (copyicudt) { source('../R/install.R') outdir <- file.path(R_PACKAGE_DIR, "libs") if (length(dir(outdir, pattern=glob2rx("*.dat"))) == 0) { # avoids multiple download attempts while multiarch building if (!stri_install_icudt(outdir, "@ICUDT_DIR@", @ICU_BUNDLE_VERSION@)) stop("icudt could not be downloaded; check your internet connectivity") } } # Copy symbols.rds: if (file.exists('symbols.rds')) file.copy('symbols.rds', dest, overwrite=TRUE) # Create ../include/stringi.h: dir.create(file.path(R_PACKAGE_DIR, 'include'), showWarnings=FALSE) file.copy('stri_exports.h', file.path(R_PACKAGE_DIR, 'include', 'stringi.h')) # Create ../include/stringi.cpp: f <- file(file.path(R_PACKAGE_DIR, 'include', 'stringi.cpp'), open='w') copyright <- readLines("stri_exports.h") i <- which.min(nchar(copyright) > 0) writeLines(copyright[1:i], con=f) writeLines("#include ", con=f) writeLines("#include ", con=f) match_all_perl <- function(s, p) { stopifnot(is.character(s), is.character(p), length(p) == 1) out <- vector("list", length(s)) nna <- which(!is.na(s)) m <- gregexpr(enc2utf8(p), enc2utf8(s[nna]), perl=TRUE) for (j in seq_along(m)) { if (any(m[[j]] < 0)) next # no match nmatch <- length(m[[j]]) ncapt <- ncol(attr(m[[j]], "capture.start")) out[[j]] <- matrix(substring(s[nna[j]], m[[j]], m[[j]]+attr(m[[j]], "match.length")-1), nrow=nmatch, ncol=ncapt+1) if (ncapt > 0) { cs <- as.integer(attr(m[[j]], "capture.start")) cl <- as.integer(attr(m[[j]], "capture.length")) out[[j]][,-1] <- substring(s[nna[j]], cs, cs+cl-1) } } out } exported1 <- match_all_perl(readLines("stri_exports.h"), "^SEXP[ ]+([A-Z0-9a-z_]+)\\(") exported1 <- t(simplify2array(exported1[!sapply(exported1, is.null)], higher=FALSE))[,-1] exported2 <- match_all_perl(readLines("stri_stringi.cpp"), "^[ ]*STRI__MK_CALL\\(\"([A-Z0-9a-z_]+)\",[ ]*([A-Z0-9a-z_]+),[ ]*([0-9]+)") exported2 <- t(simplify2array(exported2[!sapply(exported2, is.null)], higher=FALSE))[,-1] i <- match(exported1, exported2[,2]) for (j in i) { narg <- as.integer(exported2[j,3]) cat(sprintf('SEXP %s(%s) { static SEXP(*fun)(%s) = NULL; if (!fun) fun = (SEXP(*)(%s)) R_GetCCallable("stringi", "%s"); return fun(%s); }\n\n', exported2[j,2], if (narg > 0) paste("SEXP e", seq_len(narg), sep="", collapse=", ") else "", paste(rep("SEXP", narg), collapse=","), paste(rep("SEXP", narg), collapse=","), exported2[j,1], if (narg > 0) paste("e", seq_len(narg), sep="", collapse=", ") else "" ), file=f) } close(f) stringi/src/stri_container_utf8.cpp0000644000176200001440000003031514770541312017223 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_ucnv.h" #include "stri_string8buf.h" /** * Default constructor * */ StriContainerUTF8::StriContainerUTF8() : StriContainerBase() { str = NULL; } /** * Construct String Container from R character vector * * @param rstr R character vector * @param nrecycle extend length [vectorization] * @param shallowrecycle will \code{this->str} be ever modified? * * @version 1.0.6 (Marek Gagolewski, 2017-05-25) * #270 latin-1 is windows-1252 on Windows * * @version 1.6.2 (Marek Gagolewski, 2021-05-14) * #354 Force the copying of ALTREP data */ StriContainerUTF8::StriContainerUTF8(SEXP rstr, R_len_t _nrecycle, bool _shallowrecycle) { this->str = NULL; #ifndef NDEBUG if (!Rf_isString(rstr)) throw StriException("DEBUG: !Rf_isString in StriContainerUTF8::StriContainerUTF8(SEXP rstr)"); #endif R_len_t nrstr = LENGTH(rstr); this->init_Base(nrstr, _nrecycle, _shallowrecycle, rstr); // calling LENGTH(rstr) fails on constructor call if (this->n == 0) return; /* nothing more to do */ STRI_ASSERT(this->n > 0); this->str = new String8[this->n]; STRI_ASSERT(this->str); if (!this->str) throw StriException(MSG__MEM_ALLOC_ERROR_WITH_SIZE, this->n*sizeof(String8)); /* Important: ICU provides full internationalization functionality without any conversion table data. The common library contains code to handle several important encodings algorithmically: US-ASCII, ISO-8859-1, UTF-7/8/16/32, SCSU, BOCU-1, CESU-8, and IMAP-mailbox-name */ // for conversion from non-UTF-8/ASCII native charsets: #if defined(_WIN32) || defined(_WIN64) // #270: latin-1 is windows-1252 on Windows StriUcnv ucnvLatin1("WINDOWS-1252"); #else // TODO: WINDOWS-1252 is a superset of ISO-8859-1, use the former?? StriUcnv ucnvLatin1("ISO-8859-1"); #endif StriUcnv ucnvNative(NULL); R_len_t outbufsize = -1; String8buf outbuf(0); // int tmpbufsize = -1; // UChar* tmpbuf = NULL; for (R_len_t i=0; istr[i].initialize(CHAR(curs), LENGTH(curs), memalloc/*!_shallowrecycle*/, false/*killbom*/, true/*isASCII*/); } else if (IS_UTF8(curs)) { // UTF-8 - ultra fast bool memalloc = ALTREP(rstr); // #354: force copying of ALTREP data this->str[i].initialize(CHAR(curs), LENGTH(curs), memalloc/*!_shallowrecycle*/, true/*killbom*/, false/*isASCII*/); // the same is done for native encoding && ucnvNative_isUTF8 // @TODO: use macro (here & ucnvNative_isUTF8 below) } else if (IS_BYTES(curs)) { // "bytes encoding" is not allowed except // for some special functions which do encoding themselves throw StriException(MSG__BYTESENC); } else { // LATIN1 ------- OR ------ Native encoding UConverter* ucnvCurrent; if (IS_LATIN1(curs)) { ucnvCurrent = ucnvLatin1.getConverter(); } else { // "unknown" (native) encoding // an "unknown" (native) encoding may be set to UTF-8 (speedup) if (ucnvNative.isUTF8()) { // UTF-8 - ultra fast // @TODO: use macro bool memalloc = ALTREP(rstr); // #354: force copying of ALTREP data this->str[i].initialize(CHAR(curs), LENGTH(curs), memalloc/*!_shallowrecycle*/, true/*killbom*/, false/*isASCII*/); continue; } ucnvCurrent = ucnvNative.getConverter(); } if (outbufsize < 0) { // calculate max string length R_len_t maxlen = LENGTH(curs); for (R_len_t z=i+1; z UTF16 UErrorCode status = U_ZERO_ERROR; UnicodeString tmp(CHAR(curs), LENGTH(curs), ucnvCurrent, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // UTF-16 -> UTF-8 // // this is not faster than u_strToUTF8 // const UChar* tmpbuf = tmp.getBuffer(); // int tmpbufsize = tmp.length(); // // tmpbuf is a well-formed UTF-16 string // int i1 = 0, outrealsize = 0; // UChar32 c; // while (i1 < tmpbufsize) { // U16_NEXT_UNSAFE(tmpbuf, i1, c); // U8_APPEND_UNSAFE(outbuf, outrealsize, c); //#ifndef NDEBUG // if (outrealsize > outbufsize) // throw StriException(U_BUFFER_OVERFLOW_ERROR); //#endif // } int outrealsize = 0; u_strToUTF8(outbuf.data(), outbuf.size(), &outrealsize, tmp.getBuffer(), tmp.length(), &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) this->str[i].initialize(outbuf.data(), outrealsize, true/*memalloc*/, false/*killbom*/, false/*isASCII*/); // version 3: use tmpbuf (slower than v2) // UErrorCode status = U_ZERO_ERROR; // int tmprealsize = ucnv_toUChars(ucnvCurrent, tmpbuf, tmpbufsize, // CHAR(curs), LENGTH(curs), &status); // if (U_FAILURE(status)) { // STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // CLEANUP_FAILURE_StriContainerUTF8 // throw StriException(status); // } // // // UTF-16 -> UTF-8 // int outrealsize = ucnv_fromUChars(ucnvUTF8, // outbuf, outbufsize, tmpbuf, tmprealsize, &status); // if (U_FAILURE(status)) { // STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // CLEANUP_FAILURE_StriContainerUTF8 // throw StriException(status); // } } } if (!_shallowrecycle) { for (R_len_t i=nrstr; in; ++i) { this->str[i] = str[i%nrstr]; } } } StriContainerUTF8::StriContainerUTF8(StriContainerUTF8& container) : StriContainerBase((StriContainerBase&)container) { if (container.str) { this->str = new String8[this->n]; STRI_ASSERT(this->str); if (!this->str) throw StriException(MSG__MEM_ALLOC_ERROR_WITH_SIZE, this->n*sizeof(String8)); for (int i=0; in; ++i) { this->str[i] = container.str[i]; } } else { this->str = NULL; } } StriContainerUTF8& StriContainerUTF8::operator=(StriContainerUTF8& container) { this->~StriContainerUTF8(); (StriContainerBase&) (*this) = (StriContainerBase&)container; if (container.str) { this->str = new String8[this->n]; STRI_ASSERT(this->str); if (!this->str) throw StriException(MSG__MEM_ALLOC_ERROR_WITH_SIZE, this->n*sizeof(String8)); for (int i=0; in; ++i) { this->str[i] = container.str[i]; } } else { this->str = NULL; } return *this; } StriContainerUTF8::~StriContainerUTF8() { if (str) { // for (int i=0; itoR(i)); } UNPROTECT(1); return ret; } /** Export string to R * THE OUTPUT IS ALWAYS IN UTF-8 * * @param i index [with recycle] * @return CHARSXP * * @version 0.1-?? (Marek Gagolewski) * * @version 0.2-1 (Marek Gagolewski, 2014-03-22) * returns original CHARSXP if possible for increased performance */ SEXP StriContainerUTF8::toR(R_len_t i) const { #ifndef NDEBUG if (i < 0 || i >= nrecycle) throw StriException("StriContainerUTF8::toR(): INDEX OUT OF BOUNDS"); #endif String8* curs = &(str[i%n]); if (curs->isNA()) { return NA_STRING; } else if (curs->isReadOnly()) { // if ReadOnly, then surely in ASCII or UTF-8 and without BOMs (see SEXP-constructor) return STRING_ELT(sexp, i%n); } else { // This is already in UTF-8 return Rf_mkCharLenCE(curs->c_str(), curs->length(), CE_UTF8); } } stringi/src/stri_time_format.cpp0000644000176200001440000004312114770541312016600 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf16.h" #include "stri_container_utf8.h" #include "stri_container_double.h" #include "stri_container_integer.h" #include #include #include /** * Get date format * * @version 1.6.3 (Marek Gagolewski, 2021-05-24) * refactor from stri_datetime_parse */ DateFormat* stri__get_date_format( const char* format_val, const char* locale_val, UErrorCode status ) { DateFormat* fmt = NULL; // "format" may be one of: const char* format_opts[] = { "date_full", "date_long", "date_medium", "date_short", "date_relative_full", "date_relative_long", "date_relative_medium", "date_relative_short", "time_full", "time_long", "time_medium", "time_short", "time_relative_full", "time_relative_long", "time_relative_medium", "time_relative_short", "datetime_full", "datetime_long", "datetime_medium", "datetime_short", "datetime_relative_full", "datetime_relative_long", "datetime_relative_medium", "datetime_relative_short", NULL }; int format_cur = stri__match_arg(format_val, format_opts); if (format_cur >= 0) { DateFormat::EStyle style = DateFormat::kNone; switch (format_cur % 8) { case 0: style = DateFormat::kFull; break; case 1: style = DateFormat::kLong; break; case 2: style = DateFormat::kMedium; break; case 3: style = DateFormat::kShort; break; case 4: style = DateFormat::kFullRelative; break; case 5: style = DateFormat::kLongRelative; break; case 6: style = DateFormat::kMediumRelative; break; case 7: style = DateFormat::kShortRelative; break; default: style = DateFormat::kNone; break; } /* ICU 54.1: Relative time styles are not currently supported. */ switch (format_cur / 8) { case 0: fmt = DateFormat::createDateInstance( style, Locale::createFromName(locale_val) ); break; case 1: fmt = DateFormat::createTimeInstance( (DateFormat::EStyle)(style & ~DateFormat::kRelative), Locale::createFromName(locale_val) ); break; case 2: fmt = DateFormat::createDateTimeInstance( style, (DateFormat::EStyle)(style & ~DateFormat::kRelative), Locale::createFromName(locale_val) ); break; default: fmt = NULL; break; } } else { UnicodeString format_str(format_val); fmt = new SimpleDateFormat( format_str, Locale::createFromName(locale_val), status ); } return fmt; } /** * Format date-time objects * * @param time * @param format * @param tz * @param locale * * @return character vector * * @version 0.5-1 (Marek Gagolewski, 2015-01-05) * @version 0.5-1 (Marek Gagolewski, 2015-02-22) use tz * @version 1.6.3 (Marek Gagolewski, 2021-05-24) #434: vectorise wrt format */ SEXP stri_datetime_format(SEXP time, SEXP format, SEXP tz, SEXP locale) { const char* locale_val = stri__prepare_arg_locale(locale, "locale"); PROTECT(time = stri__prepare_arg_POSIXct(time, "time")); PROTECT(format = stri__prepare_arg_string(format, "format")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(time), LENGTH(format)); if (vectorize_length <= 0) { UNPROTECT(2); return Rf_allocVector(STRSXP, 0); } TimeZone* tz_val = stri__prepare_arg_timezone(tz, "tz", true/*allowdefault*/); Calendar* cal = NULL; DateFormat* fmt = NULL; STRI__ERROR_HANDLER_BEGIN(2) StriContainerDouble time_cont(time, vectorize_length); StriContainerUTF8 format_cont(format, vectorize_length); cal = stri__get_calendar(locale_val); cal->adoptTimeZone(tz_val); tz_val = NULL; /* The Calendar takes ownership of the TimeZone. */ UErrorCode status = U_ZERO_ERROR; SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); const String8* format_last = NULL; // this will allow for formatter reuse for (R_len_t i = format_cont.vectorize_init(); i != format_cont.vectorize_end(); i = format_cont.vectorize_next(i)) { if (time_cont.isNA(i) || format_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } const String8* format_cur = &(format_cont.get(i)); if (format_cur != format_last) { // well, no reuse possible - resetting format_last = format_cur; if (fmt) { delete fmt; fmt = NULL; } status = U_ZERO_ERROR; fmt = stri__get_date_format(format_cur->c_str(), locale_val, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } status = U_ZERO_ERROR; cal->setTime((UDate)(time_cont.get(i)*1000.0), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) FieldPosition pos; UnicodeString out; fmt->format(*cal, out, pos); std::string s; out.toUTF8String(s); SET_STRING_ELT(ret, i, Rf_mkCharLenCE(s.c_str(), (int)s.length(), (cetype_t)CE_UTF8)); } if (tz_val) { delete tz_val; tz_val = NULL; } if (fmt) { delete fmt; fmt = NULL; } if (cal) { delete cal; cal = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (tz_val) { delete tz_val; tz_val = NULL; } if (fmt) { delete fmt; fmt = NULL; } if (cal) { delete cal; cal = NULL; } }) } /** * Parse date-time objects * * @param str * @param format * @param tz * @param lenient * @param locale * * @return character vector * * @version 0.5-1 (Marek Gagolewski, 2015-01-08) * @version 0.5-1 (Marek Gagolewski, 2015-01-11) lenient arg added * @version 0.5-1 (Marek Gagolewski, 2015-02-22) use tz * @version 0.5-1 (Marek Gagolewski, 2015-03-01) set tzone attrib on retval * @version 1.6.3 (Marek Gagolewski, 2021-05-24) #434: vectorise wrt format * @version 1.6.3 (Marek Gagolewski, 2021-06-07) empty retval should have a class too * @version 1.8.1 (Marek Gagolewski, 2023-11-08) #469: default time is midnight today */ SEXP stri_datetime_parse(SEXP str, SEXP format, SEXP lenient, SEXP tz, SEXP locale) { const char* locale_val = stri__prepare_arg_locale(locale, "locale"); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(format = stri__prepare_arg_string(format, "format")); bool lenient_val = stri__prepare_arg_logical_1_notNA(lenient, "lenient"); if (!Rf_isNull(tz)) PROTECT(tz = stri__prepare_arg_string_1(tz, "tz")); else PROTECT(tz); /* needed to set tzone attrib */ R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(format)); if (vectorize_length <= 0) { SEXP ret; PROTECT(ret = Rf_allocVector(REALSXP, 0)); if (!Rf_isNull(tz)) Rf_setAttrib(ret, Rf_ScalarString(Rf_mkChar("tzone")), tz); stri__set_class_POSIXct(ret); UNPROTECT(4); return ret; } TimeZone* tz_val = stri__prepare_arg_timezone(tz, "tz", true/*allowdefault*/); Calendar* cal = NULL; DateFormat* fmt = NULL; STRI__ERROR_HANDLER_BEGIN(3) StriContainerUTF16 str_cont(str, vectorize_length); StriContainerUTF8 format_cont(format, vectorize_length); cal = stri__get_calendar(locale_val); cal->adoptTimeZone(tz_val); tz_val = NULL; /* The Calendar takes ownership of the TimeZone. */ cal->setLenient(lenient_val); UDate now = cal->getNow(); UErrorCode status = U_ZERO_ERROR; SEXP ret; STRI__PROTECT(ret = Rf_allocVector(REALSXP, vectorize_length)); const String8* format_last = NULL; // this will allow for formatter reuse for (R_len_t i = format_cont.vectorize_init(); i != format_cont.vectorize_end(); i = format_cont.vectorize_next(i)) { if (str_cont.isNA(i) || format_cont.isNA(i)) { REAL(ret)[i] = NA_REAL; continue; } const String8* format_cur = &(format_cont.get(i)); if (format_cur != format_last) { // well, no reuse possible - resetting format_last = format_cur; if (fmt) { delete fmt; fmt = NULL; } status = U_ZERO_ERROR; fmt = stri__get_date_format(format_cur->c_str(), locale_val, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } status = U_ZERO_ERROR; cal->setTime(now, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // weirdly, all the time fields must be reset cal->clear(UCAL_MILLISECOND); cal->clear(UCAL_SECOND); cal->clear(UCAL_MINUTE); cal->clear(UCAL_AM_PM); cal->clear(UCAL_HOUR); cal->clear(UCAL_HOUR_OF_DAY); cal->clear(UCAL_MILLISECONDS_IN_DAY); ParsePosition pos; fmt->parse(str_cont.get(i), *cal, pos); if (pos.getErrorIndex() >= 0) REAL(ret)[i] = NA_REAL; else { status = U_ZERO_ERROR; REAL(ret)[i] = ((double)cal->getTime(status))/1000.0; if (U_FAILURE(status)) REAL(ret)[i] = NA_REAL; } } if (!Rf_isNull(tz)) Rf_setAttrib(ret, Rf_ScalarString(Rf_mkChar("tzone")), tz); stri__set_class_POSIXct(ret); if (tz_val) { delete tz_val; tz_val = NULL; } if (fmt) { delete fmt; fmt = NULL; } if (cal) { delete cal; cal = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (tz_val) { delete tz_val; tz_val = NULL; } if (fmt) { delete fmt; fmt = NULL; } if (cal) { delete cal; cal = NULL; } }) } /** * Converts a single strptime/strftime format to the one used by ICU * * @param x * @return a single R string */ SEXP stri__datetime_fstr_1(const String8& _x) { STRI_ASSERT(!_x.isNA()); R_len_t n = _x.length(); const char* x = _x.c_str(); std::string buf; buf.reserve(n+1); // whatever R_len_t i=0; bool literal_substring = false; while (i < n) { // consume everything up to the next '%' if (x[i] == '\'') { if (!literal_substring) { literal_substring = true; buf.push_back('\''); } buf.push_back('\\'); buf.push_back('\''); i++; continue; } if (x[i] != '%') { if (!literal_substring) { literal_substring = true; buf.push_back('\''); } buf.push_back(x[i]); i++; continue; } // '%' found. i++; if (i >= n) // dangling % throw StriException(MSG__INVALID_FORMAT_SPECIFIER, ""); // if "%%", then output '%' and continue looking for the next '%' if (x[i] == '%') { if (!literal_substring) { literal_substring = true; buf.push_back('\''); } buf.push_back('%'); i++; continue; } if (literal_substring) { literal_substring = false; buf.push_back('\''); } char spec = x[i++]; switch (spec) { case 'U': case 'V': case 'x': case 'X': case 'u': case 'w': case 'r': case 'g': case 'G': case 'c': Rf_warning(MSG__PROBLEMATIC_FORMAT_SPECIFIER_CHAR, spec); break; default: break; } switch (spec) { case 'U': buf.append("ww" ); break; case 'W': buf.append("ww" ); break; case 'g': buf.append("yy" ); break; case 'G': buf.append("Y" ); break; case 'a': buf.append("ccc" ); break; case 'A': buf.append("cccc" ); break; case 'b': buf.append("MMM" ); break; case 'B': buf.append("MMMM" ); break; case 'c': buf.append("eee MMM d HH:mm:ss yyyy"); break; case 'd': buf.append("dd" ); break; case 'D': buf.append("MM/dd/yy" ); break; case 'e': buf.append("d" ); break; case 'F': buf.append("yyyy-MM-dd" ); break; case 'h': buf.append("MMM" ); break; case 'H': buf.append("HH" ); break; case 'I': buf.append("hh" ); break; case 'j': buf.append("D" ); break; case 'm': buf.append("MM" ); break; case 'M': buf.append("mm" ); break; case 'n': buf.append("\n" ); break; case 'p': buf.append("a" ); break; case 'r': buf.append("hh:mm:ss" ); break; case 'R': buf.append("HH:mm" ); break; case 'S': buf.append("ss" ); break; case 't': buf.append("\t" ); break; case 'T': buf.append("HH:mm:ss" ); break; case 'u': buf.append("c" ); break; case 'V': buf.append("ww" ); break; case 'w': buf.append("c" ); break; case 'x': buf.append("yy/MM/dd" ); break; case 'X': buf.append("HH:mm:ss" ); break; case 'y': buf.append("yy" ); break; case 'Y': buf.append("yyyy" ); break; case 'z': buf.append("Z" ); break; case 'Z': buf.append("z" ); break; default: throw StriException(MSG__INVALID_FORMAT_SPECIFIER_SUB, 1, x+i-1); } } if (literal_substring) { literal_substring = false; buf.push_back('\''); } return Rf_mkCharLenCE(buf.data(), buf.size(), CE_UTF8); } /** * Convert %Y-%m-%d to yyyy'-'MM'-'dd and stuff (for strptime/strftime <-> ICU) * * @param x character vector * * @return character vector * * @version 1.6.4 (Marek Gagolewski, 2021-06-07) */ SEXP stri_datetime_fstr(SEXP x) { PROTECT(x = stri__prepare_arg_string(x, "x")); R_len_t vectorize_length = LENGTH(x); if (vectorize_length <= 0) { UNPROTECT(1); return Rf_allocVector(STRSXP, 0); } STRI__ERROR_HANDLER_BEGIN(1) StriContainerUTF8 x_cont(x, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); for ( R_len_t i = x_cont.vectorize_init(); i != x_cont.vectorize_end(); i = x_cont.vectorize_next(i) ) { if (x_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } SEXP out; STRI__PROTECT(out = stri__datetime_fstr_1(x_cont.get(i))); SET_STRING_ELT(ret, i, out); STRI__UNPROTECT(1); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } stringi/src/stri_search_class_extract.cpp0000644000176200001440000002304314770541312020457 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_charclass.h" #include "stri_container_logical.h" #include #include using namespace std; /** * Extract first or last occurrences of a character class in each string * * @param str character vector * @param pattern character vector * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-08) * * @version 0.1-?? (Marek Gagolewski, 2013-06-15) * Use StrContainerCharClass * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * detects invalid UTF-8 byte stream * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * StriContainerCharClass now relies on UnicodeSet * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri__extract_firstlast_charclass(SEXP str, SEXP pattern, bool first) { PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { SET_STRING_ELT(ret, i, NA_STRING); if (str_cont.isNA(i) || pattern_cont.isNA(i)) continue; const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); R_len_t j, jlast; UChar32 chr; if (first) { for (jlast=j=0; jcontains(chr)) { SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+jlast, j-jlast, CE_UTF8)); break; // that's enough for first } jlast = j; } } else { for (jlast=j=str_cur_n; j>0; ) { U8_PREV(str_cur_s, 0, j, chr); // go backwards if (chr < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+j, jlast-j, CE_UTF8)); break; // that's enough for last } jlast = j; } } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** * Extract first occurrence of a character class in each string * * @param str character vector * @param pattern character vector * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-08) */ SEXP stri_extract_first_charclass(SEXP str, SEXP pattern) { return stri__extract_firstlast_charclass(str, pattern, true); } /** * Extract last occurrence of a character class in each string * * @param str character vector * @param pattern character vector * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-08) */ SEXP stri_extract_last_charclass(SEXP str, SEXP pattern) { return stri__extract_firstlast_charclass(str, pattern, false); } /** * Extract all occurrences of a character class in each string * * @param str character vector * @param pattern character vector * @param simplify single logical value * * @return list of character vectors or character matrix * * @version 0.1-?? (Marek Gagolewski, 2013-06-08) * * @version 0.1-?? (Marek Gagolewski, 2013-06-15) * Use StrContainerCharClass * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * detects invalid UTF-8 byte stream * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * StriContainerCharClass now relies on UnicodeSet * * @version 0.3-1 (Marek Gagolewski, 2014-10-24) * added simplify param * * @version 0.3-1 (Marek Gagolewski, 2014-11-02) * using StriContainerCharClass::locateAll; * no longer vectorized over merge * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-11-27) * FR #117: omit_no_match arg added * * @version 0.4-1 (Marek Gagolewski, 2014-12-04) * allow `simplify=NA` */ SEXP stri_extract_all_charclass(SEXP str, SEXP pattern, SEXP merge, SEXP simplify, SEXP omit_no_match) { bool merge_cur = stri__prepare_arg_logical_1_notNA(merge, "merge"); bool omit_no_match1 = stri__prepare_arg_logical_1_notNA(omit_no_match, "omit_no_match"); PROTECT(simplify = stri__prepare_arg_logical_1(simplify, "simplify")); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(3) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (pattern_cont.isNA(i) || str_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); deque< pair > occurrences; StriContainerCharClass::locateAll( occurrences, &pattern_cont.get(i), str_cur_s, str_cur_n, merge_cur, false /* byte-based indexes */ ); R_len_t noccurrences = (R_len_t)occurrences.size(); if (noccurrences == 0) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(omit_no_match1?0:1)); continue; } SEXP cur_res; STRI__PROTECT(cur_res = Rf_allocVector(STRSXP, noccurrences)); deque< pair >::iterator iter = occurrences.begin(); for (R_len_t f = 0; iter != occurrences.end(); ++iter, ++f) { pair curo = *iter; SET_STRING_ELT(cur_res, f, Rf_mkCharLenCE(str_cur_s+curo.first, curo.second-curo.first, CE_UTF8)); } SET_VECTOR_ELT(ret, i, cur_res); STRI__UNPROTECT(1) } if (LOGICAL(simplify)[0] == NA_LOGICAL || LOGICAL(simplify)[0]) { SEXP robj_TRUE, robj_zero, robj_na_strings, robj_empty_strings; STRI__PROTECT(robj_TRUE = Rf_ScalarLogical(TRUE)); STRI__PROTECT(robj_zero = Rf_ScalarInteger(0)); STRI__PROTECT(robj_na_strings = stri__vector_NA_strings(1)); STRI__PROTECT(robj_empty_strings = stri__vector_empty_strings(1)); STRI__PROTECT(ret = stri_list2matrix(ret, robj_TRUE, (LOGICAL(simplify)[0] == NA_LOGICAL)?robj_na_strings :robj_empty_strings, robj_zero)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } stringi/src/stri_sub.cpp0000644000176200001440000007033214770541312015067 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8_indexable.h" #include "stri_string8buf.h" #include /** * used both in stri_sub and stri_sub_replacement * * @return number of objects PROTECTEd * * @version ??? (Marek Gagolewski, 20??-??-??) * * @version 1.7.1 (Marek Gagolewski, 2021-06-30) allow (from,length) matrices * * @version 1.7.1 (Marek Gagolewski, 2021-07-08) use_matrix */ R_len_t stri__sub_prepare_from_to_length(SEXP& from, SEXP& to, SEXP& length, R_len_t& from_len, R_len_t& to_len, R_len_t& length_len, int*& from_tab, int*& to_tab, int*& length_tab, bool use_matrix_1) { R_len_t sub_protected = 0; bool from_ismatrix = use_matrix_1 && Rf_isMatrix(from); if (from_ismatrix) { SEXP t; PROTECT(t = Rf_getAttrib(from, R_DimSymbol)); if (INTEGER(t)[1] == 1) from_ismatrix = false; /* it's a column vector */ else if (INTEGER(t)[1] > 2) { /* error() is allowed here */ UNPROTECT(1); // t Rf_error(MSG__ARG_EXPECTED_MATRIX_WITH_GIVEN_COLUMNS, "from", 2); } UNPROTECT(1); // t } sub_protected++; PROTECT(from = stri__prepare_arg_integer(from, "from")); /* may remove R_DimSymbol */ if (from_ismatrix) { bool fromlength_matrix = false; SEXP t; PROTECT(t = Rf_getAttrib(from, R_DimNamesSymbol)); if (!Rf_isNull(t)) { SEXP t2; PROTECT(t2 = VECTOR_ELT(t, 1)); if ( Rf_isString(t2) && LENGTH(t2) == 2 && strcmp("length", CHAR(STRING_ELT(t2, 1))) == 0 ) { fromlength_matrix = true; } UNPROTECT(1); // t2 } UNPROTECT(1); // t if (fromlength_matrix) { from_len = LENGTH(from)/2; length_len = from_len; from_tab = INTEGER(from); length_tab = from_tab+from_len; } else { from_len = LENGTH(from)/2; to_len = from_len; from_tab = INTEGER(from); to_tab = from_tab+from_len; } //PROTECT(to); /* fake - not to provoke stack imbalance */ //PROTECT(length); /* fake - not to provoke stack imbalance */ } else if (Rf_isNull(length)) { sub_protected++; PROTECT(to = stri__prepare_arg_integer(to, "to")); from_len = LENGTH(from); from_tab = INTEGER(from); to_len = LENGTH(to); to_tab = INTEGER(to); //PROTECT(length); /* fake - not to provoke stack imbalance */ } else { sub_protected++; PROTECT(length= stri__prepare_arg_integer(length, "length")); from_len = LENGTH(from); from_tab = INTEGER(from); length_len = LENGTH(length); length_tab = INTEGER(length); //PROTECT(to); /* fake - not to provoke stack imbalance */ } return sub_protected; /* rchk reports that this function * [PB] has possible protection stack imbalance * * well, of course it does!! -> this is by design, UPROTECTing somewhere else. */ } /** * used both in stri_sub and stri_sub_replacement */ inline void stri__sub_get_indices(StriContainerUTF8_indexable& str_cont, R_len_t& i, R_len_t& cur_from, R_len_t& cur_to, R_len_t& cur_from2, R_len_t& cur_to2) { if (cur_from >= 0) { cur_from--; /* 1-based -> 0-based index */ cur_from2 = str_cont.UChar32_to_UTF8_index_fwd(i, cur_from); } else { cur_from = -cur_from; cur_from2 = str_cont.UChar32_to_UTF8_index_back(i, cur_from); } if (cur_to >= 0) { ; /* do nothing with cur_to ; 1-based -> 0-based index */ /* but +1 as we need the next one (bound) */ cur_to2 = str_cont.UChar32_to_UTF8_index_fwd(i, cur_to); } else { cur_to = -cur_to - 1; cur_to2 = str_cont.UChar32_to_UTF8_index_back(i, cur_to); } } /** * Get substring * * * @param str character vector * @param from integer vector (possibly with negative indices) * @param to integer vector (possibly with negative indices) or NULL * @param length integer vector or NULL * @return character vector * * @version 0.1-?? (Bartek Tartanus) * stri_sub * * @version 0.1-?? (Marek Gagolewski) * Use StriContainerUTF8 and stri__UChar32_to_UTF8_index * * @version 0.1-?? (Marek Gagolewski, 2013-06-01) * Use StriContainerUTF8's UChar32-to-UTF8 index * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * Make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-20) * Use StriContainerUTF8_indexable * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * Use stri__sub_prepare_from_to_length() * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.5-9003 (Marek Gagolewski, 2015-08-05) * Bugfix #183: floating point exception when to or length is an empty vector * * @version 1.7.1 (Marek Gagolewski, 2021-06-28) * Negative length yields NA * * @version 1.7.1 (Marek Gagolewski, 2021-07-08) * use_matrix, ignore_negative_length */ SEXP stri_sub(SEXP str, SEXP from, SEXP to, SEXP length, SEXP use_matrix, SEXP ignore_negative_length) { PROTECT(str = stri__prepare_arg_string(str, "str")); bool use_matrix_1 = stri__prepare_arg_logical_1_notNA(use_matrix, "use_matrix"); bool ignore_negative_length_1 = stri__prepare_arg_logical_1_notNA(ignore_negative_length, "ignore_negative_length"); R_len_t str_len = LENGTH(str); R_len_t from_len = 0; R_len_t to_len = 0; R_len_t length_len = 0; int* from_tab = 0; int* to_tab = 0; int* length_tab = 0; R_len_t sub_protected = 1+ /* how many objects to PROTECT on ret? */ stri__sub_prepare_from_to_length(from, to, length, from_len, to_len, length_len, from_tab, to_tab, length_tab, use_matrix_1); R_len_t vectorize_len = stri__recycling_rule(true, 3, str_len, from_len, (to_len>length_len)?to_len:length_len); if (vectorize_len <= 0) { UNPROTECT(sub_protected); return Rf_allocVector(STRSXP, 0); } STRI__ERROR_HANDLER_BEGIN(sub_protected) StriContainerUTF8_indexable str_cont(str, vectorize_len); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_len)); R_len_t num_negative_length = 0; for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { R_len_t cur_from = from_tab[i % from_len]; R_len_t cur_to = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len]; if (str_cont.isNA(i) || cur_from == NA_INTEGER || cur_to == NA_INTEGER) { SET_STRING_ELT(ret, i, NA_STRING); continue; } if (length_tab) { if (cur_to == 0) { SET_STRING_ELT(ret, i, R_BlankString); continue; } else if (cur_to < 0) { SET_STRING_ELT(ret, i, NA_STRING); num_negative_length++; continue; } cur_to = cur_from + cur_to - 1; if (cur_from < 0 && cur_to >= 0) cur_to = -1; } const char* str_cur_s = str_cont.get(i).c_str(); R_len_t cur_from2; // UTF-8 byte indices R_len_t cur_to2; // UTF-8 byte indices stri__sub_get_indices(str_cont, i, cur_from, cur_to, cur_from2, cur_to2); if (cur_to2 > cur_from2) { // just copy SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+cur_from2, cur_to2-cur_from2, CE_UTF8)); } else { // maybe a warning here? SET_STRING_ELT(ret, i, Rf_mkCharLen(NULL, 0)); } } if (num_negative_length > 0 && ignore_negative_length_1) { // stringx: ignore items corresponding to length<0 STRI_ASSERT(length_tab) SEXP ret_old = ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_len-num_negative_length)); R_len_t k = 0; for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { R_len_t cur_from = from_tab[i % from_len]; R_len_t cur_to = length_tab[i % length_len]; if (!str_cont.isNA(i) && cur_from != NA_INTEGER && cur_to != NA_INTEGER && cur_to < 0) { // ignore } else { SET_STRING_ELT(ret, k, STRING_ELT(ret_old, i)); ++k; } } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** * Substring replacement function * * * @param str character vector * @param from integer vector (possibly with negative indices) * @param to integer vector (possibly with negative indices) or NULL * @param length integer vector or NULL * @param omit_na logical scalar * @param value character vector replacement * @return character vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF8 and stri__UChar32_to_UTF8_index * * @version 0.1-?? (Marek Gagolewski, 2013-06-01) * use StriContainerUTF8's UChar32-to-UTF8 index * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-20) * Use StriContainerUTF8_indexable * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * Use stri__sub_prepare_from_to_length() * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.5-9003 (Marek Gagolewski, 2015-08-05) * Bugfix #183: floating point exception when to or length is an empty vector * * @version 1.0-2 (Marek Gagolewski, 2016-01-31) * FR #199: new arg: `omit_na` * FR #207: allow insertions * * * @version 1.4.3 (Marek Gagolewski, 2019-03-12) * #346: na_omit for `value` * * @version 1.7.1 (Marek Gagolewski, 2021-06-28) * negative length does not alter input * * @version 1.7.1 (Marek Gagolewski, 2021-07-08) * use_matrix */ SEXP stri_sub_replacement(SEXP str, SEXP from, SEXP to, SEXP length, SEXP omit_na, SEXP value, SEXP use_matrix) { PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(value = stri__prepare_arg_string(value, "value")); bool omit_na_1 = stri__prepare_arg_logical_1_notNA(omit_na, "omit_na"); bool use_matrix_1 = stri__prepare_arg_logical_1_notNA(use_matrix, "use_matrix"); R_len_t value_len = LENGTH(value); R_len_t str_len = LENGTH(str); R_len_t from_len = 0; // see below R_len_t to_len = 0; // see below R_len_t length_len = 0; // see below int* from_tab = 0; // see below int* to_tab = 0; // see below int* length_tab = 0; // see below R_len_t sub_protected = 2+ /* how many objects to PROTECT on ret? */ stri__sub_prepare_from_to_length(from, to, length, from_len, to_len, length_len, from_tab, to_tab, length_tab, use_matrix_1); R_len_t vectorize_len = stri__recycling_rule(true, 4, str_len, value_len, from_len, (to_len>length_len)?to_len:length_len); if (vectorize_len <= 0) { UNPROTECT(sub_protected); return Rf_allocVector(STRSXP, 0); } STRI__ERROR_HANDLER_BEGIN(sub_protected) StriContainerUTF8_indexable str_cont(str, vectorize_len); StriContainerUTF8 value_cont(value, vectorize_len); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_len)); String8buf buf(0); // @TODO: estimate bufsize a priori for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { R_len_t cur_from = from_tab[i % from_len]; R_len_t cur_to = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len]; if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } if (cur_from == NA_INTEGER || cur_to == NA_INTEGER || value_cont.isNA(i)) { if (omit_na_1) { SET_STRING_ELT(ret, i, str_cont.toR(i)); } else { SET_STRING_ELT(ret, i, NA_STRING); } continue; } if (!to_tab && cur_to/*length*/ < 0) { // so not NA SET_STRING_ELT(ret, i, str_cont.toR(i)); continue; } if (length_tab) { if (cur_to <= 0) { // SET_STRING_ELT(ret, i, R_BlankString); // continue; cur_to = 0; } else { cur_to = cur_from + cur_to - 1; if (cur_from < 0 && cur_to >= 0) cur_to = -1; } } const char* str_cur_s = str_cont.get(i).c_str(); R_len_t str_cur_n = str_cont.get(i).length(); const char* value_cur_s = value_cont.get(i).c_str(); R_len_t value_cur_n = value_cont.get(i).length(); R_len_t cur_from2; // UTF-8 byte indices R_len_t cur_to2; // UTF-8 byte indices stri__sub_get_indices(str_cont, i, cur_from, cur_to, cur_from2, cur_to2); if (cur_to2 < cur_from2) cur_to2 = cur_from2; R_len_t buflen = str_cur_n-(cur_to2-cur_from2)+value_cur_n; buf.resize(buflen, false/*destroy contents*/); if (cur_from2 > 0) memcpy(buf.data(), str_cur_s, (size_t)cur_from2); if (value_cur_n > 0) memcpy(buf.data()+cur_from2, value_cur_s, (size_t)value_cur_n); if (str_cur_n-cur_to2 > 0) memcpy(buf.data()+cur_from2+value_cur_n, str_cur_s+cur_to2, (size_t)str_cur_n-cur_to2); SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buflen, CE_UTF8)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** * Extract multiple substrings * * * @param str character vector * @param from list * @param to list * @param length list * @return list of character vectors * * @version 1.3.2 (Marek Gagolewski, 2019-02-21) * #30: new function * * @version 1.7.1 (Marek Gagolewski, 2021-06-28) * negative length yields NA * * @version 1.7.1 (Marek Gagolewski, 2021-07-08) * use_matrix, ignore_negative_length */ SEXP stri_sub_all(SEXP str, SEXP from, SEXP to, SEXP length, SEXP use_matrix, SEXP ignore_negative_length) { PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(from = stri__prepare_arg_list(from, "from")); PROTECT(to = stri__prepare_arg_list(to, "to")); PROTECT(length = stri__prepare_arg_list(length, "length")); // bool use_matrix_1 = stri__prepare_arg_logical_1_notNA(use_matrix, "use_matrix"); R_len_t str_len = LENGTH(str); R_len_t from_len = LENGTH(from); // R_len_t to_len = LENGTH(to); // R_len_t length_len = LENGTH(length); R_len_t vectorize_len; if (!Rf_isNull(to)) vectorize_len = stri__recycling_rule(true, 3, str_len, from_len, LENGTH(to)); else if (!Rf_isNull(length)) vectorize_len = stri__recycling_rule(true, 3, str_len, from_len, LENGTH(length)); else vectorize_len = stri__recycling_rule(true, 2, str_len, from_len); if (vectorize_len <= 0) { UNPROTECT(4); return Rf_allocVector(VECSXP, 0); } // no STRI__ERROR_HANDLER_BEGIN block ---- stri_sub can longjmp with Rf_error... SEXP ret, str_tmp, tmp; PROTECT(ret = Rf_allocVector(VECSXP, vectorize_len)); //5 PROTECT(str_tmp = Rf_allocVector(STRSXP, 1)); //6 for (R_len_t i = 0; ilength_len)?to_len:length_len); if (vectorize_len <= 0) { // "nothing" is being replaced -> return the input as-is UNPROTECT(sub_protected); return curs; } if (value_len <= 0) { // things are supposed to be replaced with "nothing"... UNPROTECT(sub_protected); Rf_warning(MSG__REPLACEMENT_ZERO); return NA_STRING; } const char* curs_s = CHAR(curs); // already in UTF-8 // TODO: ALTREP will be problematic? R_len_t curs_n = LENGTH(curs); // first check for NAs.... if (!omit_na_1) { for (R_len_t i=0; i buf; // convenience >> speed R_len_t num_replaced = 0; R_len_t last_pos = 0; R_len_t byte_pos = 0; for (R_len_t i=0; i 0-based index if (cur_from >= curs_m) cur_from = curs_m; // cur_from is in [0, curs_m] if (length_tab) { if (cur_to < 0) cur_to = 0; cur_to = cur_from+cur_to; } else { if (cur_to < 0) cur_to = curs_m+cur_to+1; if (cur_to < cur_from) cur_to = cur_from; // insertion } if (cur_to >= curs_m) cur_to = curs_m; // the chunk to replace is at code points [cur_from, cur_to) // Rprintf("orig [%d,%d) repl [%d,%d)\n", last_pos, cur_from, cur_from, cur_to); if (last_pos > cur_from) throw StriException(MSG__OVERLAPPING_OR_UNSORTED_INDEXES); // first, copy [last_pos, cur_from) R_len_t byte_pos_last = byte_pos; while (last_pos < cur_from) { U8_FWD_1_UNSAFE(curs_s, byte_pos); ++last_pos; } if (byte_pos-byte_pos_last > 0) { R_len_t buf_size = buf.size(); buf.resize(buf_size+byte_pos-byte_pos_last); if (!buf.data() || !curs_s) throw StriException(MSG__MEM_ALLOC_ERROR); memcpy(buf.data()+buf_size, curs_s+byte_pos_last, byte_pos-byte_pos_last); } // then, copy the corresponding replacement string SEXP value_cur = STRING_ELT(value, i%value_len); const char* value_s = CHAR(value_cur); // TODO: ALTREP will be problematic? R_len_t value_n = LENGTH(value_cur); if (value_n > 0) { R_len_t buf_size = buf.size(); buf.resize(buf_size+value_n); if (!buf.data() || !value_s) throw StriException(MSG__MEM_ALLOC_ERROR); memcpy(buf.data()+buf_size, value_s, value_n); } // lastly, update last_pos // ---> last_pos = cur_to; while (last_pos < cur_to) { U8_FWD_1_UNSAFE(curs_s, byte_pos); ++last_pos; } } // finally, copy [last_pos, curs_m) if (curs_n-byte_pos > 0) { R_len_t buf_size = buf.size(); buf.resize(buf_size+curs_n-byte_pos); if (!buf.data() || !curs_s) throw StriException(MSG__MEM_ALLOC_ERROR); memcpy(buf.data()+buf_size, curs_s+byte_pos, curs_n-byte_pos); } // only warn if not NA if (num_replaced > 0 && vectorize_len % value_len != 0) Rf_warning(MSG__WARN_RECYCLING_RULE2); SEXP ret; STRI__PROTECT(ret = Rf_mkCharLenCE(buf.data(), buf.size(), CE_UTF8)); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** * Replace multiple substrings * * * @param str character vector * @param from integer vector (possibly with negative indices) * @param to integer vector (possibly with negative indices) or NULL * @param length integer vector or NULL * @param omit_na logical scalar * @param value character vector replacement * @return character vector * * @version 1.3.2 (Marek Gagolewski, 2019-02-22) * #30: new function * * * @version 1.4.3 (Marek Gagolewski, 2019-03-12) * #346: na_omit for `value` * * @version 1.7.1 (Marek Gagolewski, 2021-06-28) * negative length does not alter input * * @version 1.7.1 (Marek Gagolewski, 2021-07-08) * use_matrix */ SEXP stri_sub_replacement_all(SEXP str, SEXP from, SEXP to, SEXP length, SEXP omit_na, SEXP value, SEXP use_matrix) { //PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(str = stri_enc_toutf8(str, Rf_ScalarLogical(FALSE), Rf_ScalarLogical(FALSE))); PROTECT(from = stri__prepare_arg_list(from, "from")); PROTECT(to = stri__prepare_arg_list(to, "to")); PROTECT(length = stri__prepare_arg_list(length, "length")); PROTECT(value = stri__prepare_arg_list(value, "value")); bool omit_na_1 = stri__prepare_arg_logical_1_notNA(omit_na, "omit_na"); bool use_matrix_1 = stri__prepare_arg_logical_1_notNA(use_matrix, "use_matrix"); R_len_t str_len = LENGTH(str); R_len_t from_len = LENGTH(from); R_len_t value_len = LENGTH(value); R_len_t vectorize_len; if (!Rf_isNull(to)) vectorize_len = stri__recycling_rule(true, 4, str_len, from_len, value_len, LENGTH(to)); else if (!Rf_isNull(length)) vectorize_len = stri__recycling_rule(true, 4, str_len, from_len, value_len, LENGTH(length)); else vectorize_len = stri__recycling_rule(true, 3, str_len, from_len, value_len); if (vectorize_len <= 0) { UNPROTECT(5); return Rf_allocVector(STRSXP, 0); } // no STRI__ERROR_HANDLER_BEGIN block ---- below we can longjmp with Rf_error... SEXP ret, curs, tmp; PROTECT(ret = Rf_allocVector(STRSXP, vectorize_len)); // 6 for (R_len_t i = 0; i * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_exception_h #define __stri_exception_h #include "stri_external.h" #include "stri_messages.h" #include using namespace std; #define StriException_BUFSIZE 4096 #define STRI__ERROR_HANDLER_BEGIN(nprotect) \ int __stri_protected_sexp_num = nprotect; \ char* __stri_error_msg = (char*)NULL; \ try { #define STRI__ERROR_HANDLER_END(cleanup) \ } \ catch (StriException e) { \ cleanup; \ STRI__UNPROTECT_ALL \ /*e.throwRerror()----don't do this, memleaks!: */ \ __stri_error_msg = R_alloc(StriException_BUFSIZE, 1); \ strncpy(__stri_error_msg, e.getMessage(), StriException_BUFSIZE); \ /*return R_NilValue;*/ \ } \ /* call Rf_error here, when e is deleted, no memleaks */ \ Rf_error("%s", __stri_error_msg); /* msg may feature %s */ \ /* to avoid compiler warning: */ \ return R_NilValue; #define STRI__PROTECT(s) { \ PROTECT(s); \ ++__stri_protected_sexp_num; } #ifndef NDEBUG #define STRI__UNPROTECT(n) { \ UNPROTECT(n); \ if (n > __stri_protected_sexp_num) \ Rf_warning("STRI__UNPROTECT: stack imbalance!"); \ __stri_protected_sexp_num -= n; } #else #define STRI__UNPROTECT(n) { \ UNPROTECT(n); \ __stri_protected_sexp_num -= n; } #endif #define STRI__UNPROTECT_ALL { \ UNPROTECT(__stri_protected_sexp_num); \ __stri_protected_sexp_num = 0; } #define STRI__CHECKICUSTATUS_THROW(status, onerror) { \ if (U_FAILURE(status)) { \ onerror; \ throw StriException(status); \ } \ } #define STRI__CHECKICUSTATUS_RFERROR(status, onerror) { \ if (U_FAILURE(status)) { \ onerror; \ Rf_error(MSG__ICU_ERROR, \ ICUError::getICUerrorName(status), \ u_errorName(status)); \ } \ } /** Translates ICU error code to an informative message. * * @version 1.4.7 (Marek Gagolewski, 2020-08-21) * make independent from StriException */ class ICUError { public: static const char* getICUerrorName(UErrorCode status); }; #ifndef NDEBUG /* *************** !NDEBUG *************************************************** */ #ifndef STRI_ASSERT #define __STRI_ASSERT_STR(x) #x #define STRI_ASSERT_STR(x) __STRI_ASSERT_STR(x) #define STRI_ASSERT(EXPR) { if (!(EXPR)) \ REprintf("stringi: Assertion %s failed in %s:%d", #EXPR, __FILE__, __LINE__); } #endif /** * A class representing exceptions for !NDEBUG * * @version 1.4.7 (Marek Gagolewski, 2020-08-21) * Improve !NDEBUG diagnostics * * @version 1.6.3 (Marek Gagolewski, 2021-05-21) snprintf */ class __StriException { private: char msg[StriException_BUFSIZE]; ///< message to be passed to error() public: __StriException(const char* file, int line, const char* format, ...) { snprintf(msg, StriException_BUFSIZE, "[!NDEBUG] Error in %s:%d: ", file, line); va_list args; va_start(args, format); R_len_t msg_size = strlen(msg); vsnprintf(msg+msg_size, StriException_BUFSIZE-msg_size, format, args); va_end(args); } __StriException(const char* file, int line, UErrorCode status, const char* context = NULL) { snprintf(msg, StriException_BUFSIZE, "[!NDEBUG: Error in %s:%d] ", file, line); R_len_t msg_size = strlen(msg); if (context) { snprintf(msg+msg_size, StriException_BUFSIZE-msg_size, MSG__ICU_ERROR_WITH_CONTEXT, ICUError::getICUerrorName(status), u_errorName(status), context); } else { snprintf(msg+msg_size, StriException_BUFSIZE-msg_size, MSG__ICU_ERROR, ICUError::getICUerrorName(status), u_errorName(status)); } } void throwRerror() { Rf_error("%s", msg); // avoids treating %'s as special chars } const char* getMessage() const { return msg; } }; #define StriException(...) __StriException(__FILE__, __LINE__, __VA_ARGS__) typedef __StriException StriException; /* *************** !NDEBUG *************************************************** */ #else /* *************** NDEBUG *************************************************** */ #ifndef STRI_ASSERT #define STRI_ASSERT(EXPR) { ; } #endif /** * A class representing exceptions * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * * @version 0.2-1 (Marek Gagolewski, 2014-04-18) * do not use R_alloc for msg * * @version 1.6.3 (Marek Gagolewski, 2021-05-21) snprintf */ class StriException { private: char msg[StriException_BUFSIZE]; ///< message to be passed to error() public: StriException(const char* format, ...) { va_list args; va_start(args, format); vsnprintf(msg, StriException_BUFSIZE, format, args); va_end(args); } StriException(UErrorCode status, const char* context = NULL) { if (context) { snprintf(msg, StriException_BUFSIZE, MSG__ICU_ERROR_WITH_CONTEXT, ICUError::getICUerrorName(status), u_errorName(status), context); } else { snprintf(msg, StriException_BUFSIZE, MSG__ICU_ERROR, ICUError::getICUerrorName(status), u_errorName(status)); } } void throwRerror() { Rf_error("%s", msg); // avoids treating %'s as special chars } const char* getMessage() const { return msg; } }; /* *************** NDEBUG *************************************************** */ #endif #endif stringi/src/stri_search_boundaries_extract.cpp0000644000176200001440000001527014770541312021510 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8_indexable.h" #include "stri_container_integer.h" #include "stri_brkiter.h" /** * Extract first or last text between boundaries * * @param str character vector * @param opts_brkiter list * @param first looking for first or last match? * @return character vector * * @version 0.5-1 (Marek Gagolewski, 2014-12-19) */ SEXP stri__extract_firstlast_boundaries(SEXP str, SEXP opts_brkiter, bool first) { PROTECT(str = stri__prepare_arg_string(str, "str")); StriBrkIterOptions opts_brkiter2(opts_brkiter, "line_break"); STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_length = LENGTH(str); StriContainerUTF8_indexable str_cont(str, str_length); StriRuleBasedBreakIterator brkiter(opts_brkiter2); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_length)); for (R_len_t i = 0; i < str_length; ++i) { SET_STRING_ELT(ret, i, NA_STRING); if (str_cont.isNA(i) || str_cont.get(i).length() == 0) continue; brkiter.setupMatcher(str_cont.get(i).c_str(), str_cont.get(i).length()); pair curpair; if (first) { brkiter.first(); if (!brkiter.next(curpair)) continue; } else { brkiter.last(); if (!brkiter.previous(curpair)) continue; } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cont.get(i).c_str()+curpair.first, curpair.second-curpair.first, CE_UTF8)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( ;/* do nothing special on error */ ) } /** * Extract first text between boundaries * * @param str character vector * @param opts_brkiter list * @return character vector * * @version 0.5-1 (Marek Gagolewski, 2014-12-19) */ SEXP stri_extract_first_boundaries(SEXP str, SEXP opts_brkiter) { return stri__extract_firstlast_boundaries(str, opts_brkiter, true); } /** * Extract last text between boundaries * * @param str character vector * @param opts_brkiter list * @return character vector * * @version 0.5-1 (Marek Gagolewski, 2014-12-19) */ SEXP stri_extract_last_boundaries(SEXP str, SEXP opts_brkiter) { return stri__extract_firstlast_boundaries(str, opts_brkiter, false); } /** Extract all text between boundaries * * @param str character vector * @param simplify logical * @param omit_no_match logical * @param opts_brkiter named list * @return list or matrix * * @version 0.5-1 (Marek Gagolewski, 2014-12-19) */ SEXP stri_extract_all_boundaries(SEXP str, SEXP simplify, SEXP omit_no_match, SEXP opts_brkiter) { bool omit_no_match1 = stri__prepare_arg_logical_1_notNA(omit_no_match, "omit_no_match"); PROTECT(simplify = stri__prepare_arg_logical_1(simplify, "simplify")); PROTECT(str = stri__prepare_arg_string(str, "str")); StriBrkIterOptions opts_brkiter2(opts_brkiter, "line_break"); STRI__ERROR_HANDLER_BEGIN(2) R_len_t str_length = LENGTH(str); StriContainerUTF8_indexable str_cont(str, str_length); StriRuleBasedBreakIterator brkiter(opts_brkiter2); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, str_length)); for (R_len_t i = 0; i < str_length; ++i) { if (str_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } brkiter.setupMatcher(str_cont.get(i).c_str(), str_cont.get(i).length()); brkiter.first(); deque< pair > occurrences; pair curpair; while (brkiter.next(curpair)) occurrences.push_back(curpair); R_len_t noccurrences = (R_len_t)occurrences.size(); if (noccurrences <= 0) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(omit_no_match1?0:1)); continue; } const char* str_cur_s = str_cont.get(i).c_str(); SEXP cur_res; STRI__PROTECT(cur_res = Rf_allocVector(STRSXP, noccurrences)); deque< pair >::iterator iter = occurrences.begin(); for (R_len_t j = 0; iter != occurrences.end(); ++iter, ++j) { pair curo = *iter; SET_STRING_ELT(cur_res, j, Rf_mkCharLenCE(str_cur_s+curo.first, curo.second-curo.first, CE_UTF8)); } SET_VECTOR_ELT(ret, i, cur_res); STRI__UNPROTECT(1); } if (LOGICAL(simplify)[0] == NA_LOGICAL || LOGICAL(simplify)[0]) { SEXP robj_TRUE, robj_zero, robj_na_strings, robj_empty_strings; STRI__PROTECT(robj_TRUE = Rf_ScalarLogical(TRUE)); STRI__PROTECT(robj_zero = Rf_ScalarInteger(0)); STRI__PROTECT(robj_na_strings = stri__vector_NA_strings(1)); STRI__PROTECT(robj_empty_strings = stri__vector_empty_strings(1)); STRI__PROTECT(ret = stri_list2matrix(ret, robj_TRUE, (LOGICAL(simplify)[0] == NA_LOGICAL)?robj_na_strings :robj_empty_strings, robj_zero)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({/* no-op */}) } stringi/src/stri_container_integer.h0000644000176200001440000000727014770541312017443 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_container_integer_h #define __stri_container_integer_h #include "stri_container_base.h" /** * A wrapper-class for R integer vectors * * @version 0.1-?? (Marek Gagolewski, 2013-06-15) */ class StriContainerInteger : public StriContainerBase { private: int* data; public: StriContainerInteger() : StriContainerBase() { data = NULL; } StriContainerInteger(SEXP rvec, R_len_t _nrecycle) { this->data = NULL; #ifndef NDEBUG if (!Rf_isInteger(rvec)) throw StriException("DEBUG: !isInteger in StriContainerInteger"); #endif R_len_t ndata = LENGTH(rvec); this->init_Base(ndata, _nrecycle, true); this->data = INTEGER(rvec); // TODO: ALTREP will be problematic? } // StriContainerInteger(StriContainerInteger& container); // default-shallow // ~StriContainerInteger(); // default-shallow // StriContainerInteger& operator=(StriContainerInteger& container); // default-shallow /** check if the vectorized ith element is NA * @param i index * @return true if is NA */ inline bool isNA(R_len_t i) const { #ifndef NDEBUG if (i < 0 || i >= nrecycle) throw StriException("StriContainerInteger::isNA(): INDEX OUT OF BOUNDS"); #endif return (data[i%n] == NA_INTEGER); } /** get the vectorized ith element * * @param i index * @return integer */ inline int get(R_len_t i) const { #ifndef NDEBUG if (i < 0 || i >= nrecycle) throw StriException("StriContainerInteger::get(): INDEX OUT OF BOUNDS"); if (data[i%n] == NA_INTEGER) throw StriException("StriContainerInteger::get(): isNA"); #endif return (data[i%n]); } /** get the vectorized ith element, no NA check here * * @param i index * @return integer */ inline int getNAble(R_len_t i) const { #ifndef NDEBUG if (i < 0 || i >= nrecycle) throw StriException("StriContainerInteger::get(): INDEX OUT OF BOUNDS"); #endif return (data[i%n]); } }; #endif stringi/src/stri_intvec.h0000644000176200001440000000564514770541312015240 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_intvec_h #define __stri_intvec_h #include "stri_stringi.h" /** * A class to represent an integer vector or NULL/NA * * Currently each int vector is read-only. * It may be used as a simple wrapper for R integer vectors. * * @version 0.2-1 (Marek Gagolewski, 2014-03-25) */ class IntVec { private: const int* m_data; R_len_t m_n; public: /** default constructor * */ IntVec() { this->m_data = NULL; this->m_n = 0; } /** used to set data * * @param data * @param n */ void initialize(const int* data, R_len_t n) { this->m_data = data; this->m_n = n; } /** constructor * @param data buffer * @param n buffer length (not including NUL) */ IntVec(const int* data, R_len_t n) // TODO: bool memalloc { initialize(data, n); } inline bool isNA() const { return m_data == NULL; } inline const int* data() const { #ifndef NDEBUG if (isNA()) throw StriException("IntVec::isNA() in data()"); #endif return this->m_data; } inline R_len_t size() const { #ifndef NDEBUG if (isNA()) throw StriException("IntVec::isNA() in size()"); #endif return this->m_n; } }; #endif stringi/src/stri_string8.h0000644000176200001440000001557214770541312015346 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_string8_h #define __stri_string8_h #include "stri_stringi.h" #include /** * A class to represent a (TODO: read-only?) UTF-8 string. * * It can mark string as missing (NA), count the number of Unicode code points, * remove BOMs * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski, 2013-06-13) * added resize() method and m_size field * * @version 0.1-24 (Marek Gagolewski, 2014-03-11) * Fixed array over-runs detected with valgrind * * @version 0.2-1 (Marek Gagolewski, 2014-03-15) * m_str == NULL now denotes a missing value, * isNA(), initialize() methods added * * @version 0.2-1 (Marek Gagolewski, 2014-03-23) * initialize() now can kill UTF8 BOMs. * separated String8buf * * @version 0.2-2 (Marek Gagolewski, 2014-04-20) * new method: countCodePoints() * * @version 0.3-1 (Marek Gagolewski, 2014-11-02) * BUGFIX?: Added explicit zero bytes at the end of each array; * new methods: replaceAllAtPos(), setNA() * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * new field: m_isASCII */ class String8 { private: char* m_str; ///< character data in UTF-8, NULL denotes NA R_len_t m_n; ///< string length (in bytes), not including NUL bool m_memalloc; ///< should the memory be freed at the end? bool m_isASCII; ///< ASCII or UTF-8? TODO: is it used anywhere? public: /** default constructor * */ String8() { this->m_str = NULL; // a missing value this->m_n = 0; this->m_memalloc = false; this->m_isASCII = false; } /** used to set data (construct already created, * but NA-initialized object) * * @param str character buffer * @param n buffer length (not including NUL) * @param memalloc should a deep copy of the buffer be done? * @param killbom whether to detect and delete UTF-8 BOMs * @param isASCII */ void initialize(const char* str, R_len_t n, bool memalloc, bool killbom, bool isASCII); /** constructor * @param str character buffer * @param n buffer length (not including NUL) * @param memalloc should a deep copy of the buffer be done? * @param killbom whether to detect and delete UTF-8 BOMs * @param isASCII */ String8(const char* str, R_len_t n, bool memalloc, bool killbom, bool isASCII) { this->m_str = NULL; // a missing value initialize(str, n, memalloc, killbom, isASCII); } /** destructor */ ~String8() { if (this->m_str && this->m_memalloc) { delete [] this->m_str; } this->m_str = NULL; } /** destructor */ inline void setNA() { if (this->m_str) { if (this->m_memalloc) { delete [] this->m_str; } this->m_str = NULL; } } /** copy constructor */ String8(const String8& s); /** copy */ String8& operator=(const String8& s); /** does this String8 represent a missing value? */ inline bool isNA() const { return !this->m_str; } /** is this String8 in ASCII? */ inline bool isASCII() const { return this->m_isASCII; } /** is this String8 in UTF-8? */ inline bool isUTF8() const { return !this->m_isASCII; } /** Misleading name: did we allocate mem in String8 * or is this string a shallow copy of some "external" resource? */ inline bool isReadOnly() const { return !this->m_memalloc; } /** return the char buffer */ inline const char* c_str() const { #ifndef NDEBUG if (isNA()) throw StriException("String8::isNA() in c_str()"); #endif return this->m_str; } /** string length in bytes */ inline R_len_t length() const { #ifndef NDEBUG if (isNA()) throw StriException("String8::isNA() in length()"); #endif return this->m_n; } /** The number of Unicode code points */ inline R_len_t countCodePoints() const { #ifndef NDEBUG if (isNA()) throw StriException("String8::isNA() in countCodePoints()"); #endif if (m_isASCII) return m_n; else return stri__length_string(m_str, m_n); } /** * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * moved from StriContainerByteSearch to String8 */ bool endsWith( R_len_t byteindex, const char* patternStr, R_len_t patternLen, bool caseInsensitive ) const; /** * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * moved from StriContainerByteSearch to String8 */ bool startsWith( R_len_t byteindex, const char* patternStr, R_len_t patternLen, bool caseInsensitive ) const; /** Replace substrings with a given replacement string * * * @version 0.3-1 (Marek Gagolewski, 2014-11-02) */ void replaceAllAtPos( R_len_t buf_size, const char* replacement_cur_s, R_len_t replacement_cur_n, std::deque< std::pair >& occurrences ); }; #endif stringi/src/stri_exception.cpp0000644000176200001440000004070514750110642016271 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" /** Get ICU error name * * @param status error code * @return string * * @version 0.1-?? (Marek Gagolewski) * * @version 0.4-1 (Marek Gagolewski, 2014-11-30) * ICU warning info added * * @version 1.1.2 (Marek Gagolewski, 2017-01-07) _LIMIT, _COUNT deprecated * * @version 1.4.7 (Marek Gagolewski, 2020-08-21) Rename class, make * independent from StriException * * TODO: these should all start with a lowercase letter and not end with a dot */ const char* ICUError::getICUerrorName(UErrorCode status) { switch(status) { case U_USING_FALLBACK_WARNING: return "A resource bundle lookup returned a result from a fallback (more general) locale."; // (not an error) case U_USING_DEFAULT_WARNING: return "A resource bundle lookup returned a result either from the root or the default locale."; // (not an error) case U_SAFECLONE_ALLOCATED_WARNING: return "A SafeClone operation required allocating memory. (informational only)"; case U_STATE_OLD_WARNING: return "ICU has to use compatibility layer to construct the service. Expect performance/memory usage degradation. Consider upgrading"; case U_STRING_NOT_TERMINATED_WARNING: return "An output string could not be NUL-terminated because output length==destCapacity."; case U_SORT_KEY_TOO_SHORT_WARNING: return "Number of levels requested in getBound is higher than the number of levels in the sort key."; case U_AMBIGUOUS_ALIAS_WARNING: return "This converter alias can go to different converter implementations."; case U_DIFFERENT_UCA_VERSION: return "ucol_open encountered a mismatch between UCA version and collator image version, so the collator was constructed from rules. No impact to further function."; case U_PLUGIN_CHANGED_LEVEL_WARNING: return "A plugin caused a level change. May not be an error, but later plugins may not load."; case U_ZERO_ERROR: return "No error, no warning (Why have stringi reported this? Please file a bug report.)"; case U_ILLEGAL_ARGUMENT_ERROR: return "Illegal argument."; case U_MISSING_RESOURCE_ERROR: return "The requested ICU resource cannot be found."; case U_INVALID_FORMAT_ERROR: return "Data format is not what is expected."; case U_FILE_ACCESS_ERROR: // udata.cpp only return "The requested ICU resource file cannot be found."; case U_INTERNAL_PROGRAM_ERROR: return "Internal ICU error, might be a bug in the library code."; case U_MESSAGE_PARSE_ERROR: return "Unable to parse a message (message format)."; case U_MEMORY_ALLOCATION_ERROR: return "Memory allocation error."; case U_INDEX_OUTOFBOUNDS_ERROR: return "Trying to access the index that is out of bounds."; case U_PARSE_ERROR: return "ICU Parse error."; case U_INVALID_CHAR_FOUND: return "Character conversion: Unmappable input sequence / Invalid character."; case U_TRUNCATED_CHAR_FOUND: return "Character conversion: Incomplete input sequence."; case U_ILLEGAL_CHAR_FOUND: return "Character conversion: Illegal input sequence/combination of input units."; case U_INVALID_TABLE_FORMAT: return "Conversion table file found, but corrupted."; case U_INVALID_TABLE_FILE: return "Conversion table file not found."; case U_BUFFER_OVERFLOW_ERROR: return "A result would not fit in the supplied buffer."; case U_UNSUPPORTED_ERROR: return "Requested operation not supported in current context."; case U_RESOURCE_TYPE_MISMATCH: return "An operation is requested over a resource that does not support it"; case U_ILLEGAL_ESCAPE_SEQUENCE: return "ISO-2022 illegal escape sequence."; case U_UNSUPPORTED_ESCAPE_SEQUENCE: return "ISO-2022 unsupported escape sequence."; case U_NO_SPACE_AVAILABLE: return "No space available for in-buffer expansion for Arabic shaping."; case U_CE_NOT_FOUND_ERROR: return "Currently used only while setting variable top, but can be used generally."; case U_PRIMARY_TOO_LONG_ERROR: return "User tried to set variable top to a primary that is longer than two bytes."; case U_STATE_TOO_OLD_ERROR: return "ICU cannot construct a service from this state, as it is no longer supported."; case U_TOO_MANY_ALIASES_ERROR: return "There are too many aliases in the path to the requested resource. It is very possible that a circular alias definition has occurred"; case U_ENUM_OUT_OF_SYNC_ERROR: return "UEnumeration out of sync with underlying collection."; case U_INVARIANT_CONVERSION_ERROR: return "Unable to convert a UChar* string to char* with the invariant converter."; case U_INVALID_STATE_ERROR: return "Requested operation can not be completed with ICU in its current state."; case U_COLLATOR_VERSION_MISMATCH: return "Collator version is not compatible with the base version."; case U_USELESS_COLLATOR_ERROR: return "Collator is options only and no base is specified."; case U_NO_WRITE_PERMISSION: return "Attempt to modify read-only or constant data."; // case U_STANDARD_ERROR_LIMIT: // return "This must always be the last value to indicate the limit for standard errors."; case U_BAD_VARIABLE_DEFINITION: return "Missing '$' or duplicate variable name."; // case U_PARSE_ERROR_START: // return "Start of Transliterator errors."; case U_MALFORMED_RULE: return "Elements of a rule are misplaced."; case U_MALFORMED_SET: return "A UnicodeSet pattern is invalid."; // case U_MALFORMED_SYMBOL_REFERENCE: // return "UNUSED as of ICU 2.4."; case U_MALFORMED_UNICODE_ESCAPE: return "A Unicode escape pattern is invalid."; case U_MALFORMED_VARIABLE_DEFINITION: return "A variable definition is invalid."; case U_MALFORMED_VARIABLE_REFERENCE: return "A variable reference is invalid."; // case U_MISMATCHED_SEGMENT_DELIMITERS: // return "UNUSED as of ICU 2.4."; case U_MISPLACED_ANCHOR_START: return "A start anchor appears at an illegal position."; case U_MISPLACED_CURSOR_OFFSET: return "A cursor offset occurs at an illegal position."; case U_MISPLACED_QUANTIFIER: return "A quantifier appears after a segment close delimiter."; case U_MISSING_OPERATOR: return "A rule contains no operator."; // case U_MISSING_SEGMENT_CLOSE: // return "UNUSED as of ICU 2.4."; case U_MULTIPLE_ANTE_CONTEXTS: return "More than one ante context."; case U_MULTIPLE_CURSORS: return "More than one cursor."; case U_MULTIPLE_POST_CONTEXTS: return "More than one post context."; case U_TRAILING_BACKSLASH: return "A dangling backslash."; case U_UNDEFINED_SEGMENT_REFERENCE: return "A segment reference does not correspond to a defined segment."; case U_UNDEFINED_VARIABLE: return "A variable reference does not correspond to a defined variable."; case U_UNQUOTED_SPECIAL: return "A special character was not quoted or escaped."; case U_UNTERMINATED_QUOTE: return "A closing single quote is missing."; case U_RULE_MASK_ERROR: return "A rule is hidden by an earlier more general rule."; case U_MISPLACED_COMPOUND_FILTER: return "A compound filter is in an invalid location."; case U_MULTIPLE_COMPOUND_FILTERS: return "More than one compound filter."; case U_INVALID_RBT_SYNTAX: return "A '::id' rule was passed to the RuleBasedTransliterator parser."; // case U_INVALID_PROPERTY_PATTERN: // return "UNUSED as of ICU 2.4."; case U_MALFORMED_PRAGMA: return "A 'use' pragma is invlalid."; case U_UNCLOSED_SEGMENT: return "A closing ')' is missing."; // case U_ILLEGAL_CHAR_IN_SEGMENT: // return "UNUSED as of ICU 2.4."; case U_VARIABLE_RANGE_EXHAUSTED: return "Too many stand-ins generated for the given variable range."; case U_VARIABLE_RANGE_OVERLAP: return "The variable range overlaps characters used in rules."; case U_ILLEGAL_CHARACTER: return "A special character is outside its allowed context."; case U_INTERNAL_TRANSLITERATOR_ERROR: return "Internal transliterator system error."; case U_INVALID_ID: return "A '::id' rule specifies an unknown transliterator."; case U_INVALID_FUNCTION: return "A '&fn()' rule specifies an unknown transliterator."; // case U_PARSE_ERROR_LIMIT: // return "The limit for Transliterator errors."; case U_UNEXPECTED_TOKEN: return "Syntax error in format pattern."; // case U_FMT_PARSE_ERROR_START: // return "Start of format library errors."; case U_MULTIPLE_DECIMAL_SEPARATORS: // case U_MULTIPLE_DECIMAL_SEPERATORS: return "More than one decimal separator in number pattern."; case U_MULTIPLE_EXPONENTIAL_SYMBOLS: return "More than one exponent symbol in number pattern."; case U_MALFORMED_EXPONENTIAL_PATTERN: return "Grouping symbol in exponent pattern."; case U_MULTIPLE_PERCENT_SYMBOLS: return "More than one percent symbol in number pattern."; case U_MULTIPLE_PERMILL_SYMBOLS: return "More than one permill symbol in number pattern."; case U_MULTIPLE_PAD_SPECIFIERS: return "More than one pad symbol in number pattern."; case U_PATTERN_SYNTAX_ERROR: return "Syntax error in format pattern."; case U_ILLEGAL_PAD_POSITION: return "Pad symbol misplaced in number pattern."; case U_UNMATCHED_BRACES: return "Braces do not match in message pattern."; // case U_UNSUPPORTED_PROPERTY: // return "UNUSED as of ICU 2.4."; // case U_UNSUPPORTED_ATTRIBUTE: // return "UNUSED as of ICU 2.4."; case U_ARGUMENT_TYPE_MISMATCH: return "Argument name and argument index mismatch in MessageFormat functions."; case U_DUPLICATE_KEYWORD: return "Duplicate keyword in PluralFormat."; case U_UNDEFINED_KEYWORD: return "Undefined Plural keyword."; case U_DEFAULT_KEYWORD_MISSING: return "Missing DEFAULT rule in plural rules."; case U_DECIMAL_NUMBER_SYNTAX_ERROR: return "Decimal number syntax error."; case U_FORMAT_INEXACT_ERROR: return "Cannot format a number exactly and rounding mode is ROUND_UNNECESSARY."; // case U_FMT_PARSE_ERROR_LIMIT: // return "The limit for format library errors."; case U_BRK_INTERNAL_ERROR: return "An internal error (bug) was detected in ICU."; // case U_BRK_ERROR_START: // return "Start of codes indicating Break Iterator failures."; case U_BRK_HEX_DIGITS_EXPECTED: return "Hex digits expected as part of a escaped char in a rule."; case U_BRK_SEMICOLON_EXPECTED: return "Missing ';' at the end of a RBBI rule."; case U_BRK_RULE_SYNTAX: return "Syntax error in RBBI rule."; case U_BRK_UNCLOSED_SET: return "UnicodeSet witing an RBBI rule missing a closing ']'."; case U_BRK_ASSIGN_ERROR: return "Syntax error in RBBI rule assignment statement."; case U_BRK_VARIABLE_REDFINITION: return "RBBI rule $Variable redefined."; case U_BRK_MISMATCHED_PAREN: return "Mis-matched parentheses in an RBBI rule."; case U_BRK_NEW_LINE_IN_QUOTED_STRING: return "Missing closing quote in an RBBI rule."; case U_BRK_UNDEFINED_VARIABLE: return "Use of an undefined $Variable in an RBBI rule."; case U_BRK_INIT_ERROR: return "Initialization failure. Probable missing ICU Data."; case U_BRK_RULE_EMPTY_SET: return "Rule contains an empty Unicode Set."; case U_BRK_UNRECOGNIZED_OPTION: return "!!option in RBBI rules not recognized."; case U_BRK_MALFORMED_RULE_TAG: return "The {nnn} tag on a rule is malformed."; // case U_BRK_ERROR_LIMIT: // return "This must always be the last value to indicate the limit for Break Iterator failures."; case U_REGEX_INTERNAL_ERROR: return "An internal error (bug) was detected in ICU."; // case U_REGEX_ERROR_START: // return "Start of codes indicating regex failures."; case U_REGEX_RULE_SYNTAX: return "Syntax error in regex pattern."; case U_REGEX_INVALID_STATE: return "RegexMatcher in invalid state for requested operation."; case U_REGEX_BAD_ESCAPE_SEQUENCE: return "Unrecognized backslash escape sequence in pattern."; case U_REGEX_PROPERTY_SYNTAX: return "Incorrect Unicode property."; case U_REGEX_UNIMPLEMENTED: return "Use of regex feature that is not yet implemented."; case U_REGEX_MISMATCHED_PAREN: return "Incorrectly nested parentheses in regex pattern."; case U_REGEX_NUMBER_TOO_BIG: return "Decimal number is too large."; case U_REGEX_BAD_INTERVAL: return "Error in {min,max} interval."; case U_REGEX_MAX_LT_MIN: return "In {min,max}, max is less than min."; case U_REGEX_INVALID_BACK_REF: return "Back-reference to a non-existent capture group."; case U_REGEX_INVALID_FLAG: return "Invalid value for match mode flags."; case U_REGEX_LOOK_BEHIND_LIMIT: return "Look-Behind pattern matches must have a bounded maximum length."; case U_REGEX_SET_CONTAINS_STRING: return "Regexes cannot have UnicodeSets containing strings."; // case U_REGEX_OCTAL_TOO_BIG: //Deprecated ICU 54. This error cannot occur. // return "Octal character constants must be <= 0377."; case U_REGEX_MISSING_CLOSE_BRACKET: return "Missing closing bracket on a bracket expression."; case U_REGEX_INVALID_RANGE: return "In a character range [x-y], x is greater than y."; case U_REGEX_STACK_OVERFLOW: return "Regular expression backtrack stack overflow."; case U_REGEX_TIME_OUT: return "Maximum allowed match time exceeded."; case U_REGEX_STOPPED_BY_CALLER: return "Matching operation aborted by user callback fn."; // case U_REGEX_ERROR_LIMIT: // return "This must always be the last value to indicate the limit for regex errors."; // case U_PLUGIN_ERROR_START: // return "Start of codes indicating plugin failures."; case U_PLUGIN_TOO_HIGH: return "The plugin's level is too high to be loaded right now."; case U_PLUGIN_DIDNT_SET_LEVEL: return "The plugin didn't call uplug_setPlugLevel in response to a QUERY."; // case U_PLUGIN_ERROR_LIMIT: // return "This must always be the last value to indicate the limit for plugin errors. "; #if U_ICU_VERSION_MAJOR_NUM>=55 case U_REGEX_PATTERN_TOO_BIG: return "Pattern exceeds limits on size or complexity."; case U_REGEX_INVALID_CAPTURE_GROUP_NAME: return "Invalid capture group name."; #endif default: return "Unknown ICU error or warning."; } } stringi/src/stri_container_regex.h0000644000176200001440000000674114770541312017122 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_container_regex_h #define __stri_container_regex_h #include #include #include #include "stri_container_utf16.h" /** Represents Regex Matcher's settings * * @version 1.4.7 (Marek Gagolewski, 2020-08-24) */ struct StriRegexMatcherOptions { uint32_t flags; int32_t stack_limit; int32_t time_limit; }; /** * A class to handle regex searches * * @version 0.1-?? (Marek Gagolewski, 2013-06-17) * * @version 0.2-1 (Marek Gagolewski, 2014-04-18) * BUGFIX: memleaks on StriException * * @version 0.3-1 (Marek Gagolewski, 2014-05-27) * BUGFIX: invalid matcher reuse on empty search string * * @version 1.3.1 (Marek Gagolewski, 2019-02-06) * #337: warn on empty search pattern here * * @version 1.7.1 (Marek Gagolewski, 2021-06-19) * #153: extract capture group names */ class StriContainerRegexPattern : public StriContainerUTF16 { private: StriRegexMatcherOptions opts; ///< RegexMatcher options RegexMatcher* lastMatcher; ///< recently used RegexMatcher R_len_t lastMatcherIndex; ///< used by vectorize_getMatcher std::vector lastCaptureGroupNames; R_len_t lastCaptureGroupNamesIndex; public: static StriRegexMatcherOptions getRegexOptions(SEXP opts_regex); StriContainerRegexPattern(); StriContainerRegexPattern(SEXP rstr, R_len_t nrecycle, StriRegexMatcherOptions opts); StriContainerRegexPattern(StriContainerRegexPattern& container); ~StriContainerRegexPattern(); StriContainerRegexPattern& operator=(StriContainerRegexPattern& container); RegexMatcher* getMatcher(R_len_t i); const std::vector& getCaptureGroupNames(R_len_t i); SEXP getCaptureGroupRDimnames(R_len_t i, R_len_t last_i=-1, SEXP ret=R_NilValue); SEXP getCaptureGroupRNames(R_len_t i); // TODO: allow reuse }; #endif stringi/src/stri_search_class_startsendswith.cpp0000644000176200001440000001555114770541312022100 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8_indexable.h" #include "stri_container_integer.h" #include "stri_container_charclass.h" /** * Detect if a string starts with a pattern match * * @param str character vector * @param pattern character vector * @param from integer vector * @return logical vector * * @version 0.3-1 (Marek Gagolewski, 2014-10-31) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.4.7 (Marek Gagolewski, 2020-08-24) * #345: `negate` arg added */ SEXP stri_startswith_charclass(SEXP str, SEXP pattern, SEXP from, SEXP negate) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); PROTECT(from = stri__prepare_arg_integer(from, "from")); STRI__ERROR_HANDLER_BEGIN(3) int vectorize_length = stri__recycling_rule(true, 3, LENGTH(str), LENGTH(pattern), LENGTH(from)); StriContainerUTF8_indexable str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); StriContainerInteger from_cont(from, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i) || from_cont.isNA(i)) { ret_tab[i] = NA_LOGICAL; continue; } R_len_t from_cur = from_cont.get(i); if (from_cur == 1) from_cur = 0; /* most commonly used case */ else if (from_cur >= 0) from_cur = str_cont.UChar32_to_UTF8_index_fwd(i, from_cur-1); else from_cur = str_cont.UChar32_to_UTF8_index_back(i, -from_cur); // now surely from_cur >= 0 && from_cur <= cur_n const char* str_cur_s = str_cont.get(i).c_str(); R_len_t str_cur_n = str_cont.get(i).length(); const UnicodeSet* pattern_cur = &pattern_cont.get(i); if (from_cur > str_cur_n) ret_tab[i] = negate_1; else { UChar32 chr = 0; U8_NEXT(str_cur_s, from_cur, str_cur_n, chr); if (chr < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); ret_tab[i] = pattern_cur->contains(chr); if (negate_1) ret_tab[i] = !ret_tab[i]; } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( ;/* do nothing special on error */ ) } /** * Detect if a string ends with a pattern match * * @param str character vector * @param pattern character vector * @param to integer vector * @return logical vector * * @version 0.3-1 (Marek Gagolewski, 2014-10-31) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.4.7 (Marek Gagolewski, 2020-08-24) * #345: `negate` arg added */ SEXP stri_endswith_charclass(SEXP str, SEXP pattern, SEXP to, SEXP negate) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); PROTECT(to = stri__prepare_arg_integer(to, "to")); STRI__ERROR_HANDLER_BEGIN(3) int vectorize_length = stri__recycling_rule(true, 3, LENGTH(str), LENGTH(pattern), LENGTH(to)); StriContainerUTF8_indexable str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); StriContainerInteger to_cont(to, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i) || to_cont.isNA(i)) { ret_tab[i] = NA_LOGICAL; continue; } const char* str_cur_s = str_cont.get(i).c_str(); R_len_t str_cur_n = str_cont.get(i).length(); const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t to_cur = to_cont.get(i); if (to_cur == -1) to_cur = str_cur_n; /* most commonly used case */ else if (to_cur >= 0) to_cur = str_cont.UChar32_to_UTF8_index_fwd(i, to_cur); else to_cur = str_cont.UChar32_to_UTF8_index_back(i, -to_cur-1); // now surely to_cur >= 0 && to_cur <= cur_n if (to_cur <= 0) ret_tab[i] = negate_1; else { UChar32 chr = 0; U8_PREV(str_cur_s, 0, to_cur, chr); if (chr < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); ret_tab[i] = pattern_cur->contains(chr); if (negate_1) ret_tab[i] = !ret_tab[i]; } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( ;/* do nothing special on error */ ) } stringi/src/stri_search_regex_match.cpp0000644000176200001440000003606314770541312020114 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_regex.h" #include #include #include using namespace std; /** * Extract all capture groups of the first/last occurrence * of a regex pattern in each string * * @param str character vector * @param pattern character vector * @param opts_regex list * @param first logical - search for the first or the last occurrence? * @param cg_missing single string * @return character matrix * * @version 0.1-??? (Marek Gagolewski, 2013-06-22) * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-06) * new arg: cg_missing * * @version 1.0-2 (Marek Gagolewski, 2016-01-29) * Issue #214: allow a regex pattern like `.*` to match an empty string * * @version 1.1.8 (Marek Gagolewski, 2018-04-09) * #288: stri_match did not return correct number of columns * when input was empty * * @version 1.4.7 (Marek Gagolewski, 2020-08-24) * Use StriContainerRegexPattern::getRegexOptions * * @version 1.7.1 (Marek Gagolewski, 2021-06-19) * #153: named capture groups */ SEXP stri__match_firstlast_regex(SEXP str, SEXP pattern, SEXP cg_missing, SEXP opts_regex, bool first) { // @TODO: capture_groups arg (integer vector/set - which capture groups to extract) PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); // prepare string argument PROTECT(cg_missing = stri__prepare_arg_string_1(cg_missing, "cg_missing")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriRegexMatcherOptions pattern_opts = StriContainerRegexPattern::getRegexOptions(opts_regex); UText* str_text = NULL; // may potentially be slower, but definitely is more convenient! STRI__ERROR_HANDLER_BEGIN(3) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerUTF8 cg_missing_cont(cg_missing, 1); STRI__PROTECT(cg_missing = STRING_ELT(cg_missing, 0)); // we don't know how many capture groups are there: vector< vector< pair > > occurrences(vectorize_length); R_len_t occurrences_max = 1; StriContainerRegexPattern pattern_cont(pattern, (LENGTH(str)>0)?vectorize_length:LENGTH(pattern), pattern_opts); if (LENGTH(str) == 0 && LENGTH(pattern) > 0) { // we need to determine the number of capture groups anyway for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if ((pattern_cont).isNA(i) || (pattern_cont).get(i).length() <= 0) { if (!(pattern_cont).isNA(i)) Rf_warning(MSG__EMPTY_SEARCH_PATTERN_UNSUPPORTED); continue; } RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically int pattern_cur_groups = matcher->groupCount(); if (occurrences_max < pattern_cur_groups+1) occurrences_max=pattern_cur_groups+1; } } else { for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if ((pattern_cont).isNA(i) || (pattern_cont).get(i).length() <= 0) { if (!(pattern_cont).isNA(i)) Rf_warning(MSG__EMPTY_SEARCH_PATTERN_UNSUPPORTED); continue; } UErrorCode status = U_ZERO_ERROR; RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically int pattern_cur_groups = matcher->groupCount(); if (occurrences_max < pattern_cur_groups+1) occurrences_max=pattern_cur_groups+1; if ((str_cont).isNA(i)) { continue; } str_text = utext_openUTF8(str_text, str_cont.get(i).c_str(), str_cont.get(i).length(), &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) const char* str_cur_s = str_cont.get(i).c_str(); occurrences[i] = vector< pair >(pattern_cur_groups+1); matcher->reset(str_text); while (1) { int m_res = (int)matcher->find(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (!m_res) break; occurrences[i][0].first = str_cur_s+(int)matcher->start(status); occurrences[i][0].second = str_cur_s+(int)matcher->end(status); for (R_len_t j=1; j<=pattern_cur_groups; ++j) { int m_start = (int)matcher->start(j, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) int m_end = (int)matcher->end(j, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (m_start < 0 || m_end < 0) { occurrences[i][j].first = NULL; occurrences[i][j].second = NULL; } else { occurrences[i][j].first = str_cur_s+m_start; occurrences[i][j].second = str_cur_s+m_end; } } if (first) break; } } } if (str_text) { utext_close(str_text); str_text = NULL; } SEXP ret; STRI__PROTECT(ret = stri__matrix_NA_STRING(vectorize_length, occurrences_max)); for (R_len_t i=0; i retij = occurrences[i][j]; if (retij.first != NULL && retij.second != NULL) SET_STRING_ELT(ret, i+j*vectorize_length, Rf_mkCharLenCE(retij.first, (R_len_t)(retij.second-retij.first), CE_UTF8)); else SET_STRING_ELT(ret, i+j*vectorize_length, cg_missing); } } if (pattern_cont.get_n() == 1) { // only if there's 1 pattern, otherwise how to agree names? SEXP dimnames; STRI__PROTECT(dimnames = pattern_cont.getCaptureGroupRDimnames(0)); // reuses last matcher btw if (!Rf_isNull(dimnames)) Rf_setAttrib(ret, R_DimNamesSymbol, dimnames); STRI__UNPROTECT(1); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(if (str_text) utext_close(str_text);) } /** * Extract all capture groups of the first occurrence of a regex pattern in each string * * @param str character vector * @param pattern character vector * @param cg_missing single string * @param opts_regex list * @return character matrix * * @version 0.1-?? (Marek Gagolewski, 2013-06-22) * * @version 0.4-1 (Marek Gagolewski, 2014-12-06) * new arg: cg_missing */ SEXP stri_match_first_regex(SEXP str, SEXP pattern, SEXP cg_missing, SEXP opts_regex) { return stri__match_firstlast_regex(str, pattern, cg_missing, opts_regex, true); } /** * Extract all capture groups of the last occurrence of a regex pattern in each string * * @param str character vector * @param pattern character vector * @param cg_missing single string * @param opts_regex list * @return character matrix * * @version 0.1-?? (Marek Gagolewski, 2013-06-22) * * @version 0.4-1 (Marek Gagolewski, 2014-12-06) * new arg: cg_missing */ SEXP stri_match_last_regex(SEXP str, SEXP pattern, SEXP cg_missing, SEXP opts_regex) { return stri__match_firstlast_regex(str, pattern, cg_missing, opts_regex, false); } /** * Extract all capture groups of all occurrences of a regex pattern in each string * * @param str character vector * @param pattern character vector * @param opts_regex list * @param cg_missing single string * @return list of character matrices * * @version 0.1-?? (Marek Gagolewski, 2013-06-22) * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-11-27) * FR #117: omit_no_match arg added * * @version 0.4-1 (Marek Gagolewski, 2014-12-06) * new arg: cg_missing * * @version 1.0-2 (Marek Gagolewski, 2016-01-29) * Issue #214: allow a regex pattern like `.*` to match an empty string * * @version 1.7.1 (Marek Gagolewski, 2021-06-19) * #153: named capture groups */ SEXP stri_match_all_regex(SEXP str, SEXP pattern, SEXP omit_no_match, SEXP cg_missing, SEXP opts_regex) { bool omit_no_match1 = stri__prepare_arg_logical_1_notNA(omit_no_match, "omit_no_match"); PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); // prepare string argument PROTECT(cg_missing = stri__prepare_arg_string_1(cg_missing, "cg_missing")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriRegexMatcherOptions pattern_opts = StriContainerRegexPattern::getRegexOptions(opts_regex); UText* str_text = NULL; // may potentially be slower, but definitely is more convenient! STRI__ERROR_HANDLER_BEGIN(3) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_opts); StriContainerUTF8 cg_missing_cont(cg_missing, 1); STRI__PROTECT(cg_missing = STRING_ELT(cg_missing, 0)); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); R_len_t last_i = -1; for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if ((pattern_cont).isNA(i) || (pattern_cont).get(i).length() <= 0) { if (!(pattern_cont).isNA(i)) Rf_warning(MSG__EMPTY_SEARCH_PATTERN_UNSUPPORTED); SET_VECTOR_ELT(ret, i, stri__matrix_NA_STRING(1, 1)); continue; } UErrorCode status = U_ZERO_ERROR; RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically R_len_t pattern_cur_groups = matcher->groupCount(); SEXP cur_res, dimnames; // all 2 will be PROTECT'd below STRI__PROTECT(dimnames = pattern_cont.getCaptureGroupRDimnames(i, last_i, ret)); last_i = i; if ((str_cont).isNA(i)) { STRI__PROTECT(cur_res = stri__matrix_NA_STRING(1, pattern_cur_groups+1)); if (!Rf_isNull(dimnames)) Rf_setAttrib(cur_res, R_DimNamesSymbol, dimnames); SET_VECTOR_ELT(ret, i, cur_res); STRI__UNPROTECT(2); // cur_res, dimnames continue; } str_text = utext_openUTF8(str_text, str_cont.get(i).c_str(), str_cont.get(i).length(), &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) matcher->reset(str_text); deque< pair > occurrences; while (1) { int m_res = (int)matcher->find(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (!m_res) break; occurrences.push_back(pair((R_len_t)matcher->start(status), (R_len_t)matcher->end(status))); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) for (R_len_t j=0; j((R_len_t)matcher->start(j+1, status), (R_len_t)matcher->end(j+1, status))); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } R_len_t noccurrences = (R_len_t)occurrences.size()/(pattern_cur_groups+1); if (noccurrences <= 0) { STRI__PROTECT(cur_res = stri__matrix_NA_STRING(omit_no_match1?0:1, pattern_cur_groups+1)); if (!Rf_isNull(dimnames)) Rf_setAttrib(cur_res, R_DimNamesSymbol, dimnames); SET_VECTOR_ELT(ret, i, cur_res); STRI__UNPROTECT(2); // cur_res, dimnames continue; } STRI__PROTECT(cur_res = Rf_allocMatrix(STRSXP, noccurrences, pattern_cur_groups+1)); if (!Rf_isNull(dimnames)) Rf_setAttrib(cur_res, R_DimNamesSymbol, dimnames); const char* str_cur_s = str_cont.get(i).c_str(); deque< pair >::iterator iter = occurrences.begin(); for (R_len_t j = 0; iter != occurrences.end(); ++j) { pair curo = *iter; SET_STRING_ELT(cur_res, j, Rf_mkCharLenCE(str_cur_s+curo.first, curo.second-curo.first, CE_UTF8)); ++iter; for (R_len_t k = 0; iter != occurrences.end() && k < pattern_cur_groups; ++iter, ++k) { curo = *iter; if (curo.first < 0 || curo.second < 0) SET_STRING_ELT(cur_res, j+(k+1)*noccurrences, cg_missing); else SET_STRING_ELT(cur_res, j+(k+1)*noccurrences, Rf_mkCharLenCE(str_cur_s+curo.first, curo.second-curo.first, CE_UTF8)); } } SET_VECTOR_ELT(ret, i, cur_res); STRI__UNPROTECT(2); // cur_res, dimnames } if (str_text) { utext_close(str_text); str_text = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(if (str_text) utext_close(str_text);) } stringi/src/stri_trans_normalization.cpp0000644000176200001440000002370014770541312020370 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf16.h" #include #define STRI_UNINORM_NFC 10 #define STRI_UNINORM_NFD 20 #define STRI_UNINORM_NFKC 11 #define STRI_UNINORM_NFKD 21 #define STRI_UNINORM_NFKC_CF 12 /** Get Desired Normalizer2 instance * * @param type R object, will be tested whether it's an integer vector of length 1 * @return unmodifiable singleton instance. Do not delete it. * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski, 2013-06-29) * don't use getNFCInstance as it's in ICU DRAFT API * * @version 0.2-1 (Marek Gagolewski, 2014-03-23) * getNFCInstance is stable as of ICU 49 and we require ICU >= 50 */ const Normalizer2* stri__normalizer_get(int _type) { UErrorCode status = U_ZERO_ERROR; const Normalizer2* normalizer = NULL; switch (_type) { case STRI_UNINORM_NFC: // normalizer = Normalizer2::getInstance(NULL, "nfc", UNORM2_COMPOSE, status); normalizer = Normalizer2::getNFCInstance(status); break; case STRI_UNINORM_NFD: // normalizer = Normalizer2::getInstance(NULL, "nfc", UNORM2_DECOMPOSE, status); normalizer = Normalizer2::getNFDInstance(status); break; case STRI_UNINORM_NFKC: // normalizer = Normalizer2::getInstance(NULL, "nfkc", UNORM2_COMPOSE, status); normalizer = Normalizer2::getNFKCInstance(status); break; case STRI_UNINORM_NFKD: // normalizer = Normalizer2::getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, status); normalizer = Normalizer2::getNFKDInstance(status); break; case STRI_UNINORM_NFKC_CF: // normalizer = Normalizer2::getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, status); normalizer = Normalizer2::getNFKCCasefoldInstance(status); break; default: Rf_error(MSG__INCORRECT_INTERNAL_ARG); // error() allowed here } STRI__CHECKICUSTATUS_RFERROR(status, {/* do nothing special on err */}) /* Rf_error */ return normalizer; } /** * Perform Unicode Normalization * * @param str character vector * @param type normalization type [internal] * @return character vector * * @version 0.1 (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16 & ICU facilities * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-2 (Marek Gagolewski, 2014-04-19) * renamed: stri_enc_nf -> stri_trans_nf * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * This is now an internal function */ SEXP stri_trans_nf(SEXP str, int type) { // As of ICU 52.1 (Unicode 6.3.0), the "most expansive" decomposition // is 1 UChar -> 18 UChars (data/unidata/norm2/nfkc.txt) // FDFA>0635 0644 0649 0020 0627 0644 0644 0647 0020 // 0639 0644 064A 0647 0020 0648 0633 0644 0645 // C API will not be faster here // In ICU 52.1 unorm2_normalize does UnicodeString destString(dest, 0, capacity); // and so on, thus it is a simple wrapper for C++ API const Normalizer2* normalizer = stri__normalizer_get(type); // auto `type` check here, call before ERROR_HANDLER PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument R_len_t str_length = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(1) StriContainerUTF16 str_cont(str, str_length, false); // writable, no recycle for (R_len_t i=0; inormalize(str_cont.get(i), status)); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } // normalizer shall not be deleted at all STRI__UNPROTECT_ALL return str_cont.toR(); STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** * Check if String is in NF* * * @param str character vector * @param type normalization type [internal] * @return logical vector * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16 * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-2 (Marek Gagolewski, 2014-04-19) * renamed: stri_enc_nf -> stri_trans_nf * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * This is now an internal function */ SEXP stri_trans_isnf(SEXP str, int type) { const Normalizer2* normalizer = stri__normalizer_get(type); // auto `type` check here, call before ERROR_HANDLER PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument R_len_t str_length = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(1) StriContainerUTF16 str_cont(str, str_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, str_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { ret_tab[i] = NA_LOGICAL; continue; } // C API will not be faster here // as it is a simple wrapper for C++ API UErrorCode status = U_ZERO_ERROR; ret_tab[i] = normalizer->isNormalized(str_cont.get(i), status) ? TRUE : FALSE; STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } // normalizer shall not be deleted at all STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** * Check if String is in NFC * * @param str character vector * @return logical vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * call stri_trans_nf */ SEXP stri_trans_nfc(SEXP str) { return stri_trans_nf(str, STRI_UNINORM_NFC); } /** * Check if String is in NFD * * @param str character vector * @return logical vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * call stri_trans_nf */ SEXP stri_trans_nfd(SEXP str) { return stri_trans_nf(str, STRI_UNINORM_NFD); } /** * Check if String is in NFKD * * @param str character vector * @return logical vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * call stri_trans_nf */ SEXP stri_trans_nfkd(SEXP str) { return stri_trans_nf(str, STRI_UNINORM_NFKD); } /** * Check if String is in NFKC * * @param str character vector * @return logical vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * call stri_trans_nf */ SEXP stri_trans_nfkc(SEXP str) { return stri_trans_nf(str, STRI_UNINORM_NFKC); } /** * Check if String is in NFKC-CASEFOLD * * @param str character vector * @return logical vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * call stri_trans_nf */ SEXP stri_trans_nfkc_casefold(SEXP str) { return stri_trans_nf(str, STRI_UNINORM_NFKC_CF); } /** * Convert string to NFC * * @param str character vector * @return character vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * call stri_trans_isnf */ SEXP stri_trans_isnfc(SEXP str) { return stri_trans_isnf(str, STRI_UNINORM_NFC); } /** * Convert string to NFD * * @param str character vector * @return character vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * call stri_trans_isnf */ SEXP stri_trans_isnfd(SEXP str) { return stri_trans_isnf(str, STRI_UNINORM_NFD); } /** * Convert string to NFKD * * @param str character vector * @return character vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * call stri_trans_isnf */ SEXP stri_trans_isnfkd(SEXP str) { return stri_trans_isnf(str, STRI_UNINORM_NFKD); } /** * Convert string to NFKC * * @param str character vector * @return character vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * call stri_trans_isnf */ SEXP stri_trans_isnfkc(SEXP str) { return stri_trans_isnf(str, STRI_UNINORM_NFKC); } /** * Convert string to NFKC-CASEFOLD * * @param str character vector * @return character vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * call stri_trans_isnf */ SEXP stri_trans_isnfkc_casefold(SEXP str) { return stri_trans_isnf(str, STRI_UNINORM_NFKC_CF); } stringi/src/stri_container_logical.h0000644000176200001440000000654214770541312017421 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_container_logical_h #define __stri_container_logical_h #include "stri_container_base.h" /** * A wrapper-class for R logical vectors * * @version 0.1-?? (Marek Gagolewski, 2013-06-15) */ class StriContainerLogical : public StriContainerBase { private: int* data; public: StriContainerLogical() : StriContainerBase() { data = NULL; } StriContainerLogical(SEXP rvec, R_len_t _nrecycle) { this->data = NULL; #ifndef NDEBUG if (!Rf_isLogical(rvec)) throw StriException("DEBUG: !Rf_isLogical in StriContainerLogical"); #endif R_len_t ndata = LENGTH(rvec); this->init_Base(ndata, _nrecycle, true); this->data = LOGICAL(rvec); // TODO: ALTREP will be problematic? } // StriContainerLogical(StriContainerLogical& container); // default-shallow // ~StriContainerLogical(); // default-shallow // StriContainerLogical& operator=(StriContainerLogical& container); // default-shallow /** check if the vectorized ith element is NA * @param i index * @return true if is NA */ inline bool isNA(R_len_t i) const { #ifndef NDEBUG if (i < 0 || i >= nrecycle) throw StriException("StriContainerLogical::isNA(): INDEX OUT OF BOUNDS"); #endif return (data[i%n] == NA_LOGICAL); } /** get the vectorized ith element * @param i index * @return integer */ inline int get(R_len_t i) const { #ifndef NDEBUG if (i < 0 || i >= nrecycle) throw StriException("StriContainerLogical::get(): INDEX OUT OF BOUNDS"); if (data[i%n] == NA_LOGICAL) throw StriException("StriContainerLogical::get(): isNA"); #endif return (data[i%n]); } }; #endif stringi/src/stri_search_boundaries_locate.cpp0000644000176200001440000002075414770541312021310 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8_indexable.h" #include "stri_container_integer.h" #include "stri_brkiter.h" /** * Locate first or last boundaries * * @param str character vector * @param opts_brkiter list * @param first looking for first or last match? * * @return integer matrix (2 columns) * * @version 0.4-1 (Marek Gagolewski, 2014-12-05) * * @version 1.7.1 (Marek Gagolewski, 2021-06-29) * get_length */ SEXP stri__locate_firstlast_boundaries( SEXP str, SEXP opts_brkiter, bool first, bool get_length1 ) { PROTECT(str = stri__prepare_arg_string(str, "str")); StriBrkIterOptions opts_brkiter2(opts_brkiter, "line_break"); STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_length = LENGTH(str); StriContainerUTF8_indexable str_cont(str, str_length); StriRuleBasedBreakIterator brkiter(opts_brkiter2); SEXP ret; STRI__PROTECT(ret = Rf_allocMatrix(INTSXP, str_length, 2)); stri__locate_set_dimnames_matrix(ret, get_length1); int* ret_tab = INTEGER(ret); for (R_len_t i = 0; i < str_length; ++i) { ret_tab[i] = NA_INTEGER; ret_tab[i+str_length] = NA_INTEGER; if (str_cont.isNA(i)) continue; if (get_length1) { ret_tab[i] = -1; ret_tab[i+str_length] = -1; } if (str_cont.get(i).length() == 0) { continue; } brkiter.setupMatcher(str_cont.get(i).c_str(), str_cont.get(i).length()); pair curpair; if (first) { brkiter.first(); if (!brkiter.next(curpair)) { continue; } } else { brkiter.last(); if (!brkiter.previous(curpair)) { continue; } } ret_tab[i] = curpair.first; ret_tab[i+str_length] = curpair.second; // Adjust UTF8 byte index -> UChar32 index str_cont.UTF8_to_UChar32_index(i, ret_tab+i, ret_tab+i+str_length, 1, 1, // 0-based index -> 1-based 0 // end returns position of next character after match ); if (get_length1) ret_tab[i+str_length] -= ret_tab[i] - 1; // to->length } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( ;/* do nothing special on error */ ) } /** * Locate first boundary * * @param str character vector * @param opts_brkiter list * @return integer matrix (2 columns) * * @version 0.4-1 (Marek Gagolewski, 2014-12-05) * * @version 1.7.1 (Marek Gagolewski, 2021-06-29) * get_length */ SEXP stri_locate_first_boundaries(SEXP str, SEXP opts_brkiter, SEXP get_length) { bool get_length1 = stri__prepare_arg_logical_1_notNA(get_length, "get_length"); return stri__locate_firstlast_boundaries(str, opts_brkiter, true, get_length1); } /** * Locate last boundary * * @param str character vector * @param opts_brkiter list * @return integer matrix (2 columns) * * @version 0.4-1 (Marek Gagolewski, 2014-12-05) * * @version 1.7.1 (Marek Gagolewski, 2021-06-29) * get_length */ SEXP stri_locate_last_boundaries(SEXP str, SEXP opts_brkiter, SEXP get_length) { bool get_length1 = stri__prepare_arg_logical_1_notNA(get_length, "get_length"); return stri__locate_firstlast_boundaries(str, opts_brkiter, false, get_length1); } /** Locate all BreakIterator boundaries * * @param str character vector * @param omit_no_match logical * @param opts_brkiter named list * @return list * * @version 0.2-2 (Marek Gagolewski, 2014-04-22) * * @version 0.2-2 (Marek Gagolewski, 2014-04-23) * removed "title": For Unicode 4.0 and above title boundary * iteration, please use Word Boundary iterator. * * @version 0.2-2 (Marek Gagolewski, 2014-04-25) * use stri__split_or_locate_boundaries * * @version 0.3-1 (Marek Gagolewski, 2014-10-29) * use opts_brkiter * * @version 0.4-1 (Marek Gagolewski, 2014-11-28) * new args: omit_no_match * * @version 0.4-1 (Marek Gagolewski, 2014-12-02) * use StriRuleBasedBreakIterator * * @version 1.7.1 (Marek Gagolewski, 2021-06-29) * get_length */ SEXP stri_locate_all_boundaries(SEXP str, SEXP omit_no_match, SEXP opts_brkiter, SEXP get_length) { bool omit_no_match1 = stri__prepare_arg_logical_1_notNA(omit_no_match, "omit_no_match"); bool get_length1 = stri__prepare_arg_logical_1_notNA(get_length, "get_length"); PROTECT(str = stri__prepare_arg_string(str, "str")); StriBrkIterOptions opts_brkiter2(opts_brkiter, "line_break"); STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_length = LENGTH(str); StriContainerUTF8_indexable str_cont(str, str_length); StriRuleBasedBreakIterator brkiter(opts_brkiter2); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, str_length)); for (R_len_t i = 0; i < str_length; ++i) { if (str_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, stri__matrix_NA_INTEGER(1, 2)); continue; } brkiter.setupMatcher(str_cont.get(i).c_str(), str_cont.get(i).length()); brkiter.first(); deque< pair > occurrences; pair curpair; while (brkiter.next(curpair)) occurrences.push_back(curpair); R_len_t noccurrences = (R_len_t)occurrences.size(); if (noccurrences <= 0) { SET_VECTOR_ELT( ret, i, stri__matrix_NA_INTEGER(omit_no_match1?0:1, 2, get_length1?-1:NA_INTEGER) ); continue; } SEXP ans; STRI__PROTECT(ans = Rf_allocMatrix(INTSXP, noccurrences, 2)); int* ans_tab = INTEGER(ans); deque< pair >::iterator iter = occurrences.begin(); for (R_len_t j = 0; iter != occurrences.end(); ++iter, ++j) { pair cur_match = *iter; ans_tab[j] = cur_match.first; ans_tab[j+noccurrences] = cur_match.second; } // Adjust UChar index -> UChar32 index (1-2 byte UTF16 to 1 byte UTF32-code points) str_cont.UTF8_to_UChar32_index(i, ans_tab, ans_tab+noccurrences, noccurrences, 1, // 0-based index -> 1-based 0 // end returns position of next character after match ); if (get_length1) { for (R_len_t j=0; j < noccurrences; ++j) ans_tab[j+noccurrences] -= ans_tab[j] - 1; // to->length } SET_VECTOR_ELT(ret, i, ans); STRI__UNPROTECT(1); } stri__locate_set_dimnames_list(ret, get_length1); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* nothing special t.b.d. on error */ }) } stringi/src/stri_string8.cpp0000644000176200001440000001665014770541312015677 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_string8.h" void String8::initialize(const char* str, R_len_t n, bool memalloc, bool killbom, bool isASCII) { #ifndef NDEBUG if (!isNA()) throw StriException("string8::!isNA() in initialize()"); #endif if (killbom && n >= 3 && (uint8_t)(str[0]) == UTF8_BOM_BYTE1 && (uint8_t)(str[1]) == UTF8_BOM_BYTE2 && (uint8_t)(str[2]) == UTF8_BOM_BYTE3) { // has BOM - get rid of it this->m_memalloc = true; // ignore memalloc val this->m_n = n-3; this->m_isASCII = isASCII; this->m_str = new char[this->m_n+1]; STRI_ASSERT(this->m_str); if (!this->m_str) throw StriException(MSG__MEM_ALLOC_ERROR_WITH_SIZE, this->m_n+1); memcpy(this->m_str, str+3, (size_t)this->m_n); this->m_str[this->m_n] = '\0'; } else { this->m_memalloc = memalloc; this->m_n = n; this->m_isASCII = isASCII; if (memalloc) { this->m_str = new char[this->m_n+1]; STRI_ASSERT(this->m_str); if (!this->m_str) throw StriException(MSG__MEM_ALLOC_ERROR_WITH_SIZE, this->m_n+1); // memcpy may be very fast in some libc implementations memcpy(this->m_str, str, (size_t)this->m_n); this->m_str[this->m_n] = '\0'; } else { this->m_str = (char*)(str); // we know what we're doing // str is zero-terminated } } } /** copy constructor */ String8::String8(const String8& s) { this->m_memalloc = s.m_memalloc; this->m_n = s.m_n; this->m_isASCII = s.m_isASCII; if (s.m_memalloc) { this->m_str = new char[this->m_n+1]; STRI_ASSERT(this->m_str); if (!this->m_str) throw StriException(MSG__MEM_ALLOC_ERROR_WITH_SIZE, this->m_n+1); memcpy(this->m_str, s.m_str, (size_t)this->m_n); this->m_str[this->m_n] = '\0'; } else { this->m_str = s.m_str; } } /** copy */ String8& String8::operator=(const String8& s) { if (this->m_str && this->m_memalloc) delete [] this->m_str; this->m_memalloc = s.m_memalloc; this->m_n = s.m_n; this->m_isASCII = s.m_isASCII; if (s.m_memalloc) { this->m_str = new char[this->m_n+1]; STRI_ASSERT(this->m_str); if (!this->m_str) throw StriException(MSG__MEM_ALLOC_ERROR_WITH_SIZE, this->m_n+1); memcpy(this->m_str, s.m_str, (size_t)this->m_n); this->m_str[this->m_n] = '\0'; } else { this->m_str = s.m_str; } return *this; } bool String8::endsWith(R_len_t byteindex, const char* patternStr, R_len_t patternLen, bool caseInsensitive) const { if (caseInsensitive) { R_len_t k = patternLen; UChar32 c1; UChar32 c2; while (k > 0) { if (byteindex <= 0) return false; U8_PREV(m_str, 0, byteindex, c1); U8_PREV(patternStr, 0, k, c2); if (u_toupper(c1) != u_toupper(c2)) return false; } return true; } else { if (byteindex-patternLen < 0) return false; for (R_len_t k=0; k < patternLen; ++k) if (m_str[byteindex-k-1] != patternStr[patternLen-k-1]) return false; return true; // found } } bool String8::startsWith(R_len_t byteindex, const char* patternStr, R_len_t patternLen, bool caseInsensitive) const { if (caseInsensitive) { R_len_t k = 0; UChar32 c1; UChar32 c2; while (k < patternLen) { if (byteindex >= m_n) return false; U8_NEXT(m_str, byteindex, m_n, c1); U8_NEXT(patternStr, k, patternLen, c2); if (u_toupper(c1) != u_toupper(c2)) return false; } return true; } else { if (byteindex+patternLen > m_n) return false; for (R_len_t k=0; k < patternLen; ++k) if (m_str[byteindex+k] != patternStr[k]) return false; return true; // found } } void String8::replaceAllAtPos(R_len_t buf_size, const char* replacement_cur_s, R_len_t replacement_cur_n, std::deque< std::pair >& occurrences) { #ifndef NDEBUG if (isNA()) throw StriException("String8::isNA() in replaceAllAtPos()"); #endif char* old_str = this->m_str; int old_n = this->m_n; bool old_memalloc = this->m_memalloc; this->m_str = new char[buf_size+1]; this->m_n = buf_size; this->m_memalloc = true; this->m_isASCII = true; /* TO DO */ R_len_t buf_used = 0; R_len_t jlast = 0; std::deque< std::pair >::iterator iter = occurrences.begin(); for (; iter != occurrences.end(); ++iter) { pair match = *iter; memcpy(m_str+buf_used, old_str+jlast, (size_t)(match.first-jlast)); buf_used += match.first-jlast; #ifndef NDEBUG if (buf_used > buf_size) throw StriException("!NDEBUG: String8::replaceAllAtPos: buf_used > buf_size"); #endif jlast = match.second; memcpy(m_str+buf_used, replacement_cur_s, (size_t)(replacement_cur_n)); buf_used += replacement_cur_n; #ifndef NDEBUG if (buf_used > buf_size) throw StriException("!NDEBUG: String8::replaceAllAtPos: buf_used > buf_size"); #endif } memcpy(m_str+buf_used, old_str+jlast, (size_t)(old_n-jlast)); buf_used += (old_n-jlast); #ifndef NDEBUG if (buf_used > buf_size) throw StriException("!NDEBUG: String8::replaceAllAtPos: buf_used > buf_size"); #endif #ifndef NDEBUG if (buf_used != this->m_n) throw StriException("!NDEBUG: String8::replaceAllAtPos: buf_used > buf_size"); #endif this->m_str[this->m_n] = '\0'; if (old_str && old_memalloc) delete [] old_str; } stringi/src/stri_search_other_split.cpp0000644000176200001440000002226514770541312020161 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_utf16.h" #include "stri_container_usearch.h" #include "stri_container_bytesearch.h" #include "stri_container_integer.h" #include "stri_container_logical.h" #include #include #include #include using namespace std; /** * Split a single string into text lines * * @param str character vector * * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-08-04) * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_split_lines1(SEXP str) { PROTECT(str = stri__prepare_arg_string_1(str, "str")); R_len_t vectorize_length = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(1) StriContainerUTF8 str_cont(str, vectorize_length); if (str_cont.isNA(0)) { STRI__UNPROTECT_ALL return str; } const char* str_cur_s = str_cont.get(0).c_str(); R_len_t str_cur_n = str_cont.get(0).length(); UChar32 c; R_len_t jlast; deque< pair > occurrences; occurrences.push_back(pair(0, 0)); for (R_len_t j=0; j < str_cur_n; /* null */) { jlast = j; U8_NEXT(str_cur_s, j, str_cur_n, c); switch (c) { case ASCII_CR: /* CR */ /* check if next is LF */ if (str_cur_s[j] == ASCII_LF) { // look ahead one byte j++; // just one byte } break; case ASCII_LF: /* LF */ break; case UCHAR_NEL: /* NEL */ break; case ASCII_VT: /* VT */ break; case ASCII_FF: /* FF */ break; case UCHAR_LS: /* LS */ break; case UCHAR_PS: /* PS */ break; default: /* not a newline character */ occurrences.back().second = j; continue; } occurrences.back().second = jlast; if (j < str_cur_n) occurrences.push_back(pair(j, j)); } SEXP ans; STRI__PROTECT(ans = Rf_allocVector(STRSXP, (R_len_t)occurrences.size())); deque< pair >::iterator iter = occurrences.begin(); for (R_len_t k = 0; iter != occurrences.end(); ++iter, ++k) { pair curoccur = *iter; SET_STRING_ELT(ans, k, Rf_mkCharLenCE(str_cur_s+curoccur.first, curoccur.second-curoccur.first, CE_UTF8)); } STRI__UNPROTECT_ALL return ans; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** * Split a string into text lines * * @param str character vector * @param omit_empty logical vector * * @return list of character vectors * * @version 0.1-?? (Marek Gagolewski, 2013-08-04) * * @version 0.3-1 (Marek Gagolewski, 2014-10-30) * removed `n_max` arg, as it doesn't make sense * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_split_lines(SEXP str, SEXP omit_empty) { PROTECT(str = stri__prepare_arg_string(str, "str")); // n_max = stri__prepare_arg_integer(n_max, "n_max"); PROTECT(omit_empty = stri__prepare_arg_logical(omit_empty, "omit_empty")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), /*LENGTH(n_max), */LENGTH(omit_empty)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); // StriContainerInteger n_max_cont(n_max, vectorize_length); StriContainerLogical omit_empty_cont(omit_empty, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } const char* str_cur_s = str_cont.get(i).c_str(); R_len_t str_cur_n = str_cont.get(i).length(); // int n_max_cur = n_max_cont.get(i); int omit_empty_cur = omit_empty_cont.get(i); // if (n_max_cur < 0) // n_max_cur = INT_MAX; // else if (n_max_cur == 0) { // SET_VECTOR_ELT(ret, i, Rf_allocVector(STRSXP, 0)); // continue; // } //#define STRI_INDEX_NEWLINE_CR 0 //#define STRI_INDEX_NEWLINE_LF 1 //#define STRI_INDEX_NEWLINE_CRLF 2 //#define STRI_INDEX_NEWLINE_NEL 3 //#define STRI_INDEX_NEWLINE_VT 4 //#define STRI_INDEX_NEWLINE_FF 5 //#define STRI_INDEX_NEWLINE_LS 6 //#define STRI_INDEX_NEWLINE_PS 7 //#define STRI_INDEX_NEWLINE_LAST 8 // int counts[STRI_INDEX_NEWLINE_LAST]; // for (R_len_t j=0; j > occurrences; occurrences.push_back(pair(0, 0)); for (R_len_t j=0; j < str_cur_n /*&& k < n_max_cur*/; /* null */) { jlast = j; U8_NEXT(str_cur_s, j, str_cur_n, c); switch (c) { case ASCII_CR: /* CR */ // counts[STRI_INDEX_NEWLINE_CR]++; /* check if next is LF */ if (str_cur_s[j] == ASCII_LF) { // look ahead one byte // counts[STRI_INDEX_NEWLINE_LF]++; // counts[STRI_INDEX_NEWLINE_CRLF]++; j++; // just one byte } break; case ASCII_LF: /* LF */ // counts[STRI_INDEX_NEWLINE_LF]++; break; case UCHAR_NEL: /* NEL */ // counts[STRI_INDEX_NEWLINE_NEL]++; break; case ASCII_VT: /* VT */ // counts[STRI_INDEX_NEWLINE_VT]++; break; case ASCII_FF: /* FF */ // counts[STRI_INDEX_NEWLINE_FF]++; break; case UCHAR_LS: /* LS */ // counts[STRI_INDEX_NEWLINE_LS]++; break; case UCHAR_PS: /* PS */ // counts[STRI_INDEX_NEWLINE_PS]++; break; default: /* not a newline character */ occurrences.back().second = j; continue; } // if here, then at newline if (omit_empty_cur && occurrences.back().second == occurrences.back().first) occurrences.back().first = occurrences.back().second = j; // don't start any new field else { occurrences.back().second = jlast; occurrences.push_back(pair(j, j)); ++k; // another field } } // if (k == n_max_cur) // occurrences.back().second = str_cur_n; if (omit_empty_cur && occurrences.back().first == occurrences.back().second) occurrences.pop_back(); SEXP ans; STRI__PROTECT(ans = Rf_allocVector(STRSXP, (R_len_t)occurrences.size())); deque< pair >::iterator iter = occurrences.begin(); for (R_len_t l = 0; iter != occurrences.end(); ++iter, ++l) { pair curoccur = *iter; SET_STRING_ELT(ans, l, Rf_mkCharLenCE(str_cur_s+curoccur.first, curoccur.second-curoccur.first, CE_UTF8)); } SET_VECTOR_ELT(ret, i, ans); STRI__UNPROTECT(1); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } stringi/src/stri_container_listutf8.h0000644000176200001440000000625314770541312017570 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_container_listutf8_h #define __stri_container_listutf8_h #include "stri_container_utf8.h" /** * A class to handle conversion between R lists of character * vectors and lists of UTF-8 string vectors * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * @version 0.5-3 (Marek Gagolewski, 2015-06-27) * warning on recycling rule, #174 */ class StriContainerListUTF8 : public StriContainerBase { private: StriContainerUTF8 **data; public: StriContainerListUTF8(); StriContainerListUTF8(SEXP rlist, R_len_t nrecycle, bool shallowrecycle=true); StriContainerListUTF8(StriContainerListUTF8& container); ~StriContainerListUTF8(); StriContainerListUTF8& operator=(StriContainerListUTF8& container); SEXP toR(R_len_t i) const; SEXP toR() const; /** check if the vectorized ith element is NA * @param i index * @return true if is NA */ inline bool isNA(R_len_t i) const { #ifndef NDEBUG if (i < 0 || i >= nrecycle) throw StriException("StriContainerListUTF8::isNA(): INDEX OUT OF BOUNDS"); #endif return (data[i%n] == NULL); } /** get the vectorized ith element * @param i index * @return string, read only */ const StriContainerUTF8& get(R_len_t i) const { #ifndef NDEBUG if (i < 0 || i >= nrecycle) throw StriException("StriContainerListUTF8::get(): INDEX OUT OF BOUNDS"); if (data[i%n] == NULL) throw StriException("StriContainerListUTF8::get(): isNA"); #endif return (*(data[i%n])); } }; #endif stringi/src/stri_common.cpp0000644000176200001440000002131614770541312015564 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include /** * Set names attribute for an R object * * @param object an R object * @param numnames number of names to set * @param ... variable number of C strings * * @version 0.1-?? (Marek Gagolewski) * * @version 0.5-1 (Marek Gagolewski, 2015-03-01) * assume UTF-8 */ void stri__set_names(SEXP object, R_len_t numnames, ...) { va_list arguments; SEXP names; PROTECT(names = Rf_allocVector(STRSXP, numnames)); va_start(arguments, numnames); for (R_len_t i = 0; i < numnames; ++i) SET_STRING_ELT(names, i, Rf_mkCharCE(va_arg(arguments, char*), CE_UTF8)); va_end(arguments); Rf_setAttrib(object, R_NamesSymbol, names); UNPROTECT(1); } /** * Create a character vector with given C strings * * @param numnames number of strings * @param ... variable number of C strings * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * * @version 0.5-1 (Marek Gagolewski, 2015-03-01) * assume UTF-8 */ SEXP stri__make_character_vector_char_ptr(R_len_t numnames, ...) { va_list arguments; SEXP names; PROTECT(names = Rf_allocVector(STRSXP, numnames)); va_start(arguments, numnames); for (R_len_t i = 0; i < numnames; ++i) SET_STRING_ELT(names, i, Rf_mkCharCE(va_arg(arguments, char*), CE_UTF8)); va_end(arguments); UNPROTECT(1); return names; } /** * Create a character vector with given UnicodeStrings * * @param numnames number of strings * @param ... variable number of pointers to UnicodeString * @return character vector * * @version 0.5-1 (Marek Gagolewski, 2015-03-01) */ SEXP stri__make_character_vector_UnicodeString_ptr(R_len_t numnames, ...) { va_list arguments; SEXP names; PROTECT(names = Rf_allocVector(STRSXP, numnames)); va_start(arguments, numnames); for (R_len_t i = 0; i < numnames; ++i) { UnicodeString* cur_str16 = (UnicodeString*)va_arg(arguments, UnicodeString*); std::string cur_str8; cur_str16->toUTF8String(cur_str8); SET_STRING_ELT(names, i, Rf_mkCharCE(cur_str8.c_str(), CE_UTF8)); } va_end(arguments); UNPROTECT(1); return names; } /** * Calculate the length of the output vector when applying a vectorized * operation on >= 2 vectors * * For nonconforming lengths, a warning is given * * @param enableWarning enable warning in case of multiple calls to this function * @param n number of vectors to recycle * @param ... vector lengths * @return max of the given lengths or 0 iff any ns* is <= 0 * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * variable args length */ R_len_t stri__recycling_rule(bool enableWarning, int n, ...) { R_len_t nsm = 0; va_list arguments; va_start(arguments, n); for (R_len_t i = 0; i < n; ++i) { R_len_t curlen = va_arg(arguments, R_len_t); if (curlen <= 0) return 0; if (curlen > nsm) nsm = curlen; } va_end(arguments); if (enableWarning) { va_start(arguments, n); for (R_len_t i = 0; i < n; ++i) { R_len_t curlen = va_arg(arguments, R_len_t); if (nsm % curlen != 0) { Rf_warning(MSG__WARN_RECYCLING_RULE); break; } } va_end(arguments); } return nsm; } /** * Creates a character vector filled with NA_character_ * * @param howmany length of the vector, howmany >= 0 * @return a character vector of length howmany * * @version 0.1-?? (Marek Gagolewski) */ SEXP stri__vector_NA_strings(R_len_t howmany) { if (howmany < 0) { Rf_warning(MSG__EXPECTED_NONNEGATIVE); howmany = 0; } SEXP ret; PROTECT(ret = Rf_allocVector(STRSXP, howmany)); for (R_len_t i=0; i= 0 * @return a character vector of length howmany * * @version 0.1-?? (Marek Gagolewski) */ SEXP stri__vector_NA_integers(R_len_t howmany) { if (howmany < 0) { Rf_warning(MSG__EXPECTED_NONNEGATIVE); howmany = 0; } SEXP ret; PROTECT(ret = Rf_allocVector(INTSXP, howmany)); for (R_len_t i=0; i= 0 * @return a character vector of length howmany * * @version 0.1-?? (Marek Gagolewski) */ SEXP stri__vector_empty_strings(R_len_t howmany) { if (howmany < 0) { Rf_warning(MSG__EXPECTED_NONNEGATIVE); howmany = 0; } SEXP ret; PROTECT(ret = Rf_allocVector(STRSXP, howmany)); for (R_len_t i=0; i excluded((size_t)set_length, false); for (int k=0; option[k] != '\0'; ++k) { for (int i=0; i * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef uconfig_local_h #define uconfig_local_h #define STRI_ICU_FOUND @ICU_FOUND@ // used on Windows only, some do not have it #define STRI_DISABLE_RESOLVE_LOCALE_NAME @DISABLE_RESOLVE_LOCALE_NAME@ #if STRI_DISABLE_RESOLVE_LOCALE_NAME #define UCONFIG_USE_WINDOWS_LCID_MAPPING_API 0 #endif /* This caused a serious bug on Solaris, see #94: // #define U_DISABLE_RENAMING 1 // do not turn on! */ /* #if defined(__GNUC__) && __GNUC__ >= 3 #define NORET __attribute__((noreturn)) #else #define NORET #endif extern "C" void NORET Rf_error(const char *, ...); */ #define R_NO_REMAP #include #define UPRV_UNREACHABLE_EXIT (Rf_error("ICU internal error: UPRV_UNREACHABLE")) #define DOUBLE_CONVERSION_UNIMPLEMENTED() (Rf_error("ICU internal error: DOUBLE_CONVERSION_UNIMPLEMENTED")) #define DOUBLE_CONVERSION_UNREACHABLE() (Rf_error("ICU internal error: DOUBLE_CONVERSION_UNREACHABLE")) #if !STRI_ICU_FOUND // if compiling ICU from sources, use: /* fixes #335 (and parts of #314 that were not reverted by #335) */ #define U_LIB_SUFFIX_C_NAME _stringi #ifdef U_CHARSET_IS_UTF8 #undef U_CHARSET_IS_UTF8 #endif #define U_CHARSET_IS_UTF8 0 #endif #ifdef __cplusplus #ifdef STRINGI_MAX_ALIGN_T_STD /* #431: Check for std::max_align_t GCC Bug 56019 - max_align_t should be in std namespace https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56019 */ #include extern "C++" { namespace std { using ::max_align_t; } } #endif #endif /* localtime_r is not a C++98 nor C99 function: it is POSIX. Solaris has it, but only for C (thanks to Kurt Hornik for pointing this out) */ /* #if U_PLATFORM == U_PF_SOLARIS || defined(__SUNPRO_CC) */ /*extern struct tm *localtime_r(const time_t *, struct tm *); */ #ifndef _REENTRANT #define _REENTRANT 1 #endif /* #endif */ #endif stringi/src/stri_container_utf8_indexable.h0000644000176200001440000000575414770541312020714 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_container_utf8_indexable_h #define __stri_container_utf8_indexable_h #include "stri_container_utf8.h" /** * A class to handle conversion between R character * vectors and UTF-8 string vectors, * with UChar32 to UTF-8 indexes translation * * * @version 0.2-1 (Marek Gagolewski, 2014-03-20) * separated from StriContainerUTF8 * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use String8::isASCII */ class StriContainerUTF8_indexable : public StriContainerUTF8 { private: // the following are used in UChar32_to_UTF8_index_back // and UChar32_to_UTF8_index_fwd to speed up computations // on the same strings R_len_t last_ind_fwd_codepoint; R_len_t last_ind_fwd_utf8; const char* last_ind_fwd_str; R_len_t last_ind_back_codepoint; R_len_t last_ind_back_utf8; const char* last_ind_back_str; public: StriContainerUTF8_indexable(); StriContainerUTF8_indexable(SEXP rstr, R_len_t nrecycle, bool shallowrecycle=true); StriContainerUTF8_indexable(StriContainerUTF8_indexable& container); StriContainerUTF8_indexable& operator=(StriContainerUTF8_indexable& container); void UTF8_to_UChar32_index(R_len_t i, int* i1, int* i2, const int ni, int adj1, int adj2); R_len_t UChar32_to_UTF8_index_back(R_len_t i, R_len_t wh); R_len_t UChar32_to_UTF8_index_fwd(R_len_t i, R_len_t wh); }; #endif stringi/src/stri_search_coll_startsendswith.cpp0000644000176200001440000002001014770541312021706 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf16.h" #include "stri_container_usearch.h" #include "stri_container_integer.h" /** * Detect if a string starts with a pattern match * * @param str character vector * @param pattern character vector * @param from integer vector * @param opts_collator named list * @return logical vector * * @version 0.3-1 (Marek Gagolewski, 2014-11-01) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.4.7 (Marek Gagolewski, 2020-08-24) * #345: `negate` arg added */ SEXP stri_startswith_coll(SEXP str, SEXP pattern, SEXP from, SEXP negate, SEXP opts_collator) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); PROTECT(from = stri__prepare_arg_integer(from, "from")); UCollator* collator = NULL; collator = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(3) int vectorize_length = stri__recycling_rule(true, 3, LENGTH(str), LENGTH(pattern), LENGTH(from)); StriContainerUTF16 str_cont(str, vectorize_length); StriContainerUStringSearch pattern_cont(pattern, vectorize_length, collator); // collator is not owned by pattern_cont StriContainerInteger from_cont(from, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_LOGICAL, ret_tab[i] = negate_1) if (from_cont.isNA(i)) { ret_tab[i] = NA_LOGICAL; continue; } const UnicodeString* str_cur_data = &(str_cont.get(i)); const UChar* str_cur_s = str_cur_data->getBuffer(); const int str_cur_n = str_cur_data->length(); R_len_t from_cur = from_cont.get(i); if (from_cur == 1) from_cur = 0; /* most commonly used case */ else if (from_cur >= 0) { R_len_t nskip = from_cur-1; from_cur = 0; U16_FWD_N(str_cur_s, from_cur, str_cur_n, nskip); } else { R_len_t nskip = -from_cur; from_cur = str_cur_n; U16_BACK_N(str_cur_s, 0, from_cur, nskip); } // now surely from_cur >= 0 && from_cur <= str_cur_n ret_tab[i] = negate_1; if (from_cur >= str_cur_n) continue; // no match UStringSearch *matcher = pattern_cont.getMatcher(i, str_cur_s+from_cur, str_cur_n-from_cur); usearch_reset(matcher); UErrorCode status = U_ZERO_ERROR; int start = usearch_first(matcher, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (start != USEARCH_DONE && start == 0) ret_tab[i] = !negate_1; } if (collator) { ucol_close(collator); collator=NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( if (collator) ucol_close(collator); ) } /** * Detect if a string ends with a pattern match * * @param str character vector * @param pattern character vector * @param to integer vector * @param opts_collator named list * @return logical vector * * @version 0.3-1 (Marek Gagolewski, 2014-11-01) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.4.7 (Marek Gagolewski, 2020-08-24) * #345: `negate` arg added */ SEXP stri_endswith_coll(SEXP str, SEXP pattern, SEXP to, SEXP negate, SEXP opts_collator) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); PROTECT(to = stri__prepare_arg_integer(to, "to")); UCollator* collator = NULL; collator = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(3) int vectorize_length = stri__recycling_rule(true, 3, LENGTH(str), LENGTH(pattern), LENGTH(to)); StriContainerUTF16 str_cont(str, vectorize_length); StriContainerUStringSearch pattern_cont(pattern, vectorize_length, collator); // collator is not owned by pattern_cont StriContainerInteger to_cont(to, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_LOGICAL, ret_tab[i] = negate_1) if (to_cont.isNA(i)) { ret_tab[i] = NA_LOGICAL; continue; } const UnicodeString* str_cur_data = &(str_cont.get(i)); const UChar* str_cur_s = str_cur_data->getBuffer(); const int str_cur_n = str_cur_data->length(); R_len_t to_cur = to_cont.get(i); if (to_cur == -1) to_cur = str_cur_n; /* most commonly used case */ else if (to_cur >= 0) { R_len_t nskip = to_cur; to_cur = 0; U16_FWD_N(str_cur_s, to_cur, str_cur_n, nskip); } else { R_len_t nskip = -to_cur-1; to_cur = str_cur_n; U16_BACK_N(str_cur_s, 0, to_cur, nskip); } // now surely to_cur >= 0 && to_cur <= str_cur_n ret_tab[i] = negate_1; if (to_cur <= 0) continue; // no match UStringSearch *matcher = pattern_cont.getMatcher(i, str_cur_s, to_cur); usearch_reset(matcher); UErrorCode status = U_ZERO_ERROR; int start = usearch_last(matcher, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (start != USEARCH_DONE && start+usearch_getMatchedLength(matcher) == to_cur) ret_tab[i] = !negate_1; } if (collator) { ucol_close(collator); collator=NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( if (collator) ucol_close(collator); ) } stringi/src/stri_search_coll_subset.cpp0000644000176200001440000002122014770541312020131 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf16.h" #include "stri_container_utf8.h" #include "stri_container_usearch.h" #include /** * Detect if a pattern occurs in a string [with collation] * * @param str character vector * @param pattern character vector * @param omit_na single logical value * @param opts_collator passed to stri__ucol_open(), * if {NA}, then {stri_detect_fixed_byte} is called * * @return character vector * * @version 0.3-1 (Bartek Tartanus, 2014-07-25) * * @version 0.3-1 (Marek Gagolewski, 2014-10-17) * using std::vector to avoid mem-leaks * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.3-1 (Marek Gagolewski, 2014-11-06) * Added missing ucol_close * * @version 0.4-1 (Marek Gagolewski, 2014-12-04) * #122: omit_na arg added * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * #216: `negate` arg added * * @version 1.7.1 (Marek Gagolewski, 2021-06-17) * assure LENGTH(pattern) <= LENGTH(str) */ SEXP stri_subset_coll(SEXP str, SEXP pattern, SEXP omit_na, SEXP negate, SEXP opts_collator) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); bool omit_na1 = stri__prepare_arg_logical_1_notNA(omit_na, "omit_na"); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); if (LENGTH(str) > 0 && LENGTH(str) < LENGTH(pattern)) Rf_error(MSG__WARN_RECYCLING_RULE2); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); if (vectorize_length == 0) { UNPROTECT(2); return Rf_allocVector(STRSXP, 0); } // call stri__ucol_open after prepare_arg: // if prepare_arg had failed, we would have a mem leak UCollator* collator = NULL; collator = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF16 str_cont(str, vectorize_length); StriContainerUStringSearch pattern_cont(pattern, vectorize_length, collator); // collator is not owned by pattern_cont // BT: this cannot be done with deque, because pattern is reused so i does not // go like 0,1,2...n but 0,pat_len,2*pat_len,1,pat_len+1 and so on // MG: agreed std::vector which(vectorize_length); int result_counter = 0; for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, {if (omit_na1) which[i] = FALSE; else { which[i] = NA_LOGICAL; result_counter++; } }, {which[i] = negate_1; if (which[i]) result_counter++;}) UStringSearch *matcher = pattern_cont.getMatcher(i, str_cont.get(i)); usearch_reset(matcher); UErrorCode status = U_ZERO_ERROR; which[i] = ((int)usearch_first(matcher, &status) != USEARCH_DONE); // this is F*G slow! :-( if (negate_1) which[i] = !which[i]; if (which[i]) result_counter++; STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } if (collator) { ucol_close(collator); collator = NULL; } SEXP ret; STRI__PROTECT(ret = stri__subset_by_logical(str_cont, which, result_counter)); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( if (collator) { ucol_close(collator); collator = NULL; } ) } /** * Substitutes vector elements if a pattern occurs in a string * * @param str character vector * @param pattern character vector * @param value character vector * @param opts_collator list * @return character vector * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * #124 * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * #216: `negate` arg added * * @version 1.7.1 (Marek Gagolewski, 2021-06-17) * assure LENGTH(pattern) and LENGTH(value) <= LENGTH(str) */ SEXP stri_subset_coll_replacement(SEXP str, SEXP pattern, SEXP negate, SEXP opts_collator, SEXP value) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); PROTECT(value = stri__prepare_arg_string(value, "value")); // we are subsetting `str`, therefore recycling is slightly different here if (LENGTH(value) == 0) Rf_error(MSG__REPLACEMENT_ZERO); if (LENGTH(pattern) == 0) Rf_error(MSG__WARN_EMPTY_VECTOR); if (LENGTH(str) == 0) { UNPROTECT(3); return Rf_allocVector(STRSXP, 0); } if (LENGTH(str) < LENGTH(pattern)) // for LENGTH(value), we emit warning later on Rf_error(MSG__WARN_RECYCLING_RULE2); if ((LENGTH(str) % LENGTH(pattern)) != 0) Rf_warning(MSG__WARN_RECYCLING_RULE); R_len_t vectorize_length = LENGTH(str); // call stri__ucol_open after prepare_arg: // if prepare_arg had failed, we would have a mem leak UCollator* collator = NULL; collator = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(3) R_len_t value_length = LENGTH(value); StriContainerUTF8 value_cont(value, value_length); StriContainerUTF16 str_cont(str, vectorize_length); StriContainerUStringSearch pattern_cont(pattern, vectorize_length, collator); // collator is not owned by pattern_cont SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); std::vector detected(vectorize_length, 0); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (pattern_cont.isNA(i)) { // behave like `[<-` detected[i] = false; continue; } STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, {detected[i] = NA_INTEGER;}, {detected[i] = negate_1;}) UStringSearch *matcher = pattern_cont.getMatcher(i, str_cont.get(i)); usearch_reset(matcher); UErrorCode status = U_ZERO_ERROR; detected[i] = (((int)usearch_first(matcher, &status) != USEARCH_DONE && !negate_1) || (usearch_first(matcher, &status) == USEARCH_DONE && negate_1)); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } R_len_t k = 0; // we must traverse `str_cont` in order now for (R_len_t i = 0; i * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_string8buf.h" #include #include /** * fill a vector with CHARSXPs == individual code points in s of length n */ void stri__split_codepoints(vector& out, const char* s, int n) { UChar32 c = 0; R_len_t j = 0; // current pos while (j < n) { U8_NEXT(s, j, n, c); out.push_back(c); if (c < 0) throw StriException(MSG__INVALID_UTF8); } } /** * Translate code points * * * @param str character vector * @param pattern character vector * @param replacement character vector * @return character vector * * @version 0.5-1 (Marek Gagolewski, 2015-04-06) * * @version 1.3.2 (Marek Gagolewski, 2019-02-20) * BUGFIX: overlapping maps (#343) */ SEXP stri_trans_char(SEXP str, SEXP pattern, SEXP replacement) { PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string_1(pattern, "pattern")); PROTECT(replacement = stri__prepare_arg_string_1(replacement, "replacement")); R_len_t vectorize_length = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(3) StriContainerUTF8 replacement_cont(replacement, 1); StriContainerUTF8 pattern_cont(pattern, 1); if (replacement_cont.isNA(0) || pattern_cont.isNA(0)) { STRI__UNPROTECT_ALL return stri__vector_NA_strings(LENGTH(str)); } const String8* s_pat = &pattern_cont.get(0); const String8* s_rep = &replacement_cont.get(0); std::vector d_pat; stri__split_codepoints(d_pat, s_pat->c_str(), s_pat->length()); std::vector d_rep; stri__split_codepoints(d_rep, s_rep->c_str(), s_rep->length()); R_len_t m = std::min(d_rep.size(), d_pat.size()); if (d_pat.size() != d_rep.size()) { Rf_warning(MSG__WARN_RECYCLING_RULE2); } StriContainerUTF8 str_cont(str, vectorize_length); if (m == 0) { // nothing to do STRI__UNPROTECT_ALL return str_cont.toR(); // assure UTF-8 } SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); std::vector buf; for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } buf.clear(); const char* s = str_cont.get(i).c_str(); R_len_t n = str_cont.get(i).length(); UChar32 c = 0; R_len_t j = 0; // current pos while (j < n) { U8_NEXT(s, j, n, c); if (c < 0) throw StriException(MSG__INVALID_UTF8); // considering only the first m elements in d_pat and d_rep, from last for (R_len_t k=m-1; k>=0; --k) { if (d_pat[k] == c) { c = d_rep[k]; break; } } // U8_APPEND_UNSAFE(buf, /end/, c) uint32_t __uc=(c); if(__uc<=0x7f) { buf.push_back((uint8_t)__uc); } else { if(__uc<=0x7ff) { buf.push_back((uint8_t)((__uc>>6)|0xc0)); } else { if(__uc<=0xffff) { buf.push_back((uint8_t)((__uc>>12)|0xe0)); } else { buf.push_back((uint8_t)((__uc>>18)|0xf0)); buf.push_back((uint8_t)(((__uc>>12)&0x3f)|0x80)); } buf.push_back((uint8_t)(((__uc>>6)&0x3f)|0x80)); } buf.push_back((uint8_t)((__uc&0x3f)|0x80)); } } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buf.size(), CE_UTF8)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } stringi/src/stri_container_listint.cpp0000644000176200001440000001011614770541312020020 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_listint.h" /** * Default constructor * */ StriContainerListInt::StriContainerListInt() : StriContainerBase() { data = NULL; } /** * Construct Container from R cobject * @param rstr R object * * if you want nrecycle > n, call set_nrecycle */ StriContainerListInt::StriContainerListInt(SEXP rstr) { this->data = NULL; if (Rf_isNull(rstr)) { this->init_Base(1, 1, true); this->data = new IntVec[this->n]; // 1 vector, NA/NULL if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); } else if (Rf_isInteger(rstr)) { this->init_Base(1, 1, true); this->data = new IntVec[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); this->data[0].initialize((const int*)INTEGER(rstr), LENGTH(rstr)); // shallow copy // TODO: ALTREP will be problematic? } else // if (Rf_isVectorList(rstr)) -- args already checked { R_len_t nv = LENGTH(rstr); this->init_Base(nv, nv, true); this->data = new IntVec[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (R_len_t i=0; in; ++i) { SEXP cur = VECTOR_ELT(rstr, i); if (!Rf_isNull(cur)) this->data[i].initialize((const int*)INTEGER(cur), LENGTH(cur)); // shallow copy // TODO: ALTREP will be problematic? // else leave as-is, i.e., NULL/NA } } } StriContainerListInt::StriContainerListInt(StriContainerListInt& container) : StriContainerBase((StriContainerBase&)container) { if (container.data) { this->data = new IntVec[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (int i=0; in; ++i) { this->data[i] = container.data[i]; } } else { this->data = NULL; } } StriContainerListInt& StriContainerListInt::operator=(StriContainerListInt& container) { this->~StriContainerListInt(); (StriContainerBase&) (*this) = (StriContainerBase&)container; if (container.data) { this->data = new IntVec[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (int i=0; in; ++i) { this->data[i] = container.data[i]; } } else { this->data = NULL; } return *this; } StriContainerListInt::~StriContainerListInt() { if (data) { delete [] data; data = NULL; } } stringi/src/stri_search_regex_extract.cpp0000644000176200001440000002520014770541312020461 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_regex.h" #include #include using namespace std; /** * Extract first occurrence of a regex pattern in each string * * @param str character vector * @param pattern character vector * @param opts_regex list * @param first logical - search for the first or the last occurrence? * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-20) * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.0-2 (Marek Gagolewski, 2016-01-29) * Issue #214: allow a regex pattern like `.*` to match an empty string * * @version 1.4.7 (Marek Gagolewski, 2020-08-24) * Use StriContainerRegexPattern::getRegexOptions */ SEXP stri__extract_firstlast_regex(SEXP str, SEXP pattern, SEXP opts_regex, bool first) { PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); // prepare string argument R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriRegexMatcherOptions pattern_opts = StriContainerRegexPattern::getRegexOptions(opts_regex); UText* str_text = NULL; // may potentially be slower, but definitely is more convenient! STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_opts); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_PATTERN(str_cont, pattern_cont, SET_STRING_ELT(ret, i, NA_STRING);) UErrorCode status = U_ZERO_ERROR; RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically str_text = utext_openUTF8(str_text, str_cont.get(i).c_str(), str_cont.get(i).length(), &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) int m_start = -1; int m_end = -1; int m_res; matcher->reset(str_text); m_res = (int)matcher->find(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (m_res) { // find first match m_start = (int)matcher->start(status); // The **native** position in the input string :-) STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) m_end = (int)matcher->end(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } else { SET_STRING_ELT(ret, i, NA_STRING); continue; } if (!first) { // continue searching while (1) { m_res = (int)matcher->find(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (!m_res) break; m_start = (int)matcher->start(status); m_end = (int)matcher->end(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cont.get(i).c_str()+m_start, m_end-m_start, CE_UTF8)); } if (str_text) { utext_close(str_text); str_text = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(if (str_text) utext_close(str_text);) } /** * Extract first occurrence of a regex pattern in each string * * @param str character vector * @param pattern character vector * @param opts_regex list * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-20) */ SEXP stri_extract_first_regex(SEXP str, SEXP pattern, SEXP opts_regex) { return stri__extract_firstlast_regex(str, pattern, opts_regex, true); } /** * Extract last occurrence of a regex pattern in each string * * @param str character vector * @param pattern character vector * @param opts_regex list * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-20) */ SEXP stri_extract_last_regex(SEXP str, SEXP pattern, SEXP opts_regex) { return stri__extract_firstlast_regex(str, pattern, opts_regex, false); } /** * Extract all occurrences of a regex pattern in each string * * @param str character vector * @param pattern character vector * @param opts_regex list * @param simplify single logical value * * @return list of character vectors or character matrix * * @version 0.1-?? (Marek Gagolewski, 2013-06-20) * * @version 0.3-1 (Marek Gagolewski, 2014-10-24) * added simplify param * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-11-27) * FR #117: omit_no_match arg added * * @version 0.4-1 (Marek Gagolewski, 2014-12-04) * allow `simplify=NA` * * @version 1.0-2 (Marek Gagolewski, 2016-01-29) * Issue #214: allow a regex pattern like `.*` to match an empty string */ SEXP stri_extract_all_regex(SEXP str, SEXP pattern, SEXP simplify, SEXP omit_no_match, SEXP opts_regex) { StriRegexMatcherOptions pattern_opts = StriContainerRegexPattern::getRegexOptions(opts_regex); bool omit_no_match1 = stri__prepare_arg_logical_1_notNA(omit_no_match, "omit_no_match"); PROTECT(simplify = stri__prepare_arg_logical_1(simplify, "simplify")); PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); // prepare string argument R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); UText* str_text = NULL; // may potentially be slower, but definitely is more convenient! STRI__ERROR_HANDLER_BEGIN(3) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_opts); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_PATTERN(str_cont, pattern_cont, SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));) UErrorCode status = U_ZERO_ERROR; RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically str_text = utext_openUTF8(str_text, str_cont.get(i).c_str(), str_cont.get(i).length(), &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) matcher->reset(str_text); deque< pair > occurrences; int m_res; while (1) { m_res = (int)matcher->find(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (!m_res) break; occurrences.push_back(pair( (R_len_t)matcher->start(status), (R_len_t)matcher->end(status) )); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } R_len_t noccurrences = (R_len_t)occurrences.size(); if (noccurrences <= 0) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(omit_no_match1?0:1)); continue; } const char* str_cur_s = str_cont.get(i).c_str(); SEXP cur_res; STRI__PROTECT(cur_res = Rf_allocVector(STRSXP, noccurrences)); deque< pair >::iterator iter = occurrences.begin(); for (R_len_t j = 0; iter != occurrences.end(); ++iter, ++j) { pair curo = *iter; SET_STRING_ELT(cur_res, j, Rf_mkCharLenCE(str_cur_s+curo.first, curo.second-curo.first, CE_UTF8)); } SET_VECTOR_ELT(ret, i, cur_res); STRI__UNPROTECT(1); } if (str_text) { utext_close(str_text); str_text = NULL; } if (LOGICAL(simplify)[0] == NA_LOGICAL || LOGICAL(simplify)[0]) { SEXP robj_TRUE, robj_zero, robj_na_strings, robj_empty_strings; STRI__PROTECT(robj_TRUE = Rf_ScalarLogical(TRUE)); STRI__PROTECT(robj_zero = Rf_ScalarInteger(0)); STRI__PROTECT(robj_na_strings = stri__vector_NA_strings(1)); STRI__PROTECT(robj_empty_strings = stri__vector_empty_strings(1)); STRI__PROTECT(ret = stri_list2matrix(ret, robj_TRUE, (LOGICAL(simplify)[0] == NA_LOGICAL)?robj_na_strings :robj_empty_strings, robj_zero)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(if (str_text) utext_close(str_text);) } stringi/src/stri_trans_transliterate.cpp0000644000176200001440000001241014770541312020357 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf16.h" #include #include #include /** List available transliterators * * @return character vector * * @version 0.2-2 (Marek Gagolewski, 2014-04-19) * * @version 0.2-3 (Marek Gagolewski, 2015-05-12) * uses Transliterator::getAvailableIDs * as getAvailableID is obsolete as of ICU 3.x * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_trans_list() { StringEnumeration* trans_enum = NULL; STRI__ERROR_HANDLER_BEGIN(0) UErrorCode status = U_ZERO_ERROR; trans_enum = Transliterator::getAvailableIDs(status); /*The caller should delete this object when done using it. */ STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) trans_enum->reset(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) R_len_t n = (R_len_t)trans_enum->count(status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, n)); // MG: I reckon than IDs are more readable than DisplayNames for (R_len_t i=0; inext(&len, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) SET_STRING_ELT(ret, i, Rf_mkCharLenCE(cur, len, CE_UTF8)); } if (trans_enum) { delete trans_enum; trans_enum = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( if (trans_enum) { delete trans_enum; trans_enum = NULL; } ) } /** General text transform with ICU Transliterator * * @param str character vector * @param id single string * @param rules single bool * @param forward single bool * @return character vector * * @version 0.2-2 (Marek Gagolewski, 2014-04-19) * @version 1.6.3 (Marek Gagolewski, 2021-06-03) rules, forward */ SEXP stri_trans_general(SEXP str, SEXP id, SEXP rules, SEXP forward) { PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(id = stri__prepare_arg_string_1(id, "id")); bool rules_val = stri__prepare_arg_logical_1_notNA(rules, "rules"); bool forward_val = stri__prepare_arg_logical_1_notNA(forward, "forward"); R_len_t str_length = LENGTH(str); Transliterator* trans = NULL; STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF16 id_cont(id, 1); if (id_cont.isNA(0)) { STRI__UNPROTECT_ALL return stri__vector_NA_strings(str_length); } UErrorCode status = U_ZERO_ERROR; UParseError parserr; if (!rules_val) trans = Transliterator::createInstance( id_cont.get(0), (forward_val?UTRANS_FORWARD:UTRANS_REVERSE), status ); else trans = Transliterator::createFromRules( UnicodeString("Rule-based Transliterator"), // can be anything id_cont.get(0), (forward_val?UTRANS_FORWARD:UTRANS_REVERSE), parserr, status ); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) StriContainerUTF16 str_cont(str, str_length, false); // writable, no recycle for (R_len_t i=0; itransliterate(str_cont.getWritable(i)); } if (trans) { delete trans; trans = NULL; } STRI__UNPROTECT_ALL return str_cont.toR(); STRI__ERROR_HANDLER_END( if (trans) { delete trans; trans = NULL; } ) } stringi/src/stri_time_symbols.cpp0000644000176200001440000001670414770541312017007 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_string8buf.h" #include "stri_container_utf8.h" #include #include #include /** List Localizable Date-Time Formatting Data * * @param locale single string or NULL * @param context single string * @param width single string * @return list * * @version 0.5-1 (Marek Gagolewski, 2014-12-25) * * @version 0.5-1 (Marek Gagolewski, 2015-01-01) * use calendar keyword in locale */ SEXP stri_datetime_symbols(SEXP locale, SEXP context, SEXP width) { const char* qloc = stri__prepare_arg_locale(locale, "locale"); /* this is R_alloc'ed */ const char* context_str = stri__prepare_arg_string_1_notNA(context, "context"); const char* context_opts[] = {"format", "standalone", NULL}; int context_cur = stri__match_arg(context_str, context_opts); const char* width_str = stri__prepare_arg_string_1_notNA(width, "width"); const char* width_opts[] = {"abbreviated", "wide", "narrow", NULL}; int width_cur = stri__match_arg(width_str, width_opts); DateFormatSymbols::DtContextType context_val; if (context_cur == 0) context_val = DateFormatSymbols::FORMAT; else if (context_cur == 1) context_val = DateFormatSymbols::STANDALONE; else Rf_error(MSG__INCORRECT_MATCH_OPTION, "context"); DateFormatSymbols::DtWidthType width_val; if (width_cur == 0) width_val = DateFormatSymbols::ABBREVIATED; else if (width_cur == 1) width_val = DateFormatSymbols::WIDE; else if (width_cur == 2) width_val = DateFormatSymbols::NARROW; else Rf_error(MSG__INCORRECT_MATCH_OPTION, "width"); UErrorCode status = U_ZERO_ERROR; String8buf calendar_type(128); Locale loc = Locale::createFromName(qloc); int32_t kvlen = loc.getKeywordValue("calendar", calendar_type.data(), calendar_type.size(), status); STRI__CHECKICUSTATUS_RFERROR(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; DateFormatSymbols sym(status); status = U_ZERO_ERROR; if (kvlen == 0) sym = DateFormatSymbols(loc, status); else sym = DateFormatSymbols(loc, calendar_type.data(), status); STRI__CHECKICUSTATUS_RFERROR(status, {/* do nothing special on err */}) if (status == U_USING_DEFAULT_WARNING && qloc) { //UErrorCode status2 = U_ZERO_ERROR; //const char* valid_locale = sym.getLocale(ULOC_VALID_LOCALE, status2).getBaseName(); // NOTE! It does not fall back to the "root" locale! //if (valid_locale && !strcmp(valid_locale, "root")) Rf_warning("%s", ICUError::getICUerrorName(status)); } const R_len_t infosize = 5; SEXP vals; R_len_t j = -1; PROTECT(vals = Rf_allocVector(VECSXP, infosize)); for (int i=0; i 0 && ret[0].length() == 0) { // this always(?) returns an emty string at the beginning --count; ++ret; } SET_VECTOR_ELT(vals, j, Rf_allocVector(STRSXP, count)); for (int32_t i=0; i * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_ucnv.h" #include "stri_container_utf8.h" /** * Get the largest number of bytes amongst the strings in a character vector * (useful for allocating temporary buffers) * * If all strings are NA or an empty character vector is given, -1 is returned. * Prior to memory allocation, you should check for < 0! * * Note that ICU permits only strings of length < 2^31. * @param s character vector * @return maximal number of bytes * * @version 0.1-?? (Marek Gagolewski) */ R_len_t stri__numbytes_max(SEXP str) { // STRI_ASSERT - str is a character vector R_len_t ns = LENGTH(str); if (ns <= 0) return -1; R_len_t maxlen = -1; for (R_len_t i=0; i maxlen) maxlen = cns; } } return maxlen; // TODO: overload this function for StriContainers..... } /** * Count the number of characters/code points in a string * * Note that ICU permits only strings of length < 2^31. * * @param s character vector * @return integer vector * * @version 0.1-?? (Marcin Bujarski) * * @version 0.1-?? (Marek Gagolewski) * Multiple input encoding support * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-27) * using StriUcnv; * warn on invalid UTF-8 sequences * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.6.3 (Marek Gagolewski, 2021-05-22) * use stri__length_string for UTF-8 */ SEXP stri_length(SEXP str) { PROTECT(str = stri__prepare_arg_string(str, "str")); STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_n = LENGTH(str); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_n)); int* retint = INTEGER(ret); StriUcnv ucnvNative(NULL); for (R_len_t k = 0; k < str_n; k++) { SEXP curs = STRING_ELT(str, k); if (curs == NA_STRING) { retint[k] = NA_INTEGER; continue; } R_len_t curs_n = LENGTH(curs); // O(1) - stored in SEXPREC if (IS_ASCII(curs) || IS_LATIN1(curs)) { retint[k] = curs_n; } else if (IS_BYTES(curs)) { throw StriException(MSG__BYTESENC); } else if (IS_UTF8(curs) || ucnvNative.isUTF8()) { // UTF-8 or native is UTF-8 const char* curs_s = CHAR(curs); // TODO: ALTREP will be problematic? retint[k] = stri__length_string(curs_s, curs_n); } else if (ucnvNative.is8bit()) { // native-8bit retint[k] = curs_n; } else { // native encoding, not 8 bit UConverter* uconv = ucnvNative.getConverter(); // native encoding which is neither 8-bit nor UTF-8 (e.g., 'Big5') // this is weird, but we're prepared UErrorCode status = U_ZERO_ERROR; const char* source = CHAR(curs); // TODO: ALTREP will be problematic? const char* sourceLimit = source + curs_n; R_len_t j; for (j = 0; source != sourceLimit; j++) { /*ignore_retval=*/ucnv_getNextUChar(uconv, &source, sourceLimit, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } retint[k] = j; // all right, we got it! } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* no special action on error */ }) } /** * Get number of bytes in each string * * Note that ICU permits only strings of length < 2^31. * * @param s R object coercible to a character vector * @return integer vector * * @version 0.1-?? (Marcin Bujarski) * * @version 0.2-1 (Marek Gagolewski, 2014-04-01) * StriException-friendly * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_numbytes(SEXP str) { PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument R_len_t str_n = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(1) SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_n)); int* retint = INTEGER(ret); for (R_len_t i=0; i width = 0 */ if (U_GET_GC_MASK(c) & (U_GC_MN_MASK | U_GC_ME_MASK | U_GC_CF_MASK | U_GC_CC_MASK)) return 0; /* Hangul Jamo medial vowels and final consonants have width 0 */ int hangul = (int)u_getIntPropertyValue(c, UCHAR_HANGUL_SYLLABLE_TYPE); if (hangul == U_HST_VOWEL_JAMO || hangul == U_HST_TRAILING_JAMO) return 0; /* Variation Selectors */ if (c >= (UChar32)0xFE00 && c <= (UChar32)0xFE0F) return 0; #if U_ICU_VERSION_MAJOR_NUM>=57 // UCHAR_EMOJI_* is ICU >= 57 if ( u_hasBinaryProperty(c, UCHAR_EMOJI_MODIFIER) ) { return 0; } #endif if (width == U_EA_FULLWIDTH || width == U_EA_WIDE) return 2; /* v1.6.1 had U_EA_AMBIGUOUS set to 2, this was not a good idea if (width == U_EA_AMBIGUOUS) return 2; // 'a' is narrow // 'a with ogonek' is neutral // 'Eszett' is ambiguous // 'grave accent' is narrow */ /* v1.6.1 GC=So -> width = 2 */ if (U_GET_GC_MASK(c) & (U_GC_SO_MASK)) return 2; /* v1.6.1 had GC=Sk of width = 0 but there are exceptions: \u005E \N{CIRCUMFLEX ACCENT} ^ is Sk \u0060 \N{GRAVE ACCENT} ` is Sk generally, it's not a good idea */ #if U_ICU_VERSION_MAJOR_NUM>=57 // UCHAR_EMOJI_* is ICU >= 57 if (width == U_EA_NEUTRAL && u_hasBinaryProperty(c, UCHAR_EMOJI_PRESENTATION)) return 2; #endif /* any other characters have width 1 */ return 1; } /** Get width of a single character (context-dependent) * * inspired by http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c * and https://github.com/nodejs/node/blob/master/src/node_i18n.cc * but with extras * * @version 1.6.3 (Marek Gagolewski, 2021-06-14) * stand-alone fun * * @param c code point * @param p previous code point * @return int */ int stri__width_char_with_context(UChar32 c, UChar32 p, bool& reset) { if (reset) { p = 0; reset = false; } #if U_ICU_VERSION_MAJOR_NUM>=57 // UCHAR_EMOJI_* is ICU >= 57 if ( /*j > 0 &&*/ p == 0x200D /* ZERO WIDTH JOINER */ && ( u_hasBinaryProperty(c, UCHAR_EMOJI_MODIFIER) || u_hasBinaryProperty(c, UCHAR_EMOJI_PRESENTATION) || c == 0x2640 /* FEMALE */ || c == 0x2642 /* MALE */ || c == 0x26A7 /* TRANSGENDER */ || c == 0x2695 /* HEALTH */ || c == 0x2696 /* JUDGE */ || c == 0x1F5E8 /* SPEECH */ || c == 0x1F32B /* CLOUDS */ || c == 0x2708 /* PLANE */ || c == 0x2764 /* HEART */ || c == 0x2744 /* SNOWFLAKE */ || c == 0x2620 /* SKULL AND CROSSBONES */ ) ) { // emoji sequence - ignore (display might not support it) return 0; } else if ( /*j > 0 &&*/ (p >= 0x1F1E6 && p <= 0x1F1FF) && (c >= 0x1F1E6 && c <= 0x1F1FF) ) { // E2.0 flag (p counted as of width=2 already) reset = true; // allow the next flag to be recognised return 0; } else { return stri__width_char(c); } #else // U_ICU_VERSION_MAJOR_NUM < 57 - no emoji support return stri__width_char(c); #endif } /** Get the length (number of Unicode code points) of a single UTF-8 string * or get the position where a substring of <= max_length ends * * @param str_cur_s string * @param str_cur_n number of bytes in str_cur_s * @param max_length * @return length of the whole string (if max_length==NA_INTEGER) or index * * @version 1.6.3 (Marek Gagolewski, 2021-05-22) * extracted from stri_length */ int stri__length_string(const char* str_cur_s, int str_cur_n, int max_length) { // is string is in ASCII, then length == str_cur_n, but with // merely str_cur_s ptr we are unable to tell that here UChar32 c = 0; R_len_t j = 0; R_len_t cur_length = 0; while (j < str_cur_n) { R_len_t prevj = j; U8_NEXT(str_cur_s, j, str_cur_n, c); // faster that U8_FWD_1 & gives bad UChar32s if (c < 0) throw StriException(MSG__INVALID_UTF8); cur_length++; if (max_length != NA_INTEGER && cur_length > max_length) return prevj; } if (max_length == NA_INTEGER) return cur_length; else return str_cur_n; // the whole string has length <= max_length } /** Get the width of a single UTF-8 string or get the position where * a substring of <= max_width ends * * @param str_cur_s string * @param str_cur_n number of bytes in str_cur_s * @param max_width * @return width of the whole string (if max_width==NA_INTEGER) * or index * * @version 1.6.1 (Marek Gagolewski) * most in https://unicode.org/Public/emoji/13.1/emoji-test.txt of width=2 * * @version 1.6.2 (Marek Gagolewski, 2021-05-13) * bugfixes * * @version 1.6.3 (Marek Gagolewski, 2021-05-22) * max_width */ int stri__width_string(const char* str_cur_s, int str_cur_n, int max_width) { int cur_width = 0; UChar32 p; // previous UChar32 c = 0; // current R_len_t j = 0; bool reset = true; while (j < str_cur_n) { R_len_t prevj = j; p = c; U8_NEXT(str_cur_s, j, str_cur_n, c); if (c < 0) throw StriException(MSG__INVALID_UTF8); cur_width += stri__width_char_with_context(c, p, reset); // test if max_width exceeded (here; there may be zero-width chars) if (max_width != NA_INTEGER && cur_width > max_width) return prevj; } if (max_width == NA_INTEGER) return cur_width; else return str_cur_n; // the whole string has width <= max_width } /** * Determine the width of strings * * @param str character vector * @return integer vector * * @version 0.5-1 (Marek Gagolewski, 2015-04-22) */ SEXP stri_width(SEXP str) { PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_n = LENGTH(str); StriContainerUTF8 str_cont(str, str_n); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_n)); int* retint = INTEGER(ret); for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { retint[i] = NA_INTEGER; continue; } const char* str_cur_s = str_cont.get(i).c_str(); R_len_t str_cur_n = str_cont.get(i).length(); retint[i] = stri__width_string(str_cur_s, str_cur_n); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* no special action on error */ }) } stringi/src/stri_container_usearch.cpp0000644000176200001440000001215614770541312017772 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_usearch.h" /** * Default constructor * */ StriContainerUStringSearch::StriContainerUStringSearch() : StriContainerUTF16() { this->lastMatcherIndex = -1; this->str = NULL; this->col = NULL; } /** * Construct String Container from R character vector * @param rstr R character vector * @param nrecycle extend length [vectorization] * @param col Collator; owned by external caller */ StriContainerUStringSearch::StriContainerUStringSearch(SEXP rstr, R_len_t _nrecycle, UCollator* _col) : StriContainerUTF16(rstr, _nrecycle, true) { this->lastMatcherIndex = -1; this->lastMatcher = NULL; this->col = _col; R_len_t n = get_n(); for (R_len_t i=0; ilastMatcherIndex = -1; this->lastMatcher = NULL; this->col = container.col; } StriContainerUStringSearch& StriContainerUStringSearch::operator=(StriContainerUStringSearch& container) { this->~StriContainerUStringSearch(); (StriContainerUTF16&) (*this) = (StriContainerUTF16&)container; this->lastMatcherIndex = -1; this->lastMatcher = NULL; this->col = container.col; return *this; } /** Destructor * */ StriContainerUStringSearch::~StriContainerUStringSearch() { if (lastMatcher) { usearch_close(lastMatcher); lastMatcher = NULL; } col = NULL; // col is owned by the caller } /** the returned matcher shall not be deleted by the user * * it is assumed that \code{vectorize_next()} is used: * for \code{i >= this->n} the last matcher is returned * * * @param i index * @param searchStr string to search in */ UStringSearch* StriContainerUStringSearch::getMatcher(R_len_t i, const UnicodeString& searchStr) { return getMatcher(i, searchStr.getBuffer(), searchStr.length()); } /** the returned matcher shall not be deleted by the user * * it is assumed that \code{vectorize_next()} is used: * for \code{i >= this->n} the last matcher is returned * * * @param i index * @param searchStr string to search in * @param searchStr_len string length in UChars */ UStringSearch* StriContainerUStringSearch::getMatcher(R_len_t i, const UChar* searchStr, int32_t searchStr_len) { if (!lastMatcher) { this->lastMatcherIndex = (i % n); UErrorCode status = U_ZERO_ERROR; lastMatcher = usearch_openFromCollator( this->get(i).getBuffer(), this->get(i).length(), searchStr, searchStr_len, this->col, NULL, &status); STRI__CHECKICUSTATUS_THROW(status, {usearch_close(lastMatcher); lastMatcher = NULL;}) return lastMatcher; } if (this->lastMatcherIndex == (i % n)) { // do nothing => matcher reuse } else { this->lastMatcherIndex = (i % n); UErrorCode status = U_ZERO_ERROR; usearch_setPattern(lastMatcher, this->get(i).getBuffer(), this->get(i).length(), &status); STRI__CHECKICUSTATUS_THROW(status, {usearch_close(lastMatcher); lastMatcher = NULL;}) } UErrorCode status = U_ZERO_ERROR; usearch_setText(lastMatcher, searchStr, searchStr_len, &status); STRI__CHECKICUSTATUS_THROW(status, {usearch_close(lastMatcher); lastMatcher = NULL;}) return lastMatcher; } stringi/src/stri_encoding_detection.cpp0000644000176200001440000010514414770541452020127 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include #include #include #include #include #include #include #include #include #include "stri_container_listraw.h" #include "stri_container_logical.h" #include "stri_ucnv.h" using namespace std; /** Check if a string may be valid 8-bit (including UTF-8) encoded * * simple check whether all charcodes are nonzero * * @param str_cur_s character vector * @param str_cur_n number of bytes * @param get_confidence determine confidence value or do exact check * * @return confidence value in [0,1] * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-08-06) * separate func * * @version 0.1-?? (Marek Gagolewski, 2013-08-13) * warnchars count added */ double stri__enc_check_8bit(const char* str_cur_s, R_len_t str_cur_n, bool get_confidence) { R_len_t warnchars = 0; for (R_len_t j=0; j < str_cur_n; ++j) { if (str_cur_s[j] == 0) return 0.0; if (get_confidence && (str_cur_s[j] <= 31 || str_cur_s[j] == 127)) { switch (str_cur_s[j]) { case 9: // \t case 10: // \n case 13: // \r case 26: // ASCII SUBSTITUTE break; // ignore default: warnchars++; } } } return (get_confidence?(double)warnchars/double(str_cur_n):1.0); } /** Check if a string is valid ASCII * * simple check whether charcodes are in [1..127] * by using U8_IS_SINGLE * * @param str_cur_s character vector * @param str_cur_n number of bytes * @param get_confidence determine confidence value or do exact check * * @return confidence value in [0,1] * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-08-06) * separate func * * @version 0.1-?? (Marek Gagolewski, 2013-08-13) * warnchars count added */ double stri__enc_check_ascii(const char* str_cur_s, R_len_t str_cur_n, bool get_confidence) { R_len_t warnchars = 0; for (R_len_t j=0; j < str_cur_n; ++j) { if (!U8_IS_SINGLE(str_cur_s[j]) || str_cur_s[j] == 0) // i.e., 0 < c <= 127 return 0.0; if (get_confidence && (str_cur_s[j] <= 31 || str_cur_s[j] == 127)) { switch (str_cur_s[j]) { case 9: // \t case 10: // \n case 13: // \r case 26: // ASCII SUBSTITUTE break; // ignore default: warnchars++; } } } return (get_confidence?(double)(str_cur_n-warnchars)/double(str_cur_n):1.0); } /** Check if a string is valid UTF-8 * * checks if a string is probably UTF-8-encoded; * simple check with U8_NEXT * * * @param str_cur_s character vector * @param str_cur_n number of bytes * @param get_confidence determine confidence value or do exact check * * @return confidence value in [0,1] * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-08-06) * separate func * * @version 0.1-?? (Marek Gagolewski, 2013-08-13) * confidence calculation basing on ICU's i18n/csrutf8.cpp */ double stri__enc_check_utf8(const char* str_cur_s, R_len_t str_cur_n, bool get_confidence) { if (!get_confidence) { UChar32 c; for (R_len_t j=0; j < str_cur_n; ) { if (str_cur_s[j] == 0) return 0.0; // definitely not valid UTF-8 U8_NEXT(str_cur_s, j, str_cur_n, c); if (c < 0) // ICU utf8.h doc for U8_NEXT: c -> output UChar32 variable, set to <0 in case of an error return 0.0; // definitely not valid UTF-8 } return 1.0; } else { // Based on ICU's i18n/csrutf8.cpp [with own mods] bool hasBOM = (str_cur_n >= 3 && (uint8_t)(str_cur_s[0]) == UTF8_BOM_BYTE1 && (uint8_t)(str_cur_s[1]) == UTF8_BOM_BYTE2 && (uint8_t)(str_cur_s[2]) == UTF8_BOM_BYTE3); R_len_t numValid = 0; // counts only valid UTF-8 multibyte seqs R_len_t numInvalid = 0; // Scan for multi-byte sequences for (R_len_t i=0; i < str_cur_n; i += 1) { uint32_t b = str_cur_s[i]; if ((b & 0x80) == 0) { continue; // ASCII => OK } // Hi bit on char found. Figure out how long the sequence should be R_len_t trailBytes = 0; if ((b & 0x0E0) == 0x0C0) trailBytes = 1; else if ((b & 0x0F0) == 0x0E0) trailBytes = 2; else if ((b & 0x0F8) == 0xF0) trailBytes = 3; else { numInvalid += 1; if (numInvalid > 5) break; // that's enough => not UTF-8 continue; } // Verify that we've got the right number of trail bytes in the sequence while (true) { i += 1; if (i >= str_cur_n) break; b = str_cur_s[i]; if ((b & 0xC0) != 0x080) { numInvalid += 1; break; } if (--trailBytes == 0) { numValid += 1; break; } } } // Cook up some sort of confidence score, based on BOM's presence // and the existence of valid and/or invalid multi-byte sequences. if (hasBOM && numInvalid == 0) return 1.0; else if (hasBOM && numValid > numInvalid*10) return 0.75; else if (numValid > 3 && numInvalid == 0) return 1.0; else if (numValid > 0 && numInvalid == 0) return 0.50; // too few multibyte UTF-8 seqs to be quite sure else if (numValid == 0 && numInvalid == 0) // Plain ASCII. => It's OK for UTF-8 return 0.50; else if (numValid > numInvalid*10) // Probably corrupt utf-8 data. Valid sequences aren't likely by chance. return 0.25; else return 0.0; } } /** Check if a string is valid UTF-16LE or UTF-16BE * * @param str_cur_s character vector * @param str_cur_n number of bytes * @param get_confidence determine confidence value or do exact check * @param le check for UTF-16LE? * * @return confidence value in [0,1] * * @version 0.1-?? (Marek Gagolewski, 2013-08-09) * * @version 0.1-?? (Marek Gagolewski, 2013-08-14) * confidence calculation basing on ICU's i18n/csucode.cpp */ double stri__enc_check_utf16(const char* str_cur_s, R_len_t str_cur_n, bool get_confidence, bool le) { if (str_cur_n % 2 != 0) return 0.0; bool hasLE_BOM = STRI__ENC_HAS_BOM_UTF16LE(str_cur_s, str_cur_n); bool hasBE_BOM = STRI__ENC_HAS_BOM_UTF16BE(str_cur_s, str_cur_n); if ((!le && hasLE_BOM) || (le && hasBE_BOM)) return 0.0; R_len_t warnchars = 0; for (R_len_t i=0; i= 0x0530) // last cyrrilic supplement warnchars += 2; continue; } if (!U16_IS_SURROGATE_LEAD(c)) return 0.0; i += 2; if (i >= str_cur_n) return 0.0; c = (le)? STRI__GET_INT16_LE(str_cur_s, i): STRI__GET_INT16_BE(str_cur_s, i); if (!U16_IS_SURROGATE_TRAIL(c)) return 0.0; } return (get_confidence?(double)(str_cur_n-warnchars)/double(str_cur_n):1.0); } /** Check if a string is valid UTF-16BE * * @param str_cur_s character vector * @param str_cur_n number of bytes * @param get_confidence determine confidence value or to exact check * * @return confidence value in [0,1] * * @version 0.1-?? (Marek Gagolewski, 2013-08-09) */ double stri__enc_check_utf16be(const char* str_cur_s, R_len_t str_cur_n, bool get_confidence) { return stri__enc_check_utf16(str_cur_s, str_cur_n, get_confidence, false); } /** Check if a string is valid UTF-16LE * * @param str_cur_s character vector * @param str_cur_n number of bytes * @param get_confidence determine confidence value or do exact check * * @return confidence value in [0,1] * * @version 0.1-?? (Marek Gagolewski, 2013-08-09) */ double stri__enc_check_utf16le(const char* str_cur_s, R_len_t str_cur_n, bool get_confidence) { return stri__enc_check_utf16(str_cur_s, str_cur_n, get_confidence, true); } /** Check if a string is valid UTF-32LE or UTF-32BE * * @param str_cur_s character vector * @param str_cur_n number of bytes * @param get_confidence determine confidence value or do exact check * @param le check for UTF-32LE? * * @return confidence value in [0,1] * * @version 0.1-?? (Marek Gagolewski, 2013-08-09) * * @version 0.1-?? (Marek Gagolewski, 2013-08-13) * confidence calculation basing on ICU's i18n/csucode.cpp */ double stri__enc_check_utf32(const char* str_cur_s, R_len_t str_cur_n, bool get_confidence, bool le) { if (str_cur_n % 4 != 0) return 0.0; bool hasLE_BOM = STRI__ENC_HAS_BOM_UTF32LE(str_cur_s, str_cur_n); bool hasBE_BOM = STRI__ENC_HAS_BOM_UTF32BE(str_cur_s, str_cur_n); if ((!le && hasLE_BOM) || (le && hasBE_BOM)) return 0.0; R_len_t numValid = 0; R_len_t numInvalid = 0; for (R_len_t i=0; i= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) { if (!get_confidence) return 0.0; else numInvalid++; } else numValid++; } if (!get_confidence) return 1.0; if ((hasLE_BOM || hasBE_BOM) && numInvalid==0) return 1.0; else if ((hasLE_BOM || hasBE_BOM) && numValid > numInvalid*10) return 0.80; else if (numValid > 3 && numInvalid == 0) return 1.0; else if (numValid > 0 && numInvalid == 0) return 0.80; else if (numValid > numInvalid*10) return 0.25; // Probably corruput UTF-32BE data. Valid sequences aren't likely by chance. else return 0.0; } /** Check if a string is valid UTF-32BE * * @param str_cur_s character vector * @param str_cur_n number of bytes * @param get_confidence determine confidence value or do exact check * * @return confidence value in [0,1] * * @version 0.1-?? (Marek Gagolewski, 2013-08-13) */ double stri__enc_check_utf32be(const char* str_cur_s, R_len_t str_cur_n, bool get_confidence) { return stri__enc_check_utf32(str_cur_s, str_cur_n, get_confidence, false); } /** Check if a string is valid UTF-32LE * * @param str_cur_s character vector * @param str_cur_n number of bytes * @param get_confidence determine confidence value or do exact check * * @return confidence value in [0,1] * * @version 0.1-?? (Marek Gagolewski, 2013-08-13) */ double stri__enc_check_utf32le(const char* str_cur_s, R_len_t str_cur_n, bool get_confidence) { return stri__enc_check_utf32(str_cur_s, str_cur_n, get_confidence, true); } /** Which string is in given encoding * * * @param str character vector or raw vector or list of raw vectors * @param type (single integer, internal) * @return logical vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-08-08) * use StriContainerListRaw * * @version 0.1-?? (Marek Gagolewski, 2013-08-09) * one function for is_*, do dispatch * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * this is internal function now */ SEXP stri_enc_isenc(SEXP str, int _type) { double (*isenc)(const char*, R_len_t, bool) = NULL; switch (_type) { case 1: isenc = stri__enc_check_ascii; break; case 2: isenc = stri__enc_check_utf8; break; case 3: isenc = stri__enc_check_utf16be; break; case 4: isenc = stri__enc_check_utf16le; break; case 5: isenc = stri__enc_check_utf32be; break; case 6: isenc = stri__enc_check_utf32le; break; default: Rf_error(MSG__INCORRECT_INTERNAL_ARG); // error() call allowed here } PROTECT(str = stri__prepare_arg_list_raw(str, "str")); STRI__ERROR_HANDLER_BEGIN(1) StriContainerListRaw str_cont(str); R_len_t str_length = str_cont.get_n(); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, str_length)); int* ret_tab = LOGICAL(ret); // may be faster than LOGICAL(ret)[i] all the time for (R_len_t i=0; i < str_length; ++i) { if (str_cont.isNA(i)) { ret_tab[i] = NA_LOGICAL; continue; } bool get_confidence = false; // TO BE DONE ret_tab[i] = isenc(str_cont.get(i).c_str(), str_cont.get(i).length(), get_confidence) != 0.0; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* no-op on error */ }) } /** Which string is in ASCII * * @param str character vector or raw vector or list of raw vectors * @return logical vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * call stri_enc_isenc */ SEXP stri_enc_isascii(SEXP str) { return stri_enc_isenc(str, 1); } /** Which string is in UTF8 * * @param str character vector or raw vector or list of raw vectors * @return logical vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * call stri_enc_isenc */ SEXP stri_enc_isutf8(SEXP str) { return stri_enc_isenc(str, 2); } /** Which string is in UTF-16BE * * @param str character vector or raw vector or list of raw vectors * @return logical vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * call stri_enc_isenc */ SEXP stri_enc_isutf16be(SEXP str) { return stri_enc_isenc(str, 3); } /** Which string is in UTF16-LE * * @param str character vector or raw vector or list of raw vectors * @return logical vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * call stri_enc_isenc */ SEXP stri_enc_isutf16le(SEXP str) { return stri_enc_isenc(str, 4); } /** Which string is in UTF-32BE * * @param str character vector or raw vector or list of raw vectors * @return logical vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * call stri_enc_isenc */ SEXP stri_enc_isutf32be(SEXP str) { return stri_enc_isenc(str, 5); } /** Which string is in UTF32-LE * * @param str character vector or raw vector or list of raw vectors * @return logical vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * call stri_enc_isenc */ SEXP stri_enc_isutf32le(SEXP str) { return stri_enc_isenc(str, 6); } /** Detect encoding and language * * @param str character vector * @param filter_angle_brackets logical vector * * @return list * * @version 0.1-?? (Marek Gagolewski, 2013-08-03) * * @version 0.1-?? (Marek Gagolewski, 2013-08-08) * use StriContainerListRaw + BUGFIX * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_detect(SEXP str, SEXP filter_angle_brackets) { PROTECT(str = stri__prepare_arg_list_raw(str, "str")); PROTECT(filter_angle_brackets = stri__prepare_arg_logical(filter_angle_brackets, "filter_angle_brackets")); UCharsetDetector* ucsdet = NULL; STRI__ERROR_HANDLER_BEGIN(2) UErrorCode status = U_ZERO_ERROR; ucsdet = ucsdet_open(&status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) StriContainerListRaw str_cont(str); R_len_t str_n = str_cont.get_n(); R_len_t vectorize_length = stri__recycling_rule(true, 2, str_n, LENGTH(filter_angle_brackets)); str_cont.set_nrecycle(vectorize_length); // must be set after container creation SEXP ret, names, wrong; STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); STRI__PROTECT(names = Rf_allocVector(STRSXP, 3)); SET_STRING_ELT(names, 0, Rf_mkChar("Encoding")); SET_STRING_ELT(names, 1, Rf_mkChar("Language")); SET_STRING_ELT(names, 2, Rf_mkChar("Confidence")); STRI__PROTECT(wrong = Rf_allocVector(VECSXP, 3)); SET_VECTOR_ELT(wrong, 0, stri__vector_NA_strings(1)); SET_VECTOR_ELT(wrong, 1, stri__vector_NA_strings(1)); SET_VECTOR_ELT(wrong, 2, stri__vector_NA_integers(1)); Rf_setAttrib(wrong, R_NamesSymbol, names); StriContainerLogical filter(filter_angle_brackets, vectorize_length); for (R_len_t i=0; i curmap; const char* text_start = allChars+1; const char* text_end = allChars+256; ucnv_reset(ucnv); for (R_len_t i=1; i<256; ++i) { UErrorCode status = U_ZERO_ERROR; UChar32 c = ucnv_getNextUChar(ucnv, &text_start, text_end, &status); if (U_FAILURE(status)) { return; } if (i >= 32 && i <= 127 && c != (UChar32)i) { // allow only ASCII supersets return; } if (c == UCHAR_REPLACEMENT || c < 0) { badChars[i] = true; } else { if (!u_isdefined(c) || u_isalpha(c)) badChars[i] = true; curset.add(c); curmap[c] = (uint8_t)i; } } if (!curset.containsAll(*exset)) { // not all characters are representable in given encoding return; } // now mark all characters form exset to be counted R_len_t exset_size = exset->size(); for (R_len_t k=0; kcharAt(k); if (c >= 0) { uint8_t ind = curmap[c]; countChars[ind] = true; } } isNA = false; this->name = _name; this->friendlyname = _friendlyname; } }; // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- /** Guesses text encoding; help struct for stri_enc_detect2 [DEPRECATED] * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski, 2013-08-18) * locale-dependent, use ulocdata * * @version 0.1-?? (Marek Gagolewski, 2013-11-13) * allow qloc==NULL in 8bit check * * @version 0.5-1 (Marek Gagolewski, 2015-02-24) * #146 warnings removed */ struct EncGuess { const char* name; const char* friendlyname; double confidence; EncGuess(const char* _name, const char* _friendlyname, double _confidence) { name = _name; friendlyname = _friendlyname; confidence = _confidence; } bool operator<(const EncGuess& e2) const { return (this->confidence > e2.confidence); // decreasing sort } static void do_utf32(vector& guesses, const char* str_cur_s, R_len_t str_cur_n) { /* check UTF-32LE, UTF-32BE or UTF-32+BOM */ double isutf32le = stri__enc_check_utf32le(str_cur_s, str_cur_n, true); double isutf32be = stri__enc_check_utf32be(str_cur_s, str_cur_n, true); if (isutf32le >= 0.25 && isutf32be >= 0.25) { // no BOM, both valid // i think this will never happen guesses.push_back(EncGuess("UTF-32LE", "UTF-32LE", isutf32le)); guesses.push_back(EncGuess("UTF-32BE", "UTF-32BE", isutf32be)); } else if (isutf32le >= 0.25) { if (STRI__ENC_HAS_BOM_UTF32LE(str_cur_s, str_cur_n)) guesses.push_back(EncGuess("UTF-32", "UTF-32", isutf32le)); // with BOM else guesses.push_back(EncGuess("UTF-32LE", "UTF-32LE", isutf32le)); } else if (isutf32be >= 0.25) { if (STRI__ENC_HAS_BOM_UTF32BE(str_cur_s, str_cur_n)) guesses.push_back(EncGuess("UTF-32", "UTF-32", isutf32be)); // with BOM else guesses.push_back(EncGuess("UTF-32BE", "UTF-32BE", isutf32be)); } } static void do_utf16(vector& guesses, const char* str_cur_s, R_len_t str_cur_n) { /* check UTF-16LE, UTF-16BE or UTF-16+BOM */ double isutf16le = stri__enc_check_utf16le(str_cur_s, str_cur_n, true); double isutf16be = stri__enc_check_utf16be(str_cur_s, str_cur_n, true); if (isutf16le >= 0.25 && isutf16be >= 0.25) { // no BOM, both valid // this may sometimes happen guesses.push_back(EncGuess("UTF-16LE", "UTF-16LE", isutf16le)); guesses.push_back(EncGuess("UTF-16BE", "UTF-16BE", isutf16be)); } else if (isutf16le >= 0.25) { if (STRI__ENC_HAS_BOM_UTF16LE(str_cur_s, str_cur_n)) guesses.push_back(EncGuess("UTF-16", "UTF-16", isutf16le)); // with BOM else guesses.push_back(EncGuess("UTF-16LE", "UTF-16LE", isutf16le)); } else if (isutf16be >= 0.25) { if (STRI__ENC_HAS_BOM_UTF16BE(str_cur_s, str_cur_n)) guesses.push_back(EncGuess("UTF-16", "UTF-16", isutf16be)); // with BOM else guesses.push_back(EncGuess("UTF-16BE", "UTF-16BE", isutf16be)); } } static void do_8bit(vector& guesses, const char* str_cur_s, R_len_t str_cur_n, const char* qloc) { double is8bit = stri__enc_check_8bit(str_cur_s, str_cur_n, false); if (is8bit != 0.0) { // may be an 8-bit encoding double isascii = stri__enc_check_ascii(str_cur_s, str_cur_n, true); if (isascii >= 0.25) // i.e., equal to 1.0 => nothing more to check guesses.push_back(EncGuess("US-ASCII", "US-ASCII", isascii)); else { // not ascii double isutf8 = stri__enc_check_utf8(str_cur_s, str_cur_n, true); if (isutf8 >= 0.25) guesses.push_back(EncGuess("UTF-8", "UTF-8", isutf8)); if (isutf8 < 1.0 && qloc) { do_8bit_locale(guesses, str_cur_s, str_cur_n, qloc); } } } } static void do_8bit_locale(vector& guesses, const char* str_cur_s, R_len_t str_cur_n, const char* qloc) { vector converters; if (!qloc) throw StriException(MSG__INTERNAL_ERROR); // just to be sure UErrorCode status = U_ZERO_ERROR; ULocaleData* uld = ulocdata_open(qloc, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) USet* exset_tmp = ulocdata_getExemplarSet(uld, NULL, USET_ADD_CASE_MAPPINGS, ULOCDATA_ES_STANDARD, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) UnicodeSet* exset = UnicodeSet::fromUSet(exset_tmp); // don't delete, just a pointer exset->removeAllStrings(); R_len_t ucnv_count = (R_len_t)ucnv_countAvailable(); for (R_len_t i=0; i= 128 in str_cur_s R_len_t counts[256]; R_len_t countsge128 = 0; // total count for (R_len_t k=0; k<256; ++k) counts[k] = 0; // reset tab for (R_len_t j=0; j= (uint8_t)128) { counts[(uint8_t)(str_cur_s[j])]++; countsge128++; } } // assert: countsge128 > 0 (otherwise ASCII, so this function hasn't been not called) std::vector badCounts(converters.size(), 0); // filled with 0 std::vector desiredCounts(converters.size(),0); R_len_t maxDesiredCounts = 0; for (R_len_t j=0; j<(R_len_t)converters.size(); ++j) { // for each converter for (R_len_t k=128; k<256; ++k) { // for each character // 1. Count bytes that are BAD and NOT COUNTED in this encoding if (converters[j].badChars[k] && !converters[j].countChars[k]) { badCounts[j] += (int)counts[k]; } // 2. Count indicated characters if (converters[j].countChars[k]) { desiredCounts[j] += (int)counts[k]; } } if (desiredCounts[j] > maxDesiredCounts) maxDesiredCounts = desiredCounts[j]; } // add guesses for (R_len_t j=0; j<(R_len_t)converters.size(); ++j) { // for each converter // some heuristic: double conf = min(1.0, max(0.0, (double)(countsge128-0.5*badCounts[j]-maxDesiredCounts+desiredCounts[j])/ (double)(countsge128))); if (conf > 0.25) guesses.push_back(EncGuess(converters[j].name, converters[j].friendlyname, conf)); } } }; // ----------------------------------------------------------------------- // ----------------------------------------------------------------------- /** Detect encoding with initial guess [DEPRECATED] * * @param str character or raw vector or a list of raw vectors * @param loc locale id * * @return list * * @version 0.1-?? (2013-08-15, Marek Gagolewski) * * @version 0.1-?? (2013-08-18, Marek Gagolewski) * improved 8-bit confidence measurement, * some code moved to structs, use locale & ICU locdata * * @version 0.1-?? (2013-11-13, Marek Gagolewski) * added loc NA handling (no locale) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_detect2(SEXP str, SEXP loc) { const char* qloc = /* this is R_alloc'ed */ stri__prepare_arg_locale(loc, "locale"); // raw vector, character vector, or list of raw vectors: PROTECT(str = stri__prepare_arg_list_raw(str, "str")); STRI__ERROR_HANDLER_BEGIN(1) StriContainerListRaw str_cont(str); R_len_t str_n = str_cont.get_n(); SEXP ret, names, wrong; STRI__PROTECT(ret = Rf_allocVector(VECSXP, str_n)); STRI__PROTECT(names = Rf_allocVector(STRSXP, 3)); SET_STRING_ELT(names, 0, Rf_mkChar("Encoding")); SET_STRING_ELT(names, 1, Rf_mkChar("Language")); SET_STRING_ELT(names, 2, Rf_mkChar("Confidence")); STRI__PROTECT(wrong = Rf_allocVector(VECSXP, 3)); SET_VECTOR_ELT(wrong, 0, stri__vector_NA_strings(1)); SET_VECTOR_ELT(wrong, 1, stri__vector_NA_strings(1)); SET_VECTOR_ELT(wrong, 2, stri__vector_NA_integers(1)); Rf_setAttrib(wrong, R_NamesSymbol, names); for (R_len_t i=0; i guesses; guesses.reserve(6); EncGuess::do_utf32(guesses, str_cur_s, str_cur_n); EncGuess::do_utf16(guesses, str_cur_s, str_cur_n); EncGuess::do_8bit(guesses, str_cur_s, str_cur_n, qloc); // includes UTF-8 R_len_t matchesFound = (R_len_t)guesses.size(); if (matchesFound <= 0) { SET_VECTOR_ELT(ret, i, wrong); continue; } std::stable_sort(guesses.begin(), guesses.end()); SEXP val_enc, val_lang, val_conf; STRI__PROTECT(val_enc = Rf_allocVector(STRSXP, matchesFound)); STRI__PROTECT(val_lang = Rf_allocVector(STRSXP, matchesFound)); STRI__PROTECT(val_conf = Rf_allocVector(REALSXP, matchesFound)); for (R_len_t j=0; j * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_utf16.h" #include #include #include #include #include // !!!! no longer used since stringi_0.2-3 !!!! ///** Compare 2 strings in UTF8, codepoint-wise [internal] // * // * Used by stri_order_codepoints and stri_cmp_codepoints // * // * @param str1 string in UTF8 // * @param str2 string in UTF8 // * @param n1 length of str1 // * @param n2 length of str2 // * @return -1, 0, or 1, like in strcmp // * // * @version 0.1-?? (Marek Gagolewski) // * // * @version 0.2-1 (Marek Gagolewski, 2014-03-19) // * BUGFIX: possibly incorrect results for strings of inequal number // * of codepoints // * // * @version 0.2-1 (Marek Gagolewski, 2014-04-02) // * detect invalid UTF-8 byte stream // */ //int stri__cmp_codepoints(const char* str1, R_len_t n1, const char* str2, R_len_t n2) //{ // // @NOTE: strangely, this is being outperformed by ucol_strcollUTF8 // // in some UTF-8 benchmarks... // int i1 = 0; // int i2 = 0; // UChar32 c1 = 0; // UChar32 c2 = 0; // while (c1 == c2 && i1 < n1 && i2 < n2) { // U8_NEXT(str1, i1, n1, c1); // U8_NEXT(str2, i2, n2, c2); // if (c1 < 0 || c2 < 0) // throw StriException(MSG__INVALID_UTF8); // } // // if (c1 < c2) // return -1; // else if (c1 > c2) // return 1; // // // reached here => first i1==i2 codepoints are the same // if (i1 < n1) return 1; // else if (i2 < n2) return -1; // else return 0; //} /* ************************************************************************* STRI_CMP_CODEPOINTS ************************************************************************* */ /** * Compare elements in 2 character vectors, without collation [INTERNAL] * * @param e1 character vector * @param e2 character vector * @param _negate [internal] integer; 0 or 1 (whether to negate the results) * * @return logical vector * * @version 0.2-3 (Marek Gagolewski, 2014-05-07) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_cmp_codepoints(SEXP e1, SEXP e2, int _negate) { // _negate is an internal arg, check manually, error() allowed here if (_negate < 0 || _negate > 1) Rf_error(MSG__INCORRECT_INTERNAL_ARG); PROTECT(e1 = stri__prepare_arg_string(e1, "e1")); // prepare string argument PROTECT(e2 = stri__prepare_arg_string(e2, "e2")); // prepare string argument STRI__ERROR_HANDLER_BEGIN(2) R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(e1), LENGTH(e2)); StriContainerUTF8 e1_cont(e1, vectorize_length); StriContainerUTF8 e2_cont(e2, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = 0; i < vectorize_length; ++i) { if (e1_cont.isNA(i) || e2_cont.isNA(i)) { ret_tab[i] = NA_LOGICAL; continue; } R_len_t cur1_n = e1_cont.get(i).length(); const char* cur1_s = e1_cont.get(i).c_str(); R_len_t cur2_n = e2_cont.get(i).length(); const char* cur2_s = e2_cont.get(i).c_str(); if (cur1_n != cur2_n) // different number of bytes => not equal ret_tab[i] = FALSE; else ret_tab[i] = (memcmp(cur1_s, cur2_s, cur1_n) == 0); if (_negate) ret_tab[i] = !ret_tab[i]; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({/* no-op on err */}) } /** * Test if elements in 2 character vectors are equal, without collation * * @param e1 character vector * @param e2 character vector * * @return logical vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-05) * use stri_cmp_codepoints */ SEXP stri_cmp_eq(SEXP e1, SEXP e2) { return stri_cmp_codepoints(e1, e2, 0); } /** * Test if elements in 2 character vectors are non-equal, without collation * * @param e1 character vector * @param e2 character vector * * @return logical vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-05) * use stri_cmp_codepoints */ SEXP stri_cmp_neq(SEXP e1, SEXP e2) { return stri_cmp_codepoints(e1, e2, 1); } /* ************************************************************************* STRI_CMP_LOGICAL ************************************************************************* */ /** * Compare elements in 2 character vectors, with collation [INTERNAL] * * @param e1 character vector * @param e2 character vector * @param opts_collator passed to stri__ucol_open() * @param type [internal] vector of length 2, * type[0]: 0 for ==, -1 for < and 1 for >, * type[1]: 0 or 1 (whether to negate the results) * * @return logical vector * * @version 0.2-1 (Marek Gagolewski, 2014-03-19) * * @version 0.2-3 (Marek Gagolewski, 2014-05-07) * opts_collator == NA no longer allowed * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri__cmp_logical(SEXP e1, SEXP e2, SEXP opts_collator, int _type, int _negate) { // we'll perform a collator-based cmp // type is an internal arg, check manually, error() allowed here if (_type > 1 || _type < -1 || _negate < 0 || _negate > 1) Rf_error(MSG__INCORRECT_INTERNAL_ARG); PROTECT(e1 = stri__prepare_arg_string(e1, "e1")); // prepare string argument PROTECT(e2 = stri__prepare_arg_string(e2, "e2")); // prepare string argument // call stri__ucol_open after prepare_arg: // if prepare_arg had failed, we would have a mem leak UCollator* col = NULL; col = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(2) R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(e1), LENGTH(e2)); StriContainerUTF8 e1_cont(e1, vectorize_length); StriContainerUTF8 e2_cont(e2, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = 0; i < vectorize_length; ++i) { if (e1_cont.isNA(i) || e2_cont.isNA(i)) { ret_tab[i] = NA_LOGICAL; continue; } R_len_t cur1_n = e1_cont.get(i).length(); const char* cur1_s = e1_cont.get(i).c_str(); R_len_t cur2_n = e2_cont.get(i).length(); const char* cur2_s = e2_cont.get(i).c_str(); // with collation UErrorCode status = U_ZERO_ERROR; ret_tab[i] = (_type == (int)ucol_strcollUTF8(col, cur1_s, cur1_n, cur2_s, cur2_n, &status )); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (_negate) ret_tab[i] = !ret_tab[i]; } if (col) { ucol_close(col); col = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (col) { ucol_close(col); col = NULL; } }) } /** * Compare elements in 2 character vectors, with collation [INTERNAL] * * @param e1 character vector * @param e2 character vector * @param opts_collator passed to stri__ucol_open() * * @return logical vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-05) * use stri__cmp_logical */ SEXP stri_cmp_equiv(SEXP e1, SEXP e2, SEXP opts_collator) { return stri__cmp_logical(e1, e2, opts_collator, 0, 0); } /** * Compare elements in 2 character vectors, with collation [INTERNAL] * * @param e1 character vector * @param e2 character vector * @param opts_collator passed to stri__ucol_open() * * @return logical vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-05) * use stri__cmp_logical */ SEXP stri_cmp_nequiv(SEXP e1, SEXP e2, SEXP opts_collator) { return stri__cmp_logical(e1, e2, opts_collator, 0, 1); } /** * Compare elements in 2 character vectors, with collation [INTERNAL] * * @param e1 character vector * @param e2 character vector * @param opts_collator passed to stri__ucol_open() * * @return logical vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-05) * use stri__cmp_logical */ SEXP stri_cmp_lt(SEXP e1, SEXP e2, SEXP opts_collator) { return stri__cmp_logical(e1, e2, opts_collator, -1, 0); } /** * Compare elements in 2 character vectors, with collation [INTERNAL] * * @param e1 character vector * @param e2 character vector * @param opts_collator passed to stri__ucol_open() * * @return logical vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-05) * use stri__cmp_logical */ SEXP stri_cmp_gt(SEXP e1, SEXP e2, SEXP opts_collator) { return stri__cmp_logical(e1, e2, opts_collator, 1, 0); } /** * Compare elements in 2 character vectors, with collation [INTERNAL] * * @param e1 character vector * @param e2 character vector * @param opts_collator passed to stri__ucol_open() * * @return logical vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-05) * use stri__cmp_logical */ SEXP stri_cmp_le(SEXP e1, SEXP e2, SEXP opts_collator) { return stri__cmp_logical(e1, e2, opts_collator, 1, 1); } /** * Compare elements in 2 character vectors, with collation [INTERNAL] * * @param e1 character vector * @param e2 character vector * @param opts_collator passed to stri__ucol_open() * * @return logical vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-05) * use stri__cmp_logical */ SEXP stri_cmp_ge(SEXP e1, SEXP e2, SEXP opts_collator) { return stri__cmp_logical(e1, e2, opts_collator, -1, 1); } /* ************************************************************************* STRI_CMP ************************************************************************* */ /** * Compare character vectors, possibly with collation * * @param e1 character vector * @param e2 character vector * @param opts_collator passed to stri__ucol_open() * * @return integer vector, like strcmp in C * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException friendly * * @version 0.1-?? (Marek Gagolewski, 2013-06-27) * moved to UTF16, as ucol_strcollUTF8 is DRAFT * * @version 0.2-1 (Marek Gagolewski, 2014-03-16) * using ucol_strcollUTF8 again, as we now require ICU >= 50 * [4x speedup utf8, 2x slowdown 8bit] * * @version 0.2-1 (Marek Gagolewski, 2014-03-19) * one function for cmp with and without collation * * @version 0.2-3 (Marek Gagolewski, 2014-05-07) * opts_collator == NA no longer allowed * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_cmp(SEXP e1, SEXP e2, SEXP opts_collator) { PROTECT(e1 = stri__prepare_arg_string(e1, "e1")); PROTECT(e2 = stri__prepare_arg_string(e2, "e2")); // call stri__ucol_open after prepare_arg: // if prepare_arg had failed, we would have a mem leak UCollator* col = NULL; col = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(2) R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(e1), LENGTH(e2)); StriContainerUTF8 e1_cont(e1, vectorize_length); StriContainerUTF8 e2_cont(e2, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, vectorize_length)); int* ret_int = INTEGER(ret); for (R_len_t i = 0; i < vectorize_length; ++i) { if (e1_cont.isNA(i) || e2_cont.isNA(i)) { ret_int[i] = NA_INTEGER; continue; } R_len_t cur1_n = e1_cont.get(i).length(); const char* cur1_s = e1_cont.get(i).c_str(); R_len_t cur2_n = e2_cont.get(i).length(); const char* cur2_s = e2_cont.get(i).c_str(); // cmp with collation UErrorCode status = U_ZERO_ERROR; ret_int[i] = (int)ucol_strcollUTF8(col, cur1_s, cur1_n, cur2_s, cur2_n, &status ); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } if (col) { ucol_close(col); col = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (col) { ucol_close(col); col = NULL; } }) } stringi/src/stri_sprintf.cpp0000644000176200001440000010265314770541312015765 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_integer.h" #include "stri_container_logical.h" #include "stri_container_double.h" #include "stri_string8buf.h" #include #include #include #define STRI_SPRINTF_NOT_PROVIDED (NA_INTEGER+1) /* -2**31+2 */ #define STRI_SPRINTF_SPEC_INTEGER "dioxX" #define STRI_SPRINTF_SPEC_DOUBLE "feEgGaA" #define STRI_SPRINTF_SPEC_STRING "s" #define STRI_SPRINTF_SPEC_TYPE ( \ STRI_SPRINTF_SPEC_INTEGER \ STRI_SPRINTF_SPEC_DOUBLE \ STRI_SPRINTF_SPEC_STRING \ ) #define STRI_SPRINTF_FLAGS "-+ 0#" // TODO: Single UNIX Specification has "'" flag too, we can use it for formatting with ICU #define STRI_SPRINTF_ACCEPTED_CHARS ( \ STRI_SPRINTF_SPEC_INTEGER \ STRI_SPRINTF_SPEC_DOUBLE \ STRI_SPRINTF_SPEC_STRING \ STRI_SPRINTF_FLAGS \ ".*$" \ "0123456789" \ ) /** data types for sprintf * * @version 1.6.2 (Marek Gagolewski, 2021-05-24) */ typedef enum { STRI_SPRINTF_TYPE_UNDEFINED=0, STRI_SPRINTF_TYPE_INTEGER, STRI_SPRINTF_TYPE_DOUBLE, STRI_SPRINTF_TYPE_STRING, } StriSprintfType; /** data types for sprintf * * @version 1.6.2 (Marek Gagolewski, 2021-05-24) */ typedef enum { STRI_SPRINTF_FORMAT_STATUS_OK=0, STRI_SPRINTF_FORMAT_STATUS_IS_NA, STRI_SPRINTF_FORMAT_STATUS_NEEDS_PADDING } StriSprintfFormatStatus; /** * if delim found, stops right after delim, modifies jc in place * if delim not found, returns STRI_SPRINTF_NOT_PROVIDED or throws an error * ignores leading 0s * non-negative values only * * @version 1.6.2 (Marek Gagolewski, 2021-05-24) * @version 1.6.3 (Marek Gagolewski, 2021-06-10) * return STRI_SPRINTF_NOT_PROVIDED instead of NA_INTEGER */ int stri__atoi_to_delim( const char* f, R_len_t& jc, R_len_t j0, R_len_t j1, char delim, bool throw_error=true, int max_val=99999 ) { R_len_t j = jc; STRI_ASSERT(j0 <= j && j <= j1) if (f[j] < '0' || f[j] > '9') throw StriException( MSG__INVALID_FORMAT_SPECIFIER_SUB "; " MSG__EXPECTED_NONNEGATIVE, j1-j0+1, f+j0); int val = (int)f[j++]-(int)'0'; while (true) { if (f[j] == delim) { j++; break; } if (j >= j1 || f[j] < '0' || f[j] > '9') { if (throw_error) throw StriException( MSG__INVALID_FORMAT_SPECIFIER_SUB, // TODO: error details j1-j0+1, f+j0); else return STRI_SPRINTF_NOT_PROVIDED; } val = val*10 + ((int)f[j++]-(int)'0'); // this ignores leading 0s if (val > max_val) throw StriException( MSG__INVALID_FORMAT_SPECIFIER_SUB "; " MSG__EXPECTED_SMALLER, j1-j0+1, f+j0); } // found. jc = j; // passed by reference return val; } /** * stops at a non-digit, modifies jc in place * ignores leading 0s * non-negative values only * * @version 1.6.2 (Marek Gagolewski, 2021-05-24) */ int stri__atoi_to_other(const char* f, R_len_t& jc, R_len_t j0, R_len_t j1, int max_val=99999) { STRI_ASSERT(j0 <= jc && jc < j1) if (f[jc] < '0' || f[jc] > '9') throw StriException( MSG__INVALID_FORMAT_SPECIFIER_SUB "; " MSG__EXPECTED_NONNEGATIVE, j1-j0+1, f+j0); int val = (int)f[jc++]-(int)'0'; while (jc < j1) { if (f[jc] < '0' || f[jc] > '9') break; val = val*10 + ((int)f[jc++]-(int)'0'); // this ignores leading 0s if (val > max_val) throw StriException( MSG__INVALID_FORMAT_SPECIFIER_SUB "; " MSG__EXPECTED_SMALLER, j1-j0+1, f+j0); } return val; } /** * preflight - get something which possibly is a format spec * throws an error on any chars outside of [0-9*$. +0#-] * * @returns index of the first char in STRI_SPRINTF_SPEC_TYPE * * @version 1.6.2 (Marek Gagolewski, 2021-05-24) */ int stri__find_type_spec(const char* f, R_len_t j0, R_len_t n) { R_len_t j1 = j0; STRI_ASSERT(f[j0-1] == '%'); while (true) { if (j1 >= n) { // TODO: note that this will display UTF-8 also on non-UTF-8 outputs throw StriException(MSG__INVALID_FORMAT_SPECIFIER, f+j0); // dangling %... } else if (strchr(STRI_SPRINTF_SPEC_TYPE, f[j1]) != nullptr) break; else if (strchr(STRI_SPRINTF_FLAGS, f[j1]) != nullptr) ; else if (f[j1] == '*' || f[j1] == '$' || f[j1] == '.') ; else if (f[j1] >= '0' && f[j1] <= '9') ; else { // TODO: note that this will display UTF-8 also on non-UTF-8 outputs throw StriException( MSG__INVALID_FORMAT_SPECIFIER "; " MSG__EXPECTED_CHAR_IN_SET, (f+j0), STRI_SPRINTF_ACCEPTED_CHARS); } j1++; } return j1; } /** Enables the fetching of the i-th/next integer/real/string datum from `...`. * * @version 1.6.2 (Marek Gagolewski, 2021-05-24) * @version 1.7.4 (Marek Gagolewski, 2021-08-05) * #449: segfaults; use R_PreserveObject */ class StriSprintfDataProvider { private: SEXP x; // protected outside R_len_t narg; R_len_t vectorize_length; std::vector< StriContainerInteger* > x_integer; std::vector< StriContainerDouble* > x_double; std::vector< StriContainerUTF8* > x_string; std::deque< SEXP > protected_objects; R_len_t cur_elem; // 0..vectorize_length-1 R_len_t cur_item; // 0..narg-1 public: bool warn_if_arg_unused; public: StriSprintfDataProvider(SEXP x, R_len_t vectorize_length) : x(x), narg(LENGTH(x)), vectorize_length(vectorize_length), x_integer(narg, nullptr), x_double(narg, nullptr), x_string(narg, nullptr) { STRI_ASSERT(Rf_isVectorList(x)); cur_elem = -1; warn_if_arg_unused = false; } ~StriSprintfDataProvider() { R_len_t num_unused = 0; for (R_len_t j=0; j 1) Rf_warning(MSG__ARG_UNUSED_N, num_unused); } } void reset(R_len_t elem) { cur_elem = elem; cur_item = 0; } /** Gets the next (i negative) or the i-th integer datum * Can be NA, so check with ... == NA_INTEGER. * * i == STRI_SPRINTF_NOT_PROVIDED 0 means "get next unconsumed" */ int getIntegerOrNA(int i=STRI_SPRINTF_NOT_PROVIDED) { if (i == STRI_SPRINTF_NOT_PROVIDED) i = (cur_item++); // else do not advance cur_item if (i < 0) throw StriException(MSG__EXPECTED_LARGER); else if (i >= narg) throw StriException(MSG__ARG_NEED_MORE); if (x_integer[i] == nullptr) { SEXP y; // the following may call Rf_error: PROTECT(y = stri__prepare_arg_integer(VECTOR_ELT(x, i), "...", false/*factors_as_strings*/, false/*allow_error*/)); R_PreserveObject(y); protected_objects.push_back(y); UNPROTECT(1); if (Rf_isNull(y)) throw StriException(MSG__ARG_EXPECTED_INTEGER, "..."); x_integer[i] = new StriContainerInteger(y, vectorize_length); } return x_integer[i]->getNAble(cur_elem); } /** Gets the next (i negative) or the i-th real datum; * Can be NA, so check with ISNA(...). * * i == STRI_SPRINTF_NOT_PROVIDED means "get next unconsumed" */ double getDoubleOrNA(int i=STRI_SPRINTF_NOT_PROVIDED) { if (i == STRI_SPRINTF_NOT_PROVIDED) i = (cur_item++); // else do not advance cur_item if (i < 0) throw StriException(MSG__EXPECTED_LARGER); else if (i >= narg) throw StriException(MSG__ARG_NEED_MORE); if (x_double[i] == nullptr) { SEXP y; // the following may call Rf_error: PROTECT(y = stri__prepare_arg_double(VECTOR_ELT(x, i), "...", false/*factors_as_strings*/, false/*allow_error*/)); R_PreserveObject(y); protected_objects.push_back(y); UNPROTECT(1); if (Rf_isNull(y)) throw StriException(MSG__ARG_EXPECTED_NUMERIC, "..."); x_double[i] = new StriContainerDouble(y, vectorize_length); } return x_double[i]->getNAble(cur_elem); } /** Gets the next (i negative) or the i-th real datum * Can be NA, so check with ....isNA(). * * i == STRI_SPRINTF_NOT_PROVIDED means "get next unconsumed" */ const String8& getStringOrNA(int i=STRI_SPRINTF_NOT_PROVIDED) { if (i == STRI_SPRINTF_NOT_PROVIDED) i = (cur_item++); // else do not advance cur_item if (i < 0) throw StriException(MSG__EXPECTED_LARGER); else if (i >= narg) throw StriException(MSG__ARG_NEED_MORE); if (x_string[i] == nullptr) { SEXP y; // the following may call Rf_error: PROTECT(y = stri__prepare_arg_string(VECTOR_ELT(x, i), "...", false/*allow_error*/)); R_PreserveObject(y); protected_objects.push_back(y); UNPROTECT(1); if (Rf_isNull(y)) throw StriException(MSG__ARG_EXPECTED_STRING, "..."); x_string[i] = new StriContainerUTF8(y, vectorize_length); } return x_string[i]->getNAble(cur_elem); } }; /** Parses and stores info on a single sprintf format (conversion) specifier * * @version 1.6.2 (Marek Gagolewski, 2021-05-24) * @version 1.6.3 (Marek Gagolewski, 2021-06-10) * distinguish between NA_INTEGER and STRI_SPRINTF_NOT_PROVIDED */ class StriSprintfFormatSpec { private: StriSprintfDataProvider* data; const String8& na_string; const String8& inf_string; const String8& nan_string; bool use_length; StriSprintfType type; char type_spec; int which_datum; // can be STRI_SPRINTF_NOT_PROVIDED (== consume next datum) // see normalise() for info on which options are mutually exclusive etc. bool pad_from_right; // '-' bool pad_zero; // '0' bool sign_space; // ' ' bool sign_plus; // '+' bool alternate_output; // '#' int min_width; // can be NA_INTEGER or STRI_SPRINTF_NOT_PROVIDED int precision; // can be NA_INTEGER or STRI_SPRINTF_NOT_PROVIDED or negative (but then like '-') // TODO: flag "'" -- localised formatting with ICU public: StriSprintfFormatSpec( const char* f, R_len_t j0, R_len_t j1, StriSprintfDataProvider* data, const String8& na_string, const String8& inf_string, const String8& nan_string, bool use_length ) : data(data), na_string(na_string), inf_string(inf_string), nan_string(nan_string), use_length(use_length) { // f[j0..j1] may be a format specifier (without the preceding %) // %<.PRECISION> // 1 2 3 4 5 == f[j1] STRI_ASSERT(f[j0-1] == '%') type_spec = f[j1]; if (strchr(STRI_SPRINTF_SPEC_INTEGER, type_spec) != nullptr) type = STRI_SPRINTF_TYPE_INTEGER; else if (strchr(STRI_SPRINTF_SPEC_DOUBLE, type_spec) != nullptr) type = STRI_SPRINTF_TYPE_DOUBLE; else type = STRI_SPRINTF_TYPE_STRING; pad_from_right = false; pad_zero = false; sign_space = false; sign_plus = false; alternate_output = false; min_width = STRI_SPRINTF_NOT_PROVIDED; precision = STRI_SPRINTF_NOT_PROVIDED; // eEfFgG - default precision = 6 // aA - default precision = depends on the input // dioxX - default precision = 1 // s - default precision - unspecified // gG uses eE if precision <= exponent < -4 R_len_t jc = j0; // 1. optional [0-9]*\$ - which datum is to be formatted? which_datum = STRI_SPRINTF_NOT_PROVIDED; if (f[jc] >= '0' && f[jc] <= '9') { // trailing 0s will be ignored // arg pos spec if digits followed by '$' // we can also have '0' flag at this pos, but this will not be // followed by '$' and the call below will return NA_INTEGER which_datum = stri__atoi_to_delim( f, /*by reference*/jc, j0, j1, /*delimiter*/'$', false/*throw_error*/ ); // result can be < 0; incorrect indexes will be caught by get* if (which_datum != STRI_SPRINTF_NOT_PROVIDED) which_datum--; /*0-based indexing*/ } // 2. optional flags [ +0#-] while (true) { if (f[jc] == ' ') sign_space = true; else if (f[jc] == '+') sign_plus = true; else if (f[jc] == '0') pad_zero = true; else if (f[jc] == '-') pad_from_right = true; else if (f[jc] == '#') alternate_output = true; else break; jc++; } // 3. optional field width: none / 123 / * / *0123$ if (f[jc] >= '1' && f[jc] <= '9') { // note that 0 is treated above min_width = stri__atoi_to_other(f, /*by reference*/jc, j0, j1); } else if (f[jc] == '*') { // take from ... args jc++; int which_width = STRI_SPRINTF_NOT_PROVIDED; if (f[jc] >= '0' && f[jc] <= '9') { which_width = stri__atoi_to_delim( f, /*by reference*/jc, j0, j1, /*delimiter*/'$' ); if (which_width != STRI_SPRINTF_NOT_PROVIDED) which_width--; /*0-based indexing*/ } min_width = data->getIntegerOrNA(which_width); } // else if . -- treated below // else if type spec like dfgxo -- treated below // else probably an error, will be caught below // 4. optional field precision: none / .0123 / . / .* / .0123$ if (f[jc] == '.') { jc++; if (jc == j1) { // precision "." is ".0" precision = 0; } if (f[jc] >= '0' && f[jc] <= '9') { // trailing 0s will be ignored precision = stri__atoi_to_other(f, /*by reference*/jc, j0, j1); } else if (f[jc] == '*') { // take from ... args jc++; int which_precision = STRI_SPRINTF_NOT_PROVIDED; if (f[jc] >= '0' && f[jc] <= '9') { which_precision = stri__atoi_to_delim( f, /*by reference*/jc, j0, j1, /*delimiter*/'$' ); if (which_precision != STRI_SPRINTF_NOT_PROVIDED) which_precision--; /*0-based indexing*/ } precision = data->getIntegerOrNA(which_precision); } // else error, exception thrown below } // now we should be at the conversion specifier if (jc != j1) throw StriException(MSG__INVALID_FORMAT_SPECIFIER_SUB, j1-j0+1, f+j0); normalise(); } std::string getFormatString(bool use_sign=true, bool use_pad=true) { // note that trimming based on width/length is done elsewhere normalise(); std::string f("%"); if (alternate_output) f.push_back('#'); if (use_sign && sign_space) f.push_back(' '); if (use_sign && sign_plus) f.push_back('+'); if (use_pad && pad_from_right) f.push_back('-'); if (use_pad && pad_zero) f.push_back('0'); if (use_pad && min_width > 0) // and hence not STRI_SPRINTF_NOT_PROVIDED or NA_INTEGER f.append(std::to_string(min_width)); if (precision >= 0) { // and hence not STRI_SPRINTF_NOT_PROVIDED or NA_INTEGER f.push_back('.'); f.append(std::to_string(precision)); } f.push_back(type_spec); return f; } void normalise() { if (type_spec == 'i') type_spec = 'd'; // synonym // TODO: warnings when switching off the flags? if (min_width == NA_INTEGER) ; else if (min_width == STRI_SPRINTF_NOT_PROVIDED) ; else if (min_width == 0) min_width = STRI_SPRINTF_NOT_PROVIDED; else if (min_width < 0) { min_width = -min_width; pad_from_right = true; } if (precision == NA_INTEGER) ; else if (precision == STRI_SPRINTF_NOT_PROVIDED) ; else if (precision < 0) precision = STRI_SPRINTF_NOT_PROVIDED; if (pad_from_right) pad_zero = false; if (sign_plus) sign_space = false; if (type == STRI_SPRINTF_TYPE_STRING) { pad_zero = false; // [-Wformat=] even warns about this sign_plus = false; // [-Wformat=] even warns about this sign_space = false; // [-Wformat=] even warns about this alternate_output = false; // precision = maximum width/length, yes, we support it. } else if (type == STRI_SPRINTF_TYPE_INTEGER) { // precision -- minimal number of digits that must appear if (type_spec != 'd') { // and not i, because i->d sign_plus = false; // [-Wformat=] even warns about this sign_space = false; // [-Wformat=] even warns about this } } } StriSprintfFormatStatus formatDatum(std::string& preformatted_datum) { StriSprintfFormatStatus status; if (type == STRI_SPRINTF_TYPE_INTEGER) { int datum = data->getIntegerOrNA(which_datum); status = preformatDatum_doxX(preformatted_datum/*by reference*/, datum); } else if (type == STRI_SPRINTF_TYPE_DOUBLE) { double datum = data->getDoubleOrNA(which_datum); status = preformatDatum_feEgGaA(preformatted_datum/*by reference*/, datum); } else { // string const String8& datum = data->getStringOrNA(which_datum); status = preformatDatum_s(preformatted_datum, datum); } if (status != STRI_SPRINTF_FORMAT_STATUS_NEEDS_PADDING) return status; if (min_width <= 0) // includes NA_INTEGER and STRI_SPRINTF_NOT_PROVIDED return STRI_SPRINTF_FORMAT_STATUS_OK; // no trimming needed STRI_ASSERT(min_width > 0); R_len_t datum_size; if (use_length) // number of code points datum_size = stri__length_string(preformatted_datum.c_str(), preformatted_datum.length()); else datum_size = stri__width_string(preformatted_datum.c_str(), preformatted_datum.length()); if (datum_size < min_width) { // now we need to pad with spaces from left or right up to min_width // based on width or length (use_length) // btw: pad_from_right always add spaces // btw: pad_zero "-00000" "+00000" " 00000" "0x0000" "0X0000" // but not NA/Inf/... and only numerics, // and this needs_padding no more (already dealt with) if (pad_from_right) preformatted_datum.append(min_width-datum_size, ' '); else preformatted_datum.assign(std::string(min_width-datum_size, ' ') + preformatted_datum); } return STRI_SPRINTF_FORMAT_STATUS_OK; } private: StriSprintfFormatStatus preformatDatum_doxX(std::string& preformatted_datum, int datum) { STRI_ASSERT(type_spec != 'i'); // normalised i->d bool isna = (datum == NA_INTEGER || min_width == NA_INTEGER || precision == NA_INTEGER); if (!isna) { R_len_t bufsize = std::max(0, min_width); bufsize += std::max(0, precision); bufsize += 128; // "just in case" (0x, sign, dot, and stuff) std::vector buf; buf.resize(bufsize); // oh, oh, oh, so lazy, using std::snprintf (good enough) // TODO: use ICU NumberFormat for '%d' (locale dependent) when "'" flag is set std::string format_string = getFormatString(); snprintf(buf.data(), bufsize, format_string.c_str(), datum); preformatted_datum.append(buf.data()); return STRI_SPRINTF_FORMAT_STATUS_OK; /* all in ASCII, padding done by std::snprintf */ } else if (na_string.isNA()) return STRI_SPRINTF_FORMAT_STATUS_IS_NA; else { STRI_ASSERT(type_spec == 'd' || !sign_plus); STRI_ASSERT(type_spec == 'd' || !sign_space); if (sign_plus) { // glibc produces "+nan", but we will output " nan" instead preformatted_datum.push_back(' '); } else if (sign_space) preformatted_datum.push_back(' '); // else no sign STRI_ASSERT(!na_string.isNA()); preformatted_datum.append(na_string.c_str()); return STRI_SPRINTF_FORMAT_STATUS_NEEDS_PADDING; /* might need padding (na_string can be fancy Unicode) */ } } StriSprintfFormatStatus preformatDatum_feEgGaA(std::string& preformatted_datum, double datum) { bool isna = (ISNA(datum) || min_width == NA_INTEGER || precision == NA_INTEGER); if (R_FINITE(datum) && !isna) { R_len_t bufsize = std::max(0, min_width); bufsize += std::max(0, precision); bufsize += 128; // "just in case" (0x, sign, dot, and stuff) std::vector buf; buf.resize(bufsize); // lazybones, using std::sprintf (the good-enough approach) // TODO: use ICU NumberFormat for '%feEgG' (locale dependent) when "'" flag is set std::string format_string = getFormatString(); snprintf(buf.data(), bufsize, format_string.c_str(), datum); preformatted_datum.append(buf.data()); return STRI_SPRINTF_FORMAT_STATUS_OK; /* all in ASCII, padding done by std::snprintf */ } else if ( (na_string.isNA() && isna) || (nan_string.isNA() && ISNAN(datum)) || (inf_string.isNA() && std::isinf(datum)) ) { return STRI_SPRINTF_FORMAT_STATUS_IS_NA; } else { if (isna || ISNAN(datum)) { if (sign_plus) { // glibc produces "+nan", but we will output " nan" instead preformatted_datum.push_back(' '); } else if (sign_space) preformatted_datum.push_back(' '); // else no sign } else if (datum < 0.0 /* minus infinity */) preformatted_datum.push_back('-'); else { // plus infinity if (sign_plus) preformatted_datum.push_back('+'); else if (sign_space) preformatted_datum.push_back(' '); // else no sign } // alternate_output has no effect (use inf_string etc. instead) if (isna) { STRI_ASSERT(!na_string.isNA()); preformatted_datum.append(na_string.c_str()); } else if (ISNAN(datum)) { STRI_ASSERT(!nan_string.isNA()); preformatted_datum.append(nan_string.c_str()); } else { STRI_ASSERT(!inf_string.isNA()); preformatted_datum.append(inf_string.c_str()); } return STRI_SPRINTF_FORMAT_STATUS_NEEDS_PADDING; /* might need padding (na_string can be fancy Unicode) */ } } StriSprintfFormatStatus preformatDatum_s(std::string& preformatted_datum, const String8& datum) { STRI_ASSERT(!pad_zero); STRI_ASSERT(!sign_plus); STRI_ASSERT(!sign_space); STRI_ASSERT(!alternate_output); bool isna = (datum.isNA() || min_width == NA_INTEGER || precision == NA_INTEGER); if (!isna) { R_len_t datum_size = datum.length(); // this is byte count if (precision >= 0) { if (use_length) { // ha! output no more than code points datum_size = stri__length_string(datum.c_str(), datum_size, precision); } else { // ho! output code points of total width no more than precision characters datum_size = stri__width_string(datum.c_str(), datum_size, precision); } } preformatted_datum.append(datum.c_str(), datum_size); } else if (na_string.isNA()) return STRI_SPRINTF_FORMAT_STATUS_IS_NA; else { // isNA if (na_string.isNA()) return STRI_SPRINTF_FORMAT_STATUS_IS_NA; // output na_string, possibly trimmed R_len_t na_string_size = na_string.length(); // this is byte count if (precision >= 0) { if (use_length) { // ha! output no more than code points na_string_size = stri__length_string(na_string.c_str(), na_string_size, precision); } else { // ho! output code points of total width no more than precision characters na_string_size = stri__width_string(na_string.c_str(), na_string_size, precision); } } preformatted_datum.append(na_string.c_str(), na_string_size); } return STRI_SPRINTF_FORMAT_STATUS_NEEDS_PADDING; /* might need padding */ } }; /** Formats a single string * * @version 1.6.2 (Marek Gagolewski, 2021-05-24) */ SEXP stri__sprintf_1( const String8& _f, StriSprintfDataProvider* data, const String8& na_string, const String8& inf_string, const String8& nan_string, bool use_length ) { STRI_ASSERT(!_f.isNA()); R_len_t n = _f.length(); const char* f = _f.c_str(); std::string buf; buf.reserve(n+1); // whatever; maybe there are no format specifiers at all R_len_t i=0; while (i < n) { // consume everything up to the next '%' if (f[i] != '%') { buf.push_back(f[i++]); continue; } // '%' found. i++; if (i >= n) // dangling % throw StriException(MSG__INVALID_FORMAT_SPECIFIER, ""); // if "%%", then output '%' and continue looking for the next '%' if (f[i] == '%') { buf.push_back('%'); i++; continue; } // We have %., where . is not a %% -- a possible format specifier // pre-flight stage -- look for the indef of a type spec (dfFgGs etc.) R_len_t j0 = i; // start R_len_t j1 = stri__find_type_spec(f, i, n); // stop i = j1+1; // in the next iteration, start right after the format spec // now f[j0..j1] may be a format specifier (without the preceding %) StriSprintfFormatSpec spec( f, j0, j1, data, na_string, inf_string, nan_string, use_length ); // debug: Rprintf("*** spec=%s\n", spec.toString().c_str()); // debug: buf.append(spec.toString()); std::string formatted_datum; if (spec.formatDatum(formatted_datum) == STRI_SPRINTF_FORMAT_STATUS_IS_NA) return NA_STRING; buf.append(formatted_datum); } return Rf_mkCharLenCE(buf.data(), buf.size(), CE_UTF8); } /** * Format a string * * vectorized over format and each vector in x * * @param format character vector * @param x list of vectors * @param na_string single string, can be NA * @param inf_string single string * @param nan_string single string * @param use_length single logical value * @return character vector * * @version 1.6.2 (Marek Gagolewski, 2021-05-24) */ SEXP stri_sprintf(SEXP format, SEXP x, SEXP na_string, SEXP inf_string, SEXP nan_string, SEXP use_length) { bool use_length_val = stri__prepare_arg_logical_1_notNA(use_length, "use_length"); PROTECT(x = stri__prepare_arg_list(x, "x")); PROTECT(format = stri__prepare_arg_string(format, "format")); PROTECT(na_string = stri__prepare_arg_string_1(na_string, "na_string")); PROTECT(inf_string = stri__prepare_arg_string_1(inf_string, "inf_string")); PROTECT(nan_string = stri__prepare_arg_string_1(nan_string, "nan_string")); R_len_t format_length = LENGTH(format); R_len_t vectorize_length = format_length; R_len_t narg = LENGTH(x); // TODO: allow for the Unicode plus and minus // TODO: ICU number format 1,234.567 / 1 234,567 / etc. for (R_len_t j=0; j 0) { R_len_t cur_length = LENGTH(VECTOR_ELT(x, j)); if (cur_length <= 0) vectorize_length = 0; else if (vectorize_length < cur_length) vectorize_length = cur_length; } } if (vectorize_length <= 0) { UNPROTECT(5); return Rf_allocVector(STRSXP, 0); } // ASSERT: vectorize_length > 0 // ASSERT: all elements in x meet Rf_isVector(VECTOR_ELT(x, j)) if (vectorize_length % format_length != 0) Rf_warning(MSG__WARN_RECYCLING_RULE); for (R_len_t j=0; jreset(i); SEXP out; STRI__PROTECT(out = stri__sprintf_1( format_cont.get(i), data, na_string_cont.getNAble(0), inf_string_cont.getNAble(0), nan_string_cont.getNAble(0), use_length_val )); SET_STRING_ELT(ret, i, out); STRI__UNPROTECT(1); } // there was no error, we may want to warn about unused args data->warn_if_arg_unused = true; delete data; // need to be deleted before UNPROTECTing the x list data = NULL; STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } stringi/src/stri_trans_casemap.cpp0000644000176200001440000003131414770541312017113 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_string8buf.h" #include "stri_brkiter.h" #include #define STRI_CASEMAP_TOLOWER 1 #define STRI_CASEMAP_TOUPPER 2 #define STRI_CASEMAP_CASEFOLD 3 /** * Convert case (TitleCase) * * * @param str character vector * @param opts_brkiter list * @return character vector * * @version 0.4-1 (Marek Gagolewski, 2014-12-03) * separated from stri_trans_casemap; * use StriUBreakIterator */ SEXP stri_trans_totitle(SEXP str, SEXP opts_brkiter) { StriBrkIterOptions opts_brkiter2(opts_brkiter, "word"); PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument // version 0.2-1 - Does not work with ICU 4.8 (but we require ICU >= 50) UCaseMap* ucasemap = NULL; STRI__ERROR_HANDLER_BEGIN(1) StriUBreakIterator brkiter(opts_brkiter2); UErrorCode status = U_ZERO_ERROR; ucasemap = ucasemap_open(brkiter.getLocale(), U_FOLD_CASE_DEFAULT, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; ucasemap_setBreakIterator(ucasemap, brkiter.getIterator(), &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) brkiter.free(false); // ucasemap_setOptions(ucasemap, U_TITLECASE_NO_LOWERCASE, &status); // to do? // now briter is owned by ucasemap. // it will be released on ucasemap_close // (checked with ICU man & src code) R_len_t str_n = LENGTH(str); StriContainerUTF8 str_cont(str, str_n); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_n)); // STEP 1. // Estimate the required buffer length // Notice: The resulting number of codepoints may be larger or smaller than // the number before casefolding R_len_t bufsize = str_cont.getMaxNumBytes(); bufsize += 10; // a small margin String8buf buf(bufsize); // STEP 2. // Do case folding for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); status = U_ZERO_ERROR; int buf_need = ucasemap_utf8ToTitle(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); if (U_FAILURE(status)) { buf.resize(buf_need, false/*destroy contents*/); status = U_ZERO_ERROR; buf_need = ucasemap_utf8ToTitle(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // this shouldn't happen // we do have the buffer size required to complete this op } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buf_need, CE_UTF8)); } if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL; } }) } /** * Convert case (upper, lowercase, fold) * * * @param str character vector * @param locale single string identifying * the locale ("" or NULL for default locale) * @return character vector * * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16 * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.1-?? (Marek Gagolewski, 2013-11-19) * use UCaseMap + StriContainerUTF8 * **THIS DOES NOT WORK WITH ICU 4.8**, we have to revert the changes * ** BTW, since stringi_0.1-25 we require ICU>=50 ** * * @version 0.2-1 (Marek Gagolewski, 2014-03-18) * use UCaseMap + StriContainerUTF8 * (this is much faster for UTF-8 and slightly faster for 8bit enc) * Estimates minimal buffer size. * * @version 0.3-1 (Marek Gagolewski, 2014-10-24) * Use a custom BreakIterator with stri_trans_totitle * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-03) * use StriUBreakIterator * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * now this is an internal function * * @version 1.6.1 (Marek Gagolewski, 2021-04-30) * add casefold */ SEXP stri_trans_casemap(SEXP str, int _type, SEXP locale) { if (_type < 1 || _type > 3) Rf_error(MSG__INCORRECT_INTERNAL_ARG); const char* qloc = stri__prepare_arg_locale(locale, "locale"); /* this is R_alloc'ed */ PROTECT(str = stri__prepare_arg_string(str, "str")); // prepare string argument // version 0.2-1 - Does not work with ICU 4.8 (but we require ICU >= 50) UCaseMap* ucasemap = NULL; STRI__ERROR_HANDLER_BEGIN(1) UErrorCode status = U_ZERO_ERROR; ucasemap = ucasemap_open(qloc, U_FOLD_CASE_DEFAULT, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // TODO: U_USING_DEFAULT_WARNING when qloc!=0 // NOTE: we can't check if there submitted locale is valid, // because there is no API for it [ULOC_VALID_LOCALE] R_len_t str_n = LENGTH(str); StriContainerUTF8 str_cont(str, str_n); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_n)); // STEP 1. // Estimate the required buffer length // Notice: The resulting number of code points may be larger or smaller than // the number before case mapping R_len_t bufsize = str_cont.getMaxNumBytes(); bufsize += 10; // a small margin String8buf buf(bufsize); // STEP 2. // Do case folding for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); int buf_need; bool retry = false; while (true) { status = U_ZERO_ERROR; if (_type == STRI_CASEMAP_TOLOWER) { buf_need = ucasemap_utf8ToLower( ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status ); } else if (_type == STRI_CASEMAP_TOUPPER) { buf_need = ucasemap_utf8ToUpper( ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status ); } else { buf_need = ucasemap_utf8FoldCase( ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status ); } if (!U_FAILURE(status)) break; if (!retry) { buf.resize(buf_need, false/*destroy contents*/); // we now have the buffer size required to complete this op retry = true; } else { STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // this shouldn't happen } } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buf_need, CE_UTF8)); } if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL; } }) } /** * Convert to lower case * * * @param str character vector * @param locale single string identifying * the locale ("" or NULL for default locale) * @return character vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * call stri_trans_casemap */ SEXP stri_trans_tolower(SEXP str, SEXP locale) { return stri_trans_casemap(str, STRI_CASEMAP_TOLOWER, locale); } /** * Convert to lower case * * * @param str character vector * @param locale single string identifying * the locale ("" or NULL for default locale) * @return character vector * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * call stri_trans_casemap */ SEXP stri_trans_toupper(SEXP str, SEXP locale) { return stri_trans_casemap(str, STRI_CASEMAP_TOUPPER, locale); } /** * Case folding * * @param str character vector * @return character vector * * @version 1.6.1 (Marek Gagolewski, 2021-04-30) * call stri_trans_casemap */ SEXP stri_trans_casefold(SEXP str) { return stri_trans_casemap(str, STRI_CASEMAP_CASEFOLD, R_NilValue); } // v0.1-?? - UTF-16 - WORKS WITH ICU 4.8 // (this is much slower for UTF-8 and slightly slower for 8bit enc) // Slower than v0.2-1 //// BreakIterator* briter = NULL; // // STRI__ERROR_HANDLER_BEGIN // // if (!Rf_isInteger(type) || LENGTH(type) != 1) // throw StriException(MSG__INCORRECT_INTERNAL_ARG); // this is an internal arg, check manually // int _type = INTEGER(type)[0]; // // // Locale loc = Locale::createFromName(qloc); // this will be freed automatically // StriContainerUTF16 str_cont(str, LENGTH(str), false); // writable, no recycle // //// if (_type == 6) { //// UErrorCode status = U_ZERO_ERROR; //// briter = BreakIterator::createWordInstance(loc, status); //// STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) //// } // // for (R_len_t i = str_cont.vectorize_init(); // i != str_cont.vectorize_end(); // i = str_cont.vectorize_next(i)) // { // if (!str_cont.isNA(i)) { // switch (_type) { // case 1: // str_cont.getWritable(i).toLower(loc); // break; // case 2: // str_cont.getWritable(i).toUpper(loc); // break; // case 3: // str_cont.getWritable(i).toTitle(NULL, loc); // use default ICU's BreakIterator // break; // case 4: // str_cont.getWritable(i).foldCase(U_FOLD_CASE_DEFAULT); // break; // case 5: // str_cont.getWritable(i).foldCase(U_FOLD_CASE_EXCLUDE_SPECIAL_I); // break; //// case 6: //// str_cont.getWritable(i).toTitle(briter, loc); // how to get it working properly with English text??? //// I guess ICU doesn't support language-sensitive title casing at all... //// break; // default: // throw StriException("stri_trans_case: incorrect case conversion type"); // } // } // } // //// if (briter) { delete briter; briter = NULL; } // SEXP ret; // PROTECT(ret = str_cont.toR()); // UNPROTECT(1); // return ret; // STRI__ERROR_HANDLER_END(/*noop*/; //// if (briter) delete briter; // ) stringi/src/stri_search_coll_count.cpp0000644000176200001440000001017714770541312017765 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_base.h" #include "stri_container_utf16.h" #include "stri_container_usearch.h" /** * Count pattern occurcess in a string [with collation] * * @param str character vector * @param pattern character vector * @param opts_collator passed to stri__ucol_open() * @return integer vector * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * corrected behavior on empty str/pattern * * @version 0.1-?? (Marek Gagolewski, 2013-06-23) * make StriException-friendly, * use StriContainerUStringSearch * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * new fun: stri_count_coll (opts_collator == NA not allowed) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_count_coll(SEXP str, SEXP pattern, SEXP opts_collator) { PROTECT(str = stri__prepare_arg_string(str, "str")); PROTECT(pattern = stri__prepare_arg_string(pattern, "pattern")); // call stri__ucol_open after prepare_arg: // if prepare_arg had failed, we would have a mem leak UCollator* collator = NULL; collator = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(2) R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF16 str_cont(str, vectorize_length); StriContainerUStringSearch pattern_cont(pattern, vectorize_length, collator); // collator is not owned by pattern_cont SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, vectorize_length)); int* ret_tab = INTEGER(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_INTEGER, ret_tab[i] = 0) UStringSearch *matcher = pattern_cont.getMatcher(i, str_cont.get(i)); usearch_reset(matcher); UErrorCode status = U_ZERO_ERROR; R_len_t found = 0; while (!U_FAILURE(status) && ((int)usearch_next(matcher, &status) != USEARCH_DONE)) ++found; STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) ret_tab[i] = found; } if (collator) { ucol_close(collator); collator=NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( if (collator) ucol_close(collator); ) } stringi/src/stri_time_calendar.cpp0000644000176200001440000003770414770542105017074 0ustar liggesusers/* This file is part of the 'stringi' project. * Copyright (c) 2013-2025, Marek Gagolewski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_double.h" #include "stri_container_integer.h" #include #include /** Set POSIXct class on a given object * * @param x R object * * @version 0.5-1 (Marek Gagolewski, 2014-12-29) */ void stri__set_class_POSIXct(SEXP x) { SEXP cl; PROTECT(cl = Rf_allocVector(STRSXP, 2)); // SET_STRING_ELT(cl, 0, Rf_mkChar("POSIXst")); SET_STRING_ELT(cl, 0, Rf_mkChar("POSIXct")); SET_STRING_ELT(cl, 1, Rf_mkChar("POSIXt")); Rf_setAttrib(x, R_ClassSymbol, cl); UNPROTECT(1); } /** Get current date-time * * @return POSIXct * * @version 0.5-1 (Marek Gagolewski, 2014-12-29) */ SEXP stri_datetime_now() { UDate now = Calendar::getNow(); SEXP ret; PROTECT(ret = Rf_ScalarReal(((double)now)/1000.0)); // msec.->sec. stri__set_class_POSIXct(ret); UNPROTECT(1); return ret; } /** Get calendar * * @return Calendar * * @version 1.8.1 (Marek Gagolewski, 2023-11-07) */ Calendar* stri__get_calendar(const char* locale_val) { UErrorCode status = U_ZERO_ERROR; Calendar* cal = Calendar::createInstance(Locale::createFromName(locale_val), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // NOTE: unfortunately, in ICU 74.1 U_USING_DEFAULT_WARNING is never emitted if (status == U_USING_DEFAULT_WARNING && cal && locale_val) { UErrorCode status2 = U_ZERO_ERROR; const char* valid_locale = cal->getLocaleID(ULOC_VALID_LOCALE, status2); if (valid_locale && !strcmp(valid_locale, "root")) Rf_warning("%s", ICUError::getICUerrorName(status)); } return cal; } /** Date-time arithmetic * * @param time * @param value * @param units * @param tz * @param locale * * @return POSIXct * * @version 0.5-1 (Marek Gagolewski, 2014-12-30) * * @version 0.5-1 (Marek Gagolewski, 2015-03-06) tz arg added * * @version 1.8.1 (Marek Gagolewski, 2023-11-07) * #476: Warn when falling back to the root locale, make C==en_US_POSIX */ SEXP stri_datetime_add(SEXP time, SEXP value, SEXP units, SEXP tz, SEXP locale) { PROTECT(time = stri__prepare_arg_POSIXct(time, "time")); PROTECT(value = stri__prepare_arg_integer(value, "value")); if (!Rf_isNull(tz)) PROTECT(tz = stri__prepare_arg_string_1(tz, "tz")); else PROTECT(tz); /* needed to set tzone attrib */ R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(time), LENGTH(value)); const char* units_val = stri__prepare_arg_string_1_notNA(units, "units"); const char* units_opts[] = {"years", "months", "weeks", "days", "hours", "minutes", "seconds", "milliseconds", NULL}; int units_cur = stri__match_arg(units_val, units_opts); const char* locale_val = stri__prepare_arg_locale(locale, "locale"); TimeZone* tz_val = stri__prepare_arg_timezone(tz, "tz", true/*allowdefault*/); Calendar* cal = NULL; STRI__ERROR_HANDLER_BEGIN(3) StriContainerDouble time_cont(time, vectorize_length); StriContainerInteger value_cont(value, vectorize_length); UCalendarDateFields units_field; switch (units_cur) { case 0: units_field = UCAL_YEAR; break; case 1: units_field = UCAL_MONTH; break; case 2: units_field = UCAL_WEEK_OF_YEAR; break; case 3: units_field = UCAL_DAY_OF_MONTH; break; case 4: units_field = UCAL_HOUR_OF_DAY; break; case 5: units_field = UCAL_MINUTE; break; case 6: units_field = UCAL_SECOND; break; case 7: units_field = UCAL_MILLISECOND; break; default: throw StriException(MSG__INCORRECT_MATCH_OPTION, "units"); } cal = stri__get_calendar(locale_val); cal->adoptTimeZone(tz_val); tz_val = NULL; /* The Calendar takes ownership of the TimeZone. */ UErrorCode status = U_ZERO_ERROR; SEXP ret; STRI__PROTECT(ret = Rf_allocVector(REALSXP, vectorize_length)); double* ret_val = REAL(ret); for (R_len_t i=0; isetTime((UDate)(time_cont.get(i)*1000.0), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; cal->add(units_field, value_cont.get(i), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; ret_val[i] = ((double)cal->getTime(status))/1000.0; STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } if (!Rf_isNull(tz)) Rf_setAttrib(ret, Rf_ScalarString(Rf_mkChar("tzone")), tz); stri__set_class_POSIXct(ret); if (tz_val) { delete tz_val; tz_val = NULL; } if (cal) { delete cal; cal = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (tz_val) { delete tz_val; tz_val = NULL; } if (cal) { delete cal; cal = NULL; } }) } /** * Get values of date-time fields * * @param time * @param locale * @param tz * * @return list * * @version 0.5-1 (Marek Gagolewski, 2015-01-01) * * @version 0.5-1 (Marek Gagolewski, 2015-03-03) tz arg added * * @version 1.8.1 (Marek Gagolewski, 2023-11-07) * #476: Warn when falling back to the root locale, make C==en_US_POSIX */ SEXP stri_datetime_fields(SEXP time, SEXP tz, SEXP locale) { PROTECT(time = stri__prepare_arg_POSIXct(time, "time")); const char* locale_val = stri__prepare_arg_locale(locale, "locale"); if (!Rf_isNull(tz)) PROTECT(tz = stri__prepare_arg_string_1(tz, "tz")); else PROTECT(tz); /* needed to set tzone attrib */ TimeZone* tz_val = stri__prepare_arg_timezone(tz, "tz", true/*allowdefault*/); Calendar* cal = NULL; STRI__ERROR_HANDLER_BEGIN(2) R_len_t vectorize_length = LENGTH(time); StriContainerDouble time_cont(time, vectorize_length); cal = stri__get_calendar(locale_val); cal->adoptTimeZone(tz_val); tz_val = NULL; /* The Calendar takes ownership of the TimeZone. */ UErrorCode status = U_ZERO_ERROR; SEXP ret; #define STRI__FIELDS_NUM 14 STRI__PROTECT(ret = Rf_allocVector(VECSXP, STRI__FIELDS_NUM)); for (R_len_t j=0; jsetTime((UDate)(time_cont.get(i)*1000.0), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) for (R_len_t j=0; j SecondsInDay // UCAL_AM_PM -> "AM" or "PM" (localized? or factor?+index in stri_datetime_symbols) add arg use_symbols???? // UCAL_DAY_OF_WEEK -> (localized? or factor?) SUNDAY, MONDAY // UCAL_DAY_OF_YEAR ' // isWekend status = U_ZERO_ERROR; INTEGER(VECTOR_ELT(ret, j))[i] = cal->get(units_field, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (units_field == UCAL_MONTH) ++INTEGER(VECTOR_ELT(ret, j))[i]; // month + 1 else if (units_field == UCAL_AM_PM) ++INTEGER(VECTOR_ELT(ret, j))[i]; // ampm + 1 else if (units_field == UCAL_ERA) ++INTEGER(VECTOR_ELT(ret, j))[i]; // era + 1 } } stri__set_names(ret, STRI__FIELDS_NUM, "Year", "Month", "Day", "Hour", "Minute", "Second", "Millisecond", "WeekOfYear", "WeekOfMonth","DayOfYear", "DayOfWeek", "Hour12", "AmPm", "Era"); if (tz_val) { delete tz_val; tz_val = NULL; } if (cal) { delete cal; cal = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (tz_val) { delete tz_val; tz_val = NULL; } if (cal) { delete cal; cal = NULL; } }) } /** * Create a date-time object * * @param year * @param month * @param day * @param hours * @param minutes * @param seconds * @param tz * @param lenient * @param locale * * @return POSIXct * * @version 0.5-1 (Marek Gagolewski, 2015-01-01) * @version 0.5-1 (Marek Gagolewski, 2015-01-11) lenient arg added * @version 0.5-1 (Marek Gagolewski, 2015-03-02) tz arg added * @version 1.1.2 (Marek Gagolewski, 2016-09-30) round() is not C++98 * * @version 1.8.1 (Marek Gagolewski, 2023-11-07) * #476: Warn when falling back to the root locale, make C==en_US_POSIX */ SEXP stri_datetime_create( SEXP year, SEXP month, SEXP day, SEXP hour, SEXP minute, SEXP second, SEXP lenient, SEXP tz, SEXP locale ) { PROTECT(year = stri__prepare_arg_integer(year, "year")); PROTECT(month = stri__prepare_arg_integer(month, "month")); PROTECT(day = stri__prepare_arg_integer(day, "day")); PROTECT(hour = stri__prepare_arg_integer(hour, "hour")); PROTECT(minute = stri__prepare_arg_integer(minute, "minute")); PROTECT(second = stri__prepare_arg_double(second, "second")); const char* locale_val = stri__prepare_arg_locale(locale, "locale"); bool lenient_val = stri__prepare_arg_logical_1_notNA(lenient, "lenient"); if (!Rf_isNull(tz)) PROTECT(tz = stri__prepare_arg_string_1(tz, "tz")); else PROTECT(tz); /* needed to set tzone attrib */ R_len_t vectorize_length = stri__recycling_rule(true, 6, LENGTH(year), LENGTH(month), LENGTH(day), LENGTH(hour), LENGTH(minute), LENGTH(second)); TimeZone* tz_val = stri__prepare_arg_timezone(tz, "tz", true/*allowdefault*/); Calendar* cal = NULL; STRI__ERROR_HANDLER_BEGIN(7) StriContainerInteger year_cont(year, vectorize_length); StriContainerInteger month_cont(month, vectorize_length); StriContainerInteger day_cont(day, vectorize_length); StriContainerInteger hour_cont(hour, vectorize_length); StriContainerInteger minute_cont(minute, vectorize_length); StriContainerDouble second_cont(second, vectorize_length); cal = stri__get_calendar(locale_val); cal->setLenient(lenient_val); cal->adoptTimeZone(tz_val); tz_val = NULL; /* The Calendar takes ownership of the TimeZone. */ UErrorCode status = U_ZERO_ERROR; SEXP ret; STRI__PROTECT(ret = Rf_allocVector(REALSXP, vectorize_length)); double* ret_val = REAL(ret); for (R_len_t i=0; iset(UCAL_EXTENDED_YEAR, year_cont.get(i)); cal->set(UCAL_MONTH, month_cont.get(i)-1); cal->set(UCAL_DATE, day_cont.get(i)); cal->set(UCAL_HOUR_OF_DAY, hour_cont.get(i)); cal->set(UCAL_MINUTE, minute_cont.get(i)); cal->set(UCAL_SECOND, (int)floor(second_cont.get(i))); cal->set(UCAL_MILLISECOND, (int)fround((second_cont.get(i)-floor(second_cont.get(i)))*1000.0, 0)); status = U_ZERO_ERROR; ret_val[i] = ((double)cal->getTime(status))/1000.0; if (U_FAILURE(status)) REAL(ret)[i] = NA_REAL; } if (!Rf_isNull(tz)) Rf_setAttrib(ret, Rf_ScalarString(Rf_mkChar("tzone")), tz); stri__set_class_POSIXct(ret); if (tz_val) { delete tz_val; tz_val = NULL; } if (cal) { delete cal; cal = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (tz_val) { delete tz_val; tz_val = NULL; } if (cal) { delete cal; cal = NULL; } }) } // /** // * @param x list // * @return POSIXct // * // * @version 0.5-1 (Marek Gagolewski, 2015-03-07) // */ // SEXP stri_c_posixst(SEXP x) { // if (!Rf_isVectorList(x)) Rf_error(MSG__INTERNAL_ERROR); // R_len_t n = LENGTH(x); // R_len_t m = 0; // for (R_len_t i=0; i * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include // for R_tryCatchError ------------------------------------------------------- SEXP stri__call_as_character(void* data) { SEXP call; SEXP x = (SEXP)data; PROTECT(call = Rf_lang2(Rf_install("as.character"), x)); PROTECT(x = Rf_eval(call, R_BaseEnv)); // Q: BaseEnv has the generic as.* UNPROTECT(2); return x; } SEXP stri__call_as_integer(void* data) { SEXP call; SEXP x = (SEXP)data; PROTECT(call = Rf_lang2(Rf_install("as.integer"), x)); PROTECT(x = Rf_eval(call, R_BaseEnv)); // Q: BaseEnv has the generic as.* UNPROTECT(2); return x; } SEXP stri__call_as_double(void* data) { SEXP call; SEXP x = (SEXP)data; PROTECT(call = Rf_lang2(Rf_install("as.double"), x)); PROTECT(x = Rf_eval(call, R_BaseEnv)); // Q: BaseEnv has the generic as.* UNPROTECT(2); return x; } SEXP stri__call_as_logical(void* data) { SEXP call; SEXP x = (SEXP)data; PROTECT(call = Rf_lang2(Rf_install("as.logical"), x)); PROTECT(x = Rf_eval(call, R_BaseEnv)); // Q: BaseEnv has the generic as.* UNPROTECT(2); return x; } SEXP stri__call_as_raw(void* data) { SEXP call; SEXP x = (SEXP)data; PROTECT(call = Rf_lang2(Rf_install("as.raw"), x)); PROTECT(x = Rf_eval(call, R_BaseEnv)); // Q: BaseEnv has the generic as.* UNPROTECT(2); return x; } SEXP stri__call_as_POSIXct(void* data) { SEXP call; SEXP x = (SEXP)data; PROTECT(call = Rf_lang2(Rf_install("as.POSIXct"), x)); PROTECT(x = Rf_eval(call, R_BaseEnv)); // Q: BaseEnv has the generic as.* UNPROTECT(2); return x; } SEXP stri__handler_null(SEXP /*cond*/, void* /*data*/) { return R_NilValue; } // --------------------------------------------------------------------------- /** check if a list is empty or is a list of atomic vectors each of length 1 */ bool stri__check_list_of_scalars(SEXP x) { STRI_ASSERT(Rf_isVectorList(x)); R_len_t nv = LENGTH(x); for (R_len_t i=0; i= R_Version(3, 5, 0) if (allow_error) return stri__call_as_character((void*)x); else return R_tryCatchError(stri__call_as_character, (void*)x, stri__handler_null, NULL); #else return stri__call_as_character((void*)x); #endif } else if ((bool)Rf_isString(x)) return x; // return as-is else if (Rf_isVectorAtomic(x) || Rf_isNull(x)) return Rf_coerceVector(x, STRSXP); else if ((bool)Rf_isSymbol(x)) return Rf_ScalarString(PRINTNAME(x)); Rf_error(MSG__ARG_EXPECTED_STRING, argname); // allowed here return x; // avoid compiler warning } /** * Prepare numeric vector argument * * If the object cannot be coerced, then an error will be generated * * WARNING: By default (allow_error=true), this function is allowed to call the error() function. * Use before STRI__ERROR_HANDLER_BEGIN (with other prepareargs). * * * @param x a numeric vector or an object that can be coerced to a numeric vector * @param argname argument name (message formatting) * @param factors_as_strings treat factors with as.character or as.double? * @param allow_error if false, protect a call to as.* and return NilValue on fail. * @return numeric vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski) * argname added * * @version 0.4-1 (Marek Gagolewski, 2014-11-19) * BUGFIX: PROTECT mem from GC in factor object given * * @version 0.4-1 (Marek Gagolewski, 2014-11-27) * treat NULLs as empty vectors * * @version 0.5-1 (Marek Gagolewski, 2015-05-01) * #154 - the class attribute set fires up an as.xxxx call * * @version 1.2.1 (Marek Gagolewski, 2018-04-21) * #285: warn if coercing from a non-trivial list * * @version 1.6.3 (Marek Gagolewski, 2021-05-19) * factors_as_strings, allow_error */ SEXP stri__prepare_arg_double(SEXP x, const char* argname, bool factors_as_strings, bool allow_error) { if ((SEXP*)argname == (SEXP*)R_NilValue) argname = ""; if (factors_as_strings && Rf_isFactor(x)) { // SEXP call; // PROTECT(call = Rf_lang2(Rf_install("as.character"), x)); // PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually // PROTECT(x = Rf_coerceVector(x, REALSXP)); // UNPROTECT(3); // return x; // as.double(as.character(x)) #if defined(R_VERSION) && R_VERSION >= R_Version(3, 5, 0) if (allow_error) PROTECT(x = stri__call_as_character((void*)x)); else { PROTECT(x = R_tryCatchError(stri__call_as_character, (void*)x, stri__handler_null, NULL)); if (Rf_isNull(x)) { UNPROTECT(1); return x; } } #else PROTECT(x = stri__call_as_character((void*)x)); #endif PROTECT(x = Rf_coerceVector(x, REALSXP)); UNPROTECT(2); return x; } else if (Rf_isVectorList(x) || Rf_isObject(x)) // factor is an object too { if (Rf_isVectorList(x) && !stri__check_list_of_scalars(x)) Rf_warning(MSG__WARN_LIST_COERCION); #if defined(R_VERSION) && R_VERSION >= R_Version(3, 5, 0) if (allow_error) return stri__call_as_double((void*)x); else return R_tryCatchError(stri__call_as_double, (void*)x, stri__handler_null, NULL); #else return stri__call_as_double((void*)x); #endif } else if ((bool)Rf_isReal(x)) return x; //return as-is else if (Rf_isVectorAtomic(x) || Rf_isNull(x)) return Rf_coerceVector(x, REALSXP); Rf_error(MSG__ARG_EXPECTED_NUMERIC, argname); // allowed here return x; // avoid compiler warning } /** * Prepare integer vector argument * * If the object cannot be coerced, then an error will be generated * * WARNING: By default (allow_error=true), this function is allowed to call the error() function. * Use before STRI__ERROR_HANDLER_BEGIN (with other prepareargs). * * * @param x an integer vector or an object that can be coerced to an integer vector * @param argname argument name (message formatting) * @param factors_as_strings treat factors with as.character or as.integer? * @param allow_error if false, protect a call to as.* and return NilValue on fail. * @return integer vector * * @version 0.1?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski) * argname added * * @version 0.4-1 (Marek Gagolewski, 2014-11-19) * BUGFIX: PROTECT mem from GC in factor object given * * @version 0.4-1 (Marek Gagolewski, 2014-11-27) * treat NULLs as empty vectors * * @version 0.5-1 (Marek Gagolewski, 2015-05-01) * #154 - the class attribute set fires up an as.xxxx call * * @version 1.2.1 (Marek Gagolewski, 2018-04-21) * #285: warn if coercing from a non-trivial list * * @version 1.6.3 (Marek Gagolewski, 2021-05-19) * factors_as_strings, allow_error */ SEXP stri__prepare_arg_integer(SEXP x, const char* argname, bool factors_as_strings, bool allow_error) { if ((SEXP*)argname == (SEXP*)R_NilValue) argname = ""; if (factors_as_strings && Rf_isFactor(x)) // factors must be checked first (as they are currently represented as integer vectors) { // SEXP call; // PROTECT(call = Rf_lang2(Rf_install("as.character"), x)); // PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually // PROTECT(x = Rf_coerceVector(x, INTSXP)); // UNPROTECT(3); // return x; // as.integer(as.character(x)) #if defined(R_VERSION) && R_VERSION >= R_Version(3, 5, 0) if (allow_error) PROTECT(x = stri__call_as_character((void*)x)); else { PROTECT(x = R_tryCatchError(stri__call_as_character, (void*)x, stri__handler_null, NULL)); if (Rf_isNull(x)) { UNPROTECT(1); return x; } } #else PROTECT(x = stri__call_as_character((void*)x)); #endif PROTECT(x = Rf_coerceVector(x, INTSXP)); UNPROTECT(2); return x; } else if (Rf_isVectorList(x) || Rf_isObject(x)) // factor is an object too { if (Rf_isVectorList(x) && !stri__check_list_of_scalars(x)) Rf_warning(MSG__WARN_LIST_COERCION); #if defined(R_VERSION) && R_VERSION >= R_Version(3, 5, 0) if (allow_error) return stri__call_as_integer((void*)x); else return R_tryCatchError(stri__call_as_integer, (void*)x, stri__handler_null, NULL); #else return stri__call_as_integer((void*)x); #endif } else if (Rf_isInteger(x)) return x; // return as-is else if (Rf_isVectorAtomic(x) || Rf_isNull(x)) return Rf_coerceVector(x, INTSXP); Rf_error(MSG__ARG_EXPECTED_INTEGER, argname); //allowed here return x; // avoid compiler warning } /** * Prepare logical vector argument * * If the object cannot be coerced, then an error will be generated * * WARNING: this function is allowed to call the error() function. * Use before STRI__ERROR_HANDLER_BEGIN (with other prepareargs). * * * @param x a logical vector or an object that can be coerced to a logical vector * @param argname argument name (message formatting) * @return logical vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski) * argname added * * @version 0.4-1 (Marek Gagolewski, 2014-11-19) * BUGFIX: PROTECT mem from GC in factor object given * * @version 0.4-1 (Marek Gagolewski, 2014-11-27) * treat NULLs as empty vectors * * @version 0.5-1 (Marek Gagolewski, 2015-05-01) * #154 - the class attribute set fires up an as.xxxx call * * @version 1.2.1 (Marek Gagolewski, 2018-04-21) * #285: warn if coercing from a non-trivial list * * @version 1.6.2 (Marek Gagolewski, 2021-05-19) * call as.logical on factors (not as.character+coerce to LGLSXP) * * @version 1.6.3 (Marek Gagolewski, 2021-05-20) * allow_error */ SEXP stri__prepare_arg_logical(SEXP x, const char* argname, bool allow_error) { if ((SEXP*)argname == (SEXP*)R_NilValue) argname = ""; if (Rf_isFactor(x)) { // SEXP call; // PROTECT(call = Rf_lang2(Rf_install("as.character"), x)); // PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually // PROTECT(x = Rf_coerceVector(x, LGLSXP)); // UNPROTECT(3); #if defined(R_VERSION) && R_VERSION >= R_Version(3, 5, 0) if (allow_error) return stri__call_as_logical((void*)x); else return R_tryCatchError(stri__call_as_logical, (void*)x, stri__handler_null, NULL); #else return stri__call_as_logical((void*)x); #endif } else if (Rf_isVectorList(x) || Rf_isObject(x)) { if (Rf_isVectorList(x) && !stri__check_list_of_scalars(x)) Rf_warning(MSG__WARN_LIST_COERCION); #if defined(R_VERSION) && R_VERSION >= R_Version(3, 5, 0) if (allow_error) return stri__call_as_logical((void*)x); else return R_tryCatchError(stri__call_as_logical, (void*)x, stri__handler_null, NULL); #else return stri__call_as_logical((void*)x); #endif } else if ((bool)Rf_isLogical(x)) return x; // return as-is else if (Rf_isVectorAtomic(x) || Rf_isNull(x)) return Rf_coerceVector(x, LGLSXP); Rf_error(MSG__ARG_EXPECTED_LOGICAL, argname); // allowed here return x; // avoid compiler warning } /** * Prepare raw vector argument * * If the object cannot be coerced, then an error will be generated * * WARNING: this function is allowed to call the error() function. * Use before STRI__ERROR_HANDLER_BEGIN (with other prepareargs). * * * @param x a raw vector or an object that can be coerced to a raw vector * @param argname argument name (message formatting) * @return raw vector * * @version 0.1-?? (Marek Gagolewski) * * @version 0.4-1 (Marek Gagolewski, 2014-11-19) * BUGFIX: PROTECT mem from GC in factor object given * * @version 0.4-1 (Marek Gagolewski, 2014-11-27) * treat NULLs as empty vectors * * @version 0.5-1 (Marek Gagolewski, 2015-05-01) * #154 - the class attribute set fires up an as.xxxx call * * @version 1.2.1 (Marek Gagolewski, 2018-04-21) * #285: warn if coercing from a non-trivial list * * @version 1.6.3 (Marek Gagolewski, 2021-05-19) * factors_as_strings, allow_error */ SEXP stri__prepare_arg_raw(SEXP x, const char* argname, bool factors_as_strings, bool allow_error) { if ((SEXP*)argname == (SEXP*)R_NilValue) argname = ""; if (factors_as_strings && Rf_isFactor(x)) { // SEXP call; // PROTECT(call = Rf_lang2(Rf_install("as.character"), x)); // PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually // PROTECT(x = Rf_coerceVector(x, RAWSXP)); // UNPROTECT(3); // return x; // as.raw(as.character(x)) #if defined(R_VERSION) && R_VERSION >= R_Version(3, 5, 0) if (allow_error) PROTECT(x = stri__call_as_character((void*)x)); else { PROTECT(x = R_tryCatchError(stri__call_as_character, (void*)x, stri__handler_null, NULL)); if (Rf_isNull(x)) { UNPROTECT(1); return x; } } #else PROTECT(x = stri__call_as_character((void*)x)); #endif PROTECT(x = Rf_coerceVector(x, RAWSXP)); UNPROTECT(2); return x; } else if (Rf_isVectorList(x) || Rf_isObject(x)) { if (Rf_isVectorList(x) && !stri__check_list_of_scalars(x)) Rf_warning(MSG__WARN_LIST_COERCION); #if defined(R_VERSION) && R_VERSION >= R_Version(3, 5, 0) if (allow_error) return stri__call_as_raw((void*)x); else return R_tryCatchError(stri__call_as_raw, (void*)x, stri__handler_null, NULL); #else return stri__call_as_raw((void*)x); #endif } else if (TYPEOF(x) == RAWSXP) return x; // return as-is else if (Rf_isVectorAtomic(x) || Rf_isNull(x)) return Rf_coerceVector(x, RAWSXP); Rf_error(MSG__ARG_EXPECTED_RAW, argname); // allowed here return x; // avoid compiler warning } /** * POSIXt * * If the object cannot be coerced, then an error will be generated * * WARNING: this function is allowed to call the error() function. * Use before STRI__ERROR_HANDLER_BEGIN (with other prepareargs). * * * @param x a numeric vector with class POSIXct or something coercible to * @param argname argument name (message formatting) * @return numeric vector * * @version 0.5-1 (Marek Gagolewski, 2014-12-30) * @version 1.1.6 (Marek Gagolewski, 2020-02-17) bugfix #370 * @version 1.6.3 (Marek Gagolewski, 2021-05-26) call as.POSIXct more eagerly */ SEXP stri__prepare_arg_POSIXct(SEXP x, const char* argname) { int num_protect = 0; if ((SEXP*)argname == (SEXP*)R_NilValue) argname = ""; if (/*factors_as_strings &&*/ Rf_isFactor(x)) { PROTECT(x = stri__call_as_character((void*)x)); num_protect += 1; // will convert from character below } if (Rf_inherits(x, "POSIXlt") || Rf_inherits(x, "Date") || (TYPEOF(x) == STRSXP)) { PROTECT(x = stri__call_as_POSIXct((void*)x)); num_protect += 1; } if (!Rf_inherits(x, "POSIXct")) { Rf_error(MSG__ARG_EXPECTED_POSIXct, argname); } SEXP attrib_class, attrib_tzone, robj_class, robj_tzone; PROTECT(robj_class = Rf_ScalarString(Rf_mkChar("class"))); PROTECT(robj_tzone = Rf_ScalarString(Rf_mkChar("tzone"))); PROTECT(attrib_class = Rf_getAttrib(x, robj_class)); PROTECT(attrib_tzone = Rf_getAttrib(x, robj_tzone)); PROTECT(x = stri__prepare_arg_double(x, argname)); Rf_setAttrib(x, robj_class, attrib_class); Rf_setAttrib(x, robj_tzone, attrib_tzone); UNPROTECT(num_protect+5); return x; } /** Prepare string argument - one string * * If there are 0 elements -> error * If there are >1 elements -> warning * * WARNING: this function is allowed to call the error() function. * Use before STRI__ERROR_HANDLER_BEGIN (with other prepareargs). * * * @param x R object to be checked/coerced * @param argname argument name (message formatting) * @return always an R character vector with >=1 element * * @version 0.1-?? (Marek Gagolewski) * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.2.1 (Marek Gagolewski, 2018-04-21) * #285: warn if coercing from a non-trivial list * refactor: use stri__prepare_arg_xxx (again, as in pre-64651ed-commits) */ SEXP stri__prepare_arg_string_1(SEXP x, const char* argname) { if ((SEXP*)argname == (SEXP*)R_NilValue) argname = ""; PROTECT(x = stri__prepare_arg_string(x, argname)); int nprotect = 1; // if ((SEXP*)argname == (SEXP*)R_NilValue) // argname = ""; // int nprotect = 0; // if (Rf_isFactor(x)) // { // SEXP call; // PROTECT(call = Rf_lang2(Rf_install("as.character"), x)); // PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually // nprotect = 2; // } // else if (Rf_isVectorList(x) || Rf_isObject(x)) // { // if (Rf_isVectorList(x)) { // R_len_t nv = LENGTH(x); // for (R_len_t i=0; i 1) { Rf_warning(MSG__ARG_EXPECTED_1_STRING, argname); SEXP xold = x; PROTECT(x = Rf_allocVector(STRSXP, 1)); nprotect++; SET_STRING_ELT(x, 0, STRING_ELT(xold, 0)); UNPROTECT(nprotect); return x; } else { // if (nx == 1) UNPROTECT(nprotect); return x; } } /** Prepare double argument - one value * * If there are 0 elements -> error * If there are >1 elements -> warning * * WARNING: this function is allowed to call the error() function. * Use before STRI__ERROR_HANDLER_BEGIN (with other prepareargs). * * * @param x R object to be checked/coerced * @param argname argument name (message formatting) * @return always an R double vector with >=1 element * * @version 0.1-?? (Marek Gagolewski) * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.2.1 (Marek Gagolewski, 2018-04-21) * #285: warn if coercing from a non-trivial list * * @version 1.6.3 (Marek Gagolewski, 2021-05-19) * factors_as_strings * refactor: use stri__prepare_arg_xxx (again, as in pre-64651ed-commits) */ SEXP stri__prepare_arg_double_1(SEXP x, const char* argname, bool factors_as_strings) { if ((SEXP*)argname == (SEXP*)R_NilValue) argname = ""; PROTECT(x = stri__prepare_arg_double(x, argname, factors_as_strings)); int nprotect = 1; // if ((SEXP*)argname == (SEXP*)R_NilValue) // argname = ""; // // int nprotect = 0; // if (factors_as_strings && Rf_isFactor(x)) // { // SEXP call; // PROTECT(call = Rf_lang2(Rf_install("as.character"), x)); // PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually // PROTECT(x = Rf_coerceVector(x, REALSXP)); // nprotect = 3; // } // else if (Rf_isVectorList(x) || Rf_isObject(x)) // { // if (Rf_isVectorList(x)) { // R_len_t nv = LENGTH(x); // for (R_len_t i=0; i 1) { Rf_warning(MSG__ARG_EXPECTED_1_NUMERIC, argname); double x0 = REAL(x)[0]; PROTECT(x = Rf_allocVector(REALSXP, 1)); nprotect++; REAL(x)[0] = x0; UNPROTECT(nprotect); return x; } else {// if (nx == 1) UNPROTECT(nprotect); return x; } } /** Prepare integer argument - one value * * If there are 0 elements -> error * If there are >1 elements -> warning * * WARNING: this function is allowed to call the error() function. * Use before STRI__ERROR_HANDLER_BEGIN (with other prepareargs). * * * @param x R object to be checked/coerced * @param argname argument name (message formatting) * @return always an R integer vector with >=1 element * * @version 0.1-?? (Marek Gagolewski) * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.2.1 (Marek Gagolewski, 2018-04-21) * #285: warn if coercing from a non-trivial list * * @version 1.6.3 (Marek Gagolewski, 2021-05-19) * factors_as_strings * refactor: use stri__prepare_arg_xxx (again, as in pre-64651ed-commits) */ SEXP stri__prepare_arg_integer_1(SEXP x, const char* argname, bool factors_as_strings) { if ((SEXP*)argname == (SEXP*)R_NilValue) argname = ""; PROTECT(x = stri__prepare_arg_integer(x, argname, factors_as_strings)); int nprotect = 1; // if ((SEXP*)argname == (SEXP*)R_NilValue) // argname = ""; // // int nprotect = 0; // if (factors_as_strings && Rf_isFactor(x)) // factors must be checked first (as they are currently represented as integer vectors) // { // SEXP call; // PROTECT(call = Rf_lang2(Rf_install("as.character"), x)); // PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually // PROTECT(x = Rf_coerceVector(x, INTSXP)); // nprotect = 3; // } // else if (Rf_isVectorList(x) || Rf_isObject(x)) // { // if (Rf_isVectorList(x)) { // R_len_t nv = LENGTH(x); // for (R_len_t i=0; i 1) { Rf_warning(MSG__ARG_EXPECTED_1_INTEGER, argname); int x0 = INTEGER(x)[0]; PROTECT(x = Rf_allocVector(INTSXP, 1)); nprotect++; INTEGER(x)[0] = x0; UNPROTECT(nprotect); return x; } else { // if (nx == 1) UNPROTECT(nprotect); return x; } } /** Prepare logical argument - one value * * If there are 0 elements -> error * If there are >1 elements -> warning * * WARNING: this function is allowed to call the error() function. * Use before STRI__ERROR_HANDLER_BEGIN (with other prepareargs). * * * @param x R object to be checked/coerced * @param argname argument name (message formatting) * @return always an R logical vector with >=1 element * * @version 0.1-?? (Marek Gagolewski) * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.2.1 (Marek Gagolewski, 2018-04-21) * #285: warn if coercing from a non-trivial list * * @version 1.6.2 (Marek Gagolewski, 2021-05-19) * call as.logical on factors (not as.character+coerce to LGLSXP) * * @version 1.6.3 (Marek Gagolewski, 2021-05-20) * refactor: use stri__prepare_arg_xxx (again, as in pre-64651ed-commits) */ SEXP stri__prepare_arg_logical_1(SEXP x, const char* argname) { if ((SEXP*)argname == (SEXP*)R_NilValue) argname = ""; PROTECT(x = stri__prepare_arg_logical(x, argname)); int nprotect = 1; // int nprotect = 0; // // if ((SEXP*)argname == (SEXP*)R_NilValue) // argname = ""; // // if (Rf_isFactor(x)) // { // SEXP call; // // PROTECT(call = Rf_lang2(Rf_install("as.character"), x)); // // PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually // // PROTECT(x = Rf_coerceVector(x, LGLSXP)); // // nprotect = 3; // PROTECT(call = Rf_lang2(Rf_install("as.logical"), x)); // PROTECT(x = Rf_eval(call, R_GlobalEnv)); // this will mark its encoding manually // nprotect = 2; // } // else if (Rf_isVectorList(x) || Rf_isObject(x)) // { // if (Rf_isVectorList(x)) { // R_len_t nv = LENGTH(x); // for (R_len_t i=0; i 1) { Rf_warning(MSG__ARG_EXPECTED_1_LOGICAL, argname); int x0 = LOGICAL(x)[0]; PROTECT(x = Rf_allocVector(LGLSXP, 1)); nprotect++; LOGICAL(x)[0] = x0; UNPROTECT(nprotect); return x; } else { // if (nx == 1) UNPROTECT(nprotect); return x; } } /** Prepare logical argument - one value, not NA * * If there are 0 elements -> error * If there are >1 elements -> warning * * WARNING: this function is allowed to call the error() function. * Use before STRI__ERROR_HANDLER_BEGIN (with other prepareargs). * * * @param x R object to be checked/coerced * @param argname argument name (message formatting) * @return a boolean value * * @version 0.1-?? (Marek Gagolewski) * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ bool stri__prepare_arg_logical_1_notNA(SEXP x, const char* argname) { PROTECT(x = stri__prepare_arg_logical_1(x, argname)); int xval = LOGICAL(x)[0]; UNPROTECT(1); if (xval == NA_LOGICAL) Rf_error(MSG__ARG_EXPECTED_NOT_NA, argname); // allowed here return (bool)xval; } /** Prepare logical argument - one value, can be NA * * If there are 0 elements -> error * If there are >1 elements -> warning * * WARNING: this function is allowed to call the error() function. * Use before STRI__ERROR_HANDLER_BEGIN (with other prepareargs). * * * @param x R object to be checked/coerced * @param argname argument name (message formatting) * @return int value, test (xval == NA_LOGICAL) * * @version 1.6.2 (Marek Gagolewski, 2021-05-10) */ int stri__prepare_arg_logical_1_NA(SEXP x, const char* argname) { PROTECT(x = stri__prepare_arg_logical_1(x, argname)); int xval = LOGICAL(x)[0]; UNPROTECT(1); return xval; } /** Prepare integer argument - one value, not NA * * If there are 0 elements -> error * If there are >1 elements -> warning * * WARNING: this function is allowed to call the error() function. * Use before STRI__ERROR_HANDLER_BEGIN (with other prepareargs). * * * @param x R object to be checked/coerced * @param argname argument name (message formatting) * @return an integer value * * @version 0.1-?? (Marek Gagolewski) * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ int stri__prepare_arg_integer_1_notNA(SEXP x, const char* argname) { PROTECT(x = stri__prepare_arg_integer_1(x, argname)); int xval = INTEGER(x)[0]; UNPROTECT(1); if (xval == NA_INTEGER) Rf_error(MSG__ARG_EXPECTED_NOT_NA, argname); // allowed here return (int)xval; } /** Prepare integer argument - one value, can be NA * * If there are 0 elements -> error * If there are >1 elements -> warning * * WARNING: this function is allowed to call the error() function. * Use before STRI__ERROR_HANDLER_BEGIN (with other prepareargs). * * * @param x R object to be checked/coerced * @param argname argument name (message formatting) * @return an integer value, test (xval == NA_INTEGER) * * @version 1.6.2 (Marek Gagolewski, 2021-05-10) */ int stri__prepare_arg_integer_1_NA(SEXP x, const char* argname) { PROTECT(x = stri__prepare_arg_integer_1(x, argname)); int xval = INTEGER(x)[0]; UNPROTECT(1); return (int)xval; } /** Prepare double argument - one value, not NA * * If there are 0 elements -> error * If there are >1 elements -> warning * * WARNING: this function is allowed to call the error() function. * Use before STRI__ERROR_HANDLER_BEGIN (with other prepareargs). * * * @param x R object to be checked/coerced * @param argname argument name (message formatting) * @return a double value * * @version 0.2-2 (Marek Gagolewski, 2014-04-26) * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ double stri__prepare_arg_double_1_notNA(SEXP x, const char* argname) { PROTECT(x = stri__prepare_arg_double_1(x, argname)); double xval = REAL(x)[0]; UNPROTECT(1); if (ISNA(xval)) Rf_error(MSG__ARG_EXPECTED_NOT_NA, argname); // allowed here return (double)xval; } /** Prepare double argument - one value, can be NA * * If there are 0 elements -> error * If there are >1 elements -> warning * * WARNING: this function is allowed to call the error() function. * Use before STRI__ERROR_HANDLER_BEGIN (with other prepareargs). * * * @param x R object to be checked/coerced * @param argname argument name (message formatting) * @return a double value, test (ISNA(xval)) * * @version 1.6.2 (Marek Gagolewski, 2021-05-10) */ double stri__prepare_arg_double_1_NA(SEXP x, const char* argname) { PROTECT(x = stri__prepare_arg_double_1(x, argname)); double xval = REAL(x)[0]; UNPROTECT(1); return (double)xval; } /** * This is a helper function to avoid UNPROTECTED var names warning * when playing with CHARSXP directly * * @param x an R STRING object (from STRING_ELT(charactervector, index)) * @return an Ralloc'd character string * * @version 1.1.6 (Marek Gagolewski, 2017-11-10) */ const char* stri__copy_string_Ralloc(SEXP x, const char* argname) { PROTECT(x); if (x == NA_STRING) { UNPROTECT(1); Rf_error(MSG__ARG_EXPECTED_NOT_NA, argname); // allowed here } const char* ret_tmp = (const char*)CHAR(x); // ret may be gc'ed size_t ret_n = strlen(ret_tmp); /* R_alloc == Here R will reclaim the memory at the end of the call to .Call */ char* ret = R_alloc(ret_n+1, (int)sizeof(char)); STRI_ASSERT(ret); if (!ret) { UNPROTECT(1); Rf_error(MSG__MEM_ALLOC_ERROR); } memcpy(ret, ret_tmp, ret_n+1); UNPROTECT(1); return ret; } /** Prepare string argument - one value, can be NA [no re-encoding done!!!] * * If there are 0 elements -> error * If there are >1 elements -> warning * * WARNING: this function is allowed to call the error() function. * Use before STRI__ERROR_HANDLER_BEGIN (with other prepareargs). * * * @param x R object to be checked/coerced * @param argname argument name (message formatting) * @return a character string or NULL * * @version 1.6.3 (Marek Gagolewski, 2021-05-21) */ const char* stri__prepare_arg_string_1_NA(SEXP x, const char* argname) { PROTECT(x = stri__prepare_arg_string_1(x, argname)); if (STRING_ELT(x, 0) == NA_STRING) { UNPROTECT(1); return nullptr; } const char* ret_tmp = (const char*)CHAR(STRING_ELT(x, 0)); // ret may be gc'ed size_t ret_n = strlen(ret_tmp); /* R_alloc == Here R will reclaim the memory at the end of the call to .Call */ char* ret = R_alloc(ret_n+1, (int)sizeof(char)); STRI_ASSERT(ret); if (!ret) { UNPROTECT(1); Rf_error(MSG__MEM_ALLOC_ERROR); } memcpy(ret, ret_tmp, ret_n+1); UNPROTECT(1); return ret; } /** Prepare string argument - one value, not NA [no re-encoding done!!!] * * If there are 0 elements -> error * If there are >1 elements -> warning * * WARNING: this function is allowed to call the error() function. * Use before STRI__ERROR_HANDLER_BEGIN (with other prepareargs). * * * @param x R object to be checked/coerced * @param argname argument name (message formatting) * @return a character string * * @version 0.5-1 (Marek Gagolewski, 2014-12-25) */ const char* stri__prepare_arg_string_1_notNA(SEXP x, const char* argname) { const char* ret = stri__prepare_arg_string_1_NA(x, argname); if (ret == nullptr) Rf_error(MSG__ARG_EXPECTED_NOT_NA, argname); // allowed here return ret; } /** * Check if we are dealing with the 'C' locale (it should be resolved to * en_US_POSIX) * * "C", "c", "C.UTF-8", "c.UTF-8", "C.any_other_encoding", etc. * * @version 1.8.1 (Marek Gagolewski, 2023-11-09) * * @param str string * @return bool */ bool stri__is_C_locale(const char* str) { return str && ((str[0] == 'C' || str[0] == 'c') && (str[1] == '\0' || str[1] == '.')); } /** * Prepare character vector argument that will be used to choose a locale * * If the \code{loc} argument is incorrect, the an error is generated. * If something goes wrong, a warning is given. * * WARNING: this function is allowed to call the error() function. * Use before STRI__ERROR_HANDLER_BEGIN (with other prepareargs). * * * @param loc generally, a single character string * @param allowdefault do we allow \code{R_NilValue} or a single empty string * to work as a default locale selector? * @param allowna do we allow \code{NA} in \code{loc}? * This will return \code{NULL} as result [DEPRECATED, only used in stri_enc_detect2] * @param argname argument name (message formatting) * @return string a \code{C} string with extracted locale name; * can be NULL if allownull is TRUE * * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * argname added * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.3-1 (Marek Gagolewski, 2014-11-06) * Use R_alloc for the string returned * * @version 0.5-1 (Marek Gagolewski, 2015-01-01) * "@keyword=value" may use default locale from now; also, loc is trimmed * * @version 1.5.4 (Marek Gagolewski, 2021-04-07) * BUGFIX: locale='' is the default * * @version 1.8.1 (Marek Gagolewski, 2023-11-07) * C is an alias of en_US_POSIX; allowna argument dropped, * new argument: allownull */ const char* stri__prepare_arg_locale( SEXP loc, const char* argname, bool allowdefault, bool allownull ) { const char* default_locale = (allownull)?NULL:uloc_getDefault(); if (default_locale && stri__is_C_locale(default_locale)) default_locale = "en_US_POSIX"; if (Rf_isNull(loc)) { if (allowdefault) return default_locale; else Rf_error(MSG__ARG_EXPECTED_NOT_NULL, argname); // Rf_error allowed here } PROTECT(loc = stri__prepare_arg_string_1(loc, argname)); if (STRING_ELT(loc, 0) == NA_STRING) { UNPROTECT(1); Rf_error(MSG__ARG_EXPECTED_NOT_NA, argname); // Rf_error allowed here } const char* qloc = CHAR(STRING_ELT(loc, 0)); if (qloc[0] == '\0') { // empty string UNPROTECT(1); if (allowdefault) return default_locale; else Rf_error(MSG__LOCALE_INCORRECT_ID); // Rf_error allowed here } UErrorCode err = U_ZERO_ERROR; char buf[ULOC_FULLNAME_CAPACITY]; uloc_canonicalize(qloc, buf, ULOC_FULLNAME_CAPACITY, &err); UNPROTECT(1); // qloc, loc no longer used STRI__CHECKICUSTATUS_RFERROR(err, {;}) R_len_t ret_n = strlen(buf); char* ret = R_alloc(ret_n+1, (int)sizeof(char)); memcpy(ret, buf, ret_n+1); // right-trim while (ret_n > 0 && (ret[ret_n-1] == ' ' || ret[ret_n-1] == '\t' || ret[ret_n-1] == '\n' || ret[ret_n-1] == '\r')) ret[--ret_n] = '\0'; // left-trim while (ret[0] == ' ' || ret[0] == '\t' || ret[0] == '\n' || ret[0] == '\r') { ++ret; --ret_n; } if (ret_n == 0) { if (allowdefault) return default_locale; else Rf_error(MSG__LOCALE_INCORRECT_ID); // Rf_error allowed here } if (stri__is_C_locale(ret)) return "en_US_POSIX"; if (ret[0] == ULOC_KEYWORD_SEPARATOR) { // length is > 0 // no locale specifier, just keywords if (!allowdefault) { Rf_error(MSG__LOCALE_INCORRECT_ID); } const char* ret_default; if (default_locale) ret_default = default_locale; else { ret_default = uloc_getDefault(); if (stri__is_C_locale(ret_default)) ret_default = "en_US_POSIX"; } R_len_t ret_detault_n = strlen(ret_default); const char* ret_tmp2 = ret; ret = R_alloc(ret_detault_n+ret_n+1, (int)sizeof(char)); memcpy(ret, ret_default, ret_detault_n); memcpy(ret+ret_detault_n, ret_tmp2, ret_n+1); } return ret; } /** * Prepare character vector argument that will be used to choose a time zone * * If the \code{tz} argument is incorrect, then an error is generated. * If something goes wrong, a warning is given. * * WARNING: this function is allowed to call the error() function. * Use before STRI__ERROR_HANDLER_BEGIN (with other prepareargs). * * * @param tz generally, a single character string or NULL * @param defaulttz default time zone to be used here * @return TimeZone object - owned by the caller * * * @version 0.5-1 (Marek Gagolewski, 2014-12-24) */ TimeZone* stri__prepare_arg_timezone(SEXP tz, const char* argname, bool allowdefault) { UnicodeString tz_val(""); if (!Rf_isNull(tz)) { PROTECT(tz = stri__prepare_arg_string_1(tz, argname)); if (STRING_ELT(tz, 0) == NA_STRING) { UNPROTECT(1); Rf_error(MSG__ARG_EXPECTED_NOT_NA, argname); // Rf_error allowed here } tz_val.setTo(UnicodeString((const char*)CHAR(STRING_ELT(tz, 0)))); UNPROTECT(1); } // if (tz_val.length() == 0 && !Rf_isNull(defaulttz)) { // PROTECT(defaulttz = stri__prepare_arg_string_1(defaulttz, argname)); // if (STRING_ELT(defaulttz, 0) == NA_STRING) { // UNPROTECT(1); // Rf_error(MSG__ARG_EXPECTED_NOT_NA, argname); // Rf_error allowed here // } // tz_val.setTo(UnicodeString((const char*)CHAR(STRING_ELT(defaulttz, 0)))); // UNPROTECT(1); // } if (tz_val.length() == 0) { if (!allowdefault) Rf_error(MSG__TIMEZONE_INCORRECT_ID); return TimeZone::createDefault(); } else { TimeZone* ret = TimeZone::createTimeZone(tz_val); if (*ret == TimeZone::getUnknown()) { delete ret; Rf_error(MSG__TIMEZONE_INCORRECT_ID); // allowed here } else return ret; } // won't arrive here anyway return NULL; // avoid compiler warning } /** * Prepare character vector argument that will be used to choose a character encoding * * If the \code{enc} argument is incorrect, the an error is generated. * If something goes wrong, a warning is given. * * WARNING: this function is allowed to call the error() function. * Use before STRI__ERROR_HANDLER_BEGIN (with other prepareargs). * * * @param enc generally, a single character string * @param allowdefault do we allow \code{R_NilValue} or a single empty string * to work as a default charset selector? (defaults \code{true}) * @param argname argument name (message formatting) * @return a \code{C} string with extracted locale name * (NULL for default charset so that it can be passed to ICU's \code{ucnv_open()}) * Do not delete. * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * argname added * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.3-1 (Marek Gagolewski, 2014-11-06) * Use R_alloc for the string returned */ const char* stri__prepare_arg_enc(SEXP enc, const char* argname, bool allowdefault) { if (allowdefault && Rf_isNull(enc)) return (const char*)NULL; else { PROTECT(enc = stri__prepare_arg_string_1(enc, argname)); if (STRING_ELT(enc, 0) == NA_STRING) { UNPROTECT(1); Rf_error(MSG__ARG_EXPECTED_NOT_NA, argname); // allowed here } if (LENGTH(STRING_ELT(enc, 0)) == 0) { UNPROTECT(1); if (allowdefault) return (const char*)NULL; else Rf_error(MSG__ENC_INCORRECT_ID); // allowed here } else { const char* ret_tmp = (const char*)CHAR(STRING_ELT(enc, 0)); // ret may be gc'ed size_t ret_n = strlen(ret_tmp); /* R_alloc == Here R will reclaim the memory at the end of the call to .Call */ char* ret = R_alloc(ret_n+1, (int)sizeof(char)); STRI_ASSERT(ret); if (!ret) { UNPROTECT(1); Rf_error(MSG__MEM_ALLOC_ERROR); } memcpy(ret, ret_tmp, ret_n+1); UNPROTECT(1); return ret; } } // won't come here anyway return NULL; // avoid compiler warning } /* Wrapper for stri__prepare_arg_*, mainly for testing purposes * * Can call error() * * @param x R object * @param argname single string * @return R object of desired type * * @version 1.6.3 (Marek Gagolewski, 2021-05-21) */ SEXP stri_prepare_arg_string_1(SEXP x, SEXP argname) { const char* argname_s = stri__prepare_arg_string_1_notNA(argname, "argname"); return stri__prepare_arg_string_1(x, argname_s); } /* Wrapper for stri__prepare_arg_*, mainly for testing purposes * * Can call error() * * @param x R object * @param argname single string * @return R object of desired type * * @version 1.6.3 (Marek Gagolewski, 2021-05-21) */ SEXP stri_prepare_arg_double_1(SEXP x, SEXP argname) // TODO: factors_as_strings { const char* argname_s = stri__prepare_arg_string_1_notNA(argname, "argname"); return stri__prepare_arg_double_1(x, argname_s); } /* Wrapper for stri__prepare_arg_*, mainly for testing purposes * * Can call error() * * @param x R object * @param argname single string * @return R object of desired type * * @version 1.6.3 (Marek Gagolewski, 2021-05-21) */ SEXP stri_prepare_arg_integer_1(SEXP x, SEXP argname) // TODO: factors_as_strings { const char* argname_s = stri__prepare_arg_string_1_notNA(argname, "argname"); return stri__prepare_arg_integer_1(x, argname_s); } /* Wrapper for stri__prepare_arg_*, mainly for testing purposes * * Can call error() * * @param x R object * @param argname single string * @return R object of desired type * * @version 1.6.3 (Marek Gagolewski, 2021-05-21) */ SEXP stri_prepare_arg_logical_1(SEXP x, SEXP argname) { const char* argname_s = stri__prepare_arg_string_1_notNA(argname, "argname"); return stri__prepare_arg_logical_1(x, argname_s); } /* Wrapper for stri__prepare_arg_*, mainly for testing purposes * * Can call error() * * @param x R object * @param argname single string * @return R object of desired type * * @version 1.6.3 (Marek Gagolewski, 2021-05-21) */ SEXP stri_prepare_arg_string(SEXP x, SEXP argname) { const char* argname_s = stri__prepare_arg_string_1_notNA(argname, "argname"); return stri__prepare_arg_string(x, argname_s); } /* Wrapper for stri__prepare_arg_*, mainly for testing purposes * * Can call error() * * @param x R object * @param argname single string * @return R object of desired type * * @version 1.6.3 (Marek Gagolewski, 2021-05-21) */ SEXP stri_prepare_arg_double(SEXP x, SEXP argname) // TODO: factors_as_strings { const char* argname_s = stri__prepare_arg_string_1_notNA(argname, "argname"); return stri__prepare_arg_double(x, argname_s); } /* Wrapper for stri__prepare_arg_*, mainly for testing purposes * * Can call error() * * @param x R object * @param argname single string * @return R object of desired type * * @version 1.6.3 (Marek Gagolewski, 2021-05-21) */ SEXP stri_prepare_arg_integer(SEXP x, SEXP argname) // TODO: factors_as_strings { const char* argname_s = stri__prepare_arg_string_1_notNA(argname, "argname"); return stri__prepare_arg_integer(x, argname_s); } /* Wrapper for stri__prepare_arg_*, mainly for testing purposes * * Can call error() * * @param x R object * @param argname single string * @return R object of desired type * * @version 1.6.3 (Marek Gagolewski, 2021-05-21) */ SEXP stri_prepare_arg_logical(SEXP x, SEXP argname) { const char* argname_s = stri__prepare_arg_string_1_notNA(argname, "argname"); return stri__prepare_arg_logical(x, argname_s); } /* Wrapper for stri__prepare_arg_*, mainly for testing purposes * * Can call error() * * @param x R object * @param argname single string * @return R object of desired type * * @version 1.6.3 (Marek Gagolewski, 2021-05-21) */ SEXP stri_prepare_arg_raw(SEXP x, SEXP argname) // TODO: factors_as_strings { const char* argname_s = stri__prepare_arg_string_1_notNA(argname, "argname"); return stri__prepare_arg_raw(x, argname_s); } stringi/src/icu74/0000755000176200001440000000000014771247052013464 5ustar liggesusersstringi/src/icu74/i18n/0000755000176200001440000000000014771224007014236 5ustar liggesusersstringi/src/icu74/i18n/number_affixutils.cpp0000644000176200001440000003640614700200761020472 0ustar liggesusers// © 2017 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "number_affixutils.h" #include "unicode/utf16.h" #include "unicode/uniset.h" using namespace icu; using namespace icu::number; using namespace icu::number::impl; TokenConsumer::~TokenConsumer() = default; SymbolProvider::~SymbolProvider() = default; int32_t AffixUtils::estimateLength(const UnicodeString &patternString, UErrorCode &status) { AffixPatternState state = STATE_BASE; int32_t offset = 0; int32_t length = 0; for (; offset < patternString.length();) { UChar32 cp = patternString.char32At(offset); switch (state) { case STATE_BASE: if (cp == u'\'') { // First quote state = STATE_FIRST_QUOTE; } else { // Unquoted symbol length++; } break; case STATE_FIRST_QUOTE: if (cp == u'\'') { // Repeated quote length++; state = STATE_BASE; } else { // Quoted code point length++; state = STATE_INSIDE_QUOTE; } break; case STATE_INSIDE_QUOTE: if (cp == u'\'') { // End of quoted sequence state = STATE_AFTER_QUOTE; } else { // Quoted code point length++; } break; case STATE_AFTER_QUOTE: if (cp == u'\'') { // Double quote inside of quoted sequence length++; state = STATE_INSIDE_QUOTE; } else { // Unquoted symbol length++; } break; default: UPRV_UNREACHABLE_EXIT; } offset += U16_LENGTH(cp); } switch (state) { case STATE_FIRST_QUOTE: case STATE_INSIDE_QUOTE: status = U_ILLEGAL_ARGUMENT_ERROR; break; default: break; } return length; } UnicodeString AffixUtils::escape(const UnicodeString &input) { AffixPatternState state = STATE_BASE; int32_t offset = 0; UnicodeString output; for (; offset < input.length();) { UChar32 cp = input.char32At(offset); switch (cp) { case u'\'': output.append(u"''", -1); break; case u'-': case u'+': case u'%': case u'‰': case u'¤': if (state == STATE_BASE) { output.append(u'\''); output.append(cp); state = STATE_INSIDE_QUOTE; } else { output.append(cp); } break; default: if (state == STATE_INSIDE_QUOTE) { output.append(u'\''); output.append(cp); state = STATE_BASE; } else { output.append(cp); } break; } offset += U16_LENGTH(cp); } if (state == STATE_INSIDE_QUOTE) { output.append(u'\''); } return output; } Field AffixUtils::getFieldForType(AffixPatternType type) { switch (type) { case TYPE_MINUS_SIGN: return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD}; case TYPE_PLUS_SIGN: return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD}; case TYPE_APPROXIMATELY_SIGN: return {UFIELD_CATEGORY_NUMBER, UNUM_APPROXIMATELY_SIGN_FIELD}; case TYPE_PERCENT: return {UFIELD_CATEGORY_NUMBER, UNUM_PERCENT_FIELD}; case TYPE_PERMILLE: return {UFIELD_CATEGORY_NUMBER, UNUM_PERMILL_FIELD}; case TYPE_CURRENCY_SINGLE: return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; case TYPE_CURRENCY_DOUBLE: return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; case TYPE_CURRENCY_TRIPLE: return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; case TYPE_CURRENCY_QUAD: return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; case TYPE_CURRENCY_QUINT: return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; case TYPE_CURRENCY_OVERFLOW: return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; default: UPRV_UNREACHABLE_EXIT; } } int32_t AffixUtils::unescape(const UnicodeString &affixPattern, FormattedStringBuilder &output, int32_t position, const SymbolProvider &provider, Field field, UErrorCode &status) { int32_t length = 0; AffixTag tag; while (hasNext(tag, affixPattern)) { tag = nextToken(tag, affixPattern, status); if (U_FAILURE(status)) { return length; } if (tag.type == TYPE_CURRENCY_OVERFLOW) { // Don't go to the provider for this special case length += output.insertCodePoint( position + length, 0xFFFD, {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}, status); } else if (tag.type < 0) { length += output.insert( position + length, provider.getSymbol(tag.type), getFieldForType(tag.type), status); } else { length += output.insertCodePoint(position + length, tag.codePoint, field, status); } } return length; } int32_t AffixUtils::unescapedCodePointCount(const UnicodeString &affixPattern, const SymbolProvider &provider, UErrorCode &status) { int32_t length = 0; AffixTag tag; while (hasNext(tag, affixPattern)) { tag = nextToken(tag, affixPattern, status); if (U_FAILURE(status)) { return length; } if (tag.type == TYPE_CURRENCY_OVERFLOW) { length += 1; } else if (tag.type < 0) { length += provider.getSymbol(tag.type).length(); } else { length += U16_LENGTH(tag.codePoint); } } return length; } bool AffixUtils::containsType(const UnicodeString &affixPattern, AffixPatternType type, UErrorCode &status) { if (affixPattern.length() == 0) { return false; } AffixTag tag; while (hasNext(tag, affixPattern)) { tag = nextToken(tag, affixPattern, status); if (U_FAILURE(status)) { return false; } if (tag.type == type) { return true; } } return false; } bool AffixUtils::hasCurrencySymbols(const UnicodeString &affixPattern, UErrorCode &status) { if (affixPattern.length() == 0) { return false; } AffixTag tag; while (hasNext(tag, affixPattern)) { tag = nextToken(tag, affixPattern, status); if (U_FAILURE(status)) { return false; } if (tag.type < 0 && getFieldForType(tag.type) == Field(UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD)) { return true; } } return false; } UnicodeString AffixUtils::replaceType(const UnicodeString &affixPattern, AffixPatternType type, char16_t replacementChar, UErrorCode &status) { UnicodeString output(affixPattern); // copy if (affixPattern.length() == 0) { return output; } AffixTag tag; while (hasNext(tag, affixPattern)) { tag = nextToken(tag, affixPattern, status); if (U_FAILURE(status)) { return output; } if (tag.type == type) { output.replace(tag.offset - 1, 1, replacementChar); } } return output; } bool AffixUtils::containsOnlySymbolsAndIgnorables(const UnicodeString& affixPattern, const UnicodeSet& ignorables, UErrorCode& status) { if (affixPattern.length() == 0) { return true; } AffixTag tag; while (hasNext(tag, affixPattern)) { tag = nextToken(tag, affixPattern, status); if (U_FAILURE(status)) { return false; } if (tag.type == TYPE_CODEPOINT && !ignorables.contains(tag.codePoint)) { return false; } } return true; } void AffixUtils::iterateWithConsumer(const UnicodeString& affixPattern, TokenConsumer& consumer, UErrorCode& status) { if (affixPattern.length() == 0) { return; } AffixTag tag; while (hasNext(tag, affixPattern)) { tag = nextToken(tag, affixPattern, status); if (U_FAILURE(status)) { return; } consumer.consumeToken(tag.type, tag.codePoint, status); if (U_FAILURE(status)) { return; } } } AffixTag AffixUtils::nextToken(AffixTag tag, const UnicodeString &patternString, UErrorCode &status) { int32_t offset = tag.offset; int32_t state = tag.state; for (; offset < patternString.length();) { UChar32 cp = patternString.char32At(offset); int32_t count = U16_LENGTH(cp); switch (state) { case STATE_BASE: switch (cp) { case u'\'': state = STATE_FIRST_QUOTE; offset += count; // continue to the next code point break; case u'-': return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0); case u'+': return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0); case u'~': return makeTag(offset + count, TYPE_APPROXIMATELY_SIGN, STATE_BASE, 0); case u'%': return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0); case u'‰': return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0); case u'¤': state = STATE_FIRST_CURR; offset += count; // continue to the next code point break; default: return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp); } break; case STATE_FIRST_QUOTE: if (cp == u'\'') { return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp); } else { return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp); } case STATE_INSIDE_QUOTE: if (cp == u'\'') { state = STATE_AFTER_QUOTE; offset += count; // continue to the next code point break; } else { return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp); } case STATE_AFTER_QUOTE: if (cp == u'\'') { return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp); } else { state = STATE_BASE; // re-evaluate this code point break; } case STATE_FIRST_CURR: if (cp == u'¤') { state = STATE_SECOND_CURR; offset += count; // continue to the next code point break; } else { return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0); } case STATE_SECOND_CURR: if (cp == u'¤') { state = STATE_THIRD_CURR; offset += count; // continue to the next code point break; } else { return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0); } case STATE_THIRD_CURR: if (cp == u'¤') { state = STATE_FOURTH_CURR; offset += count; // continue to the next code point break; } else { return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0); } case STATE_FOURTH_CURR: if (cp == u'¤') { state = STATE_FIFTH_CURR; offset += count; // continue to the next code point break; } else { return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0); } case STATE_FIFTH_CURR: if (cp == u'¤') { state = STATE_OVERFLOW_CURR; offset += count; // continue to the next code point break; } else { return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0); } case STATE_OVERFLOW_CURR: if (cp == u'¤') { offset += count; // continue to the next code point and loop back to this state break; } else { return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0); } default: UPRV_UNREACHABLE_EXIT; } } // End of string switch (state) { case STATE_BASE: // No more tokens in string. return {-1}; case STATE_FIRST_QUOTE: case STATE_INSIDE_QUOTE: // For consistent behavior with the JDK and ICU 58, set an error here. status = U_ILLEGAL_ARGUMENT_ERROR; return {-1}; case STATE_AFTER_QUOTE: // No more tokens in string. return {-1}; case STATE_FIRST_CURR: return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0); case STATE_SECOND_CURR: return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0); case STATE_THIRD_CURR: return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0); case STATE_FOURTH_CURR: return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0); case STATE_FIFTH_CURR: return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0); case STATE_OVERFLOW_CURR: return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0); default: UPRV_UNREACHABLE_EXIT; } } bool AffixUtils::hasNext(const AffixTag &tag, const UnicodeString &string) { // First check for the {-1} and default initializer syntax. if (tag.offset < 0) { return false; } else if (tag.offset == 0) { return string.length() > 0; } // The rest of the fields are safe to use now. // Special case: the last character in string is an end quote. if (tag.state == STATE_INSIDE_QUOTE && tag.offset == string.length() - 1 && string.charAt(tag.offset) == u'\'') { return false; } else if (tag.state != STATE_BASE) { return true; } else { return tag.offset < string.length(); } } #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/upluralrules.cpp0000644000176200001440000001357214700200761017502 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ***************************************************************************************** * Copyright (C) 2010-2012, International Business Machines * Corporation and others. All Rights Reserved. ***************************************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/upluralrules.h" #include "unicode/plurrule.h" #include "unicode/locid.h" #include "unicode/unistr.h" #include "unicode/unum.h" #include "unicode/numfmt.h" #include "unicode/unumberformatter.h" #include "number_decimalquantity.h" #include "number_utypes.h" #include "numrange_impl.h" U_NAMESPACE_USE namespace { /** * Given a number and a format, returns the keyword of the first applicable * rule for the PluralRules object. * @param rules The plural rules. * @param obj The numeric object for which the rule should be determined. * @param fmt The NumberFormat specifying how the number will be formatted * (this can affect the plural form, e.g. "1 dollar" vs "1.0 dollars"). * @param status Input/output parameter. If at entry this indicates a * failure status, the method returns immediately; otherwise * this is set to indicate the outcome of the call. * @return The keyword of the selected rule. Undefined in the case of an error. */ UnicodeString select(const PluralRules &rules, const Formattable& obj, const NumberFormat& fmt, UErrorCode& status) { if (U_SUCCESS(status)) { const DecimalFormat *decFmt = dynamic_cast(&fmt); if (decFmt != nullptr) { number::impl::DecimalQuantity dq; decFmt->formatToDecimalQuantity(obj, dq, status); if (U_SUCCESS(status)) { return rules.select(dq); } } else { double number = obj.getDouble(status); if (U_SUCCESS(status)) { return rules.select(number); } } } return UnicodeString(); } } // namespace U_CAPI UPluralRules* U_EXPORT2 uplrules_open(const char *locale, UErrorCode *status) { return uplrules_openForType(locale, UPLURAL_TYPE_CARDINAL, status); } U_CAPI UPluralRules* U_EXPORT2 uplrules_openForType(const char *locale, UPluralType type, UErrorCode *status) { return (UPluralRules*)PluralRules::forLocale(Locale(locale), type, *status); } U_CAPI void U_EXPORT2 uplrules_close(UPluralRules *uplrules) { delete (PluralRules*)uplrules; } U_CAPI int32_t U_EXPORT2 uplrules_select(const UPluralRules *uplrules, double number, char16_t *keyword, int32_t capacity, UErrorCode *status) { if (U_FAILURE(*status)) { return 0; } if (keyword == nullptr ? capacity != 0 : capacity < 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } UnicodeString result = ((PluralRules*)uplrules)->select(number); return result.extract(keyword, capacity, *status); } U_CAPI int32_t U_EXPORT2 uplrules_selectFormatted(const UPluralRules *uplrules, const UFormattedNumber* number, char16_t *keyword, int32_t capacity, UErrorCode *status) { if (U_FAILURE(*status)) { return 0; } if (keyword == nullptr ? capacity != 0 : capacity < 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } const number::impl::DecimalQuantity* dq = number::impl::validateUFormattedNumberToDecimalQuantity(number, *status); if (U_FAILURE(*status)) { return 0; } UnicodeString result = ((PluralRules*)uplrules)->select(*dq); return result.extract(keyword, capacity, *status); } U_CAPI int32_t U_EXPORT2 uplrules_selectForRange(const UPluralRules *uplrules, const UFormattedNumberRange* urange, char16_t *keyword, int32_t capacity, UErrorCode *status) { if (U_FAILURE(*status)) { return 0; } if (keyword == nullptr ? capacity != 0 : capacity < 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } const number::impl::UFormattedNumberRangeData* impl = number::impl::validateUFormattedNumberRange(urange, *status); UnicodeString result = ((PluralRules*)uplrules)->select(impl, *status); return result.extract(keyword, capacity, *status); } U_CAPI int32_t U_EXPORT2 uplrules_selectWithFormat(const UPluralRules *uplrules, double number, const UNumberFormat *fmt, char16_t *keyword, int32_t capacity, UErrorCode *status) { if (U_FAILURE(*status)) { return 0; } const PluralRules* plrules = reinterpret_cast(uplrules); const NumberFormat* nf = reinterpret_cast(fmt); if (plrules == nullptr || nf == nullptr || ((keyword == nullptr)? capacity != 0 : capacity < 0)) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } Formattable obj(number); UnicodeString result = select(*plrules, obj, *nf, *status); return result.extract(keyword, capacity, *status); } U_CAPI UEnumeration* U_EXPORT2 uplrules_getKeywords(const UPluralRules *uplrules, UErrorCode *status) { if (U_FAILURE(*status)) { return nullptr; } const PluralRules* plrules = reinterpret_cast(uplrules); if (plrules == nullptr) { *status = U_ILLEGAL_ARGUMENT_ERROR; return nullptr; } StringEnumeration *senum = plrules->getKeywords(*status); if (U_FAILURE(*status)) { return nullptr; } if (senum == nullptr) { *status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } return uenum_openFromStringEnumeration(senum, status); } #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/dtptngen_impl.h0000644000176200001440000002607614700200761017257 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2007-2016, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* * * File DTPTNGEN.H * ******************************************************************************* */ #ifndef __DTPTNGEN_IMPL_H__ #define __DTPTNGEN_IMPL_H__ #include "unicode/udatpg.h" #include "unicode/strenum.h" #include "unicode/unistr.h" #include "uvector.h" // TODO(claireho): Split off Builder class. // TODO(claireho): If splitting off Builder class: As subclass or independent? #define MAX_PATTERN_ENTRIES 52 #define MAX_CLDR_FIELD_LEN 60 #define MAX_DT_TOKEN 50 #define MAX_RESOURCE_FIELD 12 #define MAX_AVAILABLE_FORMATS 12 #define NONE 0 #define EXTRA_FIELD 0x10000 #define MISSING_FIELD 0x1000 #define MAX_STRING_ENUMERATION 200 #define SINGLE_QUOTE ((char16_t)0x0027) #define FORWARDSLASH ((char16_t)0x002F) #define BACKSLASH ((char16_t)0x005C) #define SPACE ((char16_t)0x0020) #define QUOTATION_MARK ((char16_t)0x0022) #define ASTERISK ((char16_t)0x002A) #define PLUSSITN ((char16_t)0x002B) #define COMMA ((char16_t)0x002C) #define HYPHEN ((char16_t)0x002D) #define DOT ((char16_t)0x002E) #define COLON ((char16_t)0x003A) #define CAP_A ((char16_t)0x0041) #define CAP_B ((char16_t)0x0042) #define CAP_C ((char16_t)0x0043) #define CAP_D ((char16_t)0x0044) #define CAP_E ((char16_t)0x0045) #define CAP_F ((char16_t)0x0046) #define CAP_G ((char16_t)0x0047) #define CAP_H ((char16_t)0x0048) #define CAP_J ((char16_t)0x004A) #define CAP_K ((char16_t)0x004B) #define CAP_L ((char16_t)0x004C) #define CAP_M ((char16_t)0x004D) #define CAP_O ((char16_t)0x004F) #define CAP_Q ((char16_t)0x0051) #define CAP_S ((char16_t)0x0053) #define CAP_T ((char16_t)0x0054) #define CAP_U ((char16_t)0x0055) #define CAP_V ((char16_t)0x0056) #define CAP_W ((char16_t)0x0057) #define CAP_X ((char16_t)0x0058) #define CAP_Y ((char16_t)0x0059) #define CAP_Z ((char16_t)0x005A) #define LOWLINE ((char16_t)0x005F) #define LOW_A ((char16_t)0x0061) #define LOW_B ((char16_t)0x0062) #define LOW_C ((char16_t)0x0063) #define LOW_D ((char16_t)0x0064) #define LOW_E ((char16_t)0x0065) #define LOW_F ((char16_t)0x0066) #define LOW_G ((char16_t)0x0067) #define LOW_H ((char16_t)0x0068) #define LOW_I ((char16_t)0x0069) #define LOW_J ((char16_t)0x006A) #define LOW_K ((char16_t)0x006B) #define LOW_L ((char16_t)0x006C) #define LOW_M ((char16_t)0x006D) #define LOW_N ((char16_t)0x006E) #define LOW_O ((char16_t)0x006F) #define LOW_P ((char16_t)0x0070) #define LOW_Q ((char16_t)0x0071) #define LOW_R ((char16_t)0x0072) #define LOW_S ((char16_t)0x0073) #define LOW_T ((char16_t)0x0074) #define LOW_U ((char16_t)0x0075) #define LOW_V ((char16_t)0x0076) #define LOW_W ((char16_t)0x0077) #define LOW_X ((char16_t)0x0078) #define LOW_Y ((char16_t)0x0079) #define LOW_Z ((char16_t)0x007A) #define DT_NARROW -0x101 #define DT_SHORTER -0x102 #define DT_SHORT -0x103 #define DT_LONG -0x104 #define DT_NUMERIC 0x100 #define DT_DELTA 0x10 U_NAMESPACE_BEGIN const int32_t UDATPG_FRACTIONAL_MASK = 1< skeleton; UnicodeString pattern; UBool skeletonWasSpecified; // if specified in availableFormats, not derived LocalPointer next; PtnElem(const UnicodeString &basePattern, const UnicodeString &pattern); virtual ~PtnElem(); }; class FormatParser : public UMemory { public: UnicodeString items[MAX_DT_TOKEN]; int32_t itemNumber; FormatParser(); virtual ~FormatParser(); void set(const UnicodeString& patternString); void getQuoteLiteral(UnicodeString& quote, int32_t *itemIndex); UBool isPatternSeparator(const UnicodeString& field) const; static UBool isQuoteLiteral(const UnicodeString& s); static int32_t getCanonicalIndex(const UnicodeString& s) { return getCanonicalIndex(s, true); } static int32_t getCanonicalIndex(const UnicodeString& s, UBool strict); private: typedef enum TokenStatus { START, ADD_TOKEN, SYNTAX_ERROR, DONE } TokenStatus; TokenStatus status; virtual TokenStatus setTokens(const UnicodeString& pattern, int32_t startPos, int32_t *len); }; class DistanceInfo : public UMemory { public: int32_t missingFieldMask; int32_t extraFieldMask; DistanceInfo() {} virtual ~DistanceInfo(); void clear() { missingFieldMask = extraFieldMask = 0; } void setTo(const DistanceInfo& other); void addMissing(int32_t field) { missingFieldMask |= (1< matcher; PatternMap *patternMap; }; class DTSkeletonEnumeration : public StringEnumeration { public: DTSkeletonEnumeration(PatternMap& patternMap, dtStrEnum type, UErrorCode& status); virtual ~DTSkeletonEnumeration(); static UClassID U_EXPORT2 getStaticClassID(); virtual UClassID getDynamicClassID() const override; virtual const UnicodeString* snext(UErrorCode& status) override; virtual void reset(UErrorCode& status) override; virtual int32_t count(UErrorCode& status) const override; private: int32_t pos; UBool isCanonicalItem(const UnicodeString& item); LocalPointer fSkeletons; }; class DTRedundantEnumeration : public StringEnumeration { public: DTRedundantEnumeration(); virtual ~DTRedundantEnumeration(); static UClassID U_EXPORT2 getStaticClassID(); virtual UClassID getDynamicClassID() const override; virtual const UnicodeString* snext(UErrorCode& status) override; virtual void reset(UErrorCode& status) override; virtual int32_t count(UErrorCode& status) const override; void add(const UnicodeString &pattern, UErrorCode& status); private: int32_t pos; UBool isCanonicalItem(const UnicodeString& item) const; LocalPointer fPatterns; }; U_NAMESPACE_END #endif stringi/src/icu74/i18n/quant.h0000644000176200001440000000766514700200761015546 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2001-2011, International Business Machines Corporation * and others. All Rights Reserved. ********************************************************************** * Date Name Description * 07/26/01 aliu Creation. ********************************************************************** */ #ifndef QUANT_H #define QUANT_H #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/unifunct.h" #include "unicode/unimatch.h" U_NAMESPACE_BEGIN class Quantifier : public UnicodeFunctor, public UnicodeMatcher { public: enum { MAX = 0x7FFFFFFF }; Quantifier(UnicodeFunctor *adoptedMatcher, uint32_t minCount, uint32_t maxCount); Quantifier(const Quantifier& o); virtual ~Quantifier(); /** * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer * and return the pointer. * @return the UnicodeMatcher pointer. */ virtual UnicodeMatcher* toMatcher() const override; /** * Implement UnicodeFunctor * @return a copy of the object. */ virtual Quantifier* clone() const override; /** * Implement UnicodeMatcher * @param text the text to be matched * @param offset on input, the index into text at which to begin * matching. On output, the limit of the matched text. The * number of matched characters is the output value of offset * minus the input value. Offset should always point to the * HIGH SURROGATE (leading code unit) of a pair of surrogates, * both on entry and upon return. * @param limit the limit index of text to be matched. Greater * than offset for a forward direction match, less than offset for * a backward direction match. The last character to be * considered for matching will be text.charAt(limit-1) in the * forward direction or text.charAt(limit+1) in the backward * direction. * @param incremental if true, then assume further characters may * be inserted at limit and check for partial matching. Otherwise * assume the text as given is complete. * @return a match degree value indicating a full match, a partial * match, or a mismatch. If incremental is false then * U_PARTIAL_MATCH should never be returned. */ virtual UMatchDegree matches(const Replaceable& text, int32_t& offset, int32_t limit, UBool incremental) override; /** * Implement UnicodeMatcher * @param result Output param to receive the pattern. * @param escapeUnprintable if True then escape the unprintable characters. * @return A reference to 'result'. */ virtual UnicodeString& toPattern(UnicodeString& result, UBool escapeUnprintable = false) const override; /** * Implement UnicodeMatcher * @param v the given index value. * @return true if this rule matches the given index value. */ virtual UBool matchesIndexValue(uint8_t v) const override; /** * Implement UnicodeMatcher */ virtual void addMatchSetTo(UnicodeSet& toUnionTo) const override; /** * UnicodeFunctor API */ virtual void setData(const TransliterationRuleData*) override; /** * ICU "poor man's RTTI", returns a UClassID for the actual class. */ virtual UClassID getDynamicClassID() const override; /** * ICU "poor man's RTTI", returns a UClassID for this class. */ static UClassID U_EXPORT2 getStaticClassID(); private: UnicodeFunctor* matcher; // owned uint32_t minCount; uint32_t maxCount; }; U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ #endif stringi/src/icu74/i18n/number_modifiers.cpp0000644000176200001440000004375614700200761020303 0ustar liggesusers// © 2017 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "umutex.h" #include "ucln_cmn.h" #include "ucln_in.h" #include "number_modifiers.h" using namespace icu; using namespace icu::number; using namespace icu::number::impl; namespace { // TODO: This is copied from simpleformatter.cpp const int32_t ARG_NUM_LIMIT = 0x100; // These are the default currency spacing UnicodeSets in CLDR. // Pre-compute them for performance. // The Java unit test testCurrencySpacingPatternStability() will start failing if these change in CLDR. icu::UInitOnce gDefaultCurrencySpacingInitOnce {}; UnicodeSet *UNISET_DIGIT = nullptr; UnicodeSet *UNISET_NOTSZ = nullptr; UBool U_CALLCONV cleanupDefaultCurrencySpacing() { delete UNISET_DIGIT; UNISET_DIGIT = nullptr; delete UNISET_NOTSZ; UNISET_NOTSZ = nullptr; gDefaultCurrencySpacingInitOnce.reset(); return true; } void U_CALLCONV initDefaultCurrencySpacing(UErrorCode &status) { ucln_i18n_registerCleanup(UCLN_I18N_CURRENCY_SPACING, cleanupDefaultCurrencySpacing); UNISET_DIGIT = new UnicodeSet(UnicodeString(u"[:digit:]"), status); UNISET_NOTSZ = new UnicodeSet(UnicodeString(u"[[:^S:]&[:^Z:]]"), status); if (UNISET_DIGIT == nullptr || UNISET_NOTSZ == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } UNISET_DIGIT->freeze(); UNISET_NOTSZ->freeze(); } } // namespace Modifier::~Modifier() = default; Modifier::Parameters::Parameters() : obj(nullptr) {} Modifier::Parameters::Parameters( const ModifierStore* _obj, Signum _signum, StandardPlural::Form _plural) : obj(_obj), signum(_signum), plural(_plural) {} ModifierStore::~ModifierStore() = default; AdoptingSignumModifierStore::~AdoptingSignumModifierStore() { for (const Modifier *mod : mods) { delete mod; } } AdoptingSignumModifierStore& AdoptingSignumModifierStore::operator=(AdoptingSignumModifierStore&& other) noexcept { for (size_t i=0; imods[i] = other.mods[i]; other.mods[i] = nullptr; } return *this; } int32_t ConstantAffixModifier::apply(FormattedStringBuilder &output, int leftIndex, int rightIndex, UErrorCode &status) const { // Insert the suffix first since inserting the prefix will change the rightIndex int length = output.insert(rightIndex, fSuffix, fField, status); length += output.insert(leftIndex, fPrefix, fField, status); return length; } int32_t ConstantAffixModifier::getPrefixLength() const { return fPrefix.length(); } int32_t ConstantAffixModifier::getCodePointCount() const { return fPrefix.countChar32() + fSuffix.countChar32(); } bool ConstantAffixModifier::isStrong() const { return fStrong; } bool ConstantAffixModifier::containsField(Field field) const { (void)field; // This method is not currently used. UPRV_UNREACHABLE_EXIT; } void ConstantAffixModifier::getParameters(Parameters& output) const { (void)output; // This method is not currently used. UPRV_UNREACHABLE_EXIT; } bool ConstantAffixModifier::semanticallyEquivalent(const Modifier& other) const { auto* _other = dynamic_cast(&other); if (_other == nullptr) { return false; } return fPrefix == _other->fPrefix && fSuffix == _other->fSuffix && fField == _other->fField && fStrong == _other->fStrong; } SimpleModifier::SimpleModifier(const SimpleFormatter &simpleFormatter, Field field, bool strong) : SimpleModifier(simpleFormatter, field, strong, {}) {} SimpleModifier::SimpleModifier(const SimpleFormatter &simpleFormatter, Field field, bool strong, const Modifier::Parameters parameters) : fCompiledPattern(simpleFormatter.compiledPattern), fField(field), fStrong(strong), fParameters(parameters) { int32_t argLimit = SimpleFormatter::getArgumentLimit( fCompiledPattern.getBuffer(), fCompiledPattern.length()); if (argLimit == 0) { // No arguments in compiled pattern fPrefixLength = fCompiledPattern.charAt(1) - ARG_NUM_LIMIT; U_ASSERT(2 + fPrefixLength == fCompiledPattern.length()); // Set suffixOffset = -1 to indicate no arguments in compiled pattern. fSuffixOffset = -1; fSuffixLength = 0; } else { U_ASSERT(argLimit == 1); if (fCompiledPattern.charAt(1) != 0) { // Found prefix fPrefixLength = fCompiledPattern.charAt(1) - ARG_NUM_LIMIT; fSuffixOffset = 3 + fPrefixLength; } else { // No prefix fPrefixLength = 0; fSuffixOffset = 2; } if (3 + fPrefixLength < fCompiledPattern.length()) { // Found suffix fSuffixLength = fCompiledPattern.charAt(fSuffixOffset) - ARG_NUM_LIMIT; } else { // No suffix fSuffixLength = 0; } } } SimpleModifier::SimpleModifier() : fField(kUndefinedField), fStrong(false), fPrefixLength(0), fSuffixLength(0) { } int32_t SimpleModifier::apply(FormattedStringBuilder &output, int leftIndex, int rightIndex, UErrorCode &status) const { return formatAsPrefixSuffix(output, leftIndex, rightIndex, status); } int32_t SimpleModifier::getPrefixLength() const { return fPrefixLength; } int32_t SimpleModifier::getCodePointCount() const { int32_t count = 0; if (fPrefixLength > 0) { count += fCompiledPattern.countChar32(2, fPrefixLength); } if (fSuffixLength > 0) { count += fCompiledPattern.countChar32(1 + fSuffixOffset, fSuffixLength); } return count; } bool SimpleModifier::isStrong() const { return fStrong; } bool SimpleModifier::containsField(Field field) const { (void)field; // This method is not currently used. UPRV_UNREACHABLE_EXIT; } void SimpleModifier::getParameters(Parameters& output) const { output = fParameters; } bool SimpleModifier::semanticallyEquivalent(const Modifier& other) const { auto* _other = dynamic_cast(&other); if (_other == nullptr) { return false; } if (fParameters.obj != nullptr) { return fParameters.obj == _other->fParameters.obj; } return fCompiledPattern == _other->fCompiledPattern && fField == _other->fField && fStrong == _other->fStrong; } int32_t SimpleModifier::formatAsPrefixSuffix(FormattedStringBuilder &result, int32_t startIndex, int32_t endIndex, UErrorCode &status) const { if (fSuffixOffset == -1 && fPrefixLength + fSuffixLength > 0) { // There is no argument for the inner number; overwrite the entire segment with our string. return result.splice(startIndex, endIndex, fCompiledPattern, 2, 2 + fPrefixLength, fField, status); } else { if (fPrefixLength > 0) { result.insert(startIndex, fCompiledPattern, 2, 2 + fPrefixLength, fField, status); } if (fSuffixLength > 0) { result.insert( endIndex + fPrefixLength, fCompiledPattern, 1 + fSuffixOffset, 1 + fSuffixOffset + fSuffixLength, fField, status); } return fPrefixLength + fSuffixLength; } } int32_t SimpleModifier::formatTwoArgPattern(const SimpleFormatter& compiled, FormattedStringBuilder& result, int32_t index, int32_t* outPrefixLength, int32_t* outSuffixLength, Field field, UErrorCode& status) { const UnicodeString& compiledPattern = compiled.compiledPattern; int32_t argLimit = SimpleFormatter::getArgumentLimit( compiledPattern.getBuffer(), compiledPattern.length()); if (argLimit != 2) { status = U_INTERNAL_PROGRAM_ERROR; return 0; } int32_t offset = 1; // offset into compiledPattern int32_t length = 0; // chars added to result int32_t prefixLength = compiledPattern.charAt(offset); offset++; if (prefixLength < ARG_NUM_LIMIT) { // No prefix prefixLength = 0; } else { prefixLength -= ARG_NUM_LIMIT; result.insert(index + length, compiledPattern, offset, offset + prefixLength, field, status); offset += prefixLength; length += prefixLength; offset++; } int32_t infixLength = compiledPattern.charAt(offset); offset++; if (infixLength < ARG_NUM_LIMIT) { // No infix infixLength = 0; } else { infixLength -= ARG_NUM_LIMIT; result.insert(index + length, compiledPattern, offset, offset + infixLength, field, status); offset += infixLength; length += infixLength; offset++; } int32_t suffixLength; if (offset == compiledPattern.length()) { // No suffix suffixLength = 0; } else { suffixLength = compiledPattern.charAt(offset) - ARG_NUM_LIMIT; offset++; result.insert(index + length, compiledPattern, offset, offset + suffixLength, field, status); length += suffixLength; } *outPrefixLength = prefixLength; *outSuffixLength = suffixLength; return length; } int32_t ConstantMultiFieldModifier::apply(FormattedStringBuilder &output, int leftIndex, int rightIndex, UErrorCode &status) const { int32_t length = output.insert(leftIndex, fPrefix, status); if (fOverwrite) { length += output.splice( leftIndex + length, rightIndex + length, UnicodeString(), 0, 0, kUndefinedField, status); } length += output.insert(rightIndex + length, fSuffix, status); return length; } int32_t ConstantMultiFieldModifier::getPrefixLength() const { return fPrefix.length(); } int32_t ConstantMultiFieldModifier::getCodePointCount() const { return fPrefix.codePointCount() + fSuffix.codePointCount(); } bool ConstantMultiFieldModifier::isStrong() const { return fStrong; } bool ConstantMultiFieldModifier::containsField(Field field) const { return fPrefix.containsField(field) || fSuffix.containsField(field); } void ConstantMultiFieldModifier::getParameters(Parameters& output) const { output = fParameters; } bool ConstantMultiFieldModifier::semanticallyEquivalent(const Modifier& other) const { auto* _other = dynamic_cast(&other); if (_other == nullptr) { return false; } if (fParameters.obj != nullptr) { return fParameters.obj == _other->fParameters.obj; } return fPrefix.contentEquals(_other->fPrefix) && fSuffix.contentEquals(_other->fSuffix) && fOverwrite == _other->fOverwrite && fStrong == _other->fStrong; } CurrencySpacingEnabledModifier::CurrencySpacingEnabledModifier(const FormattedStringBuilder &prefix, const FormattedStringBuilder &suffix, bool overwrite, bool strong, const DecimalFormatSymbols &symbols, UErrorCode &status) : ConstantMultiFieldModifier(prefix, suffix, overwrite, strong) { // Check for currency spacing. Do not build the UnicodeSets unless there is // a currency code point at a boundary. if (prefix.length() > 0 && prefix.fieldAt(prefix.length() - 1) == Field(UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD)) { int prefixCp = prefix.getLastCodePoint(); UnicodeSet prefixUnicodeSet = getUnicodeSet(symbols, IN_CURRENCY, PREFIX, status); if (prefixUnicodeSet.contains(prefixCp)) { fAfterPrefixUnicodeSet = getUnicodeSet(symbols, IN_NUMBER, PREFIX, status); fAfterPrefixUnicodeSet.freeze(); fAfterPrefixInsert = getInsertString(symbols, PREFIX, status); } else { fAfterPrefixUnicodeSet.setToBogus(); fAfterPrefixInsert.setToBogus(); } } else { fAfterPrefixUnicodeSet.setToBogus(); fAfterPrefixInsert.setToBogus(); } if (suffix.length() > 0 && suffix.fieldAt(0) == Field(UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD)) { int suffixCp = suffix.getFirstCodePoint(); UnicodeSet suffixUnicodeSet = getUnicodeSet(symbols, IN_CURRENCY, SUFFIX, status); if (suffixUnicodeSet.contains(suffixCp)) { fBeforeSuffixUnicodeSet = getUnicodeSet(symbols, IN_NUMBER, SUFFIX, status); fBeforeSuffixUnicodeSet.freeze(); fBeforeSuffixInsert = getInsertString(symbols, SUFFIX, status); } else { fBeforeSuffixUnicodeSet.setToBogus(); fBeforeSuffixInsert.setToBogus(); } } else { fBeforeSuffixUnicodeSet.setToBogus(); fBeforeSuffixInsert.setToBogus(); } } int32_t CurrencySpacingEnabledModifier::apply(FormattedStringBuilder &output, int leftIndex, int rightIndex, UErrorCode &status) const { // Currency spacing logic int length = 0; if (rightIndex - leftIndex > 0 && !fAfterPrefixUnicodeSet.isBogus() && fAfterPrefixUnicodeSet.contains(output.codePointAt(leftIndex))) { // TODO: Should we use the CURRENCY field here? length += output.insert( leftIndex, fAfterPrefixInsert, kUndefinedField, status); } if (rightIndex - leftIndex > 0 && !fBeforeSuffixUnicodeSet.isBogus() && fBeforeSuffixUnicodeSet.contains(output.codePointBefore(rightIndex))) { // TODO: Should we use the CURRENCY field here? length += output.insert( rightIndex + length, fBeforeSuffixInsert, kUndefinedField, status); } // Call super for the remaining logic length += ConstantMultiFieldModifier::apply(output, leftIndex, rightIndex + length, status); return length; } int32_t CurrencySpacingEnabledModifier::applyCurrencySpacing(FormattedStringBuilder &output, int32_t prefixStart, int32_t prefixLen, int32_t suffixStart, int32_t suffixLen, const DecimalFormatSymbols &symbols, UErrorCode &status) { int length = 0; bool hasPrefix = (prefixLen > 0); bool hasSuffix = (suffixLen > 0); bool hasNumber = (suffixStart - prefixStart - prefixLen > 0); // could be empty string if (hasPrefix && hasNumber) { length += applyCurrencySpacingAffix(output, prefixStart + prefixLen, PREFIX, symbols, status); } if (hasSuffix && hasNumber) { length += applyCurrencySpacingAffix(output, suffixStart + length, SUFFIX, symbols, status); } return length; } int32_t CurrencySpacingEnabledModifier::applyCurrencySpacingAffix(FormattedStringBuilder &output, int32_t index, EAffix affix, const DecimalFormatSymbols &symbols, UErrorCode &status) { // NOTE: For prefix, output.fieldAt(index-1) gets the last field type in the prefix. // This works even if the last code point in the prefix is 2 code units because the // field value gets populated to both indices in the field array. Field affixField = (affix == PREFIX) ? output.fieldAt(index - 1) : output.fieldAt(index); if (affixField != Field(UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD)) { return 0; } int affixCp = (affix == PREFIX) ? output.codePointBefore(index) : output.codePointAt(index); UnicodeSet affixUniset = getUnicodeSet(symbols, IN_CURRENCY, affix, status); if (!affixUniset.contains(affixCp)) { return 0; } int numberCp = (affix == PREFIX) ? output.codePointAt(index) : output.codePointBefore(index); UnicodeSet numberUniset = getUnicodeSet(symbols, IN_NUMBER, affix, status); if (!numberUniset.contains(numberCp)) { return 0; } UnicodeString spacingString = getInsertString(symbols, affix, status); // NOTE: This next line *inserts* the spacing string, triggering an arraycopy. // It would be more efficient if this could be done before affixes were attached, // so that it could be prepended/appended instead of inserted. // However, the build code path is more efficient, and this is the most natural // place to put currency spacing in the non-build code path. // TODO: Should we use the CURRENCY field here? return output.insert(index, spacingString, kUndefinedField, status); } UnicodeSet CurrencySpacingEnabledModifier::getUnicodeSet(const DecimalFormatSymbols &symbols, EPosition position, EAffix affix, UErrorCode &status) { // Ensure the static defaults are initialized: umtx_initOnce(gDefaultCurrencySpacingInitOnce, &initDefaultCurrencySpacing, status); if (U_FAILURE(status)) { return UnicodeSet(); } const UnicodeString& pattern = symbols.getPatternForCurrencySpacing( position == IN_CURRENCY ? UNUM_CURRENCY_MATCH : UNUM_CURRENCY_SURROUNDING_MATCH, affix == SUFFIX, status); if (pattern.compare(u"[:digit:]", -1) == 0) { return *UNISET_DIGIT; } else if (pattern.compare(u"[[:^S:]&[:^Z:]]", -1) == 0) { return *UNISET_NOTSZ; } else { return UnicodeSet(pattern, status); } } UnicodeString CurrencySpacingEnabledModifier::getInsertString(const DecimalFormatSymbols &symbols, EAffix affix, UErrorCode &status) { return symbols.getPatternForCurrencySpacing(UNUM_CURRENCY_INSERT, affix == SUFFIX, status); } #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/shareddateformatsymbols.h0000644000176200001440000000256714700200761021340 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * Copyright (C) 2014, International Business Machines * Corporation and others. All Rights Reserved. ****************************************************************************** * shareddateformatsymbols.h */ #ifndef __SHARED_DATEFORMATSYMBOLS_H__ #define __SHARED_DATEFORMATSYMBOLS_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "sharedobject.h" #include "unicode/dtfmtsym.h" #include "unifiedcache.h" U_NAMESPACE_BEGIN class U_I18N_API SharedDateFormatSymbols : public SharedObject { public: SharedDateFormatSymbols( const Locale &loc, const char *type, UErrorCode &status) : dfs(loc, type, status) { } virtual ~SharedDateFormatSymbols(); const DateFormatSymbols &get() const { return dfs; } private: DateFormatSymbols dfs; SharedDateFormatSymbols(const SharedDateFormatSymbols &) = delete; SharedDateFormatSymbols &operator=(const SharedDateFormatSymbols &) = delete; }; template<> U_I18N_API const SharedDateFormatSymbols * LocaleCacheKey::createObject( const void * /*unusedContext*/, UErrorCode &status) const; U_NAMESPACE_END #endif /* !UCONFIG_NO_FORMATTING */ #endif stringi/src/icu74/i18n/unumsys.cpp0000644000176200001440000000442114700200761016457 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ***************************************************************************************** * Copyright (C) 2013, International Business Machines Corporation and others. * All Rights Reserved. ***************************************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/unumsys.h" #include "unicode/numsys.h" #include "unicode/uenum.h" U_NAMESPACE_USE U_CAPI UNumberingSystem* U_EXPORT2 unumsys_open(const char *locale, UErrorCode *status) { // createInstance returns immediately if status indicates error return (UNumberingSystem*)NumberingSystem::createInstance(Locale(locale), *status); } U_CAPI UNumberingSystem* U_EXPORT2 unumsys_openByName(const char *name, UErrorCode *status) { // createInstanceByName does NOT return immediately if status indicates error if (U_FAILURE(*status)) { return nullptr; } return (UNumberingSystem*)NumberingSystem::createInstanceByName(name, *status); } U_CAPI void U_EXPORT2 unumsys_close(UNumberingSystem *unumsys) { delete ((NumberingSystem*)unumsys); } U_CAPI UEnumeration* U_EXPORT2 unumsys_openAvailableNames(UErrorCode *status) { // getAvailableNames returns immediately if status indicates error return uenum_openFromStringEnumeration(NumberingSystem::getAvailableNames(*status), status); } U_CAPI const char * U_EXPORT2 unumsys_getName(const UNumberingSystem *unumsys) { return ((NumberingSystem*)unumsys)->getName(); } U_CAPI int32_t U_EXPORT2 unumsys_getRadix(const UNumberingSystem *unumsys) { return ((NumberingSystem*)unumsys)->getRadix(); } U_CAPI UBool U_EXPORT2 unumsys_isAlgorithmic(const UNumberingSystem *unumsys) { return ((NumberingSystem*)unumsys)->isAlgorithmic(); } U_CAPI int32_t U_EXPORT2 unumsys_getDescription(const UNumberingSystem *unumsys, char16_t *result, int32_t resultLength, UErrorCode *status) { if (U_FAILURE(*status)) { return -1; } // implement UnicodeString descrip = ((NumberingSystem*)unumsys)->getDescription(); return descrip.extract(result, resultLength, *status); } #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/double-conversion-fast-dtoa.h0000644000176200001440000001073714700200761021725 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // // From the double-conversion library. Original license: // // Copyright 2010 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // ICU PATCH: ifdef around UCONFIG_NO_FORMATTING #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #ifndef DOUBLE_CONVERSION_FAST_DTOA_H_ #define DOUBLE_CONVERSION_FAST_DTOA_H_ // ICU PATCH: Customize header file paths for ICU. #include "double-conversion-utils.h" // ICU PATCH: Wrap in ICU namespace U_NAMESPACE_BEGIN namespace double_conversion { enum FastDtoaMode { // Computes the shortest representation of the given input. The returned // result will be the most accurate number of this length. Longer // representations might be more accurate. FAST_DTOA_SHORTEST, // Same as FAST_DTOA_SHORTEST but for single-precision floats. FAST_DTOA_SHORTEST_SINGLE, // Computes a representation where the precision (number of digits) is // given as input. The precision is independent of the decimal point. FAST_DTOA_PRECISION }; // FastDtoa will produce at most kFastDtoaMaximalLength digits. This does not // include the terminating '\0' character. static const int kFastDtoaMaximalLength = 17; // Same for single-precision numbers. static const int kFastDtoaMaximalSingleLength = 9; // Provides a decimal representation of v. // The result should be interpreted as buffer * 10^(point - length). // // Precondition: // * v must be a strictly positive finite double. // // Returns true if it succeeds, otherwise the result can not be trusted. // There will be *length digits inside the buffer followed by a null terminator. // If the function returns true and mode equals // - FAST_DTOA_SHORTEST, then // the parameter requested_digits is ignored. // The result satisfies // v == (double) (buffer * 10^(point - length)). // The digits in the buffer are the shortest representation possible. E.g. // if 0.099999999999 and 0.1 represent the same double then "1" is returned // with point = 0. // The last digit will be closest to the actual v. That is, even if several // digits might correctly yield 'v' when read again, the buffer will contain // the one closest to v. // - FAST_DTOA_PRECISION, then // the buffer contains requested_digits digits. // the difference v - (buffer * 10^(point-length)) is closest to zero for // all possible representations of requested_digits digits. // If there are two values that are equally close, then FastDtoa returns // false. // For both modes the buffer must be large enough to hold the result. bool FastDtoa(double d, FastDtoaMode mode, int requested_digits, Vector buffer, int* length, int* decimal_point); } // namespace double_conversion // ICU PATCH: Close ICU namespace U_NAMESPACE_END #endif // DOUBLE_CONVERSION_FAST_DTOA_H_ #endif // ICU PATCH: close #if !UCONFIG_NO_FORMATTING stringi/src/icu74/i18n/tzrule.cpp0000644000176200001440000004544714700200761016276 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2007-2012, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ #include "utypeinfo.h" // for 'typeid' to work #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/tzrule.h" #include "unicode/ucal.h" #include "gregoimp.h" #include "cmemory.h" #include "uarrsort.h" U_CDECL_BEGIN // UComparator function for sorting start times static int32_t U_CALLCONV compareDates(const void * /*context*/, const void *left, const void *right) { UDate l = *((UDate*)left); UDate r = *((UDate*)right); int32_t res = l < r ? -1 : (l == r ? 0 : 1); return res; } U_CDECL_END U_NAMESPACE_BEGIN TimeZoneRule::TimeZoneRule(const UnicodeString& name, int32_t rawOffset, int32_t dstSavings) : UObject(), fName(name), fRawOffset(rawOffset), fDSTSavings(dstSavings) { } TimeZoneRule::TimeZoneRule(const TimeZoneRule& source) : UObject(source), fName(source.fName), fRawOffset(source.fRawOffset), fDSTSavings(source.fDSTSavings) { } TimeZoneRule::~TimeZoneRule() { } TimeZoneRule& TimeZoneRule::operator=(const TimeZoneRule& right) { if (this != &right) { fName = right.fName; fRawOffset = right.fRawOffset; fDSTSavings = right.fDSTSavings; } return *this; } bool TimeZoneRule::operator==(const TimeZoneRule& that) const { return ((this == &that) || (typeid(*this) == typeid(that) && fName == that.fName && fRawOffset == that.fRawOffset && fDSTSavings == that.fDSTSavings)); } bool TimeZoneRule::operator!=(const TimeZoneRule& that) const { return !operator==(that); } UnicodeString& TimeZoneRule::getName(UnicodeString& name) const { name = fName; return name; } int32_t TimeZoneRule::getRawOffset() const { return fRawOffset; } int32_t TimeZoneRule::getDSTSavings() const { return fDSTSavings; } UBool TimeZoneRule::isEquivalentTo(const TimeZoneRule& other) const { return ((this == &other) || (typeid(*this) == typeid(other) && fRawOffset == other.fRawOffset && fDSTSavings == other.fDSTSavings)); } UOBJECT_DEFINE_RTTI_IMPLEMENTATION(InitialTimeZoneRule) InitialTimeZoneRule::InitialTimeZoneRule(const UnicodeString& name, int32_t rawOffset, int32_t dstSavings) : TimeZoneRule(name, rawOffset, dstSavings) { } InitialTimeZoneRule::InitialTimeZoneRule(const InitialTimeZoneRule& source) : TimeZoneRule(source) { } InitialTimeZoneRule::~InitialTimeZoneRule() { } InitialTimeZoneRule* InitialTimeZoneRule::clone() const { return new InitialTimeZoneRule(*this); } InitialTimeZoneRule& InitialTimeZoneRule::operator=(const InitialTimeZoneRule& right) { if (this != &right) { TimeZoneRule::operator=(right); } return *this; } bool InitialTimeZoneRule::operator==(const TimeZoneRule& that) const { return ((this == &that) || (typeid(*this) == typeid(that) && TimeZoneRule::operator==(that))); } bool InitialTimeZoneRule::operator!=(const TimeZoneRule& that) const { return !operator==(that); } UBool InitialTimeZoneRule::isEquivalentTo(const TimeZoneRule& other) const { if (this == &other) { return true; } if (typeid(*this) != typeid(other) || TimeZoneRule::isEquivalentTo(other) == false) { return false; } return true; } UBool InitialTimeZoneRule::getFirstStart(int32_t /*prevRawOffset*/, int32_t /*prevDSTSavings*/, UDate& /*result*/) const { return false; } UBool InitialTimeZoneRule::getFinalStart(int32_t /*prevRawOffset*/, int32_t /*prevDSTSavings*/, UDate& /*result*/) const { return false; } UBool InitialTimeZoneRule::getNextStart(UDate /*base*/, int32_t /*prevRawOffset*/, int32_t /*prevDSTSavings*/, UBool /*inclusive*/, UDate& /*result*/) const { return false; } UBool InitialTimeZoneRule::getPreviousStart(UDate /*base*/, int32_t /*prevRawOffset*/, int32_t /*prevDSTSavings*/, UBool /*inclusive*/, UDate& /*result*/) const { return false; } UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnnualTimeZoneRule) const int32_t AnnualTimeZoneRule::MAX_YEAR = 0x7FFFFFFF; /* max signed int32 */ AnnualTimeZoneRule::AnnualTimeZoneRule(const UnicodeString& name, int32_t rawOffset, int32_t dstSavings, const DateTimeRule& dateTimeRule, int32_t startYear, int32_t endYear) : TimeZoneRule(name, rawOffset, dstSavings), fDateTimeRule(new DateTimeRule(dateTimeRule)), fStartYear(startYear), fEndYear(endYear) { } AnnualTimeZoneRule::AnnualTimeZoneRule(const UnicodeString& name, int32_t rawOffset, int32_t dstSavings, DateTimeRule* dateTimeRule, int32_t startYear, int32_t endYear) : TimeZoneRule(name, rawOffset, dstSavings), fDateTimeRule(dateTimeRule), fStartYear(startYear), fEndYear(endYear) { } AnnualTimeZoneRule::AnnualTimeZoneRule(const AnnualTimeZoneRule& source) : TimeZoneRule(source), fDateTimeRule(new DateTimeRule(*(source.fDateTimeRule))), fStartYear(source.fStartYear), fEndYear(source.fEndYear) { } AnnualTimeZoneRule::~AnnualTimeZoneRule() { delete fDateTimeRule; } AnnualTimeZoneRule* AnnualTimeZoneRule::clone() const { return new AnnualTimeZoneRule(*this); } AnnualTimeZoneRule& AnnualTimeZoneRule::operator=(const AnnualTimeZoneRule& right) { if (this != &right) { TimeZoneRule::operator=(right); delete fDateTimeRule; fDateTimeRule = right.fDateTimeRule->clone(); fStartYear = right.fStartYear; fEndYear = right.fEndYear; } return *this; } bool AnnualTimeZoneRule::operator==(const TimeZoneRule& that) const { if (this == &that) { return true; } if (typeid(*this) != typeid(that)) { return false; } AnnualTimeZoneRule *atzr = (AnnualTimeZoneRule*)&that; return (*fDateTimeRule == *(atzr->fDateTimeRule) && fStartYear == atzr->fStartYear && fEndYear == atzr->fEndYear); } bool AnnualTimeZoneRule::operator!=(const TimeZoneRule& that) const { return !operator==(that); } const DateTimeRule* AnnualTimeZoneRule::getRule() const { return fDateTimeRule; } int32_t AnnualTimeZoneRule::getStartYear() const { return fStartYear; } int32_t AnnualTimeZoneRule::getEndYear() const { return fEndYear; } UBool AnnualTimeZoneRule::getStartInYear(int32_t year, int32_t prevRawOffset, int32_t prevDSTSavings, UDate &result) const { if (year < fStartYear || year > fEndYear) { return false; } double ruleDay; DateTimeRule::DateRuleType type = fDateTimeRule->getDateRuleType(); if (type == DateTimeRule::DOM) { ruleDay = Grego::fieldsToDay(year, fDateTimeRule->getRuleMonth(), fDateTimeRule->getRuleDayOfMonth()); } else { UBool after = true; if (type == DateTimeRule::DOW) { // Normalize DOW rule into DOW_GEQ_DOM or DOW_LEQ_DOM int32_t weeks = fDateTimeRule->getRuleWeekInMonth(); if (weeks > 0) { ruleDay = Grego::fieldsToDay(year, fDateTimeRule->getRuleMonth(), 1); ruleDay += 7 * (weeks - 1); } else { after = false; ruleDay = Grego::fieldsToDay(year, fDateTimeRule->getRuleMonth(), Grego::monthLength(year, fDateTimeRule->getRuleMonth())); ruleDay += 7 * (weeks + 1); } } else { int32_t month = fDateTimeRule->getRuleMonth(); int32_t dom = fDateTimeRule->getRuleDayOfMonth(); if (type == DateTimeRule::DOW_LEQ_DOM) { after = false; // Handle Feb <=29 if (month == UCAL_FEBRUARY && dom == 29 && !Grego::isLeapYear(year)) { dom--; } } ruleDay = Grego::fieldsToDay(year, month, dom); } int32_t dow = Grego::dayOfWeek(ruleDay); int32_t delta = fDateTimeRule->getRuleDayOfWeek() - dow; if (after) { delta = delta < 0 ? delta + 7 : delta; } else { delta = delta > 0 ? delta - 7 : delta; } ruleDay += delta; } result = ruleDay*U_MILLIS_PER_DAY + fDateTimeRule->getRuleMillisInDay(); if (fDateTimeRule->getTimeRuleType() != DateTimeRule::UTC_TIME) { result -= prevRawOffset; } if (fDateTimeRule->getTimeRuleType() == DateTimeRule::WALL_TIME) { result -= prevDSTSavings; } return true; } UBool AnnualTimeZoneRule::isEquivalentTo(const TimeZoneRule& other) const { if (this == &other) { return true; } if (typeid(*this) != typeid(other) || TimeZoneRule::isEquivalentTo(other) == false) { return false; } AnnualTimeZoneRule* that = (AnnualTimeZoneRule*)&other; return (*fDateTimeRule == *(that->fDateTimeRule) && fStartYear == that->fStartYear && fEndYear == that->fEndYear); } UBool AnnualTimeZoneRule::getFirstStart(int32_t prevRawOffset, int32_t prevDSTSavings, UDate& result) const { return getStartInYear(fStartYear, prevRawOffset, prevDSTSavings, result); } UBool AnnualTimeZoneRule::getFinalStart(int32_t prevRawOffset, int32_t prevDSTSavings, UDate& result) const { if (fEndYear == MAX_YEAR) { return false; } return getStartInYear(fEndYear, prevRawOffset, prevDSTSavings, result); } UBool AnnualTimeZoneRule::getNextStart(UDate base, int32_t prevRawOffset, int32_t prevDSTSavings, UBool inclusive, UDate& result) const { int32_t year, month, dom, dow, doy, mid; Grego::timeToFields(base, year, month, dom, dow, doy, mid); if (year < fStartYear) { return getFirstStart(prevRawOffset, prevDSTSavings, result); } UDate tmp; if (getStartInYear(year, prevRawOffset, prevDSTSavings, tmp)) { if (tmp < base || (!inclusive && (tmp == base))) { // Return the next one return getStartInYear(year + 1, prevRawOffset, prevDSTSavings, result); } else { result = tmp; return true; } } return false; } UBool AnnualTimeZoneRule::getPreviousStart(UDate base, int32_t prevRawOffset, int32_t prevDSTSavings, UBool inclusive, UDate& result) const { int32_t year, month, dom, dow, doy, mid; Grego::timeToFields(base, year, month, dom, dow, doy, mid); if (year > fEndYear) { return getFinalStart(prevRawOffset, prevDSTSavings, result); } UDate tmp; if (getStartInYear(year, prevRawOffset, prevDSTSavings, tmp)) { if (tmp > base || (!inclusive && (tmp == base))) { // Return the previous one return getStartInYear(year - 1, prevRawOffset, prevDSTSavings, result); } else { result = tmp; return true; } } return false; } UOBJECT_DEFINE_RTTI_IMPLEMENTATION(TimeArrayTimeZoneRule) TimeArrayTimeZoneRule::TimeArrayTimeZoneRule(const UnicodeString& name, int32_t rawOffset, int32_t dstSavings, const UDate* startTimes, int32_t numStartTimes, DateTimeRule::TimeRuleType timeRuleType) : TimeZoneRule(name, rawOffset, dstSavings), fTimeRuleType(timeRuleType), fStartTimes(nullptr) { UErrorCode status = U_ZERO_ERROR; initStartTimes(startTimes, numStartTimes, status); //TODO - status? } TimeArrayTimeZoneRule::TimeArrayTimeZoneRule(const TimeArrayTimeZoneRule& source) : TimeZoneRule(source), fTimeRuleType(source.fTimeRuleType), fStartTimes(nullptr) { UErrorCode status = U_ZERO_ERROR; initStartTimes(source.fStartTimes, source.fNumStartTimes, status); //TODO - status? } TimeArrayTimeZoneRule::~TimeArrayTimeZoneRule() { if (fStartTimes != nullptr && fStartTimes != fLocalStartTimes) { uprv_free(fStartTimes); } } TimeArrayTimeZoneRule* TimeArrayTimeZoneRule::clone() const { return new TimeArrayTimeZoneRule(*this); } TimeArrayTimeZoneRule& TimeArrayTimeZoneRule::operator=(const TimeArrayTimeZoneRule& right) { if (this != &right) { TimeZoneRule::operator=(right); UErrorCode status = U_ZERO_ERROR; initStartTimes(right.fStartTimes, right.fNumStartTimes, status); //TODO - status? fTimeRuleType = right.fTimeRuleType; } return *this; } bool TimeArrayTimeZoneRule::operator==(const TimeZoneRule& that) const { if (this == &that) { return true; } if (typeid(*this) != typeid(that) || !TimeZoneRule::operator==(that)) { return false; } TimeArrayTimeZoneRule *tatzr = (TimeArrayTimeZoneRule*)&that; if (fTimeRuleType != tatzr->fTimeRuleType || fNumStartTimes != tatzr->fNumStartTimes) { return false; } // Compare start times bool res = true; for (int32_t i = 0; i < fNumStartTimes; i++) { if (fStartTimes[i] != tatzr->fStartTimes[i]) { res = false; break; } } return res; } bool TimeArrayTimeZoneRule::operator!=(const TimeZoneRule& that) const { return !operator==(that); } DateTimeRule::TimeRuleType TimeArrayTimeZoneRule::getTimeType() const { return fTimeRuleType; } UBool TimeArrayTimeZoneRule::getStartTimeAt(int32_t index, UDate& result) const { if (index >= fNumStartTimes || index < 0) { return false; } result = fStartTimes[index]; return true; } int32_t TimeArrayTimeZoneRule::countStartTimes() const { return fNumStartTimes; } UBool TimeArrayTimeZoneRule::isEquivalentTo(const TimeZoneRule& other) const { if (this == &other) { return true; } if (typeid(*this) != typeid(other) || TimeZoneRule::isEquivalentTo(other) == false) { return false; } TimeArrayTimeZoneRule* that = (TimeArrayTimeZoneRule*)&other; if (fTimeRuleType != that->fTimeRuleType || fNumStartTimes != that->fNumStartTimes) { return false; } // Compare start times UBool res = true; for (int32_t i = 0; i < fNumStartTimes; i++) { if (fStartTimes[i] != that->fStartTimes[i]) { res = false; break; } } return res; } UBool TimeArrayTimeZoneRule::getFirstStart(int32_t prevRawOffset, int32_t prevDSTSavings, UDate& result) const { if (fNumStartTimes <= 0 || fStartTimes == nullptr) { return false; } result = getUTC(fStartTimes[0], prevRawOffset, prevDSTSavings); return true; } UBool TimeArrayTimeZoneRule::getFinalStart(int32_t prevRawOffset, int32_t prevDSTSavings, UDate& result) const { if (fNumStartTimes <= 0 || fStartTimes == nullptr) { return false; } result = getUTC(fStartTimes[fNumStartTimes - 1], prevRawOffset, prevDSTSavings); return true; } UBool TimeArrayTimeZoneRule::getNextStart(UDate base, int32_t prevRawOffset, int32_t prevDSTSavings, UBool inclusive, UDate& result) const { int32_t i = fNumStartTimes - 1; for (; i >= 0; i--) { UDate time = getUTC(fStartTimes[i], prevRawOffset, prevDSTSavings); if (time < base || (!inclusive && time == base)) { break; } result = time; } if (i == fNumStartTimes - 1) { return false; } return true; } UBool TimeArrayTimeZoneRule::getPreviousStart(UDate base, int32_t prevRawOffset, int32_t prevDSTSavings, UBool inclusive, UDate& result) const { int32_t i = fNumStartTimes - 1; for (; i >= 0; i--) { UDate time = getUTC(fStartTimes[i], prevRawOffset, prevDSTSavings); if (time < base || (inclusive && time == base)) { result = time; return true; } } return false; } // ---- private methods ------ UBool TimeArrayTimeZoneRule::initStartTimes(const UDate source[], int32_t size, UErrorCode& status) { // Free old array if (fStartTimes != nullptr && fStartTimes != fLocalStartTimes) { uprv_free(fStartTimes); } // Allocate new one if needed if (size > TIMEARRAY_STACK_BUFFER_SIZE) { fStartTimes = (UDate*)uprv_malloc(sizeof(UDate)*size); if (fStartTimes == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; fNumStartTimes = 0; return false; } } else { fStartTimes = (UDate*)fLocalStartTimes; } uprv_memcpy(fStartTimes, source, sizeof(UDate)*size); fNumStartTimes = size; // Sort dates uprv_sortArray(fStartTimes, fNumStartTimes, (int32_t)sizeof(UDate), compareDates, nullptr, true, &status); if (U_FAILURE(status)) { if (fStartTimes != nullptr && fStartTimes != fLocalStartTimes) { uprv_free(fStartTimes); } fNumStartTimes = 0; return false; } return true; } UDate TimeArrayTimeZoneRule::getUTC(UDate time, int32_t raw, int32_t dst) const { if (fTimeRuleType != DateTimeRule::UTC_TIME) { time -= raw; } if (fTimeRuleType == DateTimeRule::WALL_TIME) { time -= dst; } return time; } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ //eof stringi/src/icu74/i18n/quant.cpp0000644000176200001440000000775214700200761016076 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2001-2012, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 07/26/01 aliu Creation. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "quant.h" #include "unicode/unistr.h" #include "util.h" U_NAMESPACE_BEGIN UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Quantifier) Quantifier::Quantifier(UnicodeFunctor *adoptedMatcher, uint32_t _minCount, uint32_t _maxCount) { // assert(adopted != 0); // assert(minCount <= maxCount); matcher = adoptedMatcher; this->minCount = _minCount; this->maxCount = _maxCount; } Quantifier::Quantifier(const Quantifier& o) : UnicodeFunctor(o), UnicodeMatcher(o), matcher(o.matcher->clone()), minCount(o.minCount), maxCount(o.maxCount) { } Quantifier::~Quantifier() { delete matcher; } /** * Implement UnicodeFunctor */ Quantifier* Quantifier::clone() const { return new Quantifier(*this); } /** * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer * and return the pointer. */ UnicodeMatcher* Quantifier::toMatcher() const { Quantifier *nonconst_this = const_cast(this); UnicodeMatcher *nonconst_base = static_cast(nonconst_this); return nonconst_base; } UMatchDegree Quantifier::matches(const Replaceable& text, int32_t& offset, int32_t limit, UBool incremental) { int32_t start = offset; uint32_t count = 0; while (count < maxCount) { int32_t pos = offset; UMatchDegree m = matcher->toMatcher()->matches(text, offset, limit, incremental); if (m == U_MATCH) { ++count; if (pos == offset) { // If offset has not moved we have a zero-width match. // Don't keep matching it infinitely. break; } } else if (incremental && m == U_PARTIAL_MATCH) { return U_PARTIAL_MATCH; } else { break; } } if (incremental && offset == limit) { return U_PARTIAL_MATCH; } if (count >= minCount) { return U_MATCH; } offset = start; return U_MISMATCH; } /** * Implement UnicodeMatcher */ UnicodeString& Quantifier::toPattern(UnicodeString& result, UBool escapeUnprintable) const { result.truncate(0); matcher->toMatcher()->toPattern(result, escapeUnprintable); if (minCount == 0) { if (maxCount == 1) { return result.append((char16_t)63); /*?*/ } else if (maxCount == MAX) { return result.append((char16_t)42); /***/ } // else fall through } else if (minCount == 1 && maxCount == MAX) { return result.append((char16_t)43); /*+*/ } result.append((char16_t)123); /*{*/ ICU_Utility::appendNumber(result, minCount); result.append((char16_t)44); /*,*/ if (maxCount != MAX) { ICU_Utility::appendNumber(result, maxCount); } result.append((char16_t)125); /*}*/ return result; } /** * Implement UnicodeMatcher */ UBool Quantifier::matchesIndexValue(uint8_t v) const { return (minCount == 0) || matcher->toMatcher()->matchesIndexValue(v); } /** * Implement UnicodeMatcher */ void Quantifier::addMatchSetTo(UnicodeSet& toUnionTo) const { if (maxCount > 0) { matcher->toMatcher()->addMatchSetTo(toUnionTo); } } /** * Implement UnicodeFunctor */ void Quantifier::setData(const TransliterationRuleData* d) { matcher->setData(d); } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ //eof stringi/src/icu74/i18n/datefmt.cpp0000644000176200001440000005251614700200761016370 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 1997-2015, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* * * File DATEFMT.CPP * * Modification History: * * Date Name Description * 02/19/97 aliu Converted from java. * 03/31/97 aliu Modified extensively to work with 50 locales. * 04/01/97 aliu Added support for centuries. * 08/12/97 aliu Fixed operator== to use Calendar::equivalentTo. * 07/20/98 stephen Changed ParsePosition initialization ******************************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/ures.h" #include "unicode/datefmt.h" #include "unicode/smpdtfmt.h" #include "unicode/dtptngen.h" #include "unicode/udisplaycontext.h" #include "reldtfmt.h" #include "sharedobject.h" #include "unifiedcache.h" #include "uarrsort.h" #include "cstring.h" #include "windtfmt.h" #if defined( U_DEBUG_CALSVC ) || defined (U_DEBUG_CAL) #include #endif // ***************************************************************************** // class DateFormat // ***************************************************************************** U_NAMESPACE_BEGIN class DateFmtBestPattern : public SharedObject { public: UnicodeString fPattern; DateFmtBestPattern(const UnicodeString &pattern) : fPattern(pattern) { } ~DateFmtBestPattern(); }; DateFmtBestPattern::~DateFmtBestPattern() { } template<> const DateFmtBestPattern *LocaleCacheKey::createObject( const void * /*creationContext*/, UErrorCode &status) const { status = U_UNSUPPORTED_ERROR; return nullptr; } class DateFmtBestPatternKey : public LocaleCacheKey { private: UnicodeString fSkeleton; protected: virtual bool equals(const CacheKeyBase &other) const override { if (!LocaleCacheKey::equals(other)) { return false; } // We know that this and other are of same class if we get this far. return operator==(static_cast(other)); } public: DateFmtBestPatternKey( const Locale &loc, const UnicodeString &skeleton, UErrorCode &status) : LocaleCacheKey(loc), fSkeleton(DateTimePatternGenerator::staticGetSkeleton(skeleton, status)) { } DateFmtBestPatternKey(const DateFmtBestPatternKey &other) : LocaleCacheKey(other), fSkeleton(other.fSkeleton) { } virtual ~DateFmtBestPatternKey(); virtual int32_t hashCode() const override { return (int32_t)(37u * (uint32_t)LocaleCacheKey::hashCode() + (uint32_t)fSkeleton.hashCode()); } inline bool operator==(const DateFmtBestPatternKey &other) const { return fSkeleton == other.fSkeleton; } virtual CacheKeyBase *clone() const override { return new DateFmtBestPatternKey(*this); } virtual const DateFmtBestPattern *createObject( const void * /*unused*/, UErrorCode &status) const override { LocalPointer dtpg( DateTimePatternGenerator::createInstance(fLoc, status)); if (U_FAILURE(status)) { return nullptr; } LocalPointer pattern( new DateFmtBestPattern( dtpg->getBestPattern(fSkeleton, status)), status); if (U_FAILURE(status)) { return nullptr; } DateFmtBestPattern *result = pattern.orphan(); result->addRef(); return result; } }; DateFmtBestPatternKey::~DateFmtBestPatternKey() { } DateFormat::DateFormat() : fCalendar(0), fNumberFormat(0), fCapitalizationContext(UDISPCTX_CAPITALIZATION_NONE) { } //---------------------------------------------------------------------- DateFormat::DateFormat(const DateFormat& other) : Format(other), fCalendar(0), fNumberFormat(0), fCapitalizationContext(UDISPCTX_CAPITALIZATION_NONE) { *this = other; } //---------------------------------------------------------------------- DateFormat& DateFormat::operator=(const DateFormat& other) { if (this != &other) { delete fCalendar; delete fNumberFormat; if(other.fCalendar) { fCalendar = other.fCalendar->clone(); } else { fCalendar = nullptr; } if(other.fNumberFormat) { fNumberFormat = other.fNumberFormat->clone(); } else { fNumberFormat = nullptr; } fBoolFlags = other.fBoolFlags; fCapitalizationContext = other.fCapitalizationContext; } return *this; } //---------------------------------------------------------------------- DateFormat::~DateFormat() { delete fCalendar; delete fNumberFormat; } //---------------------------------------------------------------------- bool DateFormat::operator==(const Format& other) const { if (this == &other) { return true; } if (!(Format::operator==(other))) { return false; } // Format::operator== guarantees that this cast is safe DateFormat* fmt = (DateFormat*)&other; return fCalendar&&(fCalendar->isEquivalentTo(*fmt->fCalendar)) && (fNumberFormat && *fNumberFormat == *fmt->fNumberFormat) && (fCapitalizationContext == fmt->fCapitalizationContext); } //---------------------------------------------------------------------- UnicodeString& DateFormat::format(const Formattable& obj, UnicodeString& appendTo, FieldPosition& fieldPosition, UErrorCode& status) const { if (U_FAILURE(status)) return appendTo; // if the type of the Formattable is double or long, treat it as if it were a Date UDate date = 0; switch (obj.getType()) { case Formattable::kDate: date = obj.getDate(); break; case Formattable::kDouble: date = (UDate)obj.getDouble(); break; case Formattable::kLong: date = (UDate)obj.getLong(); break; default: status = U_ILLEGAL_ARGUMENT_ERROR; return appendTo; } // Is this right? //if (fieldPosition.getBeginIndex() == fieldPosition.getEndIndex()) // status = U_ILLEGAL_ARGUMENT_ERROR; return format(date, appendTo, fieldPosition); } //---------------------------------------------------------------------- UnicodeString& DateFormat::format(const Formattable& obj, UnicodeString& appendTo, FieldPositionIterator* posIter, UErrorCode& status) const { if (U_FAILURE(status)) return appendTo; // if the type of the Formattable is double or long, treat it as if it were a Date UDate date = 0; switch (obj.getType()) { case Formattable::kDate: date = obj.getDate(); break; case Formattable::kDouble: date = (UDate)obj.getDouble(); break; case Formattable::kLong: date = (UDate)obj.getLong(); break; default: status = U_ILLEGAL_ARGUMENT_ERROR; return appendTo; } // Is this right? //if (fieldPosition.getBeginIndex() == fieldPosition.getEndIndex()) // status = U_ILLEGAL_ARGUMENT_ERROR; return format(date, appendTo, posIter, status); } //---------------------------------------------------------------------- // Default implementation for backwards compatibility, subclasses should implement. UnicodeString& DateFormat::format(Calendar& /* unused cal */, UnicodeString& appendTo, FieldPositionIterator* /* unused posIter */, UErrorCode& status) const { if (U_SUCCESS(status)) { status = U_UNSUPPORTED_ERROR; } return appendTo; } //---------------------------------------------------------------------- UnicodeString& DateFormat::format(UDate date, UnicodeString& appendTo, FieldPosition& fieldPosition) const { if (fCalendar != nullptr) { // Use a clone of our calendar instance Calendar* calClone = fCalendar->clone(); if (calClone != nullptr) { UErrorCode ec = U_ZERO_ERROR; calClone->setTime(date, ec); if (U_SUCCESS(ec)) { format(*calClone, appendTo, fieldPosition); } delete calClone; } } return appendTo; } //---------------------------------------------------------------------- UnicodeString& DateFormat::format(UDate date, UnicodeString& appendTo, FieldPositionIterator* posIter, UErrorCode& status) const { if (fCalendar != nullptr) { Calendar* calClone = fCalendar->clone(); if (calClone != nullptr) { calClone->setTime(date, status); if (U_SUCCESS(status)) { format(*calClone, appendTo, posIter, status); } delete calClone; } } return appendTo; } //---------------------------------------------------------------------- UnicodeString& DateFormat::format(UDate date, UnicodeString& appendTo) const { // Note that any error information is just lost. That's okay // for this convenience method. FieldPosition fpos(FieldPosition::DONT_CARE); return format(date, appendTo, fpos); } //---------------------------------------------------------------------- UDate DateFormat::parse(const UnicodeString& text, ParsePosition& pos) const { UDate d = 0; // Error return UDate is 0 (the epoch) if (fCalendar != nullptr) { Calendar* calClone = fCalendar->clone(); if (calClone != nullptr) { int32_t start = pos.getIndex(); calClone->clear(); parse(text, *calClone, pos); if (pos.getIndex() != start) { UErrorCode ec = U_ZERO_ERROR; d = calClone->getTime(ec); if (U_FAILURE(ec)) { // We arrive here if fCalendar => calClone is non-lenient and // there is an out-of-range field. We don't know which field // was illegal so we set the error index to the start. pos.setIndex(start); pos.setErrorIndex(start); d = 0; } } delete calClone; } } return d; } //---------------------------------------------------------------------- UDate DateFormat::parse(const UnicodeString& text, UErrorCode& status) const { if (U_FAILURE(status)) return 0; ParsePosition pos(0); UDate result = parse(text, pos); if (pos.getIndex() == 0) { #if defined (U_DEBUG_CAL) fprintf(stderr, "%s:%d - - failed to parse - err index %d\n" , __FILE__, __LINE__, pos.getErrorIndex() ); #endif status = U_ILLEGAL_ARGUMENT_ERROR; } return result; } //---------------------------------------------------------------------- void DateFormat::parseObject(const UnicodeString& source, Formattable& result, ParsePosition& pos) const { result.setDate(parse(source, pos)); } //---------------------------------------------------------------------- DateFormat* U_EXPORT2 DateFormat::createTimeInstance(DateFormat::EStyle style, const Locale& aLocale) { return createDateTimeInstance(kNone, style, aLocale); } //---------------------------------------------------------------------- DateFormat* U_EXPORT2 DateFormat::createDateInstance(DateFormat::EStyle style, const Locale& aLocale) { return createDateTimeInstance(style, kNone, aLocale); } //---------------------------------------------------------------------- DateFormat* U_EXPORT2 DateFormat::createDateTimeInstance(EStyle dateStyle, EStyle timeStyle, const Locale& aLocale) { if(dateStyle != kNone) { dateStyle = (EStyle) (dateStyle + kDateOffset); } return create(timeStyle, dateStyle, aLocale); } //---------------------------------------------------------------------- DateFormat* U_EXPORT2 DateFormat::createInstance() { return createDateTimeInstance(kShort, kShort, Locale::getDefault()); } //---------------------------------------------------------------------- UnicodeString U_EXPORT2 DateFormat::getBestPattern( const Locale &locale, const UnicodeString &skeleton, UErrorCode &status) { UnifiedCache *cache = UnifiedCache::getInstance(status); if (U_FAILURE(status)) { return UnicodeString(); } DateFmtBestPatternKey key(locale, skeleton, status); const DateFmtBestPattern *patternPtr = nullptr; cache->get(key, patternPtr, status); if (U_FAILURE(status)) { return UnicodeString(); } UnicodeString result(patternPtr->fPattern); patternPtr->removeRef(); return result; } DateFormat* U_EXPORT2 DateFormat::createInstanceForSkeleton( Calendar *calendarToAdopt, const UnicodeString& skeleton, const Locale &locale, UErrorCode &status) { LocalPointer calendar(calendarToAdopt); if (U_FAILURE(status)) { return nullptr; } if (calendar.isNull()) { status = U_ILLEGAL_ARGUMENT_ERROR; return nullptr; } Locale localeWithCalendar = locale; localeWithCalendar.setKeywordValue("calendar", calendar->getType(), status); if (U_FAILURE(status)) { return nullptr; } DateFormat *result = createInstanceForSkeleton(skeleton, localeWithCalendar, status); if (U_FAILURE(status)) { return nullptr; } result->adoptCalendar(calendar.orphan()); return result; } DateFormat* U_EXPORT2 DateFormat::createInstanceForSkeleton( const UnicodeString& skeleton, const Locale &locale, UErrorCode &status) { if (U_FAILURE(status)) { return nullptr; } LocalPointer df( new SimpleDateFormat( getBestPattern(locale, skeleton, status), locale, status), status); return U_SUCCESS(status) ? df.orphan() : nullptr; } DateFormat* U_EXPORT2 DateFormat::createInstanceForSkeleton( const UnicodeString& skeleton, UErrorCode &status) { return createInstanceForSkeleton( skeleton, Locale::getDefault(), status); } //---------------------------------------------------------------------- DateFormat* U_EXPORT2 DateFormat::create(EStyle timeStyle, EStyle dateStyle, const Locale& locale) { UErrorCode status = U_ZERO_ERROR; #if U_PLATFORM_USES_ONLY_WIN32_API char buffer[8]; int32_t count = locale.getKeywordValue("compat", buffer, sizeof(buffer), status); // if the locale has "@compat=host", create a host-specific DateFormat... if (count > 0 && uprv_strcmp(buffer, "host") == 0) { Win32DateFormat *f = new Win32DateFormat(timeStyle, dateStyle, locale, status); if (U_SUCCESS(status)) { return f; } delete f; } #endif // is it relative? if(/*((timeStyle!=UDAT_NONE)&&(timeStyle & UDAT_RELATIVE)) || */((dateStyle!=kNone)&&((dateStyle-kDateOffset) & UDAT_RELATIVE))) { RelativeDateFormat *r = new RelativeDateFormat((UDateFormatStyle)timeStyle, (UDateFormatStyle)(dateStyle-kDateOffset), locale, status); if(U_SUCCESS(status)) return r; delete r; status = U_ZERO_ERROR; } // Try to create a SimpleDateFormat of the desired style. SimpleDateFormat *f = new SimpleDateFormat(timeStyle, dateStyle, locale, status); if (U_SUCCESS(status)) return f; delete f; // If that fails, try to create a format using the default pattern and // the DateFormatSymbols for this locale. status = U_ZERO_ERROR; f = new SimpleDateFormat(locale, status); if (U_SUCCESS(status)) return f; delete f; // This should never really happen, because the preceding constructor // should always succeed. If the resource data is unavailable, a last // resort object should be returned. return 0; } //---------------------------------------------------------------------- const Locale* U_EXPORT2 DateFormat::getAvailableLocales(int32_t& count) { // Get the list of installed locales. // Even if root has the correct date format for this locale, // it's still a valid locale (we don't worry about data fallbacks). return Locale::getAvailableLocales(count); } //---------------------------------------------------------------------- void DateFormat::adoptCalendar(Calendar* newCalendar) { delete fCalendar; fCalendar = newCalendar; } //---------------------------------------------------------------------- void DateFormat::setCalendar(const Calendar& newCalendar) { Calendar* newCalClone = newCalendar.clone(); if (newCalClone != nullptr) { adoptCalendar(newCalClone); } } //---------------------------------------------------------------------- const Calendar* DateFormat::getCalendar() const { return fCalendar; } //---------------------------------------------------------------------- void DateFormat::adoptNumberFormat(NumberFormat* newNumberFormat) { delete fNumberFormat; fNumberFormat = newNumberFormat; newNumberFormat->setParseIntegerOnly(true); newNumberFormat->setGroupingUsed(false); } //---------------------------------------------------------------------- void DateFormat::setNumberFormat(const NumberFormat& newNumberFormat) { NumberFormat* newNumFmtClone = newNumberFormat.clone(); if (newNumFmtClone != nullptr) { adoptNumberFormat(newNumFmtClone); } } //---------------------------------------------------------------------- const NumberFormat* DateFormat::getNumberFormat() const { return fNumberFormat; } //---------------------------------------------------------------------- void DateFormat::adoptTimeZone(TimeZone* zone) { if (fCalendar != nullptr) { fCalendar->adoptTimeZone(zone); } } //---------------------------------------------------------------------- void DateFormat::setTimeZone(const TimeZone& zone) { if (fCalendar != nullptr) { fCalendar->setTimeZone(zone); } } //---------------------------------------------------------------------- const TimeZone& DateFormat::getTimeZone() const { if (fCalendar != nullptr) { return fCalendar->getTimeZone(); } // If calendar doesn't exists, create default timezone. // fCalendar is rarely null return *(TimeZone::createDefault()); } //---------------------------------------------------------------------- void DateFormat::setLenient(UBool lenient) { if (fCalendar != nullptr) { fCalendar->setLenient(lenient); } UErrorCode status = U_ZERO_ERROR; setBooleanAttribute(UDAT_PARSE_ALLOW_WHITESPACE, lenient, status); setBooleanAttribute(UDAT_PARSE_ALLOW_NUMERIC, lenient, status); } //---------------------------------------------------------------------- UBool DateFormat::isLenient() const { UBool lenient = true; if (fCalendar != nullptr) { lenient = fCalendar->isLenient(); } UErrorCode status = U_ZERO_ERROR; return lenient && getBooleanAttribute(UDAT_PARSE_ALLOW_WHITESPACE, status) && getBooleanAttribute(UDAT_PARSE_ALLOW_NUMERIC, status); } void DateFormat::setCalendarLenient(UBool lenient) { if (fCalendar != nullptr) { fCalendar->setLenient(lenient); } } //---------------------------------------------------------------------- UBool DateFormat::isCalendarLenient() const { if (fCalendar != nullptr) { return fCalendar->isLenient(); } // fCalendar is rarely null return false; } //---------------------------------------------------------------------- void DateFormat::setContext(UDisplayContext value, UErrorCode& status) { if (U_FAILURE(status)) return; if ( (UDisplayContextType)((uint32_t)value >> 8) == UDISPCTX_TYPE_CAPITALIZATION ) { fCapitalizationContext = value; } else { status = U_ILLEGAL_ARGUMENT_ERROR; } } //---------------------------------------------------------------------- UDisplayContext DateFormat::getContext(UDisplayContextType type, UErrorCode& status) const { if (U_FAILURE(status)) return (UDisplayContext)0; if (type != UDISPCTX_TYPE_CAPITALIZATION) { status = U_ILLEGAL_ARGUMENT_ERROR; return (UDisplayContext)0; } return fCapitalizationContext; } //---------------------------------------------------------------------- DateFormat& DateFormat::setBooleanAttribute(UDateFormatBooleanAttribute attr, UBool newValue, UErrorCode &status) { if(!fBoolFlags.isValidValue(newValue)) { status = U_ILLEGAL_ARGUMENT_ERROR; } else { fBoolFlags.set(attr, newValue); } return *this; } //---------------------------------------------------------------------- UBool DateFormat::getBooleanAttribute(UDateFormatBooleanAttribute attr, UErrorCode &/*status*/) const { return static_cast(fBoolFlags.get(attr)); } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ //eof stringi/src/icu74/i18n/collationsettings.h0000644000176200001440000002422414700200761020151 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2013-2015, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * collationsettings.h * * created on: 2013feb07 * created by: Markus W. Scherer */ #ifndef __COLLATIONSETTINGS_H__ #define __COLLATIONSETTINGS_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION #include "unicode/ucol.h" #include "collation.h" #include "sharedobject.h" #include "umutex.h" U_NAMESPACE_BEGIN struct CollationData; /** * Collation settings/options/attributes. * These are the values that can be changed via API. */ struct U_I18N_API CollationSettings : public SharedObject { /** * Options bit 0: Perform the FCD check on the input text and deliver normalized text. */ static const int32_t CHECK_FCD = 1; /** * Options bit 1: Numeric collation. * Also known as CODAN = COllate Digits As Numbers. * * Treat digit sequences as numbers with CE sequences in numeric order, * rather than returning a normal CE for each digit. */ static const int32_t NUMERIC = 2; /** * "Shifted" alternate handling, see ALTERNATE_MASK. */ static const int32_t SHIFTED = 4; /** * Options bits 3..2: Alternate-handling mask. 0 for non-ignorable. * Reserve values 8 and 0xc for shift-trimmed and blanked. */ static const int32_t ALTERNATE_MASK = 0xc; /** * Options bits 6..4: The 3-bit maxVariable value bit field is shifted by this value. */ static const int32_t MAX_VARIABLE_SHIFT = 4; /** maxVariable options bit mask before shifting. */ static const int32_t MAX_VARIABLE_MASK = 0x70; /** Options bit 7: Reserved/unused/0. */ /** * Options bit 8: Sort uppercase first if caseLevel or caseFirst is on. */ static const int32_t UPPER_FIRST = 0x100; /** * Options bit 9: Keep the case bits in the tertiary weight (they trump other tertiary values) * unless case level is on (when they are *moved* into the separate case level). * By default, the case bits are removed from the tertiary weight (ignored). * * When CASE_FIRST is off, UPPER_FIRST must be off too, corresponding to * the tri-value UCOL_CASE_FIRST attribute: UCOL_OFF vs. UCOL_LOWER_FIRST vs. UCOL_UPPER_FIRST. */ static const int32_t CASE_FIRST = 0x200; /** * Options bit mask for caseFirst and upperFirst, before shifting. * Same value as caseFirst==upperFirst. */ static const int32_t CASE_FIRST_AND_UPPER_MASK = CASE_FIRST | UPPER_FIRST; /** * Options bit 10: Insert the case level between the secondary and tertiary levels. */ static const int32_t CASE_LEVEL = 0x400; /** * Options bit 11: Compare secondary weights backwards. ("French secondary") */ static const int32_t BACKWARD_SECONDARY = 0x800; /** * Options bits 15..12: The 4-bit strength value bit field is shifted by this value. * It is the top used bit field in the options. (No need to mask after shifting.) */ static const int32_t STRENGTH_SHIFT = 12; /** Strength options bit mask before shifting. */ static const int32_t STRENGTH_MASK = 0xf000; /** maxVariable values */ enum MaxVariable { MAX_VAR_SPACE, MAX_VAR_PUNCT, MAX_VAR_SYMBOL, MAX_VAR_CURRENCY }; CollationSettings() : options((UCOL_DEFAULT_STRENGTH << STRENGTH_SHIFT) | (MAX_VAR_PUNCT << MAX_VARIABLE_SHIFT)), variableTop(0), reorderTable(nullptr), minHighNoReorder(0), reorderRanges(nullptr), reorderRangesLength(0), reorderCodes(nullptr), reorderCodesLength(0), reorderCodesCapacity(0), fastLatinOptions(-1) {} CollationSettings(const CollationSettings &other); virtual ~CollationSettings(); bool operator==(const CollationSettings &other) const; inline bool operator!=(const CollationSettings &other) const { return !operator==(other); } int32_t hashCode() const; void resetReordering(); void aliasReordering(const CollationData &data, const int32_t *codes, int32_t length, const uint32_t *ranges, int32_t rangesLength, const uint8_t *table, UErrorCode &errorCode); void setReordering(const CollationData &data, const int32_t *codes, int32_t codesLength, UErrorCode &errorCode); void copyReorderingFrom(const CollationSettings &other, UErrorCode &errorCode); inline UBool hasReordering() const { return reorderTable != nullptr; } static UBool reorderTableHasSplitBytes(const uint8_t table[256]); inline uint32_t reorder(uint32_t p) const { uint8_t b = reorderTable[p >> 24]; if(b != 0 || p <= Collation::NO_CE_PRIMARY) { return ((uint32_t)b << 24) | (p & 0xffffff); } else { return reorderEx(p); } } void setStrength(int32_t value, int32_t defaultOptions, UErrorCode &errorCode); static int32_t getStrength(int32_t options) { return options >> STRENGTH_SHIFT; } int32_t getStrength() const { return getStrength(options); } /** Sets the options bit for an on/off attribute. */ void setFlag(int32_t bit, UColAttributeValue value, int32_t defaultOptions, UErrorCode &errorCode); UColAttributeValue getFlag(int32_t bit) const { return ((options & bit) != 0) ? UCOL_ON : UCOL_OFF; } void setCaseFirst(UColAttributeValue value, int32_t defaultOptions, UErrorCode &errorCode); UColAttributeValue getCaseFirst() const { int32_t option = options & CASE_FIRST_AND_UPPER_MASK; return (option == 0) ? UCOL_OFF : (option == CASE_FIRST) ? UCOL_LOWER_FIRST : UCOL_UPPER_FIRST; } void setAlternateHandling(UColAttributeValue value, int32_t defaultOptions, UErrorCode &errorCode); UColAttributeValue getAlternateHandling() const { return ((options & ALTERNATE_MASK) == 0) ? UCOL_NON_IGNORABLE : UCOL_SHIFTED; } void setMaxVariable(int32_t value, int32_t defaultOptions, UErrorCode &errorCode); MaxVariable getMaxVariable() const { return (MaxVariable)((options & MAX_VARIABLE_MASK) >> MAX_VARIABLE_SHIFT); } /** * Include case bits in the tertiary level if caseLevel=off and caseFirst!=off. */ static inline UBool isTertiaryWithCaseBits(int32_t options) { return (options & (CASE_LEVEL | CASE_FIRST)) == CASE_FIRST; } static uint32_t getTertiaryMask(int32_t options) { // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. return isTertiaryWithCaseBits(options) ? Collation::CASE_AND_TERTIARY_MASK : Collation::ONLY_TERTIARY_MASK; } static UBool sortsTertiaryUpperCaseFirst(int32_t options) { // On tertiary level, consider case bits and sort uppercase first // if caseLevel is off and caseFirst==upperFirst. return (options & (CASE_LEVEL | CASE_FIRST_AND_UPPER_MASK)) == CASE_FIRST_AND_UPPER_MASK; } inline UBool dontCheckFCD() const { return (options & CHECK_FCD) == 0; } inline UBool hasBackwardSecondary() const { return (options & BACKWARD_SECONDARY) != 0; } inline UBool isNumeric() const { return (options & NUMERIC) != 0; } /** CHECK_FCD etc. */ int32_t options; /** Variable-top primary weight. */ uint32_t variableTop; /** * 256-byte table for reordering permutation of primary lead bytes; nullptr if no reordering. * A 0 entry at a non-zero index means that the primary lead byte is "split" * (there are different offsets for primaries that share that lead byte) * and the reordering offset must be determined via the reorderRanges. */ const uint8_t *reorderTable; /** Limit of last reordered range. 0 if no reordering or no split bytes. */ uint32_t minHighNoReorder; /** * Primary-weight ranges for script reordering, * to be used by reorder(p) for split-reordered primary lead bytes. * * Each entry is a (limit, offset) pair. * The upper 16 bits of the entry are the upper 16 bits of the * exclusive primary limit of a range. * Primaries between the previous limit and this one have their lead bytes * modified by the signed offset (-0xff..+0xff) stored in the lower 16 bits. * * CollationData::makeReorderRanges() writes a full list where the first range * (at least for terminators and separators) has a 0 offset. * The last range has a non-zero offset. * minHighNoReorder is set to the limit of that last range. * * In the settings object, the initial ranges before the first split lead byte * are omitted for efficiency; they are handled by reorder(p) via the reorderTable. * If there are no split-reordered lead bytes, then no ranges are needed. */ const uint32_t *reorderRanges; int32_t reorderRangesLength; /** Array of reorder codes; ignored if reorderCodesLength == 0. */ const int32_t *reorderCodes; /** Number of reorder codes; 0 if no reordering. */ int32_t reorderCodesLength; /** * Capacity of reorderCodes. * If 0, then the codes, the ranges, and the table are aliases. * Otherwise, this object owns the memory via the reorderCodes pointer; * the codes, the ranges, and the table are in the same memory block, in that order. */ int32_t reorderCodesCapacity; /** Options for CollationFastLatin. Negative if disabled. */ int32_t fastLatinOptions; uint16_t fastLatinPrimaries[0x180]; private: void setReorderArrays(const int32_t *codes, int32_t codesLength, const uint32_t *ranges, int32_t rangesLength, const uint8_t *table, UErrorCode &errorCode); uint32_t reorderEx(uint32_t p) const; }; U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION #endif // __COLLATIONSETTINGS_H__ stringi/src/icu74/i18n/collationfastlatin.cpp0000644000176200001440000012344114700200761020632 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2013-2015, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * collationfastlatin.cpp * * created on: 2013aug18 * created by: Markus W. Scherer */ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION #include "unicode/ucol.h" #include "collationdata.h" #include "collationfastlatin.h" #include "collationsettings.h" #include "uassert.h" U_NAMESPACE_BEGIN int32_t CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings, uint16_t *primaries, int32_t capacity) { const uint16_t *table = data->fastLatinTable; if(table == nullptr) { return -1; } U_ASSERT(capacity == LATIN_LIMIT); if(capacity != LATIN_LIMIT) { return -1; } uint32_t miniVarTop; if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) { // No mini primaries are variable, set a variableTop just below the // lowest long mini primary. miniVarTop = MIN_LONG - 1; } else { int32_t headerLength = *table & 0xff; int32_t i = 1 + settings.getMaxVariable(); if(i >= headerLength) { return -1; // variableTop >= digits, should not occur } miniVarTop = table[i]; } UBool digitsAreReordered = false; if(settings.hasReordering()) { uint32_t prevStart = 0; uint32_t beforeDigitStart = 0; uint32_t digitStart = 0; uint32_t afterDigitStart = 0; for(int32_t group = UCOL_REORDER_CODE_FIRST; group < UCOL_REORDER_CODE_FIRST + CollationData::MAX_NUM_SPECIAL_REORDER_CODES; ++group) { uint32_t start = data->getFirstPrimaryForGroup(group); start = settings.reorder(start); if(group == UCOL_REORDER_CODE_DIGIT) { beforeDigitStart = prevStart; digitStart = start; } else if(start != 0) { if(start < prevStart) { // The permutation affects the groups up to Latin. return -1; } // In the future, there might be a special group between digits & Latin. if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) { afterDigitStart = start; } prevStart = start; } } uint32_t latinStart = data->getFirstPrimaryForGroup(USCRIPT_LATIN); latinStart = settings.reorder(latinStart); if(latinStart < prevStart) { return -1; } if(afterDigitStart == 0) { afterDigitStart = latinStart; } if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) { digitsAreReordered = true; } } table += (table[0] & 0xff); // skip the header for(UChar32 c = 0; c < LATIN_LIMIT; ++c) { uint32_t p = table[c]; if(p >= MIN_SHORT) { p &= SHORT_PRIMARY_MASK; } else if(p > miniVarTop) { p &= LONG_PRIMARY_MASK; } else { p = 0; } primaries[c] = (uint16_t)p; } if(digitsAreReordered || (settings.options & CollationSettings::NUMERIC) != 0) { // Bail out for digits. for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; } } // Shift the miniVarTop above other options. return ((int32_t)miniVarTop << 16) | settings.options; } int32_t CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options, const char16_t *left, int32_t leftLength, const char16_t *right, int32_t rightLength) { // This is a modified copy of CollationCompare::compareUpToQuaternary(), // optimized for common Latin text. // Keep them in sync! // Keep compareUTF16() and compareUTF8() in sync very closely! U_ASSERT((table[0] >> 8) == VERSION); table += (table[0] & 0xff); // skip the header uint32_t variableTop = (uint32_t)options >> 16; // see getOptions() options &= 0xffff; // needed for CollationSettings::getStrength() to work // Check for supported characters, fetch mini CEs, and compare primaries. int32_t leftIndex = 0, rightIndex = 0; /** * Single mini CE or a pair. * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. * If there is only one, then it is in the lower bits, and the upper bits are 0. */ uint32_t leftPair = 0, rightPair = 0; for(;;) { // We fetch CEs until we get a non-ignorable primary or reach the end. while(leftPair == 0) { if(leftIndex == leftLength) { leftPair = EOS; break; } UChar32 c = left[leftIndex++]; if(c <= LATIN_MAX) { leftPair = primaries[c]; if(leftPair != 0) { break; } if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { return BAIL_OUT_RESULT; } leftPair = table[c]; } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { leftPair = table[c - PUNCT_START + LATIN_LIMIT]; } else { leftPair = lookup(table, c); } if(leftPair >= MIN_SHORT) { leftPair &= SHORT_PRIMARY_MASK; break; } else if(leftPair > variableTop) { leftPair &= LONG_PRIMARY_MASK; break; } else { leftPair = nextPair(table, c, leftPair, left, nullptr, leftIndex, leftLength); if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } leftPair = getPrimaries(variableTop, leftPair); } } while(rightPair == 0) { if(rightIndex == rightLength) { rightPair = EOS; break; } UChar32 c = right[rightIndex++]; if(c <= LATIN_MAX) { rightPair = primaries[c]; if(rightPair != 0) { break; } if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { return BAIL_OUT_RESULT; } rightPair = table[c]; } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { rightPair = table[c - PUNCT_START + LATIN_LIMIT]; } else { rightPair = lookup(table, c); } if(rightPair >= MIN_SHORT) { rightPair &= SHORT_PRIMARY_MASK; break; } else if(rightPair > variableTop) { rightPair &= LONG_PRIMARY_MASK; break; } else { rightPair = nextPair(table, c, rightPair, right, nullptr, rightIndex, rightLength); if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } rightPair = getPrimaries(variableTop, rightPair); } } if(leftPair == rightPair) { if(leftPair == EOS) { break; } leftPair = rightPair = 0; continue; } uint32_t leftPrimary = leftPair & 0xffff; uint32_t rightPrimary = rightPair & 0xffff; if(leftPrimary != rightPrimary) { // Return the primary difference. return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; } if(leftPair == EOS) { break; } leftPair >>= 16; rightPair >>= 16; } // In the following, we need to re-fetch each character because we did not buffer the CEs, // but we know that the string is well-formed and // only contains supported characters and mappings. // We might skip the secondary level but continue with the case level // which is turned on separately. if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { leftIndex = rightIndex = 0; leftPair = rightPair = 0; for(;;) { while(leftPair == 0) { if(leftIndex == leftLength) { leftPair = EOS; break; } UChar32 c = left[leftIndex++]; if(c <= LATIN_MAX) { leftPair = table[c]; } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { leftPair = table[c - PUNCT_START + LATIN_LIMIT]; } else { leftPair = lookup(table, c); } if(leftPair >= MIN_SHORT) { leftPair = getSecondariesFromOneShortCE(leftPair); break; } else if(leftPair > variableTop) { leftPair = COMMON_SEC_PLUS_OFFSET; break; } else { leftPair = nextPair(table, c, leftPair, left, nullptr, leftIndex, leftLength); leftPair = getSecondaries(variableTop, leftPair); } } while(rightPair == 0) { if(rightIndex == rightLength) { rightPair = EOS; break; } UChar32 c = right[rightIndex++]; if(c <= LATIN_MAX) { rightPair = table[c]; } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { rightPair = table[c - PUNCT_START + LATIN_LIMIT]; } else { rightPair = lookup(table, c); } if(rightPair >= MIN_SHORT) { rightPair = getSecondariesFromOneShortCE(rightPair); break; } else if(rightPair > variableTop) { rightPair = COMMON_SEC_PLUS_OFFSET; break; } else { rightPair = nextPair(table, c, rightPair, right, nullptr, rightIndex, rightLength); rightPair = getSecondaries(variableTop, rightPair); } } if(leftPair == rightPair) { if(leftPair == EOS) { break; } leftPair = rightPair = 0; continue; } uint32_t leftSecondary = leftPair & 0xffff; uint32_t rightSecondary = rightPair & 0xffff; if(leftSecondary != rightSecondary) { if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { // Full support for backwards secondary requires backwards contraction matching // and moving backwards between merge separators. return BAIL_OUT_RESULT; } return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; } if(leftPair == EOS) { break; } leftPair >>= 16; rightPair >>= 16; } } if((options & CollationSettings::CASE_LEVEL) != 0) { UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY; leftIndex = rightIndex = 0; leftPair = rightPair = 0; for(;;) { while(leftPair == 0) { if(leftIndex == leftLength) { leftPair = EOS; break; } UChar32 c = left[leftIndex++]; leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); if(leftPair < MIN_LONG) { leftPair = nextPair(table, c, leftPair, left, nullptr, leftIndex, leftLength); } leftPair = getCases(variableTop, strengthIsPrimary, leftPair); } while(rightPair == 0) { if(rightIndex == rightLength) { rightPair = EOS; break; } UChar32 c = right[rightIndex++]; rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); if(rightPair < MIN_LONG) { rightPair = nextPair(table, c, rightPair, right, nullptr, rightIndex, rightLength); } rightPair = getCases(variableTop, strengthIsPrimary, rightPair); } if(leftPair == rightPair) { if(leftPair == EOS) { break; } leftPair = rightPair = 0; continue; } uint32_t leftCase = leftPair & 0xffff; uint32_t rightCase = rightPair & 0xffff; if(leftCase != rightCase) { if((options & CollationSettings::UPPER_FIRST) == 0) { return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; } else { return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; } } if(leftPair == EOS) { break; } leftPair >>= 16; rightPair >>= 16; } } if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); leftIndex = rightIndex = 0; leftPair = rightPair = 0; for(;;) { while(leftPair == 0) { if(leftIndex == leftLength) { leftPair = EOS; break; } UChar32 c = left[leftIndex++]; leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); if(leftPair < MIN_LONG) { leftPair = nextPair(table, c, leftPair, left, nullptr, leftIndex, leftLength); } leftPair = getTertiaries(variableTop, withCaseBits, leftPair); } while(rightPair == 0) { if(rightIndex == rightLength) { rightPair = EOS; break; } UChar32 c = right[rightIndex++]; rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); if(rightPair < MIN_LONG) { rightPair = nextPair(table, c, rightPair, right, nullptr, rightIndex, rightLength); } rightPair = getTertiaries(variableTop, withCaseBits, rightPair); } if(leftPair == rightPair) { if(leftPair == EOS) { break; } leftPair = rightPair = 0; continue; } uint32_t leftTertiary = leftPair & 0xffff; uint32_t rightTertiary = rightPair & 0xffff; if(leftTertiary != rightTertiary) { if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { // Pass through EOS and MERGE_WEIGHT // and keep real tertiary weights larger than the MERGE_WEIGHT. // Tertiary CEs (secondary ignorables) are not supported in fast Latin. if(leftTertiary > MERGE_WEIGHT) { leftTertiary ^= CASE_MASK; } if(rightTertiary > MERGE_WEIGHT) { rightTertiary ^= CASE_MASK; } } return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; } if(leftPair == EOS) { break; } leftPair >>= 16; rightPair >>= 16; } if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } leftIndex = rightIndex = 0; leftPair = rightPair = 0; for(;;) { while(leftPair == 0) { if(leftIndex == leftLength) { leftPair = EOS; break; } UChar32 c = left[leftIndex++]; leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); if(leftPair < MIN_LONG) { leftPair = nextPair(table, c, leftPair, left, nullptr, leftIndex, leftLength); } leftPair = getQuaternaries(variableTop, leftPair); } while(rightPair == 0) { if(rightIndex == rightLength) { rightPair = EOS; break; } UChar32 c = right[rightIndex++]; rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); if(rightPair < MIN_LONG) { rightPair = nextPair(table, c, rightPair, right, nullptr, rightIndex, rightLength); } rightPair = getQuaternaries(variableTop, rightPair); } if(leftPair == rightPair) { if(leftPair == EOS) { break; } leftPair = rightPair = 0; continue; } uint32_t leftQuaternary = leftPair & 0xffff; uint32_t rightQuaternary = rightPair & 0xffff; if(leftQuaternary != rightQuaternary) { return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; } if(leftPair == EOS) { break; } leftPair >>= 16; rightPair >>= 16; } return UCOL_EQUAL; } int32_t CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options, const uint8_t *left, int32_t leftLength, const uint8_t *right, int32_t rightLength) { // Keep compareUTF16() and compareUTF8() in sync very closely! U_ASSERT((table[0] >> 8) == VERSION); table += (table[0] & 0xff); // skip the header uint32_t variableTop = (uint32_t)options >> 16; // see RuleBasedCollator::getFastLatinOptions() options &= 0xffff; // needed for CollationSettings::getStrength() to work // Check for supported characters, fetch mini CEs, and compare primaries. int32_t leftIndex = 0, rightIndex = 0; /** * Single mini CE or a pair. * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. * If there is only one, then it is in the lower bits, and the upper bits are 0. */ uint32_t leftPair = 0, rightPair = 0; // Note: There is no need to assemble the code point. // We only need to look up the table entry for the character, // and nextPair() looks for whether c==0. for(;;) { // We fetch CEs until we get a non-ignorable primary or reach the end. while(leftPair == 0) { if(leftIndex == leftLength) { leftPair = EOS; break; } UChar32 c = left[leftIndex++]; uint8_t t; if(c <= 0x7f) { leftPair = primaries[c]; if(leftPair != 0) { break; } if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { return BAIL_OUT_RESULT; } leftPair = table[c]; } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength && 0x80 <= (t = left[leftIndex]) && t <= 0xbf) { ++leftIndex; c = ((c - 0xc2) << 6) + t; leftPair = primaries[c]; if(leftPair != 0) { break; } leftPair = table[c]; } else { leftPair = lookupUTF8(table, c, left, leftIndex, leftLength); } if(leftPair >= MIN_SHORT) { leftPair &= SHORT_PRIMARY_MASK; break; } else if(leftPair > variableTop) { leftPair &= LONG_PRIMARY_MASK; break; } else { leftPair = nextPair(table, c, leftPair, nullptr, left, leftIndex, leftLength); if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } leftPair = getPrimaries(variableTop, leftPair); } } while(rightPair == 0) { if(rightIndex == rightLength) { rightPair = EOS; break; } UChar32 c = right[rightIndex++]; uint8_t t; if(c <= 0x7f) { rightPair = primaries[c]; if(rightPair != 0) { break; } if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { return BAIL_OUT_RESULT; } rightPair = table[c]; } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength && 0x80 <= (t = right[rightIndex]) && t <= 0xbf) { ++rightIndex; c = ((c - 0xc2) << 6) + t; rightPair = primaries[c]; if(rightPair != 0) { break; } rightPair = table[c]; } else { rightPair = lookupUTF8(table, c, right, rightIndex, rightLength); } if(rightPair >= MIN_SHORT) { rightPair &= SHORT_PRIMARY_MASK; break; } else if(rightPair > variableTop) { rightPair &= LONG_PRIMARY_MASK; break; } else { rightPair = nextPair(table, c, rightPair, nullptr, right, rightIndex, rightLength); if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } rightPair = getPrimaries(variableTop, rightPair); } } if(leftPair == rightPair) { if(leftPair == EOS) { break; } leftPair = rightPair = 0; continue; } uint32_t leftPrimary = leftPair & 0xffff; uint32_t rightPrimary = rightPair & 0xffff; if(leftPrimary != rightPrimary) { // Return the primary difference. return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; } if(leftPair == EOS) { break; } leftPair >>= 16; rightPair >>= 16; } // In the following, we need to re-fetch each character because we did not buffer the CEs, // but we know that the string is well-formed and // only contains supported characters and mappings. // We might skip the secondary level but continue with the case level // which is turned on separately. if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { leftIndex = rightIndex = 0; leftPair = rightPair = 0; for(;;) { while(leftPair == 0) { if(leftIndex == leftLength) { leftPair = EOS; break; } UChar32 c = left[leftIndex++]; if(c <= 0x7f) { leftPair = table[c]; } else if(c <= LATIN_MAX_UTF8_LEAD) { leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]]; } else { leftPair = lookupUTF8Unsafe(table, c, left, leftIndex); } if(leftPair >= MIN_SHORT) { leftPair = getSecondariesFromOneShortCE(leftPair); break; } else if(leftPair > variableTop) { leftPair = COMMON_SEC_PLUS_OFFSET; break; } else { leftPair = nextPair(table, c, leftPair, nullptr, left, leftIndex, leftLength); leftPair = getSecondaries(variableTop, leftPair); } } while(rightPair == 0) { if(rightIndex == rightLength) { rightPair = EOS; break; } UChar32 c = right[rightIndex++]; if(c <= 0x7f) { rightPair = table[c]; } else if(c <= LATIN_MAX_UTF8_LEAD) { rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]]; } else { rightPair = lookupUTF8Unsafe(table, c, right, rightIndex); } if(rightPair >= MIN_SHORT) { rightPair = getSecondariesFromOneShortCE(rightPair); break; } else if(rightPair > variableTop) { rightPair = COMMON_SEC_PLUS_OFFSET; break; } else { rightPair = nextPair(table, c, rightPair, nullptr, right, rightIndex, rightLength); rightPair = getSecondaries(variableTop, rightPair); } } if(leftPair == rightPair) { if(leftPair == EOS) { break; } leftPair = rightPair = 0; continue; } uint32_t leftSecondary = leftPair & 0xffff; uint32_t rightSecondary = rightPair & 0xffff; if(leftSecondary != rightSecondary) { if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { // Full support for backwards secondary requires backwards contraction matching // and moving backwards between merge separators. return BAIL_OUT_RESULT; } return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; } if(leftPair == EOS) { break; } leftPair >>= 16; rightPair >>= 16; } } if((options & CollationSettings::CASE_LEVEL) != 0) { UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY; leftIndex = rightIndex = 0; leftPair = rightPair = 0; for(;;) { while(leftPair == 0) { if(leftIndex == leftLength) { leftPair = EOS; break; } UChar32 c = left[leftIndex++]; leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); if(leftPair < MIN_LONG) { leftPair = nextPair(table, c, leftPair, nullptr, left, leftIndex, leftLength); } leftPair = getCases(variableTop, strengthIsPrimary, leftPair); } while(rightPair == 0) { if(rightIndex == rightLength) { rightPair = EOS; break; } UChar32 c = right[rightIndex++]; rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); if(rightPair < MIN_LONG) { rightPair = nextPair(table, c, rightPair, nullptr, right, rightIndex, rightLength); } rightPair = getCases(variableTop, strengthIsPrimary, rightPair); } if(leftPair == rightPair) { if(leftPair == EOS) { break; } leftPair = rightPair = 0; continue; } uint32_t leftCase = leftPair & 0xffff; uint32_t rightCase = rightPair & 0xffff; if(leftCase != rightCase) { if((options & CollationSettings::UPPER_FIRST) == 0) { return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; } else { return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; } } if(leftPair == EOS) { break; } leftPair >>= 16; rightPair >>= 16; } } if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); leftIndex = rightIndex = 0; leftPair = rightPair = 0; for(;;) { while(leftPair == 0) { if(leftIndex == leftLength) { leftPair = EOS; break; } UChar32 c = left[leftIndex++]; leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); if(leftPair < MIN_LONG) { leftPair = nextPair(table, c, leftPair, nullptr, left, leftIndex, leftLength); } leftPair = getTertiaries(variableTop, withCaseBits, leftPair); } while(rightPair == 0) { if(rightIndex == rightLength) { rightPair = EOS; break; } UChar32 c = right[rightIndex++]; rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); if(rightPair < MIN_LONG) { rightPair = nextPair(table, c, rightPair, nullptr, right, rightIndex, rightLength); } rightPair = getTertiaries(variableTop, withCaseBits, rightPair); } if(leftPair == rightPair) { if(leftPair == EOS) { break; } leftPair = rightPair = 0; continue; } uint32_t leftTertiary = leftPair & 0xffff; uint32_t rightTertiary = rightPair & 0xffff; if(leftTertiary != rightTertiary) { if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { // Pass through EOS and MERGE_WEIGHT // and keep real tertiary weights larger than the MERGE_WEIGHT. // Tertiary CEs (secondary ignorables) are not supported in fast Latin. if(leftTertiary > MERGE_WEIGHT) { leftTertiary ^= CASE_MASK; } if(rightTertiary > MERGE_WEIGHT) { rightTertiary ^= CASE_MASK; } } return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; } if(leftPair == EOS) { break; } leftPair >>= 16; rightPair >>= 16; } if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } leftIndex = rightIndex = 0; leftPair = rightPair = 0; for(;;) { while(leftPair == 0) { if(leftIndex == leftLength) { leftPair = EOS; break; } UChar32 c = left[leftIndex++]; leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); if(leftPair < MIN_LONG) { leftPair = nextPair(table, c, leftPair, nullptr, left, leftIndex, leftLength); } leftPair = getQuaternaries(variableTop, leftPair); } while(rightPair == 0) { if(rightIndex == rightLength) { rightPair = EOS; break; } UChar32 c = right[rightIndex++]; rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); if(rightPair < MIN_LONG) { rightPair = nextPair(table, c, rightPair, nullptr, right, rightIndex, rightLength); } rightPair = getQuaternaries(variableTop, rightPair); } if(leftPair == rightPair) { if(leftPair == EOS) { break; } leftPair = rightPair = 0; continue; } uint32_t leftQuaternary = leftPair & 0xffff; uint32_t rightQuaternary = rightPair & 0xffff; if(leftQuaternary != rightQuaternary) { return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; } if(leftPair == EOS) { break; } leftPair >>= 16; rightPair >>= 16; } return UCOL_EQUAL; } uint32_t CollationFastLatin::lookup(const uint16_t *table, UChar32 c) { U_ASSERT(c > LATIN_MAX); if(PUNCT_START <= c && c < PUNCT_LIMIT) { return table[c - PUNCT_START + LATIN_LIMIT]; } else if(c == 0xfffe) { return MERGE_WEIGHT; } else if(c == 0xffff) { return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; } else { return BAIL_OUT; } } uint32_t CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c, const uint8_t *s8, int32_t &sIndex, int32_t sLength) { // The caller handled ASCII and valid/supported Latin. U_ASSERT(c > 0x7f); int32_t i2 = sIndex + 1; if(i2 < sLength || sLength < 0) { uint8_t t1 = s8[sIndex]; uint8_t t2 = s8[i2]; sIndex += 2; if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) { return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF } else if(c == 0xef && t1 == 0xbf) { if(t2 == 0xbe) { return MERGE_WEIGHT; // U+FFFE } else if(t2 == 0xbf) { return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF } } } return BAIL_OUT; } uint32_t CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c, const uint8_t *s8, int32_t &sIndex) { // The caller handled ASCII. // The string is well-formed and contains only supported characters. U_ASSERT(c > 0x7f); if(c <= LATIN_MAX_UTF8_LEAD) { return table[((c - 0xc2) << 6) + s8[sIndex++]]; // 0080..017F } uint8_t t2 = s8[sIndex + 1]; sIndex += 2; if(c == 0xe2) { return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF } else if(t2 == 0xbe) { return MERGE_WEIGHT; // U+FFFE } else { return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF } } uint32_t CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce, const char16_t *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) { if(ce >= MIN_LONG || ce < CONTRACTION) { return ce; // simple or special mini CE } else if(ce >= EXPANSION) { int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); return ((uint32_t)table[index + 1] << 16) | table[index]; } else /* ce >= CONTRACTION */ { if(c == 0 && sLength < 0) { sLength = sIndex - 1; return EOS; } // Contraction list: Default mapping followed by // 0 or more single-character contraction suffix mappings. int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); if(sIndex != sLength) { // Read the next character. int32_t c2; int32_t nextIndex = sIndex; if(s16 != nullptr) { c2 = s16[nextIndex++]; if(c2 > LATIN_MAX) { if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) { c2 = c2 - PUNCT_START + LATIN_LIMIT; // 2000..203F -> 0180..01BF } else if(c2 == 0xfffe || c2 == 0xffff) { c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions. } else { return BAIL_OUT; } } } else { c2 = s8[nextIndex++]; if(c2 > 0x7f) { uint8_t t; if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength && 0x80 <= (t = s8[nextIndex]) && t <= 0xbf) { c2 = ((c2 - 0xc2) << 6) + t; // 0080..017F ++nextIndex; } else { int32_t i2 = nextIndex + 1; if(i2 < sLength || sLength < 0) { if(c2 == 0xe2 && s8[nextIndex] == 0x80 && 0x80 <= (t = s8[i2]) && t <= 0xbf) { c2 = (LATIN_LIMIT - 0x80) + t; // 2000..203F -> 0180..01BF } else if(c2 == 0xef && s8[nextIndex] == 0xbf && ((t = s8[i2]) == 0xbe || t == 0xbf)) { c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions. } else { return BAIL_OUT; } } else { return BAIL_OUT; } nextIndex += 2; } } } if(c2 == 0 && sLength < 0) { sLength = sIndex; c2 = -1; } // Look for the next character in the contraction suffix list, // which is in ascending order of single suffix characters. int32_t i = index; int32_t head = table[i]; // first skip the default mapping int32_t x; do { i += head >> CONTR_LENGTH_SHIFT; head = table[i]; x = head & CONTR_CHAR_MASK; } while(x < c2); if(x == c2) { index = i; sIndex = nextIndex; } } // Return the CE or CEs for the default or contraction mapping. int32_t length = table[index] >> CONTR_LENGTH_SHIFT; if(length == 1) { return BAIL_OUT; } ce = table[index + 1]; if(length == 2) { return ce; } else { return ((uint32_t)table[index + 2] << 16) | ce; } } } uint32_t CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) { if(pair <= 0xffff) { // one mini CE if(pair >= MIN_SHORT) { pair = getSecondariesFromOneShortCE(pair); } else if(pair > variableTop) { pair = COMMON_SEC_PLUS_OFFSET; } else if(pair >= MIN_LONG) { pair = 0; // variable } // else special mini CE } else { uint32_t ce = pair & 0xffff; if(ce >= MIN_SHORT) { pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS; } else if(ce > variableTop) { pair = TWO_COMMON_SEC_PLUS_OFFSET; } else { U_ASSERT(ce >= MIN_LONG); pair = 0; // variable } } return pair; } uint32_t CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) { // Primary+caseLevel: Ignore case level weights of primary ignorables. // Otherwise: Ignore case level weights of secondary ignorables. // For details see the comments in the CollationCompare class. // Tertiary CEs (secondary ignorables) are not supported in fast Latin. if(pair <= 0xffff) { // one mini CE if(pair >= MIN_SHORT) { // A high secondary weight means we really have two CEs, // a primary CE and a secondary CE. uint32_t ce = pair; pair &= CASE_MASK; // explicit weight of primary CE if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { pair |= LOWER_CASE << 16; // implied weight of secondary CE } } else if(pair > variableTop) { pair = LOWER_CASE; } else if(pair >= MIN_LONG) { pair = 0; // variable } // else special mini CE } else { // two mini CEs, same primary groups, neither expands like above uint32_t ce = pair & 0xffff; if(ce >= MIN_SHORT) { if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) { pair &= CASE_MASK; } else { pair &= TWO_CASES_MASK; } } else if(ce > variableTop) { pair = TWO_LOWER_CASES; } else { U_ASSERT(ce >= MIN_LONG); pair = 0; // variable } } return pair; } uint32_t CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) { if(pair <= 0xffff) { // one mini CE if(pair >= MIN_SHORT) { // A high secondary weight means we really have two CEs, // a primary CE and a secondary CE. uint32_t ce = pair; if(withCaseBits) { pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET; if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16; } } else { pair = (pair & TERTIARY_MASK) + TER_OFFSET; if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { pair |= COMMON_TER_PLUS_OFFSET << 16; } } } else if(pair > variableTop) { pair = (pair & TERTIARY_MASK) + TER_OFFSET; if(withCaseBits) { pair |= LOWER_CASE; } } else if(pair >= MIN_LONG) { pair = 0; // variable } // else special mini CE } else { // two mini CEs, same primary groups, neither expands like above uint32_t ce = pair & 0xffff; if(ce >= MIN_SHORT) { if(withCaseBits) { pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK; } else { pair &= TWO_TERTIARIES_MASK; } pair += TWO_TER_OFFSETS; } else if(ce > variableTop) { pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS; if(withCaseBits) { pair |= TWO_LOWER_CASES; } } else { U_ASSERT(ce >= MIN_LONG); pair = 0; // variable } } return pair; } uint32_t CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) { // Return the primary weight of a variable CE, // or the maximum primary weight for a non-variable, not-completely-ignorable CE. if(pair <= 0xffff) { // one mini CE if(pair >= MIN_SHORT) { // A high secondary weight means we really have two CEs, // a primary CE and a secondary CE. if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) { pair = TWO_SHORT_PRIMARIES_MASK; } else { pair = SHORT_PRIMARY_MASK; } } else if(pair > variableTop) { pair = SHORT_PRIMARY_MASK; } else if(pair >= MIN_LONG) { pair &= LONG_PRIMARY_MASK; // variable } // else special mini CE } else { // two mini CEs, same primary groups, neither expands like above uint32_t ce = pair & 0xffff; if(ce > variableTop) { pair = TWO_SHORT_PRIMARIES_MASK; } else { U_ASSERT(ce >= MIN_LONG); pair &= TWO_LONG_PRIMARIES_MASK; // variable } } return pair; } U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION stringi/src/icu74/i18n/fphdlimp.cpp0000644000176200001440000000642514700200761016545 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2009-2015, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "fphdlimp.h" #include "uvectr32.h" U_NAMESPACE_BEGIN // utility FieldPositionHandler // base class, null implementation FieldPositionHandler::~FieldPositionHandler() { } void FieldPositionHandler::setShift(int32_t delta) { fShift = delta; } // utility subclass FieldPositionOnlyHandler FieldPositionOnlyHandler::FieldPositionOnlyHandler(FieldPosition& _pos) : pos(_pos) { } FieldPositionOnlyHandler::~FieldPositionOnlyHandler() { } void FieldPositionOnlyHandler::addAttribute(int32_t id, int32_t start, int32_t limit) { if (pos.getField() == id && (!acceptFirstOnly || !seenFirst)) { seenFirst = true; pos.setBeginIndex(start + fShift); pos.setEndIndex(limit + fShift); } } void FieldPositionOnlyHandler::shiftLast(int32_t delta) { if (delta != 0 && pos.getField() != FieldPosition::DONT_CARE && pos.getBeginIndex() != -1) { pos.setBeginIndex(delta + pos.getBeginIndex()); pos.setEndIndex(delta + pos.getEndIndex()); } } UBool FieldPositionOnlyHandler::isRecording() const { return pos.getField() != FieldPosition::DONT_CARE; } void FieldPositionOnlyHandler::setAcceptFirstOnly(UBool acceptFirstOnly) { this->acceptFirstOnly = acceptFirstOnly; } // utility subclass FieldPositionIteratorHandler FieldPositionIteratorHandler::FieldPositionIteratorHandler(FieldPositionIterator* posIter, UErrorCode& _status) : iter(posIter), vec(nullptr), status(_status), fCategory(UFIELD_CATEGORY_UNDEFINED) { if (iter && U_SUCCESS(status)) { vec = new UVector32(status); } } FieldPositionIteratorHandler::FieldPositionIteratorHandler( UVector32* vec, UErrorCode& status) : iter(nullptr), vec(vec), status(status), fCategory(UFIELD_CATEGORY_UNDEFINED) { } FieldPositionIteratorHandler::~FieldPositionIteratorHandler() { // setData adopts the vec regardless of status, so it's safe to null it if (iter) { iter->setData(vec, status); } // if iter is null, we never allocated vec, so no need to free it vec = nullptr; } void FieldPositionIteratorHandler::addAttribute(int32_t id, int32_t start, int32_t limit) { if (vec && U_SUCCESS(status) && start < limit) { int32_t size = vec->size(); vec->addElement(fCategory, status); vec->addElement(id, status); vec->addElement(start + fShift, status); vec->addElement(limit + fShift, status); if (!U_SUCCESS(status)) { vec->setSize(size); } } } void FieldPositionIteratorHandler::shiftLast(int32_t delta) { if (U_SUCCESS(status) && delta != 0) { int32_t i = vec->size(); if (i > 0) { --i; vec->setElementAt(delta + vec->elementAti(i), i); --i; vec->setElementAt(delta + vec->elementAti(i), i); } } } UBool FieldPositionIteratorHandler::isRecording() const { return U_SUCCESS(status); } U_NAMESPACE_END #endif /* !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/tridpars.h0000644000176200001440000003567714700200761016252 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ************************************************************************** * Copyright (c) 2002-2010, International Business Machines Corporation * * and others. All Rights Reserved. * ************************************************************************** * Date Name Description * * 01/28/2002 aliu Creation. * ************************************************************************** */ #ifndef TRIDPARS_H #define TRIDPARS_H #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/uobject.h" #include "unicode/unistr.h" U_NAMESPACE_BEGIN class Transliterator; class UnicodeSet; class UVector; /** * Parsing component for transliterator IDs. This class contains only * static members; it cannot be instantiated. Methods in this class * parse various ID formats, including the following: * * A basic ID, which contains source, target, and variant, but no * filter and no explicit inverse. Examples include * "Latin-Greek/UNGEGN" and "Null". * * A single ID, which is a basic ID plus optional filter and optional * explicit inverse. Examples include "[a-zA-Z] Latin-Greek" and * "Lower (Upper)". * * A compound ID, which is a sequence of one or more single IDs, * separated by semicolons, with optional forward and reverse global * filters. The global filters are UnicodeSet patterns prepended or * appended to the IDs, separated by semicolons. An appended filter * must be enclosed in parentheses and applies in the reverse * direction. * * @author Alan Liu */ class TransliteratorIDParser /* not : public UObject because all methods are static */ { public: /** * A structure containing the parsed data of a filtered ID, that * is, a basic ID optionally with a filter. * * 'source' and 'target' will always be non-null. The 'variant' * will be non-null only if a non-empty variant was parsed. * * 'sawSource' is true if there was an explicit source in the * parsed id. If there was no explicit source, then an implied * source of ANY is returned and 'sawSource' is set to false. * * 'filter' is the parsed filter pattern, or null if there was no * filter. */ class Specs : public UMemory { public: UnicodeString source; // not null UnicodeString target; // not null UnicodeString variant; // may be null UnicodeString filter; // may be null UBool sawSource; Specs(const UnicodeString& s, const UnicodeString& t, const UnicodeString& v, UBool sawS, const UnicodeString& f); private: Specs(const Specs &other); // forbid copying of this class Specs &operator=(const Specs &other); // forbid copying of this class }; /** * A structure containing the canonicalized data of a filtered ID, * that is, a basic ID optionally with a filter. * * 'canonID' is always non-null. It may be the empty string "". * It is the id that should be assigned to the created * transliterator. It _cannot_ be instantiated directly. * * 'basicID' is always non-null and non-empty. It is always of * the form S-T or S-T/V. It is designed to be fed to low-level * instantiation code that only understands these two formats. * * 'filter' may be null, if there is none, or non-null and * non-empty. */ class SingleID : public UMemory { public: UnicodeString canonID; UnicodeString basicID; UnicodeString filter; SingleID(const UnicodeString& c, const UnicodeString& b, const UnicodeString& f); SingleID(const UnicodeString& c, const UnicodeString& b); Transliterator* createInstance(); private: SingleID(const SingleID &other); // forbid copying of this class SingleID &operator=(const SingleID &other); // forbid copying of this class }; /** * Parse a filter ID, that is, an ID of the general form * "[f1] s1-t1/v1", with the filters optional, and the variants optional. * @param id the id to be parsed * @param pos INPUT-OUTPUT parameter. On input, the position of * the first character to parse. On output, the position after * the last character parsed. * @return a SingleID object or null if the parse fails */ static SingleID* parseFilterID(const UnicodeString& id, int32_t& pos); /** * Parse a single ID, that is, an ID of the general form * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element * optional, the filters optional, and the variants optional. * @param id the id to be parsed * @param pos INPUT-OUTPUT parameter. On input, the position of * the first character to parse. On output, the position after * the last character parsed. * @param dir the direction. If the direction is REVERSE then the * SingleID is constructed for the reverse direction. * @return a SingleID object or null */ static SingleID* parseSingleID(const UnicodeString& id, int32_t& pos, int32_t dir, UErrorCode& status); /** * Parse a global filter of the form "[f]" or "([f])", depending * on 'withParens'. * @param id the pattern the parse * @param pos INPUT-OUTPUT parameter. On input, the position of * the first character to parse. On output, the position after * the last character parsed. * @param dir the direction. * @param withParens INPUT-OUTPUT parameter. On entry, if * withParens[0] is 0, then parens are disallowed. If it is 1, * then parens are required. If it is -1, then parens are * optional, and the return result will be set to 0 or 1. * @param canonID OUTPUT parameter. The pattern for the filter * added to the canonID, either at the end, if dir is FORWARD, or * at the start, if dir is REVERSE. The pattern will be enclosed * in parentheses if appropriate, and will be suffixed with an * ID_DELIM character. May be null. * @return a UnicodeSet object or null. A non-null results * indicates a successful parse, regardless of whether the filter * applies to the given direction. The caller should discard it * if withParens != (dir == REVERSE). */ static UnicodeSet* parseGlobalFilter(const UnicodeString& id, int32_t& pos, int32_t dir, int32_t& withParens, UnicodeString* canonID); /** * Parse a compound ID, consisting of an optional forward global * filter, a separator, one or more single IDs delimited by * separators, an an optional reverse global filter. The * separator is a semicolon. The global filters are UnicodeSet * patterns. The reverse global filter must be enclosed in * parentheses. * @param id the pattern the parse * @param dir the direction. * @param canonID OUTPUT parameter that receives the canonical ID, * consisting of canonical IDs for all elements, as returned by * parseSingleID(), separated by semicolons. Previous contents * are discarded. * @param list OUTPUT parameter that receives a list of SingleID * objects representing the parsed IDs. Previous contents are * discarded. * @param globalFilter OUTPUT parameter that receives a pointer to * a newly created global filter for this ID in this direction, or * null if there is none. * @return true if the parse succeeds, that is, if the entire * id is consumed without syntax error. */ static UBool parseCompoundID(const UnicodeString& id, int32_t dir, UnicodeString& canonID, UVector& list, UnicodeSet*& globalFilter); /** * Convert the elements of the 'list' vector, which are SingleID * objects, into actual Transliterator objects. In the course of * this, some (or all) entries may be removed. If all entries * are removed, the Null transliterator will be added. * * Delete entries with empty basicIDs; these are generated by * elements like "(A)" in the forward direction, or "A()" in * the reverse. THIS MAY RESULT IN AN EMPTY VECTOR. Convert * SingleID entries to actual transliterators. * * @param list vector of SingleID objects. On exit, vector * of one or more Transliterators. * @param ec Output param to receive a success or an error code. * @return new value of insertIndex. The index will shift if * there are empty items, like "(Lower)", with indices less than * insertIndex. */ static void instantiateList(UVector& list, UErrorCode& ec); /** * Parse an ID into pieces. Take IDs of the form T, T/V, S-T, * S-T/V, or S/V-T. If the source is missing, return a source of * ANY. * @param id the id string, in any of several forms * @param source the given source. * @param target the given target. * @param variant the given variant * @param isSourcePresent If true then the source is present. * If the source is not present, ANY will be * given as the source, and isSourcePresent will be null * @return an array of 4 strings: source, target, variant, and * isSourcePresent. If the source is not present, ANY will be * given as the source, and isSourcePresent will be null. Otherwise * isSourcePresent will be non-null. The target may be empty if the * id is not well-formed. The variant may be empty. */ static void IDtoSTV(const UnicodeString& id, UnicodeString& source, UnicodeString& target, UnicodeString& variant, UBool& isSourcePresent); /** * Given source, target, and variant strings, concatenate them into a * full ID. If the source is empty, then "Any" will be used for the * source, so the ID will always be of the form s-t/v or s-t. */ static void STVtoID(const UnicodeString& source, const UnicodeString& target, const UnicodeString& variant, UnicodeString& id); /** * Register two targets as being inverses of one another. For * example, calling registerSpecialInverse("NFC", "NFD", true) causes * Transliterator to form the following inverse relationships: * *
NFC => NFD
     * Any-NFC => Any-NFD
     * NFD => NFC
     * Any-NFD => Any-NFC
* * (Without the special inverse registration, the inverse of NFC * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but * that the presence or absence of "Any-" is preserved. * *

The relationship is symmetrical; registering (a, b) is * equivalent to registering (b, a). * *

The relevant IDs must still be registered separately as * factories or classes. * *

Only the targets are specified. Special inverses always * have the form Any-Target1 <=> Any-Target2. The target should * have canonical casing (the casing desired to be produced when * an inverse is formed) and should contain no whitespace or other * extraneous characters. * * @param target the target against which to register the inverse * @param inverseTarget the inverse of target, that is * Any-target.getInverse() => Any-inverseTarget * @param bidirectional if true, register the reverse relation * as well, that is, Any-inverseTarget.getInverse() => Any-target */ static void registerSpecialInverse(const UnicodeString& target, const UnicodeString& inverseTarget, UBool bidirectional, UErrorCode &status); /** * Free static memory. */ static void cleanup(); private: //---------------------------------------------------------------- // Private implementation //---------------------------------------------------------------- // forbid instantiation TransliteratorIDParser(); /** * Parse an ID into component pieces. Take IDs of the form T, * T/V, S-T, S-T/V, or S/V-T. If the source is missing, return a * source of ANY. * @param id the id string, in any of several forms * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the * offset of the first character to parse in id. On output, * pos[0] is the offset after the last parsed character. If the * parse failed, pos[0] will be unchanged. * @param allowFilter if true, a UnicodeSet pattern is allowed * at any location between specs or delimiters, and is returned * as the fifth string in the array. * @return a Specs object, or null if the parse failed. If * neither source nor target was seen in the parsed id, then the * parse fails. If allowFilter is true, then the parsed filter * pattern is returned in the Specs object, otherwise the returned * filter reference is null. If the parse fails for any reason * null is returned. */ static Specs* parseFilterID(const UnicodeString& id, int32_t& pos, UBool allowFilter); /** * Givens a Specs object, convert it to a SingleID object. The * Spec object is a more unprocessed parse result. The SingleID * object contains information about canonical and basic IDs. * @param specs the given Specs object. * @param dir either FORWARD or REVERSE. * @return a SingleID; never returns null. Returned object always * has 'filter' field of null. */ static SingleID* specsToID(const Specs* specs, int32_t dir); /** * Given a Specs object, return a SingleID representing the * special inverse of that ID. If there is no special inverse * then return null. * @param specs the given Specs. * @return a SingleID or null. Returned object always has * 'filter' field of null. */ static SingleID* specsToSpecialInverse(const Specs& specs, UErrorCode &status); /** * Glue method to get around access problems in C++. * @param id the id string for the transliterator, in any of several forms * @param canonID the given canonical ID */ static Transliterator* createBasicInstance(const UnicodeString& id, const UnicodeString* canonID); /** * Initialize static memory. */ static void U_CALLCONV init(UErrorCode &status); friend class SingleID; }; U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ #endif stringi/src/icu74/i18n/hebrwcal.cpp0000644000176200001440000007012014700200761016522 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * Copyright (C) 2003-2016, International Business Machines Corporation * and others. All Rights Reserved. ****************************************************************************** * * File HEBRWCAL.CPP * * Modification History: * * Date Name Description * 12/03/2003 srl ported from java HebrewCalendar ***************************************************************************** */ #include "hebrwcal.h" #if !UCONFIG_NO_FORMATTING #include "cmemory.h" #include "cstring.h" #include "umutex.h" #include #include "gregoimp.h" // ClockMath #include "astro.h" // CalendarCache #include "uhash.h" #include "ucln_in.h" // Hebrew Calendar implementation /** * The absolute date, in milliseconds since 1/1/1970 AD, Gregorian, * of the start of the Hebrew calendar. In order to keep this calendar's * time of day in sync with that of the Gregorian calendar, we use * midnight, rather than sunset the day before. */ //static const double EPOCH_MILLIS = -180799862400000.; // 1/1/1 HY static const int32_t LIMITS[UCAL_FIELD_COUNT][4] = { // Minimum Greatest Least Maximum // Minimum Maximum { 0, 0, 0, 0}, // ERA { -5000000, -5000000, 5000000, 5000000}, // YEAR { 0, 0, 12, 12}, // MONTH { 1, 1, 51, 56}, // WEEK_OF_YEAR {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // WEEK_OF_MONTH { 1, 1, 29, 30}, // DAY_OF_MONTH { 1, 1, 353, 385}, // DAY_OF_YEAR {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // DAY_OF_WEEK { -1, -1, 5, 5}, // DAY_OF_WEEK_IN_MONTH {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // AM_PM {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // HOUR {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // HOUR_OF_DAY {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // MINUTE {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // SECOND {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // MILLISECOND {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // ZONE_OFFSET {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // DST_OFFSET { -5000000, -5000000, 5000000, 5000000}, // YEAR_WOY {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // DOW_LOCAL { -5000000, -5000000, 5000000, 5000000}, // EXTENDED_YEAR {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // JULIAN_DAY {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // MILLISECONDS_IN_DAY {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // IS_LEAP_MONTH { 0, 0, 11, 12}, // ORDINAL_MONTH }; /** * The lengths of the Hebrew months. This is complicated, because there * are three different types of years, or six if you count leap years. * Due to the rules for postponing the start of the year to avoid having * certain holidays fall on the sabbath, the year can end up being three * different lengths, called "deficient", "normal", and "complete". */ static const int8_t MONTH_LENGTH[][3] = { // Deficient Normal Complete { 30, 30, 30 }, //Tishri { 29, 29, 30 }, //Heshvan { 29, 30, 30 }, //Kislev { 29, 29, 29 }, //Tevet { 30, 30, 30 }, //Shevat { 30, 30, 30 }, //Adar I (leap years only) { 29, 29, 29 }, //Adar { 30, 30, 30 }, //Nisan { 29, 29, 29 }, //Iyar { 30, 30, 30 }, //Sivan { 29, 29, 29 }, //Tammuz { 30, 30, 30 }, //Av { 29, 29, 29 }, //Elul }; /** * The cumulative # of days to the end of each month in a non-leap year * Although this can be calculated from the MONTH_LENGTH table, * keeping it around separately makes some calculations a lot faster */ static const int16_t MONTH_START[][3] = { // Deficient Normal Complete { 0, 0, 0 }, // (placeholder) { 30, 30, 30 }, // Tishri { 59, 59, 60 }, // Heshvan { 88, 89, 90 }, // Kislev { 117, 118, 119 }, // Tevet { 147, 148, 149 }, // Shevat { 147, 148, 149 }, // (Adar I placeholder) { 176, 177, 178 }, // Adar { 206, 207, 208 }, // Nisan { 235, 236, 237 }, // Iyar { 265, 266, 267 }, // Sivan { 294, 295, 296 }, // Tammuz { 324, 325, 326 }, // Av { 353, 354, 355 }, // Elul }; /** * The cumulative # of days to the end of each month in a leap year */ static const int16_t LEAP_MONTH_START[][3] = { // Deficient Normal Complete { 0, 0, 0 }, // (placeholder) { 30, 30, 30 }, // Tishri { 59, 59, 60 }, // Heshvan { 88, 89, 90 }, // Kislev { 117, 118, 119 }, // Tevet { 147, 148, 149 }, // Shevat { 177, 178, 179 }, // Adar I { 206, 207, 208 }, // Adar II { 236, 237, 238 }, // Nisan { 265, 266, 267 }, // Iyar { 295, 296, 297 }, // Sivan { 324, 325, 326 }, // Tammuz { 354, 355, 356 }, // Av { 383, 384, 385 }, // Elul }; static icu::CalendarCache *gCache = nullptr; U_CDECL_BEGIN static UBool calendar_hebrew_cleanup() { delete gCache; gCache = nullptr; return true; } U_CDECL_END U_NAMESPACE_BEGIN //------------------------------------------------------------------------- // Constructors... //------------------------------------------------------------------------- /** * Constructs a default HebrewCalendar using the current time * in the default time zone with the default locale. * @internal */ HebrewCalendar::HebrewCalendar(const Locale& aLocale, UErrorCode& success) : Calendar(TimeZone::forLocaleOrDefault(aLocale), aLocale, success) { setTimeInMillis(getNow(), success); // Call this again now that the vtable is set up properly. } HebrewCalendar::~HebrewCalendar() { } const char *HebrewCalendar::getType() const { return "hebrew"; } HebrewCalendar* HebrewCalendar::clone() const { return new HebrewCalendar(*this); } HebrewCalendar::HebrewCalendar(const HebrewCalendar& other) : Calendar(other) { } //------------------------------------------------------------------------- // Rolling and adding functions overridden from Calendar // // These methods call through to the default implementation in IBMCalendar // for most of the fields and only handle the unusual ones themselves. //------------------------------------------------------------------------- /** * Add a signed amount to a specified field, using this calendar's rules. * For example, to add three days to the current date, you can call * add(Calendar.DATE, 3). *

* When adding to certain fields, the values of other fields may conflict and * need to be changed. For example, when adding one to the {@link #MONTH MONTH} field * for the date "30 Av 5758", the {@link #DAY_OF_MONTH DAY_OF_MONTH} field * must be adjusted so that the result is "29 Elul 5758" rather than the invalid * "30 Elul 5758". *

* This method is able to add to * all fields except for {@link #ERA ERA}, {@link #DST_OFFSET DST_OFFSET}, * and {@link #ZONE_OFFSET ZONE_OFFSET}. *

* Note: You should always use {@link #roll roll} and add rather * than attempting to perform arithmetic operations directly on the fields * of a HebrewCalendar. Since the {@link #MONTH MONTH} field behaves * discontinuously in non-leap years, simple arithmetic can give invalid results. *

* @param field the time field. * @param amount the amount to add to the field. * * @exception IllegalArgumentException if the field is invalid or refers * to a field that cannot be handled by this method. * @internal */ void HebrewCalendar::add(UCalendarDateFields field, int32_t amount, UErrorCode& status) { if(U_FAILURE(status)) { return; } switch (field) { case UCAL_MONTH: case UCAL_ORDINAL_MONTH: { // We can't just do a set(MONTH, get(MONTH) + amount). The // reason is ADAR_1. Suppose amount is +2 and we land in // ADAR_1 -- then we have to bump to ADAR_2 aka ADAR. But // if amount is -2 and we land in ADAR_1, then we have to // bump the other way -- down to SHEVAT. - Alan 11/00 int32_t month = get(UCAL_MONTH, status); int32_t year = get(UCAL_YEAR, status); UBool acrossAdar1; if (amount > 0) { acrossAdar1 = (month < ADAR_1); // started before ADAR_1? month += amount; for (;;) { if (acrossAdar1 && month>=ADAR_1 && !isLeapYear(year)) { ++month; } if (month <= ELUL) { break; } month -= ELUL+1; ++year; acrossAdar1 = true; } } else { acrossAdar1 = (month > ADAR_1); // started after ADAR_1? month += amount; for (;;) { if (acrossAdar1 && month<=ADAR_1 && !isLeapYear(year)) { --month; } if (month >= 0) { break; } month += ELUL+1; --year; acrossAdar1 = true; } } set(UCAL_MONTH, month); set(UCAL_YEAR, year); pinField(UCAL_DAY_OF_MONTH, status); break; } default: Calendar::add(field, amount, status); break; } } /** * @deprecated ICU 2.6 use UCalendarDateFields instead of EDateFields */ void HebrewCalendar::add(EDateFields field, int32_t amount, UErrorCode& status) { add((UCalendarDateFields)field, amount, status); } /** * Rolls (up/down) a specified amount time on the given field. For * example, to roll the current date up by three days, you can call * roll(Calendar.DATE, 3). If the * field is rolled past its maximum allowable value, it will "wrap" back * to its minimum and continue rolling. * For example, calling roll(Calendar.DATE, 10) * on a Hebrew calendar set to "25 Av 5758" will result in the date "5 Av 5758". *

* When rolling certain fields, the values of other fields may conflict and * need to be changed. For example, when rolling the {@link #MONTH MONTH} field * upward by one for the date "30 Av 5758", the {@link #DAY_OF_MONTH DAY_OF_MONTH} field * must be adjusted so that the result is "29 Elul 5758" rather than the invalid * "30 Elul". *

* This method is able to roll * all fields except for {@link #ERA ERA}, {@link #DST_OFFSET DST_OFFSET}, * and {@link #ZONE_OFFSET ZONE_OFFSET}. Subclasses may, of course, add support for * additional fields in their overrides of roll. *

* Note: You should always use roll and {@link #add add} rather * than attempting to perform arithmetic operations directly on the fields * of a HebrewCalendar. Since the {@link #MONTH MONTH} field behaves * discontinuously in non-leap years, simple arithmetic can give invalid results. *

* @param field the time field. * @param amount the amount by which the field should be rolled. * * @exception IllegalArgumentException if the field is invalid or refers * to a field that cannot be handled by this method. * @internal */ void HebrewCalendar::roll(UCalendarDateFields field, int32_t amount, UErrorCode& status) { if(U_FAILURE(status)) { return; } switch (field) { case UCAL_MONTH: case UCAL_ORDINAL_MONTH: { int32_t month = get(UCAL_MONTH, status); int32_t year = get(UCAL_YEAR, status); UBool leapYear = isLeapYear(year); int32_t yearLength = monthsInYear(year); int32_t newMonth = month + (amount % yearLength); // // If it's not a leap year and we're rolling past the missing month // of ADAR_1, we need to roll an extra month to make up for it. // if (!leapYear) { if (amount > 0 && month < ADAR_1 && newMonth >= ADAR_1) { newMonth++; } else if (amount < 0 && month > ADAR_1 && newMonth <= ADAR_1) { newMonth--; } } set(UCAL_MONTH, (newMonth + 13) % 13); pinField(UCAL_DAY_OF_MONTH, status); return; } default: Calendar::roll(field, amount, status); } } void HebrewCalendar::roll(EDateFields field, int32_t amount, UErrorCode& status) { roll((UCalendarDateFields)field, amount, status); } //------------------------------------------------------------------------- // Support methods //------------------------------------------------------------------------- // Hebrew date calculations are performed in terms of days, hours, and // "parts" (or halakim), which are 1/1080 of an hour, or 3 1/3 seconds. static const int32_t HOUR_PARTS = 1080; static const int32_t DAY_PARTS = 24*HOUR_PARTS; // An approximate value for the length of a lunar month. // It is used to calculate the approximate year and month of a given // absolute date. static const int32_t MONTH_DAYS = 29; static const int32_t MONTH_FRACT = 12*HOUR_PARTS + 793; static const int32_t MONTH_PARTS = MONTH_DAYS*DAY_PARTS + MONTH_FRACT; // The time of the new moon (in parts) on 1 Tishri, year 1 (the epoch) // counting from noon on the day before. BAHARAD is an abbreviation of // Bet (Monday), Hey (5 hours from sunset), Resh-Daled (204). static const int32_t BAHARAD = 11*HOUR_PARTS + 204; /** * Finds the day # of the first day in the given Hebrew year. * To do this, we want to calculate the time of the Tishri 1 new moon * in that year. *

* The algorithm here is similar to ones described in a number of * references, including: *

*/ int32_t HebrewCalendar::startOfYear(int32_t year, UErrorCode &status) { ucln_i18n_registerCleanup(UCLN_I18N_HEBREW_CALENDAR, calendar_hebrew_cleanup); int32_t day = CalendarCache::get(&gCache, year, status); if (day == 0) { // # of months before year int32_t months = (int32_t)ClockMath::floorDivide((235 * (int64_t)year - 234), (int64_t)19); int64_t frac = (int64_t)months * MONTH_FRACT + BAHARAD; // Fractional part of day # day = months * 29 + (int32_t)(frac / DAY_PARTS); // Whole # part of calculation frac = frac % DAY_PARTS; // Time of day int32_t wd = (day % 7); // Day of week (0 == Monday) if (wd == 2 || wd == 4 || wd == 6) { // If the 1st is on Sun, Wed, or Fri, postpone to the next day day += 1; wd = (day % 7); } if (wd == 1 && frac > 15*HOUR_PARTS+204 && !isLeapYear(year) ) { // If the new moon falls after 3:11:20am (15h204p from the previous noon) // on a Tuesday and it is not a leap year, postpone by 2 days. // This prevents 356-day years. day += 2; } else if (wd == 0 && frac > 21*HOUR_PARTS+589 && isLeapYear(year-1) ) { // If the new moon falls after 9:32:43 1/3am (21h589p from yesterday noon) // on a Monday and *last* year was a leap year, postpone by 1 day. // Prevents 382-day years. day += 1; } CalendarCache::put(&gCache, year, day, status); } return day; } /** * Find the day of the week for a given day * * @param day The # of days since the start of the Hebrew calendar, * 1-based (i.e. 1/1/1 AM is day 1). */ int32_t HebrewCalendar::absoluteDayToDayOfWeek(int32_t day) { // We know that 1/1/1 AM is a Monday, which makes the math easy... return (day % 7) + 1; } /** * Returns the the type of a given year. * 0 "Deficient" year with 353 or 383 days * 1 "Normal" year with 354 or 384 days * 2 "Complete" year with 355 or 385 days */ int32_t HebrewCalendar::yearType(int32_t year) const { int32_t yearLength = handleGetYearLength(year); if (yearLength > 380) { yearLength -= 30; // Subtract length of leap month. } int type = 0; switch (yearLength) { case 353: type = 0; break; case 354: type = 1; break; case 355: type = 2; break; default: //throw new RuntimeException("Illegal year length " + yearLength + " in year " + year); type = 1; } return type; } /** * Determine whether a given Hebrew year is a leap year * * The rule here is that if (year % 19) == 0, 3, 6, 8, 11, 14, or 17. * The formula below performs the same test, believe it or not. */ UBool HebrewCalendar::isLeapYear(int32_t year) { //return (year * 12 + 17) % 19 >= 12; int32_t x = (year*12 + 17) % 19; return x >= ((x < 0) ? -7 : 12); } int32_t HebrewCalendar::monthsInYear(int32_t year) { return isLeapYear(year) ? 13 : 12; } //------------------------------------------------------------------------- // Calendar framework //------------------------------------------------------------------------- /** * @internal */ int32_t HebrewCalendar::handleGetLimit(UCalendarDateFields field, ELimitType limitType) const { return LIMITS[field][limitType]; } /** * Returns the length of the given month in the given year * @internal */ int32_t HebrewCalendar::handleGetMonthLength(int32_t extendedYear, int32_t month) const { // Resolve out-of-range months. This is necessary in order to // obtain the correct year. We correct to // a 12- or 13-month year (add/subtract 12 or 13, depending // on the year) but since we _always_ number from 0..12, and // the leap year determines whether or not month 5 (Adar 1) // is present, we allow 0..12 in any given year. while (month < 0) { month += monthsInYear(--extendedYear); } // Careful: allow 0..12 in all years while (month > 12) { month -= monthsInYear(extendedYear++); } switch (month) { case HESHVAN: case KISLEV: // These two month lengths can vary return MONTH_LENGTH[month][yearType(extendedYear)]; default: // The rest are a fixed length return MONTH_LENGTH[month][0]; } } /** * Returns the number of days in the given Hebrew year * @internal */ int32_t HebrewCalendar::handleGetYearLength(int32_t eyear) const { UErrorCode status = U_ZERO_ERROR; return startOfYear(eyear+1, status) - startOfYear(eyear, status); } void HebrewCalendar::validateField(UCalendarDateFields field, UErrorCode &status) { if ((field == UCAL_MONTH || field == UCAL_ORDINAL_MONTH) && !isLeapYear(handleGetExtendedYear()) && internalGetMonth() == ADAR_1) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } Calendar::validateField(field, status); } //------------------------------------------------------------------------- // Functions for converting from milliseconds to field values //------------------------------------------------------------------------- /** * Subclasses may override this method to compute several fields * specific to each calendar system. These are: * *
  • ERA *
  • YEAR *
  • MONTH *
  • DAY_OF_MONTH *
  • DAY_OF_YEAR *
  • EXTENDED_YEAR
* * Subclasses can refer to the DAY_OF_WEEK and DOW_LOCAL fields, * which will be set when this method is called. Subclasses can * also call the getGregorianXxx() methods to obtain Gregorian * calendar equivalents for the given Julian day. * *

In addition, subclasses should compute any subclass-specific * fields, that is, fields from BASE_FIELD_COUNT to * getFieldCount() - 1. * @internal */ void HebrewCalendar::handleComputeFields(int32_t julianDay, UErrorCode &status) { int32_t d = julianDay - 347997; double m = ClockMath::floorDivide((d * (double)DAY_PARTS), (double) MONTH_PARTS); // Months (approx) int32_t year = (int32_t)(ClockMath::floorDivide((19. * m + 234.), 235.) + 1.); // Years (approx) int32_t ys = startOfYear(year, status); // 1st day of year int32_t dayOfYear = (d - ys); // Because of the postponement rules, it's possible to guess wrong. Fix it. while (dayOfYear < 1) { year--; ys = startOfYear(year, status); dayOfYear = (d - ys); } // Now figure out which month we're in, and the date within that month int32_t type = yearType(year); UBool isLeap = isLeapYear(year); int32_t month = 0; int32_t momax = UPRV_LENGTHOF(MONTH_START); while (month < momax && dayOfYear > ( isLeap ? LEAP_MONTH_START[month][type] : MONTH_START[month][type] ) ) { month++; } if (month >= momax || month<=0) { // TODO: I found dayOfYear could be out of range when // a large value is set to julianDay. I patched startOfYear // to reduce the chace, but it could be still reproduced either // by startOfYear or other places. For now, we check // the month is in valid range to avoid out of array index // access problem here. However, we need to carefully review // the calendar implementation to check the extreme limit of // each calendar field and the code works well for any values // in the valid value range. -yoshito status = U_ILLEGAL_ARGUMENT_ERROR; return; } month--; int dayOfMonth = dayOfYear - (isLeap ? LEAP_MONTH_START[month][type] : MONTH_START[month][type]); internalSet(UCAL_ERA, 0); internalSet(UCAL_YEAR, year); internalSet(UCAL_EXTENDED_YEAR, year); int32_t ordinal_month = month; if (!isLeap && ordinal_month > ADAR_1) { ordinal_month--; } internalSet(UCAL_ORDINAL_MONTH, ordinal_month); internalSet(UCAL_MONTH, month); internalSet(UCAL_DAY_OF_MONTH, dayOfMonth); internalSet(UCAL_DAY_OF_YEAR, dayOfYear); } //------------------------------------------------------------------------- // Functions for converting from field values to milliseconds //------------------------------------------------------------------------- /** * @internal */ int32_t HebrewCalendar::handleGetExtendedYear() { int32_t year; if (newerField(UCAL_EXTENDED_YEAR, UCAL_YEAR) == UCAL_EXTENDED_YEAR) { year = internalGet(UCAL_EXTENDED_YEAR, 1); // Default to year 1 } else { year = internalGet(UCAL_YEAR, 1); // Default to year 1 } return year; } /** * Return JD of start of given month/year. * @internal */ int32_t HebrewCalendar::handleComputeMonthStart(int32_t eyear, int32_t month, UBool /*useMonth*/) const { UErrorCode status = U_ZERO_ERROR; // Resolve out-of-range months. This is necessary in order to // obtain the correct year. We correct to // a 12- or 13-month year (add/subtract 12 or 13, depending // on the year) but since we _always_ number from 0..12, and // the leap year determines whether or not month 5 (Adar 1) // is present, we allow 0..12 in any given year. while (month < 0) { month += monthsInYear(--eyear); } // Careful: allow 0..12 in all years while (month > 12) { month -= monthsInYear(eyear++); } int32_t day = startOfYear(eyear, status); if(U_FAILURE(status)) { return 0; } if (month != 0) { if (isLeapYear(eyear)) { day += LEAP_MONTH_START[month][yearType(eyear)]; } else { day += MONTH_START[month][yearType(eyear)]; } } return (int) (day + 347997); } constexpr uint32_t kHebrewRelatedYearDiff = -3760; int32_t HebrewCalendar::getRelatedYear(UErrorCode &status) const { int32_t year = get(UCAL_EXTENDED_YEAR, status); if (U_FAILURE(status)) { return 0; } return year + kHebrewRelatedYearDiff; } void HebrewCalendar::setRelatedYear(int32_t year) { // set extended year set(UCAL_EXTENDED_YEAR, year - kHebrewRelatedYearDiff); } /** * The system maintains a static default century start date and Year. They are * initialized the first time they are used. Once the system default century date * and year are set, they do not change. */ static UDate gSystemDefaultCenturyStart = DBL_MIN; static int32_t gSystemDefaultCenturyStartYear = -1; static icu::UInitOnce gSystemDefaultCenturyInit {}; UBool HebrewCalendar::haveDefaultCentury() const { return true; } static void U_CALLCONV initializeSystemDefaultCentury() { // initialize systemDefaultCentury and systemDefaultCenturyYear based // on the current time. They'll be set to 80 years before // the current time. UErrorCode status = U_ZERO_ERROR; HebrewCalendar calendar(Locale("@calendar=hebrew"),status); if (U_SUCCESS(status)) { calendar.setTime(Calendar::getNow(), status); calendar.add(UCAL_YEAR, -80, status); gSystemDefaultCenturyStart = calendar.getTime(status); gSystemDefaultCenturyStartYear = calendar.get(UCAL_YEAR, status); } // We have no recourse upon failure unless we want to propagate the failure // out. } UDate HebrewCalendar::defaultCenturyStart() const { // lazy-evaluate systemDefaultCenturyStart umtx_initOnce(gSystemDefaultCenturyInit, &initializeSystemDefaultCentury); return gSystemDefaultCenturyStart; } int32_t HebrewCalendar::defaultCenturyStartYear() const { // lazy-evaluate systemDefaultCenturyStartYear umtx_initOnce(gSystemDefaultCenturyInit, &initializeSystemDefaultCentury); return gSystemDefaultCenturyStartYear; } bool HebrewCalendar::inTemporalLeapYear(UErrorCode& status) const { if (U_FAILURE(status)) return false; int32_t eyear = get(UCAL_EXTENDED_YEAR, status); if (U_FAILURE(status)) return false; return isLeapYear(eyear); } static const char * const gTemporalMonthCodesForHebrew[] = { "M01", "M02", "M03", "M04", "M05", "M05L", "M06", "M07", "M08", "M09", "M10", "M11", "M12", nullptr }; const char* HebrewCalendar::getTemporalMonthCode(UErrorCode& status) const { int32_t month = get(UCAL_MONTH, status); if (U_FAILURE(status)) return nullptr; return gTemporalMonthCodesForHebrew[month]; } void HebrewCalendar::setTemporalMonthCode(const char* code, UErrorCode& status ) { if (U_FAILURE(status)) return; int32_t len = static_cast(uprv_strlen(code)); if (len == 3 || len == 4) { for (int m = 0; gTemporalMonthCodesForHebrew[m] != nullptr; m++) { if (uprv_strcmp(code, gTemporalMonthCodesForHebrew[m]) == 0) { set(UCAL_MONTH, m); return; } } } status = U_ILLEGAL_ARGUMENT_ERROR; } int32_t HebrewCalendar::internalGetMonth() const { if (resolveFields(kMonthPrecedence) == UCAL_ORDINAL_MONTH) { int32_t ordinalMonth = internalGet(UCAL_ORDINAL_MONTH); HebrewCalendar *nonConstThis = (HebrewCalendar*)this; // cast away const int32_t year = nonConstThis->handleGetExtendedYear(); return ordinalMonth + (((!isLeapYear(year)) && (ordinalMonth > ADAR_1)) ? 1: 0); } return Calendar::internalGetMonth(); } UOBJECT_DEFINE_RTTI_IMPLEMENTATION(HebrewCalendar) U_NAMESPACE_END #endif // UCONFIG_NO_FORMATTING stringi/src/icu74/i18n/numparse_scientific.h0000644000176200001440000000235714700200761020441 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #ifndef __NUMPARSE_SCIENTIFIC_H__ #define __NUMPARSE_SCIENTIFIC_H__ #include "numparse_types.h" #include "numparse_decimal.h" #include "numparse_symbols.h" #include "unicode/numberformatter.h" using icu::number::impl::Grouper; U_NAMESPACE_BEGIN namespace numparse { namespace impl { class ScientificMatcher : public NumberParseMatcher, public UMemory { public: ScientificMatcher() = default; // WARNING: Leaves the object in an unusable state ScientificMatcher(const DecimalFormatSymbols& dfs, const Grouper& grouper); bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override; bool smokeTest(const StringSegment& segment) const override; UnicodeString toString() const override; private: UnicodeString fExponentSeparatorString; DecimalMatcher fExponentMatcher; IgnorablesMatcher fIgnorablesMatcher; UnicodeString fCustomMinusSign; UnicodeString fCustomPlusSign; }; } // namespace impl } // namespace numparse U_NAMESPACE_END #endif //__NUMPARSE_SCIENTIFIC_H__ #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/csrmbcs.cpp0000644000176200001440000003731214700200761016375 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2005-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_CONVERSION #include "cmemory.h" #include "csmatch.h" #include "csrmbcs.h" #include U_NAMESPACE_BEGIN #define min(x,y) (((x)<(y))?(x):(y)) static const uint16_t commonChars_sjis [] = { // TODO: This set of data comes from the character frequency- // of-occurrence analysis tool. The data needs to be moved // into a resource and loaded from there. 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0, 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5, 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc, 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341, 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389, 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa}; static const uint16_t commonChars_euc_jp[] = { // TODO: This set of data comes from the character frequency- // of-occurrence analysis tool. The data needs to be moved // into a resource and loaded from there. 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2, 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3, 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4, 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1}; static const uint16_t commonChars_euc_kr[] = { // TODO: This set of data comes from the character frequency- // of-occurrence analysis tool. The data needs to be moved // into a resource and loaded from there. 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc, 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9, 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce, 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad}; static const uint16_t commonChars_big5[] = { // TODO: This set of data comes from the character frequency- // of-occurrence analysis tool. The data needs to be moved // into a resource and loaded from there. 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446, 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3, 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548, 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8, 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f}; static const uint16_t commonChars_gb_18030[] = { // TODO: This set of data comes from the character frequency- // of-occurrence analysis tool. The data needs to be moved // into a resource and loaded from there. 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac, 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4, 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4, 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0}; static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value) { int32_t start = 0, end = len-1; int32_t mid = (start+end)/2; while(start <= end) { if(array[mid] == value) { return mid; } if(array[mid] < value){ start = mid+1; } else { end = mid-1; } mid = (start+end)/2; } return -1; } IteratedChar::IteratedChar() : charValue(0), index(-1), nextIndex(0), error(false), done(false) { // nothing else to do. } /*void IteratedChar::reset() { charValue = 0; index = -1; nextIndex = 0; error = false; done = false; }*/ int32_t IteratedChar::nextByte(InputText *det) { if (nextIndex >= det->fRawLength) { done = true; return -1; } return det->fRawInput[nextIndex++]; } CharsetRecog_mbcs::~CharsetRecog_mbcs() { // nothing to do. } int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const { int32_t doubleByteCharCount = 0; int32_t commonCharCount = 0; int32_t badCharCount = 0; int32_t totalCharCount = 0; int32_t confidence = 0; IteratedChar iter; while (nextChar(&iter, det)) { totalCharCount++; if (iter.error) { badCharCount++; } else { if (iter.charValue > 0xFF) { doubleByteCharCount++; if (commonChars != 0) { if (binarySearch(commonChars, commonCharsLen, static_cast(iter.charValue)) >= 0){ commonCharCount += 1; } } } } if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) { // Bail out early if the byte data is not matching the encoding scheme. // break detectBlock; return confidence; } } if (doubleByteCharCount <= 10 && badCharCount == 0) { // Not many multi-byte chars. if (doubleByteCharCount == 0 && totalCharCount < 10) { // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes. // We don't have enough data to have any confidence. // Statistical analysis of single byte non-ASCII characters would probably help here. confidence = 0; } else { // ASCII or ISO file? It's probably not our encoding, // but is not incompatible with our encoding, so don't give it a zero. confidence = 10; } return confidence; } // // No match if there are too many characters that don't fit the encoding scheme. // (should we have zero tolerance for these?) // if (doubleByteCharCount < 20*badCharCount) { confidence = 0; return confidence; } if (commonChars == 0) { // We have no statistics on frequently occurring characters. // Assess confidence purely on having a reasonable number of // multi-byte characters (the more the better) confidence = 30 + doubleByteCharCount - 20*badCharCount; if (confidence > 100) { confidence = 100; } } else { // // Frequency of occurrence statistics exist. // double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/ double scaleFactor = 90.0 / maxVal; confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0); confidence = min(confidence, 100); } if (confidence < 0) { confidence = 0; } return confidence; } CharsetRecog_sjis::~CharsetRecog_sjis() { // nothing to do } UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const { it->index = it->nextIndex; it->error = false; int32_t firstByte = it->charValue = it->nextByte(det); if (firstByte < 0) { return false; } if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) { return true; } int32_t secondByte = it->nextByte(det); if (secondByte >= 0) { it->charValue = (firstByte << 8) | secondByte; } // else we'll handle the error later. if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) { // Illegal second byte value. it->error = true; } return true; } UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const { int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis)); results->set(det, this, confidence); return (confidence > 0); } const char *CharsetRecog_sjis::getName() const { return "Shift_JIS"; } const char *CharsetRecog_sjis::getLanguage() const { return "ja"; } CharsetRecog_euc::~CharsetRecog_euc() { // nothing to do } UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const { int32_t firstByte = 0; int32_t secondByte = 0; int32_t thirdByte = 0; it->index = it->nextIndex; it->error = false; firstByte = it->charValue = it->nextByte(det); if (firstByte < 0) { // Ran off the end of the input data return false; } if (firstByte <= 0x8D) { // single byte char return true; } secondByte = it->nextByte(det); if (secondByte >= 0) { it->charValue = (it->charValue << 8) | secondByte; } // else we'll handle the error later. if (firstByte >= 0xA1 && firstByte <= 0xFE) { // Two byte Char if (secondByte < 0xA1) { it->error = true; } return true; } if (firstByte == 0x8E) { // Code Set 2. // In EUC-JP, total char size is 2 bytes, only one byte of actual char value. // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value. // We don't know which we've got. // Treat it like EUC-JP. If the data really was EUC-TW, the following two // bytes will look like a well formed 2 byte char. if (secondByte < 0xA1) { it->error = true; } return true; } if (firstByte == 0x8F) { // Code set 3. // Three byte total char size, two bytes of actual char value. thirdByte = it->nextByte(det); it->charValue = (it->charValue << 8) | thirdByte; if (thirdByte < 0xa1) { // Bad second byte or ran off the end of the input data with a non-ASCII first byte. it->error = true; } } return true; } CharsetRecog_euc_jp::~CharsetRecog_euc_jp() { // nothing to do } const char *CharsetRecog_euc_jp::getName() const { return "EUC-JP"; } const char *CharsetRecog_euc_jp::getLanguage() const { return "ja"; } UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const { int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp)); results->set(det, this, confidence); return (confidence > 0); } CharsetRecog_euc_kr::~CharsetRecog_euc_kr() { // nothing to do } const char *CharsetRecog_euc_kr::getName() const { return "EUC-KR"; } const char *CharsetRecog_euc_kr::getLanguage() const { return "ko"; } UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const { int32_t confidence = match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr)); results->set(det, this, confidence); return (confidence > 0); } CharsetRecog_big5::~CharsetRecog_big5() { // nothing to do } UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const { int32_t firstByte; it->index = it->nextIndex; it->error = false; firstByte = it->charValue = it->nextByte(det); if (firstByte < 0) { return false; } if (firstByte <= 0x7F || firstByte == 0xFF) { // single byte character. return true; } int32_t secondByte = it->nextByte(det); if (secondByte >= 0) { it->charValue = (it->charValue << 8) | secondByte; } // else we'll handle the error later. if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) { it->error = true; } return true; } const char *CharsetRecog_big5::getName() const { return "Big5"; } const char *CharsetRecog_big5::getLanguage() const { return "zh"; } UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const { int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5)); results->set(det, this, confidence); return (confidence > 0); } CharsetRecog_gb_18030::~CharsetRecog_gb_18030() { // nothing to do } UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const { int32_t firstByte = 0; int32_t secondByte = 0; int32_t thirdByte = 0; int32_t fourthByte = 0; it->index = it->nextIndex; it->error = false; firstByte = it->charValue = it->nextByte(det); if (firstByte < 0) { // Ran off the end of the input data return false; } if (firstByte <= 0x80) { // single byte char return true; } secondByte = it->nextByte(det); if (secondByte >= 0) { it->charValue = (it->charValue << 8) | secondByte; } // else we'll handle the error later. if (firstByte >= 0x81 && firstByte <= 0xFE) { // Two byte Char if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) { return true; } // Four byte char if (secondByte >= 0x30 && secondByte <= 0x39) { thirdByte = it->nextByte(det); if (thirdByte >= 0x81 && thirdByte <= 0xFE) { fourthByte = it->nextByte(det); if (fourthByte >= 0x30 && fourthByte <= 0x39) { it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte; return true; } } } // Something wasn't valid, or we ran out of data (-1). it->error = true; } return true; } const char *CharsetRecog_gb_18030::getName() const { return "GB18030"; } const char *CharsetRecog_gb_18030::getLanguage() const { return "zh"; } UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const { int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030)); results->set(det, this, confidence); return (confidence > 0); } U_NAMESPACE_END #endif stringi/src/icu74/i18n/collationsets.h0000644000176200001440000001107014700200761017262 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2013-2014, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * collationsets.h * * created on: 2013feb09 * created by: Markus W. Scherer */ #ifndef __COLLATIONSETS_H__ #define __COLLATIONSETS_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION #include "unicode/uniset.h" #include "collation.h" U_NAMESPACE_BEGIN struct CollationData; /** * Finds the set of characters and strings that sort differently in the tailoring * from the base data. * * Every mapping in the tailoring needs to be compared to the base, * because some mappings are copied for optimization, and * all contractions for a character are copied if any contractions for that character * are added, modified or removed. * * It might be simpler to re-parse the rule string, but: * - That would require duplicating some of the from-rules builder code. * - That would make the runtime code depend on the builder. * - That would only work if we have the rule string, and we allow users to * omit the rule string from data files. */ class TailoredSet : public UMemory { public: TailoredSet(UnicodeSet *t) : data(nullptr), baseData(nullptr), tailored(t), suffix(nullptr), errorCode(U_ZERO_ERROR) {} void forData(const CollationData *d, UErrorCode &errorCode); /** * @return U_SUCCESS(errorCode) in C++, void in Java * @internal only public for access by callback */ UBool handleCE32(UChar32 start, UChar32 end, uint32_t ce32); private: void compare(UChar32 c, uint32_t ce32, uint32_t baseCE32); void comparePrefixes(UChar32 c, const char16_t *p, const char16_t *q); void compareContractions(UChar32 c, const char16_t *p, const char16_t *q); void addPrefixes(const CollationData *d, UChar32 c, const char16_t *p); void addPrefix(const CollationData *d, const UnicodeString &pfx, UChar32 c, uint32_t ce32); void addContractions(UChar32 c, const char16_t *p); void addSuffix(UChar32 c, const UnicodeString &sfx); void add(UChar32 c); /** Prefixes are reversed in the data structure. */ void setPrefix(const UnicodeString &pfx) { unreversedPrefix = pfx; unreversedPrefix.reverse(); } void resetPrefix() { unreversedPrefix.remove(); } const CollationData *data; const CollationData *baseData; UnicodeSet *tailored; UnicodeString unreversedPrefix; const UnicodeString *suffix; UErrorCode errorCode; }; class ContractionsAndExpansions : public UMemory { public: class CESink : public UMemory { public: virtual ~CESink(); virtual void handleCE(int64_t ce) = 0; virtual void handleExpansion(const int64_t ces[], int32_t length) = 0; }; ContractionsAndExpansions(UnicodeSet *con, UnicodeSet *exp, CESink *s, UBool prefixes) : data(nullptr), contractions(con), expansions(exp), sink(s), addPrefixes(prefixes), checkTailored(0), suffix(nullptr), errorCode(U_ZERO_ERROR) {} void forData(const CollationData *d, UErrorCode &errorCode); void forCodePoint(const CollationData *d, UChar32 c, UErrorCode &ec); // all following: @internal, only public for access by callback void handleCE32(UChar32 start, UChar32 end, uint32_t ce32); void handlePrefixes(UChar32 start, UChar32 end, uint32_t ce32); void handleContractions(UChar32 start, UChar32 end, uint32_t ce32); void addExpansions(UChar32 start, UChar32 end); void addStrings(UChar32 start, UChar32 end, UnicodeSet *set); /** Prefixes are reversed in the data structure. */ void setPrefix(const UnicodeString &pfx) { unreversedPrefix = pfx; unreversedPrefix.reverse(); } void resetPrefix() { unreversedPrefix.remove(); } const CollationData *data; UnicodeSet *contractions; UnicodeSet *expansions; CESink *sink; UBool addPrefixes; int8_t checkTailored; // -1: collected tailored +1: exclude tailored UnicodeSet tailored; UnicodeSet ranges; UnicodeString unreversedPrefix; const UnicodeString *suffix; int64_t ces[Collation::MAX_EXPANSION_LENGTH]; UErrorCode errorCode; }; U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION #endif // __COLLATIONSETS_H__ stringi/src/icu74/i18n/double-conversion-fast-dtoa.cpp0000644000176200001440000007672214700200761022266 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // // From the double-conversion library. Original license: // // Copyright 2012 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // ICU PATCH: ifdef around UCONFIG_NO_FORMATTING #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING // ICU PATCH: Customize header file paths for ICU. #include "double-conversion-fast-dtoa.h" #include "double-conversion-cached-powers.h" #include "double-conversion-diy-fp.h" #include "double-conversion-ieee.h" // ICU PATCH: Wrap in ICU namespace U_NAMESPACE_BEGIN namespace double_conversion { // The minimal and maximal target exponent define the range of w's binary // exponent, where 'w' is the result of multiplying the input by a cached power // of ten. // // A different range might be chosen on a different platform, to optimize digit // generation, but a smaller range requires more powers of ten to be cached. static const int kMinimalTargetExponent = -60; static const int kMaximalTargetExponent = -32; // Adjusts the last digit of the generated number, and screens out generated // solutions that may be inaccurate. A solution may be inaccurate if it is // outside the safe interval, or if we cannot prove that it is closer to the // input than a neighboring representation of the same length. // // Input: * buffer containing the digits of too_high / 10^kappa // * the buffer's length // * distance_too_high_w == (too_high - w).f() * unit // * unsafe_interval == (too_high - too_low).f() * unit // * rest = (too_high - buffer * 10^kappa).f() * unit // * ten_kappa = 10^kappa * unit // * unit = the common multiplier // Output: returns true if the buffer is guaranteed to contain the closest // representable number to the input. // Modifies the generated digits in the buffer to approach (round towards) w. static bool RoundWeed(Vector buffer, int length, uint64_t distance_too_high_w, uint64_t unsafe_interval, uint64_t rest, uint64_t ten_kappa, uint64_t unit) { uint64_t small_distance = distance_too_high_w - unit; uint64_t big_distance = distance_too_high_w + unit; // Let w_low = too_high - big_distance, and // w_high = too_high - small_distance. // Note: w_low < w < w_high // // The real w (* unit) must lie somewhere inside the interval // ]w_low; w_high[ (often written as "(w_low; w_high)") // Basically the buffer currently contains a number in the unsafe interval // ]too_low; too_high[ with too_low < w < too_high // // too_high - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // ^v 1 unit ^ ^ ^ ^ // boundary_high --------------------- . . . . // ^v 1 unit . . . . // - - - - - - - - - - - - - - - - - - - + - - + - - - - - - . . // . . ^ . . // . big_distance . . . // . . . . rest // small_distance . . . . // v . . . . // w_high - - - - - - - - - - - - - - - - - - . . . . // ^v 1 unit . . . . // w ---------------------------------------- . . . . // ^v 1 unit v . . . // w_low - - - - - - - - - - - - - - - - - - - - - . . . // . . v // buffer --------------------------------------------------+-------+-------- // . . // safe_interval . // v . // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - . // ^v 1 unit . // boundary_low ------------------------- unsafe_interval // ^v 1 unit v // too_low - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // // // Note that the value of buffer could lie anywhere inside the range too_low // to too_high. // // boundary_low, boundary_high and w are approximations of the real boundaries // and v (the input number). They are guaranteed to be precise up to one unit. // In fact the error is guaranteed to be strictly less than one unit. // // Anything that lies outside the unsafe interval is guaranteed not to round // to v when read again. // Anything that lies inside the safe interval is guaranteed to round to v // when read again. // If the number inside the buffer lies inside the unsafe interval but not // inside the safe interval then we simply do not know and bail out (returning // false). // // Similarly we have to take into account the imprecision of 'w' when finding // the closest representation of 'w'. If we have two potential // representations, and one is closer to both w_low and w_high, then we know // it is closer to the actual value v. // // By generating the digits of too_high we got the largest (closest to // too_high) buffer that is still in the unsafe interval. In the case where // w_high < buffer < too_high we try to decrement the buffer. // This way the buffer approaches (rounds towards) w. // There are 3 conditions that stop the decrementation process: // 1) the buffer is already below w_high // 2) decrementing the buffer would make it leave the unsafe interval // 3) decrementing the buffer would yield a number below w_high and farther // away than the current number. In other words: // (buffer{-1} < w_high) && w_high - buffer{-1} > buffer - w_high // Instead of using the buffer directly we use its distance to too_high. // Conceptually rest ~= too_high - buffer // We need to do the following tests in this order to avoid over- and // underflows. DOUBLE_CONVERSION_ASSERT(rest <= unsafe_interval); while (rest < small_distance && // Negated condition 1 unsafe_interval - rest >= ten_kappa && // Negated condition 2 (rest + ten_kappa < small_distance || // buffer{-1} > w_high small_distance - rest >= rest + ten_kappa - small_distance)) { buffer[length - 1]--; rest += ten_kappa; } // We have approached w+ as much as possible. We now test if approaching w- // would require changing the buffer. If yes, then we have two possible // representations close to w, but we cannot decide which one is closer. if (rest < big_distance && unsafe_interval - rest >= ten_kappa && (rest + ten_kappa < big_distance || big_distance - rest > rest + ten_kappa - big_distance)) { return false; } // Weeding test. // The safe interval is [too_low + 2 ulp; too_high - 2 ulp] // Since too_low = too_high - unsafe_interval this is equivalent to // [too_high - unsafe_interval + 4 ulp; too_high - 2 ulp] // Conceptually we have: rest ~= too_high - buffer return (2 * unit <= rest) && (rest <= unsafe_interval - 4 * unit); } // Rounds the buffer upwards if the result is closer to v by possibly adding // 1 to the buffer. If the precision of the calculation is not sufficient to // round correctly, return false. // The rounding might shift the whole buffer in which case the kappa is // adjusted. For example "99", kappa = 3 might become "10", kappa = 4. // // If 2*rest > ten_kappa then the buffer needs to be round up. // rest can have an error of +/- 1 unit. This function accounts for the // imprecision and returns false, if the rounding direction cannot be // unambiguously determined. // // Precondition: rest < ten_kappa. static bool RoundWeedCounted(Vector buffer, int length, uint64_t rest, uint64_t ten_kappa, uint64_t unit, int* kappa) { DOUBLE_CONVERSION_ASSERT(rest < ten_kappa); // The following tests are done in a specific order to avoid overflows. They // will work correctly with any uint64 values of rest < ten_kappa and unit. // // If the unit is too big, then we don't know which way to round. For example // a unit of 50 means that the real number lies within rest +/- 50. If // 10^kappa == 40 then there is no way to tell which way to round. if (unit >= ten_kappa) return false; // Even if unit is just half the size of 10^kappa we are already completely // lost. (And after the previous test we know that the expression will not // over/underflow.) if (ten_kappa - unit <= unit) return false; // If 2 * (rest + unit) <= 10^kappa we can safely round down. if ((ten_kappa - rest > rest) && (ten_kappa - 2 * rest >= 2 * unit)) { return true; } // If 2 * (rest - unit) >= 10^kappa, then we can safely round up. if ((rest > unit) && (ten_kappa - (rest - unit) <= (rest - unit))) { // Increment the last digit recursively until we find a non '9' digit. buffer[length - 1]++; for (int i = length - 1; i > 0; --i) { if (buffer[i] != '0' + 10) break; buffer[i] = '0'; buffer[i - 1]++; } // If the first digit is now '0'+ 10 we had a buffer with all '9's. With the // exception of the first digit all digits are now '0'. Simply switch the // first digit to '1' and adjust the kappa. Example: "99" becomes "10" and // the power (the kappa) is increased. if (buffer[0] == '0' + 10) { buffer[0] = '1'; (*kappa) += 1; } return true; } return false; } // Returns the biggest power of ten that is less than or equal to the given // number. We furthermore receive the maximum number of bits 'number' has. // // Returns power == 10^(exponent_plus_one-1) such that // power <= number < power * 10. // If number_bits == 0 then 0^(0-1) is returned. // The number of bits must be <= 32. // Precondition: number < (1 << (number_bits + 1)). // Inspired by the method for finding an integer log base 10 from here: // http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog10 static unsigned int const kSmallPowersOfTen[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000}; static void BiggestPowerTen(uint32_t number, int number_bits, uint32_t* power, int* exponent_plus_one) { DOUBLE_CONVERSION_ASSERT(number < (1u << (number_bits + 1))); // 1233/4096 is approximately 1/lg(10). int exponent_plus_one_guess = ((number_bits + 1) * 1233 >> 12); // We increment to skip over the first entry in the kPowersOf10 table. // Note: kPowersOf10[i] == 10^(i-1). exponent_plus_one_guess++; // We don't have any guarantees that 2^number_bits <= number. if (number < kSmallPowersOfTen[exponent_plus_one_guess]) { exponent_plus_one_guess--; } *power = kSmallPowersOfTen[exponent_plus_one_guess]; *exponent_plus_one = exponent_plus_one_guess; } // Generates the digits of input number w. // w is a floating-point number (DiyFp), consisting of a significand and an // exponent. Its exponent is bounded by kMinimalTargetExponent and // kMaximalTargetExponent. // Hence -60 <= w.e() <= -32. // // Returns false if it fails, in which case the generated digits in the buffer // should not be used. // Preconditions: // * low, w and high are correct up to 1 ulp (unit in the last place). That // is, their error must be less than a unit of their last digits. // * low.e() == w.e() == high.e() // * low < w < high, and taking into account their error: low~ <= high~ // * kMinimalTargetExponent <= w.e() <= kMaximalTargetExponent // Postconditions: returns false if procedure fails. // otherwise: // * buffer is not null-terminated, but len contains the number of digits. // * buffer contains the shortest possible decimal digit-sequence // such that LOW < buffer * 10^kappa < HIGH, where LOW and HIGH are the // correct values of low and high (without their error). // * if more than one decimal representation gives the minimal number of // decimal digits then the one closest to W (where W is the correct value // of w) is chosen. // Remark: this procedure takes into account the imprecision of its input // numbers. If the precision is not enough to guarantee all the postconditions // then false is returned. This usually happens rarely (~0.5%). // // Say, for the sake of example, that // w.e() == -48, and w.f() == 0x1234567890abcdef // w's value can be computed by w.f() * 2^w.e() // We can obtain w's integral digits by simply shifting w.f() by -w.e(). // -> w's integral part is 0x1234 // w's fractional part is therefore 0x567890abcdef. // Printing w's integral part is easy (simply print 0x1234 in decimal). // In order to print its fraction we repeatedly multiply the fraction by 10 and // get each digit. Example the first digit after the point would be computed by // (0x567890abcdef * 10) >> 48. -> 3 // The whole thing becomes slightly more complicated because we want to stop // once we have enough digits. That is, once the digits inside the buffer // represent 'w' we can stop. Everything inside the interval low - high // represents w. However we have to pay attention to low, high and w's // imprecision. static bool DigitGen(DiyFp low, DiyFp w, DiyFp high, Vector buffer, int* length, int* kappa) { DOUBLE_CONVERSION_ASSERT(low.e() == w.e() && w.e() == high.e()); DOUBLE_CONVERSION_ASSERT(low.f() + 1 <= high.f() - 1); DOUBLE_CONVERSION_ASSERT(kMinimalTargetExponent <= w.e() && w.e() <= kMaximalTargetExponent); // low, w and high are imprecise, but by less than one ulp (unit in the last // place). // If we remove (resp. add) 1 ulp from low (resp. high) we are certain that // the new numbers are outside of the interval we want the final // representation to lie in. // Inversely adding (resp. removing) 1 ulp from low (resp. high) would yield // numbers that are certain to lie in the interval. We will use this fact // later on. // We will now start by generating the digits within the uncertain // interval. Later we will weed out representations that lie outside the safe // interval and thus _might_ lie outside the correct interval. uint64_t unit = 1; DiyFp too_low = DiyFp(low.f() - unit, low.e()); DiyFp too_high = DiyFp(high.f() + unit, high.e()); // too_low and too_high are guaranteed to lie outside the interval we want the // generated number in. DiyFp unsafe_interval = DiyFp::Minus(too_high, too_low); // We now cut the input number into two parts: the integral digits and the // fractionals. We will not write any decimal separator though, but adapt // kappa instead. // Reminder: we are currently computing the digits (stored inside the buffer) // such that: too_low < buffer * 10^kappa < too_high // We use too_high for the digit_generation and stop as soon as possible. // If we stop early we effectively round down. DiyFp one = DiyFp(static_cast(1) << -w.e(), w.e()); // Division by one is a shift. uint32_t integrals = static_cast(too_high.f() >> -one.e()); // Modulo by one is an and. uint64_t fractionals = too_high.f() & (one.f() - 1); uint32_t divisor; int divisor_exponent_plus_one; BiggestPowerTen(integrals, DiyFp::kSignificandSize - (-one.e()), &divisor, &divisor_exponent_plus_one); *kappa = divisor_exponent_plus_one; *length = 0; // Loop invariant: buffer = too_high / 10^kappa (integer division) // The invariant holds for the first iteration: kappa has been initialized // with the divisor exponent + 1. And the divisor is the biggest power of ten // that is smaller than integrals. while (*kappa > 0) { int digit = integrals / divisor; DOUBLE_CONVERSION_ASSERT(digit <= 9); buffer[*length] = static_cast('0' + digit); (*length)++; integrals %= divisor; (*kappa)--; // Note that kappa now equals the exponent of the divisor and that the // invariant thus holds again. uint64_t rest = (static_cast(integrals) << -one.e()) + fractionals; // Invariant: too_high = buffer * 10^kappa + DiyFp(rest, one.e()) // Reminder: unsafe_interval.e() == one.e() if (rest < unsafe_interval.f()) { // Rounding down (by not emitting the remaining digits) yields a number // that lies within the unsafe interval. return RoundWeed(buffer, *length, DiyFp::Minus(too_high, w).f(), unsafe_interval.f(), rest, static_cast(divisor) << -one.e(), unit); } divisor /= 10; } // The integrals have been generated. We are at the point of the decimal // separator. In the following loop we simply multiply the remaining digits by // 10 and divide by one. We just need to pay attention to multiply associated // data (like the interval or 'unit'), too. // Note that the multiplication by 10 does not overflow, because w.e >= -60 // and thus one.e >= -60. DOUBLE_CONVERSION_ASSERT(one.e() >= -60); DOUBLE_CONVERSION_ASSERT(fractionals < one.f()); DOUBLE_CONVERSION_ASSERT(DOUBLE_CONVERSION_UINT64_2PART_C(0xFFFFFFFF, FFFFFFFF) / 10 >= one.f()); for (;;) { fractionals *= 10; unit *= 10; unsafe_interval.set_f(unsafe_interval.f() * 10); // Integer division by one. int digit = static_cast(fractionals >> -one.e()); DOUBLE_CONVERSION_ASSERT(digit <= 9); buffer[*length] = static_cast('0' + digit); (*length)++; fractionals &= one.f() - 1; // Modulo by one. (*kappa)--; if (fractionals < unsafe_interval.f()) { return RoundWeed(buffer, *length, DiyFp::Minus(too_high, w).f() * unit, unsafe_interval.f(), fractionals, one.f(), unit); } } } // Generates (at most) requested_digits digits of input number w. // w is a floating-point number (DiyFp), consisting of a significand and an // exponent. Its exponent is bounded by kMinimalTargetExponent and // kMaximalTargetExponent. // Hence -60 <= w.e() <= -32. // // Returns false if it fails, in which case the generated digits in the buffer // should not be used. // Preconditions: // * w is correct up to 1 ulp (unit in the last place). That // is, its error must be strictly less than a unit of its last digit. // * kMinimalTargetExponent <= w.e() <= kMaximalTargetExponent // // Postconditions: returns false if procedure fails. // otherwise: // * buffer is not null-terminated, but length contains the number of // digits. // * the representation in buffer is the most precise representation of // requested_digits digits. // * buffer contains at most requested_digits digits of w. If there are less // than requested_digits digits then some trailing '0's have been removed. // * kappa is such that // w = buffer * 10^kappa + eps with |eps| < 10^kappa / 2. // // Remark: This procedure takes into account the imprecision of its input // numbers. If the precision is not enough to guarantee all the postconditions // then false is returned. This usually happens rarely, but the failure-rate // increases with higher requested_digits. static bool DigitGenCounted(DiyFp w, int requested_digits, Vector buffer, int* length, int* kappa) { DOUBLE_CONVERSION_ASSERT(kMinimalTargetExponent <= w.e() && w.e() <= kMaximalTargetExponent); DOUBLE_CONVERSION_ASSERT(kMinimalTargetExponent >= -60); DOUBLE_CONVERSION_ASSERT(kMaximalTargetExponent <= -32); // w is assumed to have an error less than 1 unit. Whenever w is scaled we // also scale its error. uint64_t w_error = 1; // We cut the input number into two parts: the integral digits and the // fractional digits. We don't emit any decimal separator, but adapt kappa // instead. Example: instead of writing "1.2" we put "12" into the buffer and // increase kappa by 1. DiyFp one = DiyFp(static_cast(1) << -w.e(), w.e()); // Division by one is a shift. uint32_t integrals = static_cast(w.f() >> -one.e()); // Modulo by one is an and. uint64_t fractionals = w.f() & (one.f() - 1); uint32_t divisor; int divisor_exponent_plus_one; BiggestPowerTen(integrals, DiyFp::kSignificandSize - (-one.e()), &divisor, &divisor_exponent_plus_one); *kappa = divisor_exponent_plus_one; *length = 0; // Loop invariant: buffer = w / 10^kappa (integer division) // The invariant holds for the first iteration: kappa has been initialized // with the divisor exponent + 1. And the divisor is the biggest power of ten // that is smaller than 'integrals'. while (*kappa > 0) { int digit = integrals / divisor; DOUBLE_CONVERSION_ASSERT(digit <= 9); buffer[*length] = static_cast('0' + digit); (*length)++; requested_digits--; integrals %= divisor; (*kappa)--; // Note that kappa now equals the exponent of the divisor and that the // invariant thus holds again. if (requested_digits == 0) break; divisor /= 10; } if (requested_digits == 0) { uint64_t rest = (static_cast(integrals) << -one.e()) + fractionals; return RoundWeedCounted(buffer, *length, rest, static_cast(divisor) << -one.e(), w_error, kappa); } // The integrals have been generated. We are at the point of the decimal // separator. In the following loop we simply multiply the remaining digits by // 10 and divide by one. We just need to pay attention to multiply associated // data (the 'unit'), too. // Note that the multiplication by 10 does not overflow, because w.e >= -60 // and thus one.e >= -60. DOUBLE_CONVERSION_ASSERT(one.e() >= -60); DOUBLE_CONVERSION_ASSERT(fractionals < one.f()); DOUBLE_CONVERSION_ASSERT(DOUBLE_CONVERSION_UINT64_2PART_C(0xFFFFFFFF, FFFFFFFF) / 10 >= one.f()); while (requested_digits > 0 && fractionals > w_error) { fractionals *= 10; w_error *= 10; // Integer division by one. int digit = static_cast(fractionals >> -one.e()); DOUBLE_CONVERSION_ASSERT(digit <= 9); buffer[*length] = static_cast('0' + digit); (*length)++; requested_digits--; fractionals &= one.f() - 1; // Modulo by one. (*kappa)--; } if (requested_digits != 0) return false; return RoundWeedCounted(buffer, *length, fractionals, one.f(), w_error, kappa); } // Provides a decimal representation of v. // Returns true if it succeeds, otherwise the result cannot be trusted. // There will be *length digits inside the buffer (not null-terminated). // If the function returns true then // v == (double) (buffer * 10^decimal_exponent). // The digits in the buffer are the shortest representation possible: no // 0.09999999999999999 instead of 0.1. The shorter representation will even be // chosen even if the longer one would be closer to v. // The last digit will be closest to the actual v. That is, even if several // digits might correctly yield 'v' when read again, the closest will be // computed. static bool Grisu3(double v, FastDtoaMode mode, Vector buffer, int* length, int* decimal_exponent) { DiyFp w = Double(v).AsNormalizedDiyFp(); // boundary_minus and boundary_plus are the boundaries between v and its // closest floating-point neighbors. Any number strictly between // boundary_minus and boundary_plus will round to v when convert to a double. // Grisu3 will never output representations that lie exactly on a boundary. DiyFp boundary_minus, boundary_plus; if (mode == FAST_DTOA_SHORTEST) { Double(v).NormalizedBoundaries(&boundary_minus, &boundary_plus); } else { DOUBLE_CONVERSION_ASSERT(mode == FAST_DTOA_SHORTEST_SINGLE); float single_v = static_cast(v); Single(single_v).NormalizedBoundaries(&boundary_minus, &boundary_plus); } DOUBLE_CONVERSION_ASSERT(boundary_plus.e() == w.e()); DiyFp ten_mk; // Cached power of ten: 10^-k int mk; // -k int ten_mk_minimal_binary_exponent = kMinimalTargetExponent - (w.e() + DiyFp::kSignificandSize); int ten_mk_maximal_binary_exponent = kMaximalTargetExponent - (w.e() + DiyFp::kSignificandSize); PowersOfTenCache::GetCachedPowerForBinaryExponentRange( ten_mk_minimal_binary_exponent, ten_mk_maximal_binary_exponent, &ten_mk, &mk); DOUBLE_CONVERSION_ASSERT((kMinimalTargetExponent <= w.e() + ten_mk.e() + DiyFp::kSignificandSize) && (kMaximalTargetExponent >= w.e() + ten_mk.e() + DiyFp::kSignificandSize)); // Note that ten_mk is only an approximation of 10^-k. A DiyFp only contains a // 64 bit significand and ten_mk is thus only precise up to 64 bits. // The DiyFp::Times procedure rounds its result, and ten_mk is approximated // too. The variable scaled_w (as well as scaled_boundary_minus/plus) are now // off by a small amount. // In fact: scaled_w - w*10^k < 1ulp (unit in the last place) of scaled_w. // In other words: let f = scaled_w.f() and e = scaled_w.e(), then // (f-1) * 2^e < w*10^k < (f+1) * 2^e DiyFp scaled_w = DiyFp::Times(w, ten_mk); DOUBLE_CONVERSION_ASSERT(scaled_w.e() == boundary_plus.e() + ten_mk.e() + DiyFp::kSignificandSize); // In theory it would be possible to avoid some recomputations by computing // the difference between w and boundary_minus/plus (a power of 2) and to // compute scaled_boundary_minus/plus by subtracting/adding from // scaled_w. However the code becomes much less readable and the speed // enhancements are not terrific. DiyFp scaled_boundary_minus = DiyFp::Times(boundary_minus, ten_mk); DiyFp scaled_boundary_plus = DiyFp::Times(boundary_plus, ten_mk); // DigitGen will generate the digits of scaled_w. Therefore we have // v == (double) (scaled_w * 10^-mk). // Set decimal_exponent == -mk and pass it to DigitGen. If scaled_w is not an // integer than it will be updated. For instance if scaled_w == 1.23 then // the buffer will be filled with "123" and the decimal_exponent will be // decreased by 2. int kappa; bool result = DigitGen(scaled_boundary_minus, scaled_w, scaled_boundary_plus, buffer, length, &kappa); *decimal_exponent = -mk + kappa; return result; } // The "counted" version of grisu3 (see above) only generates requested_digits // number of digits. This version does not generate the shortest representation, // and with enough requested digits 0.1 will at some point print as 0.9999999... // Grisu3 is too imprecise for real halfway cases (1.5 will not work) and // therefore the rounding strategy for halfway cases is irrelevant. static bool Grisu3Counted(double v, int requested_digits, Vector buffer, int* length, int* decimal_exponent) { DiyFp w = Double(v).AsNormalizedDiyFp(); DiyFp ten_mk; // Cached power of ten: 10^-k int mk; // -k int ten_mk_minimal_binary_exponent = kMinimalTargetExponent - (w.e() + DiyFp::kSignificandSize); int ten_mk_maximal_binary_exponent = kMaximalTargetExponent - (w.e() + DiyFp::kSignificandSize); PowersOfTenCache::GetCachedPowerForBinaryExponentRange( ten_mk_minimal_binary_exponent, ten_mk_maximal_binary_exponent, &ten_mk, &mk); DOUBLE_CONVERSION_ASSERT((kMinimalTargetExponent <= w.e() + ten_mk.e() + DiyFp::kSignificandSize) && (kMaximalTargetExponent >= w.e() + ten_mk.e() + DiyFp::kSignificandSize)); // Note that ten_mk is only an approximation of 10^-k. A DiyFp only contains a // 64 bit significand and ten_mk is thus only precise up to 64 bits. // The DiyFp::Times procedure rounds its result, and ten_mk is approximated // too. The variable scaled_w (as well as scaled_boundary_minus/plus) are now // off by a small amount. // In fact: scaled_w - w*10^k < 1ulp (unit in the last place) of scaled_w. // In other words: let f = scaled_w.f() and e = scaled_w.e(), then // (f-1) * 2^e < w*10^k < (f+1) * 2^e DiyFp scaled_w = DiyFp::Times(w, ten_mk); // We now have (double) (scaled_w * 10^-mk). // DigitGen will generate the first requested_digits digits of scaled_w and // return together with a kappa such that scaled_w ~= buffer * 10^kappa. (It // will not always be exactly the same since DigitGenCounted only produces a // limited number of digits.) int kappa; bool result = DigitGenCounted(scaled_w, requested_digits, buffer, length, &kappa); *decimal_exponent = -mk + kappa; return result; } bool FastDtoa(double v, FastDtoaMode mode, int requested_digits, Vector buffer, int* length, int* decimal_point) { DOUBLE_CONVERSION_ASSERT(v > 0); DOUBLE_CONVERSION_ASSERT(!Double(v).IsSpecial()); bool result = false; int decimal_exponent = 0; switch (mode) { case FAST_DTOA_SHORTEST: case FAST_DTOA_SHORTEST_SINGLE: result = Grisu3(v, mode, buffer, length, &decimal_exponent); break; case FAST_DTOA_PRECISION: result = Grisu3Counted(v, requested_digits, buffer, length, &decimal_exponent); break; default: DOUBLE_CONVERSION_UNREACHABLE(); } if (result) { *decimal_point = *length + decimal_exponent; buffer[*length] = '\0'; } return result; } } // namespace double_conversion // ICU PATCH: Close ICU namespace U_NAMESPACE_END #endif // ICU PATCH: close #if !UCONFIG_NO_FORMATTING stringi/src/icu74/i18n/erarules.h0000644000176200001440000000614314700200761016226 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #ifndef ERARULES_H_ #define ERARULES_H_ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/localpointer.h" #include "unicode/uobject.h" #include "cmemory.h" U_NAMESPACE_BEGIN // Export an explicit template instantiation of LocalMemory used as a data member of EraRules. // When building DLLs for Windows this is required even though no direct access leaks out of the i18n library. // See digitlst.h, pluralaffix.h, datefmt.h, and others for similar examples. #if U_PF_WINDOWS <= U_PLATFORM && U_PLATFORM <= U_PF_CYGWIN #if defined(_MSC_VER) // Ignore warning 4661 as LocalPointerBase does not use operator== or operator!= #pragma warning(push) #pragma warning(disable: 4661) #endif template class U_I18N_API LocalPointerBase; template class U_I18N_API LocalMemory; #if defined(_MSC_VER) #pragma warning(pop) #endif #endif class U_I18N_API EraRules : public UMemory { public: ~EraRules(); static EraRules* createInstance(const char *calType, UBool includeTentativeEra, UErrorCode& status); /** * Gets number of effective eras * @return number of effective eras */ inline int32_t getNumberOfEras() const { return numEras; } /** * Gets start date of an era * @param eraIdx Era index * @param fields Receives date fields. The result includes values of year, month, * day of month in this order. When an era has no start date, the result * will be January 1st in year whose value is minimum integer. * @param status Receives status. */ void getStartDate(int32_t eraIdx, int32_t (&fields)[3], UErrorCode& status) const; /** * Gets start year of an era * @param eraIdx Era index * @param status Receives status. * @return The first year of an era. When a era has no start date, minimum int32 * value is returned. */ int32_t getStartYear(int32_t eraIdx, UErrorCode& status) const; /** * Returns era index for the specified year/month/day. * @param year Year * @param month Month (1-base) * @param day Day of month * @param status Receives status * @return era index (or 0, when the specified date is before the first era) */ int32_t getEraIndex(int32_t year, int32_t month, int32_t day, UErrorCode& status) const; /** * Gets the current era index. This is calculated only once for an instance of * EraRules. The current era calculation is based on the default time zone at * the time of instantiation. * * @return era index of current era (or 0, when current date is before the first era) */ inline int32_t getCurrentEraIndex() const { return currentEra; } private: EraRules(LocalMemory& eraStartDates, int32_t numEra); void initCurrentEra(); LocalMemory startDates; int32_t numEras; int32_t currentEra; }; U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ #endif /* ERARULES_H_ */ stringi/src/icu74/i18n/decNumber.cpp0000644000176200001440000143005114700200761016643 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ------------------------------------------------------------------ */ /* Decimal Number arithmetic module */ /* ------------------------------------------------------------------ */ /* Copyright (c) IBM Corporation, 2000-2014. All rights reserved. */ /* */ /* This software is made available under the terms of the */ /* ICU License -- ICU 1.8.1 and later. */ /* */ /* The description and User's Guide ("The decNumber C Library") for */ /* this software is called decNumber.pdf. This document is */ /* available, together with arithmetic and format specifications, */ /* testcases, and Web links, on the General Decimal Arithmetic page. */ /* */ /* Please send comments, suggestions, and corrections to the author: */ /* mfc@uk.ibm.com */ /* Mike Cowlishaw, IBM Fellow */ /* IBM UK, PO Box 31, Birmingham Road, Warwick CV34 5JL, UK */ /* ------------------------------------------------------------------ */ /* Modified version, for use from within ICU. * Renamed public functions, to avoid an unwanted export of the * standard names from the ICU library. * * Use ICU's uprv_malloc() and uprv_free() * * Revert comment syntax to plain C * * Remove a few compiler warnings. */ /* This module comprises the routines for arbitrary-precision General */ /* Decimal Arithmetic as defined in the specification which may be */ /* found on the General Decimal Arithmetic pages. It implements both */ /* the full ('extended') arithmetic and the simpler ('subset') */ /* arithmetic. */ /* */ /* Usage notes: */ /* */ /* 1. This code is ANSI C89 except: */ /* */ /* a) C99 line comments (double forward slash) are used. (Most C */ /* compilers accept these. If yours does not, a simple script */ /* can be used to convert them to ANSI C comments.) */ /* */ /* b) Types from C99 stdint.h are used. If you do not have this */ /* header file, see the User's Guide section of the decNumber */ /* documentation; this lists the necessary definitions. */ /* */ /* c) If DECDPUN>4 or DECUSE64=1, the C99 64-bit int64_t and */ /* uint64_t types may be used. To avoid these, set DECUSE64=0 */ /* and DECDPUN<=4 (see documentation). */ /* */ /* The code also conforms to C99 restrictions; in particular, */ /* strict aliasing rules are observed. */ /* */ /* 2. The decNumber format which this library uses is optimized for */ /* efficient processing of relatively short numbers; in particular */ /* it allows the use of fixed sized structures and minimizes copy */ /* and move operations. It does, however, support arbitrary */ /* precision (up to 999,999,999 digits) and arbitrary exponent */ /* range (Emax in the range 0 through 999,999,999 and Emin in the */ /* range -999,999,999 through 0). Mathematical functions (for */ /* example decNumberExp) as identified below are restricted more */ /* tightly: digits, emax, and -emin in the context must be <= */ /* DEC_MAX_MATH (999999), and their operand(s) must be within */ /* these bounds. */ /* */ /* 3. Logical functions are further restricted; their operands must */ /* be finite, positive, have an exponent of zero, and all digits */ /* must be either 0 or 1. The result will only contain digits */ /* which are 0 or 1 (and will have exponent=0 and a sign of 0). */ /* */ /* 4. Operands to operator functions are never modified unless they */ /* are also specified to be the result number (which is always */ /* permitted). Other than that case, operands must not overlap. */ /* */ /* 5. Error handling: the type of the error is ORed into the status */ /* flags in the current context (decContext structure). The */ /* SIGFPE signal is then raised if the corresponding trap-enabler */ /* flag in the decContext is set (is 1). */ /* */ /* It is the responsibility of the caller to clear the status */ /* flags as required. */ /* */ /* The result of any routine which returns a number will always */ /* be a valid number (which may be a special value, such as an */ /* Infinity or NaN). */ /* */ /* 6. The decNumber format is not an exchangeable concrete */ /* representation as it comprises fields which may be machine- */ /* dependent (packed or unpacked, or special length, for example). */ /* Canonical conversions to and from strings are provided; other */ /* conversions are available in separate modules. */ /* */ /* 7. Normally, input operands are assumed to be valid. Set DECCHECK */ /* to 1 for extended operand checking (including nullptr operands). */ /* Results are undefined if a badly-formed structure (or a nullptr */ /* pointer to a structure) is provided, though with DECCHECK */ /* enabled the operator routines are protected against exceptions. */ /* (Except if the result pointer is nullptr, which is unrecoverable.) */ /* */ /* However, the routines will never cause exceptions if they are */ /* given well-formed operands, even if the value of the operands */ /* is inappropriate for the operation and DECCHECK is not set. */ /* (Except for SIGFPE, as and where documented.) */ /* */ /* 8. Subset arithmetic is available only if DECSUBSET is set to 1. */ /* ------------------------------------------------------------------ */ /* Implementation notes for maintenance of this module: */ /* */ /* 1. Storage leak protection: Routines which use malloc are not */ /* permitted to use return for fastpath or error exits (i.e., */ /* they follow strict structured programming conventions). */ /* Instead they have a do{}while(0); construct surrounding the */ /* code which is protected -- break may be used to exit this. */ /* Other routines can safely use the return statement inline. */ /* */ /* Storage leak accounting can be enabled using DECALLOC. */ /* */ /* 2. All loops use the for(;;) construct. Any do construct does */ /* not loop; it is for allocation protection as just described. */ /* */ /* 3. Setting status in the context must always be the very last */ /* action in a routine, as non-0 status may raise a trap and hence */ /* the call to set status may not return (if the handler uses long */ /* jump). Therefore all cleanup must be done first. In general, */ /* to achieve this status is accumulated and is only applied just */ /* before return by calling decContextSetStatus (via decStatus). */ /* */ /* Routines which allocate storage cannot, in general, use the */ /* 'top level' routines which could cause a non-returning */ /* transfer of control. The decXxxxOp routines are safe (do not */ /* call decStatus even if traps are set in the context) and should */ /* be used instead (they are also a little faster). */ /* */ /* 4. Exponent checking is minimized by allowing the exponent to */ /* grow outside its limits during calculations, provided that */ /* the decFinalize function is called later. Multiplication and */ /* division, and intermediate calculations in exponentiation, */ /* require more careful checks because of the risk of 31-bit */ /* overflow (the most negative valid exponent is -1999999997, for */ /* a 999999999-digit number with adjusted exponent of -999999999). */ /* */ /* 5. Rounding is deferred until finalization of results, with any */ /* 'off to the right' data being represented as a single digit */ /* residue (in the range -1 through 9). This avoids any double- */ /* rounding when more than one shortening takes place (for */ /* example, when a result is subnormal). */ /* */ /* 6. The digits count is allowed to rise to a multiple of DECDPUN */ /* during many operations, so whole Units are handled and exact */ /* accounting of digits is not needed. The correct digits value */ /* is found by decGetDigits, which accounts for leading zeros. */ /* This must be called before any rounding if the number of digits */ /* is not known exactly. */ /* */ /* 7. The multiply-by-reciprocal 'trick' is used for partitioning */ /* numbers up to four digits, using appropriate constants. This */ /* is not useful for longer numbers because overflow of 32 bits */ /* would lead to 4 multiplies, which is almost as expensive as */ /* a divide (unless a floating-point or 64-bit multiply is */ /* assumed to be available). */ /* */ /* 8. Unusual abbreviations that may be used in the commentary: */ /* lhs -- left hand side (operand, of an operation) */ /* lsd -- least significant digit (of coefficient) */ /* lsu -- least significant Unit (of coefficient) */ /* msd -- most significant digit (of coefficient) */ /* msi -- most significant item (in an array) */ /* msu -- most significant Unit (of coefficient) */ /* rhs -- right hand side (operand, of an operation) */ /* +ve -- positive */ /* -ve -- negative */ /* ** -- raise to the power */ /* ------------------------------------------------------------------ */ #include /* for malloc, free, etc. */ /* #include */ /* for printf [if needed] */ #include /* for strcpy */ #include /* for lower */ #include "cmemory.h" /* for uprv_malloc, etc., in ICU */ #include "decNumber.h" /* base number library */ #include "decNumberLocal.h" /* decNumber local types, etc. */ #include "uassert.h" /* Constants */ /* Public lookup table used by the D2U macro */ static const uByte d2utable[DECMAXD2U+1]=D2UTABLE; #define DECVERB 1 /* set to 1 for verbose DECCHECK */ #define powers DECPOWERS /* old internal name */ /* Local constants */ #define DIVIDE 0x80 /* Divide operators */ #define REMAINDER 0x40 /* .. */ #define DIVIDEINT 0x20 /* .. */ #define REMNEAR 0x10 /* .. */ #define COMPARE 0x01 /* Compare operators */ #define COMPMAX 0x02 /* .. */ #define COMPMIN 0x03 /* .. */ #define COMPTOTAL 0x04 /* .. */ #define COMPNAN 0x05 /* .. [NaN processing] */ #define COMPSIG 0x06 /* .. [signaling COMPARE] */ #define COMPMAXMAG 0x07 /* .. */ #define COMPMINMAG 0x08 /* .. */ #define DEC_sNaN 0x40000000 /* local status: sNaN signal */ #define BADINT (Int)0x80000000 /* most-negative Int; error indicator */ /* Next two indicate an integer >= 10**6, and its parity (bottom bit) */ #define BIGEVEN (Int)0x80000002 #define BIGODD (Int)0x80000003 static const Unit uarrone[1]={1}; /* Unit array of 1, used for incrementing */ /* ------------------------------------------------------------------ */ /* round-for-reround digits */ /* ------------------------------------------------------------------ */ #if 0 static const uByte DECSTICKYTAB[10]={1,1,2,3,4,6,6,7,8,9}; /* used if sticky */ #endif /* ------------------------------------------------------------------ */ /* Powers of ten (powers[n]==10**n, 0<=n<=9) */ /* ------------------------------------------------------------------ */ static const uInt DECPOWERS[10]={1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000}; /* Granularity-dependent code */ #if DECDPUN<=4 #define eInt Int /* extended integer */ #define ueInt uInt /* unsigned extended integer */ /* Constant multipliers for divide-by-power-of five using reciprocal */ /* multiply, after removing powers of 2 by shifting, and final shift */ /* of 17 [we only need up to **4] */ static const uInt multies[]={131073, 26215, 5243, 1049, 210}; /* QUOT10 -- macro to return the quotient of unit u divided by 10**n */ #define QUOT10(u, n) ((((uInt)(u)>>(n))*multies[n])>>17) #else /* For DECDPUN>4 non-ANSI-89 64-bit types are needed. */ #if !DECUSE64 #error decNumber.c: DECUSE64 must be 1 when DECDPUN>4 #endif #define eInt Long /* extended integer */ #define ueInt uLong /* unsigned extended integer */ #endif /* Local routines */ static decNumber * decAddOp(decNumber *, const decNumber *, const decNumber *, decContext *, uByte, uInt *); static Flag decBiStr(const char *, const char *, const char *); static uInt decCheckMath(const decNumber *, decContext *, uInt *); static void decApplyRound(decNumber *, decContext *, Int, uInt *); static Int decCompare(const decNumber *lhs, const decNumber *rhs, Flag); static decNumber * decCompareOp(decNumber *, const decNumber *, const decNumber *, decContext *, Flag, uInt *); static void decCopyFit(decNumber *, const decNumber *, decContext *, Int *, uInt *); static decNumber * decDecap(decNumber *, Int); static decNumber * decDivideOp(decNumber *, const decNumber *, const decNumber *, decContext *, Flag, uInt *); static decNumber * decExpOp(decNumber *, const decNumber *, decContext *, uInt *); static void decFinalize(decNumber *, decContext *, Int *, uInt *); static Int decGetDigits(Unit *, Int); static Int decGetInt(const decNumber *); static decNumber * decLnOp(decNumber *, const decNumber *, decContext *, uInt *); static decNumber * decMultiplyOp(decNumber *, const decNumber *, const decNumber *, decContext *, uInt *); static decNumber * decNaNs(decNumber *, const decNumber *, const decNumber *, decContext *, uInt *); static decNumber * decQuantizeOp(decNumber *, const decNumber *, const decNumber *, decContext *, Flag, uInt *); static void decReverse(Unit *, Unit *); static void decSetCoeff(decNumber *, decContext *, const Unit *, Int, Int *, uInt *); static void decSetMaxValue(decNumber *, decContext *); static void decSetOverflow(decNumber *, decContext *, uInt *); static void decSetSubnormal(decNumber *, decContext *, Int *, uInt *); static Int decShiftToLeast(Unit *, Int, Int); static Int decShiftToMost(Unit *, Int, Int); static void decStatus(decNumber *, uInt, decContext *); static void decToString(const decNumber *, char[], Flag); static decNumber * decTrim(decNumber *, decContext *, Flag, Flag, Int *); static Int decUnitAddSub(const Unit *, Int, const Unit *, Int, Int, Unit *, Int); static Int decUnitCompare(const Unit *, Int, const Unit *, Int, Int); #if !DECSUBSET /* decFinish == decFinalize when no subset arithmetic needed */ #define decFinish(a,b,c,d) decFinalize(a,b,c,d) #else static void decFinish(decNumber *, decContext *, Int *, uInt *); static decNumber * decRoundOperand(const decNumber *, decContext *, uInt *); #endif /* Local macros */ /* masked special-values bits */ #define SPECIALARG (rhs->bits & DECSPECIAL) #define SPECIALARGS ((lhs->bits | rhs->bits) & DECSPECIAL) /* For use in ICU */ #define malloc(a) uprv_malloc(a) #define free(a) uprv_free(a) /* Diagnostic macros, etc. */ #if DECALLOC /* Handle malloc/free accounting. If enabled, our accountable routines */ /* are used; otherwise the code just goes straight to the system malloc */ /* and free routines. */ #define malloc(a) decMalloc(a) #define free(a) decFree(a) #define DECFENCE 0x5a /* corruption detector */ /* 'Our' malloc and free: */ static void *decMalloc(size_t); static void decFree(void *); uInt decAllocBytes=0; /* count of bytes allocated */ /* Note that DECALLOC code only checks for storage buffer overflow. */ /* To check for memory leaks, the decAllocBytes variable must be */ /* checked to be 0 at appropriate times (e.g., after the test */ /* harness completes a set of tests). This checking may be unreliable */ /* if the testing is done in a multi-thread environment. */ #endif #if DECCHECK /* Optional checking routines. Enabling these means that decNumber */ /* and decContext operands to operator routines are checked for */ /* correctness. This roughly doubles the execution time of the */ /* fastest routines (and adds 600+ bytes), so should not normally be */ /* used in 'production'. */ /* decCheckInexact is used to check that inexact results have a full */ /* complement of digits (where appropriate -- this is not the case */ /* for Quantize, for example) */ #define DECUNRESU ((decNumber *)(void *)0xffffffff) #define DECUNUSED ((const decNumber *)(void *)0xffffffff) #define DECUNCONT ((decContext *)(void *)(0xffffffff)) static Flag decCheckOperands(decNumber *, const decNumber *, const decNumber *, decContext *); static Flag decCheckNumber(const decNumber *); static void decCheckInexact(const decNumber *, decContext *); #endif #if DECTRACE || DECCHECK /* Optional trace/debugging routines (may or may not be used) */ void decNumberShow(const decNumber *); /* displays the components of a number */ static void decDumpAr(char, const Unit *, Int); #endif /* ================================================================== */ /* Conversions */ /* ================================================================== */ /* ------------------------------------------------------------------ */ /* from-int32 -- conversion from Int or uInt */ /* */ /* dn is the decNumber to receive the integer */ /* in or uin is the integer to be converted */ /* returns dn */ /* */ /* No error is possible. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberFromInt32(decNumber *dn, Int in) { uInt unsig; if (in>=0) unsig=in; else { /* negative (possibly BADINT) */ if (in==BADINT) unsig=(uInt)1073741824*2; /* special case */ else unsig=-in; /* invert */ } /* in is now positive */ uprv_decNumberFromUInt32(dn, unsig); if (in<0) dn->bits=DECNEG; /* sign needed */ return dn; } /* decNumberFromInt32 */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberFromUInt32(decNumber *dn, uInt uin) { Unit *up; /* work pointer */ uprv_decNumberZero(dn); /* clean */ if (uin==0) return dn; /* [or decGetDigits bad call] */ for (up=dn->lsu; uin>0; up++) { *up=(Unit)(uin%(DECDPUNMAX+1)); uin=uin/(DECDPUNMAX+1); } dn->digits=decGetDigits(dn->lsu, static_cast(up - dn->lsu)); return dn; } /* decNumberFromUInt32 */ /* ------------------------------------------------------------------ */ /* to-int32 -- conversion to Int or uInt */ /* */ /* dn is the decNumber to convert */ /* set is the context for reporting errors */ /* returns the converted decNumber, or 0 if Invalid is set */ /* */ /* Invalid is set if the decNumber does not have exponent==0 or if */ /* it is a NaN, Infinite, or out-of-range. */ /* ------------------------------------------------------------------ */ U_CAPI Int U_EXPORT2 uprv_decNumberToInt32(const decNumber *dn, decContext *set) { #if DECCHECK if (decCheckOperands(DECUNRESU, DECUNUSED, dn, set)) return 0; #endif /* special or too many digits, or bad exponent */ if (dn->bits&DECSPECIAL || dn->digits>10 || dn->exponent!=0) ; /* bad */ else { /* is a finite integer with 10 or fewer digits */ Int d; /* work */ const Unit *up; /* .. */ uInt hi=0, lo; /* .. */ up=dn->lsu; /* -> lsu */ lo=*up; /* get 1 to 9 digits */ #if DECDPUN>1 /* split to higher */ hi=lo/10; lo=lo%10; #endif up++; /* collect remaining Units, if any, into hi */ for (d=DECDPUN; ddigits; up++, d+=DECDPUN) hi+=*up*powers[d-1]; /* now low has the lsd, hi the remainder */ if (hi>214748364 || (hi==214748364 && lo>7)) { /* out of range? */ /* most-negative is a reprieve */ if (dn->bits&DECNEG && hi==214748364 && lo==8) return 0x80000000; /* bad -- drop through */ } else { /* in-range always */ Int i=X10(hi)+lo; if (dn->bits&DECNEG) return -i; return i; } } /* integer */ uprv_decContextSetStatus(set, DEC_Invalid_operation); /* [may not return] */ return 0; } /* decNumberToInt32 */ U_CAPI uInt U_EXPORT2 uprv_decNumberToUInt32(const decNumber *dn, decContext *set) { #if DECCHECK if (decCheckOperands(DECUNRESU, DECUNUSED, dn, set)) return 0; #endif /* special or too many digits, or bad exponent, or negative (<0) */ if (dn->bits&DECSPECIAL || dn->digits>10 || dn->exponent!=0 || (dn->bits&DECNEG && !ISZERO(dn))); /* bad */ else { /* is a finite integer with 10 or fewer digits */ Int d; /* work */ const Unit *up; /* .. */ uInt hi=0, lo; /* .. */ up=dn->lsu; /* -> lsu */ lo=*up; /* get 1 to 9 digits */ #if DECDPUN>1 /* split to higher */ hi=lo/10; lo=lo%10; #endif up++; /* collect remaining Units, if any, into hi */ for (d=DECDPUN; ddigits; up++, d+=DECDPUN) hi+=*up*powers[d-1]; /* now low has the lsd, hi the remainder */ if (hi>429496729 || (hi==429496729 && lo>5)) ; /* no reprieve possible */ else return X10(hi)+lo; } /* integer */ uprv_decContextSetStatus(set, DEC_Invalid_operation); /* [may not return] */ return 0; } /* decNumberToUInt32 */ /* ------------------------------------------------------------------ */ /* to-scientific-string -- conversion to numeric string */ /* to-engineering-string -- conversion to numeric string */ /* */ /* decNumberToString(dn, string); */ /* decNumberToEngString(dn, string); */ /* */ /* dn is the decNumber to convert */ /* string is the string where the result will be laid out */ /* */ /* string must be at least dn->digits+14 characters long */ /* */ /* No error is possible, and no status can be set. */ /* ------------------------------------------------------------------ */ U_CAPI char * U_EXPORT2 uprv_decNumberToString(const decNumber *dn, char *string){ decToString(dn, string, 0); return string; } /* DecNumberToString */ U_CAPI char * U_EXPORT2 uprv_decNumberToEngString(const decNumber *dn, char *string){ decToString(dn, string, 1); return string; } /* DecNumberToEngString */ /* ------------------------------------------------------------------ */ /* to-number -- conversion from numeric string */ /* */ /* decNumberFromString -- convert string to decNumber */ /* dn -- the number structure to fill */ /* chars[] -- the string to convert ('\0' terminated) */ /* set -- the context used for processing any error, */ /* determining the maximum precision available */ /* (set.digits), determining the maximum and minimum */ /* exponent (set.emax and set.emin), determining if */ /* extended values are allowed, and checking the */ /* rounding mode if overflow occurs or rounding is */ /* needed. */ /* */ /* The length of the coefficient and the size of the exponent are */ /* checked by this routine, so the correct error (Underflow or */ /* Overflow) can be reported or rounding applied, as necessary. */ /* */ /* If bad syntax is detected, the result will be a quiet NaN. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberFromString(decNumber *dn, const char chars[], decContext *set) { Int exponent=0; /* working exponent [assume 0] */ uByte bits=0; /* working flags [assume +ve] */ Unit *res; /* where result will be built */ Unit resbuff[SD2U(DECBUFFER+9)];/* local buffer in case need temporary */ /* [+9 allows for ln() constants] */ Unit *allocres=nullptr; /* -> allocated result, iff allocated */ Int d=0; /* count of digits found in decimal part */ const char *dotchar=nullptr; /* where dot was found */ const char *cfirst=chars; /* -> first character of decimal part */ const char *last=nullptr; /* -> last digit of decimal part */ const char *c; /* work */ Unit *up; /* .. */ #if DECDPUN>1 Int cut, out; /* .. */ #endif Int residue; /* rounding residue */ uInt status=0; /* error code */ #if DECCHECK if (decCheckOperands(DECUNRESU, DECUNUSED, DECUNUSED, set)) return uprv_decNumberZero(dn); #endif do { /* status & malloc protection */ for (c=chars;; c++) { /* -> input character */ if (*c>='0' && *c<='9') { /* test for Arabic digit */ last=c; d++; /* count of real digits */ continue; /* still in decimal part */ } if (*c=='.' && dotchar==nullptr) { /* first '.' */ dotchar=c; /* record offset into decimal part */ if (c==cfirst) cfirst++; /* first digit must follow */ continue;} if (c==chars) { /* first in string... */ if (*c=='-') { /* valid - sign */ cfirst++; bits=DECNEG; continue;} if (*c=='+') { /* valid + sign */ cfirst++; continue;} } /* *c is not a digit, or a valid +, -, or '.' */ break; } /* c */ if (last==nullptr) { /* no digits yet */ status=DEC_Conversion_syntax;/* assume the worst */ if (*c=='\0') break; /* and no more to come... */ #if DECSUBSET /* if subset then infinities and NaNs are not allowed */ if (!set->extended) break; /* hopeless */ #endif /* Infinities and NaNs are possible, here */ if (dotchar!=nullptr) break; /* .. unless had a dot */ uprv_decNumberZero(dn); /* be optimistic */ if (decBiStr(c, "infinity", "INFINITY") || decBiStr(c, "inf", "INF")) { dn->bits=bits | DECINF; status=0; /* is OK */ break; /* all done */ } /* a NaN expected */ /* 2003.09.10 NaNs are now permitted to have a sign */ dn->bits=bits | DECNAN; /* assume simple NaN */ if (*c=='s' || *c=='S') { /* looks like an sNaN */ c++; dn->bits=bits | DECSNAN; } if (*c!='n' && *c!='N') break; /* check caseless "NaN" */ c++; if (*c!='a' && *c!='A') break; /* .. */ c++; if (*c!='n' && *c!='N') break; /* .. */ c++; /* now either nothing, or nnnn payload, expected */ /* -> start of integer and skip leading 0s [including plain 0] */ for (cfirst=c; *cfirst=='0';) cfirst++; if (*cfirst=='\0') { /* "NaN" or "sNaN", maybe with all 0s */ status=0; /* it's good */ break; /* .. */ } /* something other than 0s; setup last and d as usual [no dots] */ for (c=cfirst;; c++, d++) { if (*c<'0' || *c>'9') break; /* test for Arabic digit */ last=c; } if (*c!='\0') break; /* not all digits */ if (d>set->digits-1) { /* [NB: payload in a decNumber can be full length unless */ /* clamped, in which case can only be digits-1] */ if (set->clamp) break; if (d>set->digits) break; } /* too many digits? */ /* good; drop through to convert the integer to coefficient */ status=0; /* syntax is OK */ bits=dn->bits; /* for copy-back */ } /* last==nullptr */ else if (*c!='\0') { /* more to process... */ /* had some digits; exponent is only valid sequence now */ Flag nege; /* 1=negative exponent */ const char *firstexp; /* -> first significant exponent digit */ status=DEC_Conversion_syntax;/* assume the worst */ if (*c!='e' && *c!='E') break; /* Found 'e' or 'E' -- now process explicit exponent */ /* 1998.07.11: sign no longer required */ nege=0; c++; /* to (possible) sign */ if (*c=='-') {nege=1; c++;} else if (*c=='+') c++; if (*c=='\0') break; for (; *c=='0' && *(c+1)!='\0';) c++; /* strip insignificant zeros */ firstexp=c; /* save exponent digit place */ uInt uexponent = 0; /* Avoid undefined behavior on signed int overflow */ for (; ;c++) { if (*c<'0' || *c>'9') break; /* not a digit */ uexponent=X10(uexponent)+(uInt)*c-(uInt)'0'; } /* c */ exponent = (Int)uexponent; /* if not now on a '\0', *c must not be a digit */ if (*c!='\0') break; /* (this next test must be after the syntax checks) */ /* if it was too long the exponent may have wrapped, so check */ /* carefully and set it to a certain overflow if wrap possible */ if (c>=firstexp+9+1) { if (c>firstexp+9+1 || *firstexp>'1') exponent=DECNUMMAXE*2; /* [up to 1999999999 is OK, for example 1E-1000000998] */ } if (nege) exponent=-exponent; /* was negative */ status=0; /* is OK */ } /* stuff after digits */ /* Here when whole string has been inspected; syntax is good */ /* cfirst->first digit (never dot), last->last digit (ditto) */ /* strip leading zeros/dot [leave final 0 if all 0's] */ if (*cfirst=='0') { /* [cfirst has stepped over .] */ for (c=cfirst; cextended) { uprv_decNumberZero(dn); /* clean result */ break; /* [could be return] */ } #endif } /* at least one leading 0 */ /* Handle decimal point... */ if (dotchar!=nullptr && dotchar(last-dotchar); /* adjust exponent */ /* [we can now ignore the .] */ /* OK, the digits string is good. Assemble in the decNumber, or in */ /* a temporary units array if rounding is needed */ if (d<=set->digits) res=dn->lsu; /* fits into supplied decNumber */ else { /* rounding needed */ Int needbytes=D2U(d)*sizeof(Unit);/* bytes needed */ res=resbuff; /* assume use local buffer */ if (needbytes>(Int)sizeof(resbuff)) { /* too big for local */ allocres=(Unit *)malloc(needbytes); if (allocres==nullptr) {status|=DEC_Insufficient_storage; break;} res=allocres; } } /* res now -> number lsu, buffer, or allocated storage for Unit array */ /* Place the coefficient into the selected Unit array */ /* [this is often 70% of the cost of this function when DECDPUN>1] */ #if DECDPUN>1 out=0; /* accumulator */ up=res+D2U(d)-1; /* -> msu */ cut=d-(up-res)*DECDPUN; /* digits in top unit */ for (c=cfirst;; c++) { /* along the digits */ if (*c=='.') continue; /* ignore '.' [don't decrement cut] */ out=X10(out)+(Int)*c-(Int)'0'; if (c==last) break; /* done [never get to trailing '.'] */ cut--; if (cut>0) continue; /* more for this unit */ *up=(Unit)out; /* write unit */ up--; /* prepare for unit below.. */ cut=DECDPUN; /* .. */ out=0; /* .. */ } /* c */ *up=(Unit)out; /* write lsu */ #else /* DECDPUN==1 */ up=res; /* -> lsu */ for (c=last; c>=cfirst; c--) { /* over each character, from least */ if (*c=='.') continue; /* ignore . [don't step up] */ *up=(Unit)((Int)*c-(Int)'0'); up++; } /* c */ #endif dn->bits=bits; dn->exponent=exponent; dn->digits=d; /* if not in number (too long) shorten into the number */ if (d>set->digits) { residue=0; decSetCoeff(dn, set, res, d, &residue, &status); /* always check for overflow or subnormal and round as needed */ decFinalize(dn, set, &residue, &status); } else { /* no rounding, but may still have overflow or subnormal */ /* [these tests are just for performance; finalize repeats them] */ if ((dn->exponent-1emin-dn->digits) || (dn->exponent-1>set->emax-set->digits)) { residue=0; decFinalize(dn, set, &residue, &status); } } /* decNumberShow(dn); */ } while(0); /* [for break] */ if (allocres!=nullptr) free(allocres); /* drop any storage used */ if (status!=0) decStatus(dn, status, set); return dn; } /* decNumberFromString */ /* ================================================================== */ /* Operators */ /* ================================================================== */ /* ------------------------------------------------------------------ */ /* decNumberAbs -- absolute value operator */ /* */ /* This computes C = abs(A) */ /* */ /* res is C, the result. C may be A */ /* rhs is A */ /* set is the context */ /* */ /* See also decNumberCopyAbs for a quiet bitwise version of this. */ /* C must have space for set->digits digits. */ /* ------------------------------------------------------------------ */ /* This has the same effect as decNumberPlus unless A is negative, */ /* in which case it has the same effect as decNumberMinus. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberAbs(decNumber *res, const decNumber *rhs, decContext *set) { decNumber dzero; /* for 0 */ uInt status=0; /* accumulator */ #if DECCHECK if (decCheckOperands(res, DECUNUSED, rhs, set)) return res; #endif uprv_decNumberZero(&dzero); /* set 0 */ dzero.exponent=rhs->exponent; /* [no coefficient expansion] */ decAddOp(res, &dzero, rhs, set, (uByte)(rhs->bits & DECNEG), &status); if (status!=0) decStatus(res, status, set); #if DECCHECK decCheckInexact(res, set); #endif return res; } /* decNumberAbs */ /* ------------------------------------------------------------------ */ /* decNumberAdd -- add two Numbers */ /* */ /* This computes C = A + B */ /* */ /* res is C, the result. C may be A and/or B (e.g., X=X+X) */ /* lhs is A */ /* rhs is B */ /* set is the context */ /* */ /* C must have space for set->digits digits. */ /* ------------------------------------------------------------------ */ /* This just calls the routine shared with Subtract */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberAdd(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { uInt status=0; /* accumulator */ decAddOp(res, lhs, rhs, set, 0, &status); if (status!=0) decStatus(res, status, set); #if DECCHECK decCheckInexact(res, set); #endif return res; } /* decNumberAdd */ /* ------------------------------------------------------------------ */ /* decNumberAnd -- AND two Numbers, digitwise */ /* */ /* This computes C = A & B */ /* */ /* res is C, the result. C may be A and/or B (e.g., X=X&X) */ /* lhs is A */ /* rhs is B */ /* set is the context (used for result length and error report) */ /* */ /* C must have space for set->digits digits. */ /* */ /* Logical function restrictions apply (see above); a NaN is */ /* returned with Invalid_operation if a restriction is violated. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberAnd(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { const Unit *ua, *ub; /* -> operands */ const Unit *msua, *msub; /* -> operand msus */ Unit *uc, *msuc; /* -> result and its msu */ Int msudigs; /* digits in res msu */ #if DECCHECK if (decCheckOperands(res, lhs, rhs, set)) return res; #endif if (lhs->exponent!=0 || decNumberIsSpecial(lhs) || decNumberIsNegative(lhs) || rhs->exponent!=0 || decNumberIsSpecial(rhs) || decNumberIsNegative(rhs)) { decStatus(res, DEC_Invalid_operation, set); return res; } /* operands are valid */ ua=lhs->lsu; /* bottom-up */ ub=rhs->lsu; /* .. */ uc=res->lsu; /* .. */ msua=ua+D2U(lhs->digits)-1; /* -> msu of lhs */ msub=ub+D2U(rhs->digits)-1; /* -> msu of rhs */ msuc=uc+D2U(set->digits)-1; /* -> msu of result */ msudigs=MSUDIGITS(set->digits); /* [faster than remainder] */ for (; uc<=msuc; ua++, ub++, uc++) { /* Unit loop */ Unit a, b; /* extract units */ if (ua>msua) a=0; else a=*ua; if (ub>msub) b=0; else b=*ub; *uc=0; /* can now write back */ if (a|b) { /* maybe 1 bits to examine */ Int i, j; *uc=0; /* can now write back */ /* This loop could be unrolled and/or use BIN2BCD tables */ for (i=0; i1) { decStatus(res, DEC_Invalid_operation, set); return res; } if (uc==msuc && i==msudigs-1) break; /* just did final digit */ } /* each digit */ } /* both OK */ } /* each unit */ /* [here uc-1 is the msu of the result] */ res->digits=decGetDigits(res->lsu, static_cast(uc - res->lsu)); res->exponent=0; /* integer */ res->bits=0; /* sign=0 */ return res; /* [no status to set] */ } /* decNumberAnd */ /* ------------------------------------------------------------------ */ /* decNumberCompare -- compare two Numbers */ /* */ /* This computes C = A ? B */ /* */ /* res is C, the result. C may be A and/or B (e.g., X=X?X) */ /* lhs is A */ /* rhs is B */ /* set is the context */ /* */ /* C must have space for one digit (or NaN). */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberCompare(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { uInt status=0; /* accumulator */ decCompareOp(res, lhs, rhs, set, COMPARE, &status); if (status!=0) decStatus(res, status, set); return res; } /* decNumberCompare */ /* ------------------------------------------------------------------ */ /* decNumberCompareSignal -- compare, signalling on all NaNs */ /* */ /* This computes C = A ? B */ /* */ /* res is C, the result. C may be A and/or B (e.g., X=X?X) */ /* lhs is A */ /* rhs is B */ /* set is the context */ /* */ /* C must have space for one digit (or NaN). */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberCompareSignal(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { uInt status=0; /* accumulator */ decCompareOp(res, lhs, rhs, set, COMPSIG, &status); if (status!=0) decStatus(res, status, set); return res; } /* decNumberCompareSignal */ /* ------------------------------------------------------------------ */ /* decNumberCompareTotal -- compare two Numbers, using total ordering */ /* */ /* This computes C = A ? B, under total ordering */ /* */ /* res is C, the result. C may be A and/or B (e.g., X=X?X) */ /* lhs is A */ /* rhs is B */ /* set is the context */ /* */ /* C must have space for one digit; the result will always be one of */ /* -1, 0, or 1. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberCompareTotal(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { uInt status=0; /* accumulator */ decCompareOp(res, lhs, rhs, set, COMPTOTAL, &status); if (status!=0) decStatus(res, status, set); return res; } /* decNumberCompareTotal */ /* ------------------------------------------------------------------ */ /* decNumberCompareTotalMag -- compare, total ordering of magnitudes */ /* */ /* This computes C = |A| ? |B|, under total ordering */ /* */ /* res is C, the result. C may be A and/or B (e.g., X=X?X) */ /* lhs is A */ /* rhs is B */ /* set is the context */ /* */ /* C must have space for one digit; the result will always be one of */ /* -1, 0, or 1. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberCompareTotalMag(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { uInt status=0; /* accumulator */ uInt needbytes; /* for space calculations */ decNumber bufa[D2N(DECBUFFER+1)];/* +1 in case DECBUFFER=0 */ decNumber *allocbufa=nullptr; /* -> allocated bufa, iff allocated */ decNumber bufb[D2N(DECBUFFER+1)]; decNumber *allocbufb=nullptr; /* -> allocated bufb, iff allocated */ decNumber *a, *b; /* temporary pointers */ #if DECCHECK if (decCheckOperands(res, lhs, rhs, set)) return res; #endif do { /* protect allocated storage */ /* if either is negative, take a copy and absolute */ if (decNumberIsNegative(lhs)) { /* lhs<0 */ a=bufa; needbytes=sizeof(decNumber)+(D2U(lhs->digits)-1)*sizeof(Unit); if (needbytes>sizeof(bufa)) { /* need malloc space */ allocbufa=(decNumber *)malloc(needbytes); if (allocbufa==nullptr) { /* hopeless -- abandon */ status|=DEC_Insufficient_storage; break;} a=allocbufa; /* use the allocated space */ } uprv_decNumberCopy(a, lhs); /* copy content */ a->bits&=~DECNEG; /* .. and clear the sign */ lhs=a; /* use copy from here on */ } if (decNumberIsNegative(rhs)) { /* rhs<0 */ b=bufb; needbytes=sizeof(decNumber)+(D2U(rhs->digits)-1)*sizeof(Unit); if (needbytes>sizeof(bufb)) { /* need malloc space */ allocbufb=(decNumber *)malloc(needbytes); if (allocbufb==nullptr) { /* hopeless -- abandon */ status|=DEC_Insufficient_storage; break;} b=allocbufb; /* use the allocated space */ } uprv_decNumberCopy(b, rhs); /* copy content */ b->bits&=~DECNEG; /* .. and clear the sign */ rhs=b; /* use copy from here on */ } decCompareOp(res, lhs, rhs, set, COMPTOTAL, &status); } while(0); /* end protected */ if (allocbufa!=nullptr) free(allocbufa); /* drop any storage used */ if (allocbufb!=nullptr) free(allocbufb); /* .. */ if (status!=0) decStatus(res, status, set); return res; } /* decNumberCompareTotalMag */ /* ------------------------------------------------------------------ */ /* decNumberDivide -- divide one number by another */ /* */ /* This computes C = A / B */ /* */ /* res is C, the result. C may be A and/or B (e.g., X=X/X) */ /* lhs is A */ /* rhs is B */ /* set is the context */ /* */ /* C must have space for set->digits digits. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberDivide(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { uInt status=0; /* accumulator */ decDivideOp(res, lhs, rhs, set, DIVIDE, &status); if (status!=0) decStatus(res, status, set); #if DECCHECK decCheckInexact(res, set); #endif return res; } /* decNumberDivide */ /* ------------------------------------------------------------------ */ /* decNumberDivideInteger -- divide and return integer quotient */ /* */ /* This computes C = A # B, where # is the integer divide operator */ /* */ /* res is C, the result. C may be A and/or B (e.g., X=X#X) */ /* lhs is A */ /* rhs is B */ /* set is the context */ /* */ /* C must have space for set->digits digits. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberDivideInteger(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { uInt status=0; /* accumulator */ decDivideOp(res, lhs, rhs, set, DIVIDEINT, &status); if (status!=0) decStatus(res, status, set); return res; } /* decNumberDivideInteger */ /* ------------------------------------------------------------------ */ /* decNumberExp -- exponentiation */ /* */ /* This computes C = exp(A) */ /* */ /* res is C, the result. C may be A */ /* rhs is A */ /* set is the context; note that rounding mode has no effect */ /* */ /* C must have space for set->digits digits. */ /* */ /* Mathematical function restrictions apply (see above); a NaN is */ /* returned with Invalid_operation if a restriction is violated. */ /* */ /* Finite results will always be full precision and Inexact, except */ /* when A is a zero or -Infinity (giving 1 or 0 respectively). */ /* */ /* An Inexact result is rounded using DEC_ROUND_HALF_EVEN; it will */ /* almost always be correctly rounded, but may be up to 1 ulp in */ /* error in rare cases. */ /* ------------------------------------------------------------------ */ /* This is a wrapper for decExpOp which can handle the slightly wider */ /* (double) range needed by Ln (which has to be able to calculate */ /* exp(-a) where a can be the tiniest number (Ntiny). */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberExp(decNumber *res, const decNumber *rhs, decContext *set) { uInt status=0; /* accumulator */ #if DECSUBSET decNumber *allocrhs=nullptr; /* non-nullptr if rounded rhs allocated */ #endif #if DECCHECK if (decCheckOperands(res, DECUNUSED, rhs, set)) return res; #endif /* Check restrictions; these restrictions ensure that if h=8 (see */ /* decExpOp) then the result will either overflow or underflow to 0. */ /* Other math functions restrict the input range, too, for inverses. */ /* If not violated then carry out the operation. */ if (!decCheckMath(rhs, set, &status)) do { /* protect allocation */ #if DECSUBSET if (!set->extended) { /* reduce operand and set lostDigits status, as needed */ if (rhs->digits>set->digits) { allocrhs=decRoundOperand(rhs, set, &status); if (allocrhs==nullptr) break; rhs=allocrhs; } } #endif decExpOp(res, rhs, set, &status); } while(0); /* end protected */ #if DECSUBSET if (allocrhs !=nullptr) free(allocrhs); /* drop any storage used */ #endif /* apply significant status */ if (status!=0) decStatus(res, status, set); #if DECCHECK decCheckInexact(res, set); #endif return res; } /* decNumberExp */ /* ------------------------------------------------------------------ */ /* decNumberFMA -- fused multiply add */ /* */ /* This computes D = (A * B) + C with only one rounding */ /* */ /* res is D, the result. D may be A or B or C (e.g., X=FMA(X,X,X)) */ /* lhs is A */ /* rhs is B */ /* fhs is C [far hand side] */ /* set is the context */ /* */ /* Mathematical function restrictions apply (see above); a NaN is */ /* returned with Invalid_operation if a restriction is violated. */ /* */ /* C must have space for set->digits digits. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberFMA(decNumber *res, const decNumber *lhs, const decNumber *rhs, const decNumber *fhs, decContext *set) { uInt status=0; /* accumulator */ decContext dcmul; /* context for the multiplication */ uInt needbytes; /* for space calculations */ decNumber bufa[D2N(DECBUFFER*2+1)]; decNumber *allocbufa=nullptr; /* -> allocated bufa, iff allocated */ decNumber *acc; /* accumulator pointer */ decNumber dzero; /* work */ #if DECCHECK if (decCheckOperands(res, lhs, rhs, set)) return res; if (decCheckOperands(res, fhs, DECUNUSED, set)) return res; #endif do { /* protect allocated storage */ #if DECSUBSET if (!set->extended) { /* [undefined if subset] */ status|=DEC_Invalid_operation; break;} #endif /* Check math restrictions [these ensure no overflow or underflow] */ if ((!decNumberIsSpecial(lhs) && decCheckMath(lhs, set, &status)) || (!decNumberIsSpecial(rhs) && decCheckMath(rhs, set, &status)) || (!decNumberIsSpecial(fhs) && decCheckMath(fhs, set, &status))) break; /* set up context for multiply */ dcmul=*set; dcmul.digits=lhs->digits+rhs->digits; /* just enough */ /* [The above may be an over-estimate for subset arithmetic, but that's OK] */ dcmul.emax=DEC_MAX_EMAX; /* effectively unbounded .. */ dcmul.emin=DEC_MIN_EMIN; /* [thanks to Math restrictions] */ /* set up decNumber space to receive the result of the multiply */ acc=bufa; /* may fit */ needbytes=sizeof(decNumber)+(D2U(dcmul.digits)-1)*sizeof(Unit); if (needbytes>sizeof(bufa)) { /* need malloc space */ allocbufa=(decNumber *)malloc(needbytes); if (allocbufa==nullptr) { /* hopeless -- abandon */ status|=DEC_Insufficient_storage; break;} acc=allocbufa; /* use the allocated space */ } /* multiply with extended range and necessary precision */ /*printf("emin=%ld\n", dcmul.emin); */ decMultiplyOp(acc, lhs, rhs, &dcmul, &status); /* Only Invalid operation (from sNaN or Inf * 0) is possible in */ /* status; if either is seen than ignore fhs (in case it is */ /* another sNaN) and set acc to NaN unless we had an sNaN */ /* [decMultiplyOp leaves that to caller] */ /* Note sNaN has to go through addOp to shorten payload if */ /* necessary */ if ((status&DEC_Invalid_operation)!=0) { if (!(status&DEC_sNaN)) { /* but be true invalid */ uprv_decNumberZero(res); /* acc not yet set */ res->bits=DECNAN; break; } uprv_decNumberZero(&dzero); /* make 0 (any non-NaN would do) */ fhs=&dzero; /* use that */ } #if DECCHECK else { /* multiply was OK */ if (status!=0) printf("Status=%08lx after FMA multiply\n", (LI)status); } #endif /* add the third operand and result -> res, and all is done */ decAddOp(res, acc, fhs, set, 0, &status); } while(0); /* end protected */ if (allocbufa!=nullptr) free(allocbufa); /* drop any storage used */ if (status!=0) decStatus(res, status, set); #if DECCHECK decCheckInexact(res, set); #endif return res; } /* decNumberFMA */ /* ------------------------------------------------------------------ */ /* decNumberInvert -- invert a Number, digitwise */ /* */ /* This computes C = ~A */ /* */ /* res is C, the result. C may be A (e.g., X=~X) */ /* rhs is A */ /* set is the context (used for result length and error report) */ /* */ /* C must have space for set->digits digits. */ /* */ /* Logical function restrictions apply (see above); a NaN is */ /* returned with Invalid_operation if a restriction is violated. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberInvert(decNumber *res, const decNumber *rhs, decContext *set) { const Unit *ua, *msua; /* -> operand and its msu */ Unit *uc, *msuc; /* -> result and its msu */ Int msudigs; /* digits in res msu */ #if DECCHECK if (decCheckOperands(res, DECUNUSED, rhs, set)) return res; #endif if (rhs->exponent!=0 || decNumberIsSpecial(rhs) || decNumberIsNegative(rhs)) { decStatus(res, DEC_Invalid_operation, set); return res; } /* operand is valid */ ua=rhs->lsu; /* bottom-up */ uc=res->lsu; /* .. */ msua=ua+D2U(rhs->digits)-1; /* -> msu of rhs */ msuc=uc+D2U(set->digits)-1; /* -> msu of result */ msudigs=MSUDIGITS(set->digits); /* [faster than remainder] */ for (; uc<=msuc; ua++, uc++) { /* Unit loop */ Unit a; /* extract unit */ Int i, j; /* work */ if (ua>msua) a=0; else a=*ua; *uc=0; /* can now write back */ /* always need to examine all bits in rhs */ /* This loop could be unrolled and/or use BIN2BCD tables */ for (i=0; i1) { decStatus(res, DEC_Invalid_operation, set); return res; } if (uc==msuc && i==msudigs-1) break; /* just did final digit */ } /* each digit */ } /* each unit */ /* [here uc-1 is the msu of the result] */ res->digits=decGetDigits(res->lsu, static_cast(uc - res->lsu)); res->exponent=0; /* integer */ res->bits=0; /* sign=0 */ return res; /* [no status to set] */ } /* decNumberInvert */ /* ------------------------------------------------------------------ */ /* decNumberLn -- natural logarithm */ /* */ /* This computes C = ln(A) */ /* */ /* res is C, the result. C may be A */ /* rhs is A */ /* set is the context; note that rounding mode has no effect */ /* */ /* C must have space for set->digits digits. */ /* */ /* Notable cases: */ /* A<0 -> Invalid */ /* A=0 -> -Infinity (Exact) */ /* A=+Infinity -> +Infinity (Exact) */ /* A=1 exactly -> 0 (Exact) */ /* */ /* Mathematical function restrictions apply (see above); a NaN is */ /* returned with Invalid_operation if a restriction is violated. */ /* */ /* An Inexact result is rounded using DEC_ROUND_HALF_EVEN; it will */ /* almost always be correctly rounded, but may be up to 1 ulp in */ /* error in rare cases. */ /* ------------------------------------------------------------------ */ /* This is a wrapper for decLnOp which can handle the slightly wider */ /* (+11) range needed by Ln, Log10, etc. (which may have to be able */ /* to calculate at p+e+2). */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberLn(decNumber *res, const decNumber *rhs, decContext *set) { uInt status=0; /* accumulator */ #if DECSUBSET decNumber *allocrhs=nullptr; /* non-nullptr if rounded rhs allocated */ #endif #if DECCHECK if (decCheckOperands(res, DECUNUSED, rhs, set)) return res; #endif /* Check restrictions; this is a math function; if not violated */ /* then carry out the operation. */ if (!decCheckMath(rhs, set, &status)) do { /* protect allocation */ #if DECSUBSET if (!set->extended) { /* reduce operand and set lostDigits status, as needed */ if (rhs->digits>set->digits) { allocrhs=decRoundOperand(rhs, set, &status); if (allocrhs==nullptr) break; rhs=allocrhs; } /* special check in subset for rhs=0 */ if (ISZERO(rhs)) { /* +/- zeros -> error */ status|=DEC_Invalid_operation; break;} } /* extended=0 */ #endif decLnOp(res, rhs, set, &status); } while(0); /* end protected */ #if DECSUBSET if (allocrhs !=nullptr) free(allocrhs); /* drop any storage used */ #endif /* apply significant status */ if (status!=0) decStatus(res, status, set); #if DECCHECK decCheckInexact(res, set); #endif return res; } /* decNumberLn */ /* ------------------------------------------------------------------ */ /* decNumberLogB - get adjusted exponent, by 754 rules */ /* */ /* This computes C = adjustedexponent(A) */ /* */ /* res is C, the result. C may be A */ /* rhs is A */ /* set is the context, used only for digits and status */ /* */ /* C must have space for 10 digits (A might have 10**9 digits and */ /* an exponent of +999999999, or one digit and an exponent of */ /* -1999999999). */ /* */ /* This returns the adjusted exponent of A after (in theory) padding */ /* with zeros on the right to set->digits digits while keeping the */ /* same value. The exponent is not limited by emin/emax. */ /* */ /* Notable cases: */ /* A<0 -> Use |A| */ /* A=0 -> -Infinity (Division by zero) */ /* A=Infinite -> +Infinity (Exact) */ /* A=1 exactly -> 0 (Exact) */ /* NaNs are propagated as usual */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberLogB(decNumber *res, const decNumber *rhs, decContext *set) { uInt status=0; /* accumulator */ #if DECCHECK if (decCheckOperands(res, DECUNUSED, rhs, set)) return res; #endif /* NaNs as usual; Infinities return +Infinity; 0->oops */ if (decNumberIsNaN(rhs)) decNaNs(res, rhs, nullptr, set, &status); else if (decNumberIsInfinite(rhs)) uprv_decNumberCopyAbs(res, rhs); else if (decNumberIsZero(rhs)) { uprv_decNumberZero(res); /* prepare for Infinity */ res->bits=DECNEG|DECINF; /* -Infinity */ status|=DEC_Division_by_zero; /* as per 754 */ } else { /* finite non-zero */ Int ae=rhs->exponent+rhs->digits-1; /* adjusted exponent */ uprv_decNumberFromInt32(res, ae); /* lay it out */ } if (status!=0) decStatus(res, status, set); return res; } /* decNumberLogB */ /* ------------------------------------------------------------------ */ /* decNumberLog10 -- logarithm in base 10 */ /* */ /* This computes C = log10(A) */ /* */ /* res is C, the result. C may be A */ /* rhs is A */ /* set is the context; note that rounding mode has no effect */ /* */ /* C must have space for set->digits digits. */ /* */ /* Notable cases: */ /* A<0 -> Invalid */ /* A=0 -> -Infinity (Exact) */ /* A=+Infinity -> +Infinity (Exact) */ /* A=10**n (if n is an integer) -> n (Exact) */ /* */ /* Mathematical function restrictions apply (see above); a NaN is */ /* returned with Invalid_operation if a restriction is violated. */ /* */ /* An Inexact result is rounded using DEC_ROUND_HALF_EVEN; it will */ /* almost always be correctly rounded, but may be up to 1 ulp in */ /* error in rare cases. */ /* ------------------------------------------------------------------ */ /* This calculates ln(A)/ln(10) using appropriate precision. For */ /* ln(A) this is the max(p, rhs->digits + t) + 3, where p is the */ /* requested digits and t is the number of digits in the exponent */ /* (maximum 6). For ln(10) it is p + 3; this is often handled by the */ /* fastpath in decLnOp. The final division is done to the requested */ /* precision. */ /* ------------------------------------------------------------------ */ // #if defined(__clang__) || U_GCC_MAJOR_MINOR >= 406 // #pragma GCC diagnostic push // #pragma GCC diagnostic ignored "-Warray-bounds" // #endif U_CAPI decNumber * U_EXPORT2 uprv_decNumberLog10(decNumber *res, const decNumber *rhs, decContext *set) { uInt status=0, ignore=0; /* status accumulators */ uInt needbytes; /* for space calculations */ Int p; /* working precision */ Int t; /* digits in exponent of A */ /* buffers for a and b working decimals */ /* (adjustment calculator, same size) */ decNumber bufa[D2N(DECBUFFER+2)]; decNumber *allocbufa=nullptr; /* -> allocated bufa, iff allocated */ decNumber *a=bufa; /* temporary a */ decNumber bufb[D2N(DECBUFFER+2)]; decNumber *allocbufb=nullptr; /* -> allocated bufb, iff allocated */ decNumber *b=bufb; /* temporary b */ decNumber bufw[D2N(10)]; /* working 2-10 digit number */ decNumber *w=bufw; /* .. */ #if DECSUBSET decNumber *allocrhs=nullptr; /* non-nullptr if rounded rhs allocated */ #endif decContext aset; /* working context */ #if DECCHECK if (decCheckOperands(res, DECUNUSED, rhs, set)) return res; #endif /* Check restrictions; this is a math function; if not violated */ /* then carry out the operation. */ if (!decCheckMath(rhs, set, &status)) do { /* protect malloc */ #if DECSUBSET if (!set->extended) { /* reduce operand and set lostDigits status, as needed */ if (rhs->digits>set->digits) { allocrhs=decRoundOperand(rhs, set, &status); if (allocrhs==nullptr) break; rhs=allocrhs; } /* special check in subset for rhs=0 */ if (ISZERO(rhs)) { /* +/- zeros -> error */ status|=DEC_Invalid_operation; break;} } /* extended=0 */ #endif uprv_decContextDefault(&aset, DEC_INIT_DECIMAL64); /* clean context */ /* handle exact powers of 10; only check if +ve finite */ if (!(rhs->bits&(DECNEG|DECSPECIAL)) && !ISZERO(rhs)) { Int residue=0; /* (no residue) */ uInt copystat=0; /* clean status */ /* round to a single digit... */ aset.digits=1; decCopyFit(w, rhs, &aset, &residue, ©stat); /* copy & shorten */ /* if exact and the digit is 1, rhs is a power of 10 */ if (!(copystat&DEC_Inexact) && w->lsu[0]==1) { /* the exponent, conveniently, is the power of 10; making */ /* this the result needs a little care as it might not fit, */ /* so first convert it into the working number, and then move */ /* to res */ uprv_decNumberFromInt32(w, w->exponent); residue=0; decCopyFit(res, w, set, &residue, &status); /* copy & round */ decFinish(res, set, &residue, &status); /* cleanup/set flags */ break; } /* not a power of 10 */ } /* not a candidate for exact */ /* simplify the information-content calculation to use 'total */ /* number of digits in a, including exponent' as compared to the */ /* requested digits, as increasing this will only rarely cost an */ /* iteration in ln(a) anyway */ t=6; /* it can never be >6 */ /* allocate space when needed... */ p=(rhs->digits+t>set->digits?rhs->digits+t:set->digits)+3; needbytes=sizeof(decNumber)+(D2U(p)-1)*sizeof(Unit); if (needbytes>sizeof(bufa)) { /* need malloc space */ allocbufa=(decNumber *)malloc(needbytes); if (allocbufa==nullptr) { /* hopeless -- abandon */ status|=DEC_Insufficient_storage; break;} a=allocbufa; /* use the allocated space */ } aset.digits=p; /* as calculated */ aset.emax=DEC_MAX_MATH; /* usual bounds */ aset.emin=-DEC_MAX_MATH; /* .. */ aset.clamp=0; /* and no concrete format */ decLnOp(a, rhs, &aset, &status); /* a=ln(rhs) */ /* skip the division if the result so far is infinite, NaN, or */ /* zero, or there was an error; note NaN from sNaN needs copy */ if (status&DEC_NaNs && !(status&DEC_sNaN)) break; if (a->bits&DECSPECIAL || ISZERO(a)) { uprv_decNumberCopy(res, a); /* [will fit] */ break;} /* for ln(10) an extra 3 digits of precision are needed */ p=set->digits+3; needbytes=sizeof(decNumber)+(D2U(p)-1)*sizeof(Unit); if (needbytes>sizeof(bufb)) { /* need malloc space */ allocbufb=(decNumber *)malloc(needbytes); if (allocbufb==nullptr) { /* hopeless -- abandon */ status|=DEC_Insufficient_storage; break;} b=allocbufb; /* use the allocated space */ } uprv_decNumberZero(w); /* set up 10... */ #if DECDPUN==1 w->lsu[1]=1; w->lsu[0]=0; /* .. */ #else w->lsu[0]=10; /* .. */ #endif w->digits=2; /* .. */ aset.digits=p; decLnOp(b, w, &aset, &ignore); /* b=ln(10) */ aset.digits=set->digits; /* for final divide */ decDivideOp(res, a, b, &aset, DIVIDE, &status); /* into result */ } while(0); /* [for break] */ if (allocbufa!=nullptr) free(allocbufa); /* drop any storage used */ if (allocbufb!=nullptr) free(allocbufb); /* .. */ #if DECSUBSET if (allocrhs !=nullptr) free(allocrhs); /* .. */ #endif /* apply significant status */ if (status!=0) decStatus(res, status, set); #if DECCHECK decCheckInexact(res, set); #endif return res; } /* decNumberLog10 */ // #if defined(__clang__) || U_GCC_MAJOR_MINOR >= 406 // #pragma GCC diagnostic pop // #endif /* ------------------------------------------------------------------ */ /* decNumberMax -- compare two Numbers and return the maximum */ /* */ /* This computes C = A ? B, returning the maximum by 754 rules */ /* */ /* res is C, the result. C may be A and/or B (e.g., X=X?X) */ /* lhs is A */ /* rhs is B */ /* set is the context */ /* */ /* C must have space for set->digits digits. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberMax(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { uInt status=0; /* accumulator */ decCompareOp(res, lhs, rhs, set, COMPMAX, &status); if (status!=0) decStatus(res, status, set); #if DECCHECK decCheckInexact(res, set); #endif return res; } /* decNumberMax */ /* ------------------------------------------------------------------ */ /* decNumberMaxMag -- compare and return the maximum by magnitude */ /* */ /* This computes C = A ? B, returning the maximum by 754 rules */ /* */ /* res is C, the result. C may be A and/or B (e.g., X=X?X) */ /* lhs is A */ /* rhs is B */ /* set is the context */ /* */ /* C must have space for set->digits digits. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberMaxMag(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { uInt status=0; /* accumulator */ decCompareOp(res, lhs, rhs, set, COMPMAXMAG, &status); if (status!=0) decStatus(res, status, set); #if DECCHECK decCheckInexact(res, set); #endif return res; } /* decNumberMaxMag */ /* ------------------------------------------------------------------ */ /* decNumberMin -- compare two Numbers and return the minimum */ /* */ /* This computes C = A ? B, returning the minimum by 754 rules */ /* */ /* res is C, the result. C may be A and/or B (e.g., X=X?X) */ /* lhs is A */ /* rhs is B */ /* set is the context */ /* */ /* C must have space for set->digits digits. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberMin(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { uInt status=0; /* accumulator */ decCompareOp(res, lhs, rhs, set, COMPMIN, &status); if (status!=0) decStatus(res, status, set); #if DECCHECK decCheckInexact(res, set); #endif return res; } /* decNumberMin */ /* ------------------------------------------------------------------ */ /* decNumberMinMag -- compare and return the minimum by magnitude */ /* */ /* This computes C = A ? B, returning the minimum by 754 rules */ /* */ /* res is C, the result. C may be A and/or B (e.g., X=X?X) */ /* lhs is A */ /* rhs is B */ /* set is the context */ /* */ /* C must have space for set->digits digits. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberMinMag(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { uInt status=0; /* accumulator */ decCompareOp(res, lhs, rhs, set, COMPMINMAG, &status); if (status!=0) decStatus(res, status, set); #if DECCHECK decCheckInexact(res, set); #endif return res; } /* decNumberMinMag */ /* ------------------------------------------------------------------ */ /* decNumberMinus -- prefix minus operator */ /* */ /* This computes C = 0 - A */ /* */ /* res is C, the result. C may be A */ /* rhs is A */ /* set is the context */ /* */ /* See also decNumberCopyNegate for a quiet bitwise version of this. */ /* C must have space for set->digits digits. */ /* ------------------------------------------------------------------ */ /* Simply use AddOp for the subtract, which will do the necessary. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberMinus(decNumber *res, const decNumber *rhs, decContext *set) { decNumber dzero; uInt status=0; /* accumulator */ #if DECCHECK if (decCheckOperands(res, DECUNUSED, rhs, set)) return res; #endif uprv_decNumberZero(&dzero); /* make 0 */ dzero.exponent=rhs->exponent; /* [no coefficient expansion] */ decAddOp(res, &dzero, rhs, set, DECNEG, &status); if (status!=0) decStatus(res, status, set); #if DECCHECK decCheckInexact(res, set); #endif return res; } /* decNumberMinus */ /* ------------------------------------------------------------------ */ /* decNumberNextMinus -- next towards -Infinity */ /* */ /* This computes C = A - infinitesimal, rounded towards -Infinity */ /* */ /* res is C, the result. C may be A */ /* rhs is A */ /* set is the context */ /* */ /* This is a generalization of 754 NextDown. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberNextMinus(decNumber *res, const decNumber *rhs, decContext *set) { decNumber dtiny; /* constant */ decContext workset=*set; /* work */ uInt status=0; /* accumulator */ #if DECCHECK if (decCheckOperands(res, DECUNUSED, rhs, set)) return res; #endif /* +Infinity is the special case */ if ((rhs->bits&(DECINF|DECNEG))==DECINF) { decSetMaxValue(res, set); /* is +ve */ /* there is no status to set */ return res; } uprv_decNumberZero(&dtiny); /* start with 0 */ dtiny.lsu[0]=1; /* make number that is .. */ dtiny.exponent=DEC_MIN_EMIN-1; /* .. smaller than tiniest */ workset.round=DEC_ROUND_FLOOR; decAddOp(res, rhs, &dtiny, &workset, DECNEG, &status); status&=DEC_Invalid_operation|DEC_sNaN; /* only sNaN Invalid please */ if (status!=0) decStatus(res, status, set); return res; } /* decNumberNextMinus */ /* ------------------------------------------------------------------ */ /* decNumberNextPlus -- next towards +Infinity */ /* */ /* This computes C = A + infinitesimal, rounded towards +Infinity */ /* */ /* res is C, the result. C may be A */ /* rhs is A */ /* set is the context */ /* */ /* This is a generalization of 754 NextUp. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberNextPlus(decNumber *res, const decNumber *rhs, decContext *set) { decNumber dtiny; /* constant */ decContext workset=*set; /* work */ uInt status=0; /* accumulator */ #if DECCHECK if (decCheckOperands(res, DECUNUSED, rhs, set)) return res; #endif /* -Infinity is the special case */ if ((rhs->bits&(DECINF|DECNEG))==(DECINF|DECNEG)) { decSetMaxValue(res, set); res->bits=DECNEG; /* negative */ /* there is no status to set */ return res; } uprv_decNumberZero(&dtiny); /* start with 0 */ dtiny.lsu[0]=1; /* make number that is .. */ dtiny.exponent=DEC_MIN_EMIN-1; /* .. smaller than tiniest */ workset.round=DEC_ROUND_CEILING; decAddOp(res, rhs, &dtiny, &workset, 0, &status); status&=DEC_Invalid_operation|DEC_sNaN; /* only sNaN Invalid please */ if (status!=0) decStatus(res, status, set); return res; } /* decNumberNextPlus */ /* ------------------------------------------------------------------ */ /* decNumberNextToward -- next towards rhs */ /* */ /* This computes C = A +/- infinitesimal, rounded towards */ /* +/-Infinity in the direction of B, as per 754-1985 nextafter */ /* modified during revision but dropped from 754-2008. */ /* */ /* res is C, the result. C may be A or B. */ /* lhs is A */ /* rhs is B */ /* set is the context */ /* */ /* This is a generalization of 754-1985 NextAfter. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberNextToward(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { decNumber dtiny; /* constant */ decContext workset=*set; /* work */ Int result; /* .. */ uInt status=0; /* accumulator */ #if DECCHECK if (decCheckOperands(res, lhs, rhs, set)) return res; #endif if (decNumberIsNaN(lhs) || decNumberIsNaN(rhs)) { decNaNs(res, lhs, rhs, set, &status); } else { /* Is numeric, so no chance of sNaN Invalid, etc. */ result=decCompare(lhs, rhs, 0); /* sign matters */ if (result==BADINT) status|=DEC_Insufficient_storage; /* rare */ else { /* valid compare */ if (result==0) uprv_decNumberCopySign(res, lhs, rhs); /* easy */ else { /* differ: need NextPlus or NextMinus */ uByte sub; /* add or subtract */ if (result<0) { /* lhsbits&(DECINF|DECNEG))==(DECINF|DECNEG)) { decSetMaxValue(res, set); res->bits=DECNEG; /* negative */ return res; /* there is no status to set */ } workset.round=DEC_ROUND_CEILING; sub=0; /* add, please */ } /* plus */ else { /* lhs>rhs, do nextminus */ /* +Infinity is the special case */ if ((lhs->bits&(DECINF|DECNEG))==DECINF) { decSetMaxValue(res, set); return res; /* there is no status to set */ } workset.round=DEC_ROUND_FLOOR; sub=DECNEG; /* subtract, please */ } /* minus */ uprv_decNumberZero(&dtiny); /* start with 0 */ dtiny.lsu[0]=1; /* make number that is .. */ dtiny.exponent=DEC_MIN_EMIN-1; /* .. smaller than tiniest */ decAddOp(res, lhs, &dtiny, &workset, sub, &status); /* + or - */ /* turn off exceptions if the result is a normal number */ /* (including Nmin), otherwise let all status through */ if (uprv_decNumberIsNormal(res, set)) status=0; } /* unequal */ } /* compare OK */ } /* numeric */ if (status!=0) decStatus(res, status, set); return res; } /* decNumberNextToward */ /* ------------------------------------------------------------------ */ /* decNumberOr -- OR two Numbers, digitwise */ /* */ /* This computes C = A | B */ /* */ /* res is C, the result. C may be A and/or B (e.g., X=X|X) */ /* lhs is A */ /* rhs is B */ /* set is the context (used for result length and error report) */ /* */ /* C must have space for set->digits digits. */ /* */ /* Logical function restrictions apply (see above); a NaN is */ /* returned with Invalid_operation if a restriction is violated. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberOr(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { const Unit *ua, *ub; /* -> operands */ const Unit *msua, *msub; /* -> operand msus */ Unit *uc, *msuc; /* -> result and its msu */ Int msudigs; /* digits in res msu */ #if DECCHECK if (decCheckOperands(res, lhs, rhs, set)) return res; #endif if (lhs->exponent!=0 || decNumberIsSpecial(lhs) || decNumberIsNegative(lhs) || rhs->exponent!=0 || decNumberIsSpecial(rhs) || decNumberIsNegative(rhs)) { decStatus(res, DEC_Invalid_operation, set); return res; } /* operands are valid */ ua=lhs->lsu; /* bottom-up */ ub=rhs->lsu; /* .. */ uc=res->lsu; /* .. */ msua=ua+D2U(lhs->digits)-1; /* -> msu of lhs */ msub=ub+D2U(rhs->digits)-1; /* -> msu of rhs */ msuc=uc+D2U(set->digits)-1; /* -> msu of result */ msudigs=MSUDIGITS(set->digits); /* [faster than remainder] */ for (; uc<=msuc; ua++, ub++, uc++) { /* Unit loop */ Unit a, b; /* extract units */ if (ua>msua) a=0; else a=*ua; if (ub>msub) b=0; else b=*ub; *uc=0; /* can now write back */ if (a|b) { /* maybe 1 bits to examine */ Int i, j; /* This loop could be unrolled and/or use BIN2BCD tables */ for (i=0; i1) { decStatus(res, DEC_Invalid_operation, set); return res; } if (uc==msuc && i==msudigs-1) break; /* just did final digit */ } /* each digit */ } /* non-zero */ } /* each unit */ /* [here uc-1 is the msu of the result] */ res->digits=decGetDigits(res->lsu, static_cast(uc-res->lsu)); res->exponent=0; /* integer */ res->bits=0; /* sign=0 */ return res; /* [no status to set] */ } /* decNumberOr */ /* ------------------------------------------------------------------ */ /* decNumberPlus -- prefix plus operator */ /* */ /* This computes C = 0 + A */ /* */ /* res is C, the result. C may be A */ /* rhs is A */ /* set is the context */ /* */ /* See also decNumberCopy for a quiet bitwise version of this. */ /* C must have space for set->digits digits. */ /* ------------------------------------------------------------------ */ /* This simply uses AddOp; Add will take fast path after preparing A. */ /* Performance is a concern here, as this routine is often used to */ /* check operands and apply rounding and overflow/underflow testing. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberPlus(decNumber *res, const decNumber *rhs, decContext *set) { decNumber dzero; uInt status=0; /* accumulator */ #if DECCHECK if (decCheckOperands(res, DECUNUSED, rhs, set)) return res; #endif uprv_decNumberZero(&dzero); /* make 0 */ dzero.exponent=rhs->exponent; /* [no coefficient expansion] */ decAddOp(res, &dzero, rhs, set, 0, &status); if (status!=0) decStatus(res, status, set); #if DECCHECK decCheckInexact(res, set); #endif return res; } /* decNumberPlus */ /* ------------------------------------------------------------------ */ /* decNumberMultiply -- multiply two Numbers */ /* */ /* This computes C = A x B */ /* */ /* res is C, the result. C may be A and/or B (e.g., X=X+X) */ /* lhs is A */ /* rhs is B */ /* set is the context */ /* */ /* C must have space for set->digits digits. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberMultiply(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { uInt status=0; /* accumulator */ decMultiplyOp(res, lhs, rhs, set, &status); if (status!=0) decStatus(res, status, set); #if DECCHECK decCheckInexact(res, set); #endif return res; } /* decNumberMultiply */ /* ------------------------------------------------------------------ */ /* decNumberPower -- raise a number to a power */ /* */ /* This computes C = A ** B */ /* */ /* res is C, the result. C may be A and/or B (e.g., X=X**X) */ /* lhs is A */ /* rhs is B */ /* set is the context */ /* */ /* C must have space for set->digits digits. */ /* */ /* Mathematical function restrictions apply (see above); a NaN is */ /* returned with Invalid_operation if a restriction is violated. */ /* */ /* However, if 1999999997<=B<=999999999 and B is an integer then the */ /* restrictions on A and the context are relaxed to the usual bounds, */ /* for compatibility with the earlier (integer power only) version */ /* of this function. */ /* */ /* When B is an integer, the result may be exact, even if rounded. */ /* */ /* The final result is rounded according to the context; it will */ /* almost always be correctly rounded, but may be up to 1 ulp in */ /* error in rare cases. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberPower(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { #if DECSUBSET decNumber *alloclhs=nullptr; /* non-nullptr if rounded lhs allocated */ decNumber *allocrhs=nullptr; /* .., rhs */ #endif decNumber *allocdac=nullptr; /* -> allocated acc buffer, iff used */ decNumber *allocinv=nullptr; /* -> allocated 1/x buffer, iff used */ Int reqdigits=set->digits; /* requested DIGITS */ Int n; /* rhs in binary */ Flag rhsint=0; /* 1 if rhs is an integer */ Flag useint=0; /* 1 if can use integer calculation */ Flag isoddint=0; /* 1 if rhs is an integer and odd */ Int i; /* work */ #if DECSUBSET Int dropped; /* .. */ #endif uInt needbytes; /* buffer size needed */ Flag seenbit; /* seen a bit while powering */ Int residue=0; /* rounding residue */ uInt status=0; /* accumulators */ uByte bits=0; /* result sign if errors */ decContext aset; /* working context */ decNumber dnOne; /* work value 1... */ /* local accumulator buffer [a decNumber, with digits+elength+1 digits] */ decNumber dacbuff[D2N(DECBUFFER+9)]; decNumber *dac=dacbuff; /* -> result accumulator */ /* same again for possible 1/lhs calculation */ decNumber invbuff[D2N(DECBUFFER+9)]; #if DECCHECK if (decCheckOperands(res, lhs, rhs, set)) return res; #endif do { /* protect allocated storage */ #if DECSUBSET if (!set->extended) { /* reduce operands and set status, as needed */ if (lhs->digits>reqdigits) { alloclhs=decRoundOperand(lhs, set, &status); if (alloclhs==nullptr) break; lhs=alloclhs; } if (rhs->digits>reqdigits) { allocrhs=decRoundOperand(rhs, set, &status); if (allocrhs==nullptr) break; rhs=allocrhs; } } #endif /* [following code does not require input rounding] */ /* handle NaNs and rhs Infinity (lhs infinity is harder) */ if (SPECIALARGS) { if (decNumberIsNaN(lhs) || decNumberIsNaN(rhs)) { /* NaNs */ decNaNs(res, lhs, rhs, set, &status); break;} if (decNumberIsInfinite(rhs)) { /* rhs Infinity */ Flag rhsneg=rhs->bits&DECNEG; /* save rhs sign */ if (decNumberIsNegative(lhs) /* lhs<0 */ && !decNumberIsZero(lhs)) /* .. */ status|=DEC_Invalid_operation; else { /* lhs >=0 */ uprv_decNumberZero(&dnOne); /* set up 1 */ dnOne.lsu[0]=1; uprv_decNumberCompare(dac, lhs, &dnOne, set); /* lhs ? 1 */ uprv_decNumberZero(res); /* prepare for 0/1/Infinity */ if (decNumberIsNegative(dac)) { /* lhs<1 */ if (rhsneg) res->bits|=DECINF; /* +Infinity [else is +0] */ } else if (dac->lsu[0]==0) { /* lhs=1 */ /* 1**Infinity is inexact, so return fully-padded 1.0000 */ Int shift=set->digits-1; *res->lsu=1; /* was 0, make int 1 */ res->digits=decShiftToMost(res->lsu, 1, shift); res->exponent=-shift; /* make 1.0000... */ status|=DEC_Inexact|DEC_Rounded; /* deemed inexact */ } else { /* lhs>1 */ if (!rhsneg) res->bits|=DECINF; /* +Infinity [else is +0] */ } } /* lhs>=0 */ break;} /* [lhs infinity drops through] */ } /* specials */ /* Original rhs may be an integer that fits and is in range */ n=decGetInt(rhs); if (n!=BADINT) { /* it is an integer */ rhsint=1; /* record the fact for 1**n */ isoddint=(Flag)n&1; /* [works even if big] */ if (n!=BIGEVEN && n!=BIGODD) /* can use integer path? */ useint=1; /* looks good */ } if (decNumberIsNegative(lhs) /* -x .. */ && isoddint) bits=DECNEG; /* .. to an odd power */ /* handle LHS infinity */ if (decNumberIsInfinite(lhs)) { /* [NaNs already handled] */ uByte rbits=rhs->bits; /* save */ uprv_decNumberZero(res); /* prepare */ if (n==0) *res->lsu=1; /* [-]Inf**0 => 1 */ else { /* -Inf**nonint -> error */ if (!rhsint && decNumberIsNegative(lhs)) { status|=DEC_Invalid_operation; /* -Inf**nonint is error */ break;} if (!(rbits & DECNEG)) bits|=DECINF; /* was not a **-n */ /* [otherwise will be 0 or -0] */ res->bits=bits; } break;} /* similarly handle LHS zero */ if (decNumberIsZero(lhs)) { if (n==0) { /* 0**0 => Error */ #if DECSUBSET if (!set->extended) { /* [unless subset] */ uprv_decNumberZero(res); *res->lsu=1; /* return 1 */ break;} #endif status|=DEC_Invalid_operation; } else { /* 0**x */ uByte rbits=rhs->bits; /* save */ if (rbits & DECNEG) { /* was a 0**(-n) */ #if DECSUBSET if (!set->extended) { /* [bad if subset] */ status|=DEC_Invalid_operation; break;} #endif bits|=DECINF; } uprv_decNumberZero(res); /* prepare */ /* [otherwise will be 0 or -0] */ res->bits=bits; } break;} /* here both lhs and rhs are finite; rhs==0 is handled in the */ /* integer path. Next handle the non-integer cases */ if (!useint) { /* non-integral rhs */ /* any -ve lhs is bad, as is either operand or context out of */ /* bounds */ if (decNumberIsNegative(lhs)) { status|=DEC_Invalid_operation; break;} if (decCheckMath(lhs, set, &status) || decCheckMath(rhs, set, &status)) break; /* variable status */ uprv_decContextDefault(&aset, DEC_INIT_DECIMAL64); /* clean context */ aset.emax=DEC_MAX_MATH; /* usual bounds */ aset.emin=-DEC_MAX_MATH; /* .. */ aset.clamp=0; /* and no concrete format */ /* calculate the result using exp(ln(lhs)*rhs), which can */ /* all be done into the accumulator, dac. The precision needed */ /* is enough to contain the full information in the lhs (which */ /* is the total digits, including exponent), or the requested */ /* precision, if larger, + 4; 6 is used for the exponent */ /* maximum length, and this is also used when it is shorter */ /* than the requested digits as it greatly reduces the >0.5 ulp */ /* cases at little cost (because Ln doubles digits each */ /* iteration so a few extra digits rarely causes an extra */ /* iteration) */ aset.digits=MAXI(lhs->digits, set->digits)+6+4; } /* non-integer rhs */ else { /* rhs is in-range integer */ if (n==0) { /* x**0 = 1 */ /* (0**0 was handled above) */ uprv_decNumberZero(res); /* result=1 */ *res->lsu=1; /* .. */ break;} /* rhs is a non-zero integer */ if (n<0) n=-n; /* use abs(n) */ aset=*set; /* clone the context */ aset.round=DEC_ROUND_HALF_EVEN; /* internally use balanced */ /* calculate the working DIGITS */ aset.digits=reqdigits+(rhs->digits+rhs->exponent)+2; #if DECSUBSET if (!set->extended) aset.digits--; /* use classic precision */ #endif /* it's an error if this is more than can be handled */ if (aset.digits>DECNUMMAXP) {status|=DEC_Invalid_operation; break;} } /* integer path */ /* aset.digits is the count of digits for the accumulator needed */ /* if accumulator is too long for local storage, then allocate */ needbytes=sizeof(decNumber)+(D2U(aset.digits)-1)*sizeof(Unit); /* [needbytes also used below if 1/lhs needed] */ if (needbytes>sizeof(dacbuff)) { allocdac=(decNumber *)malloc(needbytes); if (allocdac==nullptr) { /* hopeless -- abandon */ status|=DEC_Insufficient_storage; break;} dac=allocdac; /* use the allocated space */ } /* here, aset is set up and accumulator is ready for use */ if (!useint) { /* non-integral rhs */ /* x ** y; special-case x=1 here as it will otherwise always */ /* reduce to integer 1; decLnOp has a fastpath which detects */ /* the case of x=1 */ decLnOp(dac, lhs, &aset, &status); /* dac=ln(lhs) */ /* [no error possible, as lhs 0 already handled] */ if (ISZERO(dac)) { /* x==1, 1.0, etc. */ /* need to return fully-padded 1.0000 etc., but rhsint->1 */ *dac->lsu=1; /* was 0, make int 1 */ if (!rhsint) { /* add padding */ Int shift=set->digits-1; dac->digits=decShiftToMost(dac->lsu, 1, shift); dac->exponent=-shift; /* make 1.0000... */ status|=DEC_Inexact|DEC_Rounded; /* deemed inexact */ } } else { decMultiplyOp(dac, dac, rhs, &aset, &status); /* dac=dac*rhs */ decExpOp(dac, dac, &aset, &status); /* dac=exp(dac) */ } /* and drop through for final rounding */ } /* non-integer rhs */ else { /* carry on with integer */ uprv_decNumberZero(dac); /* acc=1 */ *dac->lsu=1; /* .. */ /* if a negative power the constant 1 is needed, and if not subset */ /* invert the lhs now rather than inverting the result later */ if (decNumberIsNegative(rhs)) { /* was a **-n [hence digits>0] */ decNumber *inv=invbuff; /* assume use fixed buffer */ uprv_decNumberCopy(&dnOne, dac); /* dnOne=1; [needed now or later] */ #if DECSUBSET if (set->extended) { /* need to calculate 1/lhs */ #endif /* divide lhs into 1, putting result in dac [dac=1/dac] */ decDivideOp(dac, &dnOne, lhs, &aset, DIVIDE, &status); /* now locate or allocate space for the inverted lhs */ if (needbytes>sizeof(invbuff)) { allocinv=(decNumber *)malloc(needbytes); if (allocinv==nullptr) { /* hopeless -- abandon */ status|=DEC_Insufficient_storage; break;} inv=allocinv; /* use the allocated space */ } /* [inv now points to big-enough buffer or allocated storage] */ uprv_decNumberCopy(inv, dac); /* copy the 1/lhs */ uprv_decNumberCopy(dac, &dnOne); /* restore acc=1 */ lhs=inv; /* .. and go forward with new lhs */ #if DECSUBSET } #endif } /* Raise-to-the-power loop... */ seenbit=0; /* set once a 1-bit is encountered */ for (i=1;;i++){ /* for each bit [top bit ignored] */ /* abandon if had overflow or terminal underflow */ if (status & (DEC_Overflow|DEC_Underflow)) { /* interesting? */ if (status&DEC_Overflow || ISZERO(dac)) break; } /* [the following two lines revealed an optimizer bug in a C++ */ /* compiler, with symptom: 5**3 -> 25, when n=n+n was used] */ n=n<<1; /* move next bit to testable position */ if (n<0) { /* top bit is set */ seenbit=1; /* OK, significant bit seen */ decMultiplyOp(dac, dac, lhs, &aset, &status); /* dac=dac*x */ } if (i==31) break; /* that was the last bit */ if (!seenbit) continue; /* no need to square 1 */ decMultiplyOp(dac, dac, dac, &aset, &status); /* dac=dac*dac [square] */ } /*i*/ /* 32 bits */ /* complete internal overflow or underflow processing */ if (status & (DEC_Overflow|DEC_Underflow)) { #if DECSUBSET /* If subset, and power was negative, reverse the kind of -erflow */ /* [1/x not yet done] */ if (!set->extended && decNumberIsNegative(rhs)) { if (status & DEC_Overflow) status^=DEC_Overflow | DEC_Underflow | DEC_Subnormal; else { /* trickier -- Underflow may or may not be set */ status&=~(DEC_Underflow | DEC_Subnormal); /* [one or both] */ status|=DEC_Overflow; } } #endif dac->bits=(dac->bits & ~DECNEG) | bits; /* force correct sign */ /* round subnormals [to set.digits rather than aset.digits] */ /* or set overflow result similarly as required */ decFinalize(dac, set, &residue, &status); uprv_decNumberCopy(res, dac); /* copy to result (is now OK length) */ break; } #if DECSUBSET if (!set->extended && /* subset math */ decNumberIsNegative(rhs)) { /* was a **-n [hence digits>0] */ /* so divide result into 1 [dac=1/dac] */ decDivideOp(dac, &dnOne, dac, &aset, DIVIDE, &status); } #endif } /* rhs integer path */ /* reduce result to the requested length and copy to result */ decCopyFit(res, dac, set, &residue, &status); decFinish(res, set, &residue, &status); /* final cleanup */ #if DECSUBSET if (!set->extended) decTrim(res, set, 0, 1, &dropped); /* trailing zeros */ #endif } while(0); /* end protected */ if (allocdac!=nullptr) free(allocdac); /* drop any storage used */ if (allocinv!=nullptr) free(allocinv); /* .. */ #if DECSUBSET if (alloclhs!=nullptr) free(alloclhs); /* .. */ if (allocrhs!=nullptr) free(allocrhs); /* .. */ #endif if (status!=0) decStatus(res, status, set); #if DECCHECK decCheckInexact(res, set); #endif return res; } /* decNumberPower */ /* ------------------------------------------------------------------ */ /* decNumberQuantize -- force exponent to requested value */ /* */ /* This computes C = op(A, B), where op adjusts the coefficient */ /* of C (by rounding or shifting) such that the exponent (-scale) */ /* of C has exponent of B. The numerical value of C will equal A, */ /* except for the effects of any rounding that occurred. */ /* */ /* res is C, the result. C may be A or B */ /* lhs is A, the number to adjust */ /* rhs is B, the number with exponent to match */ /* set is the context */ /* */ /* C must have space for set->digits digits. */ /* */ /* Unless there is an error or the result is infinite, the exponent */ /* after the operation is guaranteed to be equal to that of B. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberQuantize(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { uInt status=0; /* accumulator */ decQuantizeOp(res, lhs, rhs, set, 1, &status); if (status!=0) decStatus(res, status, set); return res; } /* decNumberQuantize */ /* ------------------------------------------------------------------ */ /* decNumberReduce -- remove trailing zeros */ /* */ /* This computes C = 0 + A, and normalizes the result */ /* */ /* res is C, the result. C may be A */ /* rhs is A */ /* set is the context */ /* */ /* C must have space for set->digits digits. */ /* ------------------------------------------------------------------ */ /* Previously known as Normalize */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberNormalize(decNumber *res, const decNumber *rhs, decContext *set) { return uprv_decNumberReduce(res, rhs, set); } /* decNumberNormalize */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberReduce(decNumber *res, const decNumber *rhs, decContext *set) { #if DECSUBSET decNumber *allocrhs=nullptr; /* non-nullptr if rounded rhs allocated */ #endif uInt status=0; /* as usual */ Int residue=0; /* as usual */ Int dropped; /* work */ #if DECCHECK if (decCheckOperands(res, DECUNUSED, rhs, set)) return res; #endif do { /* protect allocated storage */ #if DECSUBSET if (!set->extended) { /* reduce operand and set lostDigits status, as needed */ if (rhs->digits>set->digits) { allocrhs=decRoundOperand(rhs, set, &status); if (allocrhs==nullptr) break; rhs=allocrhs; } } #endif /* [following code does not require input rounding] */ /* Infinities copy through; NaNs need usual treatment */ if (decNumberIsNaN(rhs)) { decNaNs(res, rhs, nullptr, set, &status); break; } /* reduce result to the requested length and copy to result */ decCopyFit(res, rhs, set, &residue, &status); /* copy & round */ decFinish(res, set, &residue, &status); /* cleanup/set flags */ decTrim(res, set, 1, 0, &dropped); /* normalize in place */ /* [may clamp] */ } while(0); /* end protected */ #if DECSUBSET if (allocrhs !=nullptr) free(allocrhs); /* .. */ #endif if (status!=0) decStatus(res, status, set);/* then report status */ return res; } /* decNumberReduce */ /* ------------------------------------------------------------------ */ /* decNumberRescale -- force exponent to requested value */ /* */ /* This computes C = op(A, B), where op adjusts the coefficient */ /* of C (by rounding or shifting) such that the exponent (-scale) */ /* of C has the value B. The numerical value of C will equal A, */ /* except for the effects of any rounding that occurred. */ /* */ /* res is C, the result. C may be A or B */ /* lhs is A, the number to adjust */ /* rhs is B, the requested exponent */ /* set is the context */ /* */ /* C must have space for set->digits digits. */ /* */ /* Unless there is an error or the result is infinite, the exponent */ /* after the operation is guaranteed to be equal to B. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberRescale(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { uInt status=0; /* accumulator */ decQuantizeOp(res, lhs, rhs, set, 0, &status); if (status!=0) decStatus(res, status, set); return res; } /* decNumberRescale */ /* ------------------------------------------------------------------ */ /* decNumberRemainder -- divide and return remainder */ /* */ /* This computes C = A % B */ /* */ /* res is C, the result. C may be A and/or B (e.g., X=X%X) */ /* lhs is A */ /* rhs is B */ /* set is the context */ /* */ /* C must have space for set->digits digits. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberRemainder(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { uInt status=0; /* accumulator */ decDivideOp(res, lhs, rhs, set, REMAINDER, &status); if (status!=0) decStatus(res, status, set); #if DECCHECK decCheckInexact(res, set); #endif return res; } /* decNumberRemainder */ /* ------------------------------------------------------------------ */ /* decNumberRemainderNear -- divide and return remainder from nearest */ /* */ /* This computes C = A % B, where % is the IEEE remainder operator */ /* */ /* res is C, the result. C may be A and/or B (e.g., X=X%X) */ /* lhs is A */ /* rhs is B */ /* set is the context */ /* */ /* C must have space for set->digits digits. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberRemainderNear(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { uInt status=0; /* accumulator */ decDivideOp(res, lhs, rhs, set, REMNEAR, &status); if (status!=0) decStatus(res, status, set); #if DECCHECK decCheckInexact(res, set); #endif return res; } /* decNumberRemainderNear */ /* ------------------------------------------------------------------ */ /* decNumberRotate -- rotate the coefficient of a Number left/right */ /* */ /* This computes C = A rot B (in base ten and rotating set->digits */ /* digits). */ /* */ /* res is C, the result. C may be A and/or B (e.g., X=XrotX) */ /* lhs is A */ /* rhs is B, the number of digits to rotate (-ve to right) */ /* set is the context */ /* */ /* The digits of the coefficient of A are rotated to the left (if B */ /* is positive) or to the right (if B is negative) without adjusting */ /* the exponent or the sign of A. If lhs->digits is less than */ /* set->digits the coefficient is padded with zeros on the left */ /* before the rotate. Any leading zeros in the result are removed */ /* as usual. */ /* */ /* B must be an integer (q=0) and in the range -set->digits through */ /* +set->digits. */ /* C must have space for set->digits digits. */ /* NaNs are propagated as usual. Infinities are unaffected (but */ /* B must be valid). No status is set unless B is invalid or an */ /* operand is an sNaN. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberRotate(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { uInt status=0; /* accumulator */ Int rotate; /* rhs as an Int */ #if DECCHECK if (decCheckOperands(res, lhs, rhs, set)) return res; #endif /* NaNs propagate as normal */ if (decNumberIsNaN(lhs) || decNumberIsNaN(rhs)) decNaNs(res, lhs, rhs, set, &status); /* rhs must be an integer */ else if (decNumberIsInfinite(rhs) || rhs->exponent!=0) status=DEC_Invalid_operation; else { /* both numeric, rhs is an integer */ rotate=decGetInt(rhs); /* [cannot fail] */ if (rotate==BADINT /* something bad .. */ || rotate==BIGODD || rotate==BIGEVEN /* .. very big .. */ || abs(rotate)>set->digits) /* .. or out of range */ status=DEC_Invalid_operation; else { /* rhs is OK */ uprv_decNumberCopy(res, lhs); /* convert -ve rotate to equivalent positive rotation */ if (rotate<0) rotate=set->digits+rotate; if (rotate!=0 && rotate!=set->digits /* zero or full rotation */ && !decNumberIsInfinite(res)) { /* lhs was infinite */ /* left-rotate to do; 0 < rotate < set->digits */ uInt units, shift; /* work */ uInt msudigits; /* digits in result msu */ Unit *msu=res->lsu+D2U(res->digits)-1; /* current msu */ Unit *msumax=res->lsu+D2U(set->digits)-1; /* rotation msu */ for (msu++; msu<=msumax; msu++) *msu=0; /* ensure high units=0 */ res->digits=set->digits; /* now full-length */ msudigits=MSUDIGITS(res->digits); /* actual digits in msu */ /* rotation here is done in-place, in three steps */ /* 1. shift all to least up to one unit to unit-align final */ /* lsd [any digits shifted out are rotated to the left, */ /* abutted to the original msd (which may require split)] */ /* */ /* [if there are no whole units left to rotate, the */ /* rotation is now complete] */ /* */ /* 2. shift to least, from below the split point only, so that */ /* the final msd is in the right place in its Unit [any */ /* digits shifted out will fit exactly in the current msu, */ /* left aligned, no split required] */ /* */ /* 3. rotate all the units by reversing left part, right */ /* part, and then whole */ /* */ /* example: rotate right 8 digits (2 units + 2), DECDPUN=3. */ /* */ /* start: 00a bcd efg hij klm npq */ /* */ /* 1a 000 0ab cde fgh|ijk lmn [pq saved] */ /* 1b 00p qab cde fgh|ijk lmn */ /* */ /* 2a 00p qab cde fgh|00i jkl [mn saved] */ /* 2b mnp qab cde fgh|00i jkl */ /* */ /* 3a fgh cde qab mnp|00i jkl */ /* 3b fgh cde qab mnp|jkl 00i */ /* 3c 00i jkl mnp qab cde fgh */ /* Step 1: amount to shift is the partial right-rotate count */ rotate=set->digits-rotate; /* make it right-rotate */ units=rotate/DECDPUN; /* whole units to rotate */ shift=rotate%DECDPUN; /* left-over digits count */ if (shift>0) { /* not an exact number of units */ uInt save=res->lsu[0]%powers[shift]; /* save low digit(s) */ decShiftToLeast(res->lsu, D2U(res->digits), shift); if (shift>msudigits) { /* msumax-1 needs >0 digits */ uInt rem=save%powers[shift-msudigits];/* split save */ *msumax=(Unit)(save/powers[shift-msudigits]); /* and insert */ *(msumax-1)=*(msumax-1) +(Unit)(rem*powers[DECDPUN-(shift-msudigits)]); /* .. */ } else { /* all fits in msumax */ *msumax=*msumax+(Unit)(save*powers[msudigits-shift]); /* [maybe *1] */ } } /* digits shift needed */ /* If whole units to rotate... */ if (units>0) { /* some to do */ /* Step 2: the units to touch are the whole ones in rotate, */ /* if any, and the shift is DECDPUN-msudigits (which may be */ /* 0, again) */ shift=DECDPUN-msudigits; if (shift>0) { /* not an exact number of units */ uInt save=res->lsu[0]%powers[shift]; /* save low digit(s) */ decShiftToLeast(res->lsu, units, shift); *msumax=*msumax+(Unit)(save*powers[msudigits]); } /* partial shift needed */ /* Step 3: rotate the units array using triple reverse */ /* (reversing is easy and fast) */ decReverse(res->lsu+units, msumax); /* left part */ decReverse(res->lsu, res->lsu+units-1); /* right part */ decReverse(res->lsu, msumax); /* whole */ } /* whole units to rotate */ /* the rotation may have left an undetermined number of zeros */ /* on the left, so true length needs to be calculated */ res->digits=decGetDigits(res->lsu, static_cast(msumax-res->lsu+1)); } /* rotate needed */ } /* rhs OK */ } /* numerics */ if (status!=0) decStatus(res, status, set); return res; } /* decNumberRotate */ /* ------------------------------------------------------------------ */ /* decNumberSameQuantum -- test for equal exponents */ /* */ /* res is the result number, which will contain either 0 or 1 */ /* lhs is a number to test */ /* rhs is the second (usually a pattern) */ /* */ /* No errors are possible and no context is needed. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberSameQuantum(decNumber *res, const decNumber *lhs, const decNumber *rhs) { Unit ret=0; /* return value */ #if DECCHECK if (decCheckOperands(res, lhs, rhs, DECUNCONT)) return res; #endif if (SPECIALARGS) { if (decNumberIsNaN(lhs) && decNumberIsNaN(rhs)) ret=1; else if (decNumberIsInfinite(lhs) && decNumberIsInfinite(rhs)) ret=1; /* [anything else with a special gives 0] */ } else if (lhs->exponent==rhs->exponent) ret=1; uprv_decNumberZero(res); /* OK to overwrite an operand now */ *res->lsu=ret; return res; } /* decNumberSameQuantum */ /* ------------------------------------------------------------------ */ /* decNumberScaleB -- multiply by a power of 10 */ /* */ /* This computes C = A x 10**B where B is an integer (q=0) with */ /* maximum magnitude 2*(emax+digits) */ /* */ /* res is C, the result. C may be A or B */ /* lhs is A, the number to adjust */ /* rhs is B, the requested power of ten to use */ /* set is the context */ /* */ /* C must have space for set->digits digits. */ /* */ /* The result may underflow or overflow. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberScaleB(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { Int reqexp; /* requested exponent change [B] */ uInt status=0; /* accumulator */ Int residue; /* work */ #if DECCHECK if (decCheckOperands(res, lhs, rhs, set)) return res; #endif /* Handle special values except lhs infinite */ if (decNumberIsNaN(lhs) || decNumberIsNaN(rhs)) decNaNs(res, lhs, rhs, set, &status); /* rhs must be an integer */ else if (decNumberIsInfinite(rhs) || rhs->exponent!=0) status=DEC_Invalid_operation; else { /* lhs is a number; rhs is a finite with q==0 */ reqexp=decGetInt(rhs); /* [cannot fail] */ if (reqexp==BADINT /* something bad .. */ || reqexp==BIGODD || reqexp==BIGEVEN /* .. very big .. */ || abs(reqexp)>(2*(set->digits+set->emax))) /* .. or out of range */ status=DEC_Invalid_operation; else { /* rhs is OK */ uprv_decNumberCopy(res, lhs); /* all done if infinite lhs */ if (!decNumberIsInfinite(res)) { /* prepare to scale */ res->exponent+=reqexp; /* adjust the exponent */ residue=0; decFinalize(res, set, &residue, &status); /* .. and check */ } /* finite LHS */ } /* rhs OK */ } /* rhs finite */ if (status!=0) decStatus(res, status, set); return res; } /* decNumberScaleB */ /* ------------------------------------------------------------------ */ /* decNumberShift -- shift the coefficient of a Number left or right */ /* */ /* This computes C = A << B or C = A >> -B (in base ten). */ /* */ /* res is C, the result. C may be A and/or B (e.g., X=X<digits through */ /* +set->digits. */ /* C must have space for set->digits digits. */ /* NaNs are propagated as usual. Infinities are unaffected (but */ /* B must be valid). No status is set unless B is invalid or an */ /* operand is an sNaN. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberShift(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { uInt status=0; /* accumulator */ Int shift; /* rhs as an Int */ #if DECCHECK if (decCheckOperands(res, lhs, rhs, set)) return res; #endif /* NaNs propagate as normal */ if (decNumberIsNaN(lhs) || decNumberIsNaN(rhs)) decNaNs(res, lhs, rhs, set, &status); /* rhs must be an integer */ else if (decNumberIsInfinite(rhs) || rhs->exponent!=0) status=DEC_Invalid_operation; else { /* both numeric, rhs is an integer */ shift=decGetInt(rhs); /* [cannot fail] */ if (shift==BADINT /* something bad .. */ || shift==BIGODD || shift==BIGEVEN /* .. very big .. */ || abs(shift)>set->digits) /* .. or out of range */ status=DEC_Invalid_operation; else { /* rhs is OK */ uprv_decNumberCopy(res, lhs); if (shift!=0 && !decNumberIsInfinite(res)) { /* something to do */ if (shift>0) { /* to left */ if (shift==set->digits) { /* removing all */ *res->lsu=0; /* so place 0 */ res->digits=1; /* .. */ } else { /* */ /* first remove leading digits if necessary */ if (res->digits+shift>set->digits) { decDecap(res, res->digits+shift-set->digits); /* that updated res->digits; may have gone to 1 (for a */ /* single digit or for zero */ } if (res->digits>1 || *res->lsu) /* if non-zero.. */ res->digits=decShiftToMost(res->lsu, res->digits, shift); } /* partial left */ } /* left */ else { /* to right */ if (-shift>=res->digits) { /* discarding all */ *res->lsu=0; /* so place 0 */ res->digits=1; /* .. */ } else { decShiftToLeast(res->lsu, D2U(res->digits), -shift); res->digits-=(-shift); } } /* to right */ } /* non-0 non-Inf shift */ } /* rhs OK */ } /* numerics */ if (status!=0) decStatus(res, status, set); return res; } /* decNumberShift */ /* ------------------------------------------------------------------ */ /* decNumberSquareRoot -- square root operator */ /* */ /* This computes C = squareroot(A) */ /* */ /* res is C, the result. C may be A */ /* rhs is A */ /* set is the context; note that rounding mode has no effect */ /* */ /* C must have space for set->digits digits. */ /* ------------------------------------------------------------------ */ /* This uses the following varying-precision algorithm in: */ /* */ /* Properly Rounded Variable Precision Square Root, T. E. Hull and */ /* A. Abrham, ACM Transactions on Mathematical Software, Vol 11 #3, */ /* pp229-237, ACM, September 1985. */ /* */ /* The square-root is calculated using Newton's method, after which */ /* a check is made to ensure the result is correctly rounded. */ /* */ /* % [Reformatted original Numerical Turing source code follows.] */ /* function sqrt(x : real) : real */ /* % sqrt(x) returns the properly rounded approximation to the square */ /* % root of x, in the precision of the calling environment, or it */ /* % fails if x < 0. */ /* % t e hull and a abrham, august, 1984 */ /* if x <= 0 then */ /* if x < 0 then */ /* assert false */ /* else */ /* result 0 */ /* end if */ /* end if */ /* var f := setexp(x, 0) % fraction part of x [0.1 <= x < 1] */ /* var e := getexp(x) % exponent part of x */ /* var approx : real */ /* if e mod 2 = 0 then */ /* approx := .259 + .819 * f % approx to root of f */ /* else */ /* f := f/l0 % adjustments */ /* e := e + 1 % for odd */ /* approx := .0819 + 2.59 * f % exponent */ /* end if */ /* */ /* var p:= 3 */ /* const maxp := currentprecision + 2 */ /* loop */ /* p := min(2*p - 2, maxp) % p = 4,6,10, . . . , maxp */ /* precision p */ /* approx := .5 * (approx + f/approx) */ /* exit when p = maxp */ /* end loop */ /* */ /* % approx is now within 1 ulp of the properly rounded square root */ /* % of f; to ensure proper rounding, compare squares of (approx - */ /* % l/2 ulp) and (approx + l/2 ulp) with f. */ /* p := currentprecision */ /* begin */ /* precision p + 2 */ /* const approxsubhalf := approx - setexp(.5, -p) */ /* if mulru(approxsubhalf, approxsubhalf) > f then */ /* approx := approx - setexp(.l, -p + 1) */ /* else */ /* const approxaddhalf := approx + setexp(.5, -p) */ /* if mulrd(approxaddhalf, approxaddhalf) < f then */ /* approx := approx + setexp(.l, -p + 1) */ /* end if */ /* end if */ /* end */ /* result setexp(approx, e div 2) % fix exponent */ /* end sqrt */ /* ------------------------------------------------------------------ */ // #if defined(__clang__) || U_GCC_MAJOR_MINOR >= 406 // #pragma GCC diagnostic push // #pragma GCC diagnostic ignored "-Warray-bounds" // #endif U_CAPI decNumber * U_EXPORT2 uprv_decNumberSquareRoot(decNumber *res, const decNumber *rhs, decContext *set) { decContext workset, approxset; /* work contexts */ decNumber dzero; /* used for constant zero */ Int maxp; /* largest working precision */ Int workp; /* working precision */ Int residue=0; /* rounding residue */ uInt status=0, ignore=0; /* status accumulators */ uInt rstatus; /* .. */ Int exp; /* working exponent */ Int ideal; /* ideal (preferred) exponent */ Int needbytes; /* work */ Int dropped; /* .. */ #if DECSUBSET decNumber *allocrhs=nullptr; /* non-nullptr if rounded rhs allocated */ #endif /* buffer for f [needs +1 in case DECBUFFER 0] */ decNumber buff[D2N(DECBUFFER+1)]; /* buffer for a [needs +2 to match likely maxp] */ decNumber bufa[D2N(DECBUFFER+2)]; /* buffer for temporary, b [must be same size as a] */ decNumber bufb[D2N(DECBUFFER+2)]; decNumber *allocbuff=nullptr; /* -> allocated buff, iff allocated */ decNumber *allocbufa=nullptr; /* -> allocated bufa, iff allocated */ decNumber *allocbufb=nullptr; /* -> allocated bufb, iff allocated */ decNumber *f=buff; /* reduced fraction */ decNumber *a=bufa; /* approximation to result */ decNumber *b=bufb; /* intermediate result */ /* buffer for temporary variable, up to 3 digits */ decNumber buft[D2N(3)]; decNumber *t=buft; /* up-to-3-digit constant or work */ #if DECCHECK if (decCheckOperands(res, DECUNUSED, rhs, set)) return res; #endif do { /* protect allocated storage */ #if DECSUBSET if (!set->extended) { /* reduce operand and set lostDigits status, as needed */ if (rhs->digits>set->digits) { allocrhs=decRoundOperand(rhs, set, &status); if (allocrhs==nullptr) break; /* [Note: 'f' allocation below could reuse this buffer if */ /* used, but as this is rare they are kept separate for clarity.] */ rhs=allocrhs; } } #endif /* [following code does not require input rounding] */ /* handle infinities and NaNs */ if (SPECIALARG) { if (decNumberIsInfinite(rhs)) { /* an infinity */ if (decNumberIsNegative(rhs)) status|=DEC_Invalid_operation; else uprv_decNumberCopy(res, rhs); /* +Infinity */ } else decNaNs(res, rhs, nullptr, set, &status); /* a NaN */ break; } /* calculate the ideal (preferred) exponent [floor(exp/2)] */ /* [It would be nicer to write: ideal=rhs->exponent>>1, but this */ /* generates a compiler warning. Generated code is the same.] */ ideal=(rhs->exponent&~1)/2; /* target */ /* handle zeros */ if (ISZERO(rhs)) { uprv_decNumberCopy(res, rhs); /* could be 0 or -0 */ res->exponent=ideal; /* use the ideal [safe] */ /* use decFinish to clamp any out-of-range exponent, etc. */ decFinish(res, set, &residue, &status); break; } /* any other -x is an oops */ if (decNumberIsNegative(rhs)) { status|=DEC_Invalid_operation; break; } /* space is needed for three working variables */ /* f -- the same precision as the RHS, reduced to 0.01->0.99... */ /* a -- Hull's approximation -- precision, when assigned, is */ /* currentprecision+1 or the input argument precision, */ /* whichever is larger (+2 for use as temporary) */ /* b -- intermediate temporary result (same size as a) */ /* if any is too long for local storage, then allocate */ workp=MAXI(set->digits+1, rhs->digits); /* actual rounding precision */ workp=MAXI(workp, 7); /* at least 7 for low cases */ maxp=workp+2; /* largest working precision */ needbytes=sizeof(decNumber)+(D2U(rhs->digits)-1)*sizeof(Unit); if (needbytes>(Int)sizeof(buff)) { allocbuff=(decNumber *)malloc(needbytes); if (allocbuff==nullptr) { /* hopeless -- abandon */ status|=DEC_Insufficient_storage; break;} f=allocbuff; /* use the allocated space */ } /* a and b both need to be able to hold a maxp-length number */ needbytes=sizeof(decNumber)+(D2U(maxp)-1)*sizeof(Unit); if (needbytes>(Int)sizeof(bufa)) { /* [same applies to b] */ allocbufa=(decNumber *)malloc(needbytes); allocbufb=(decNumber *)malloc(needbytes); if (allocbufa==nullptr || allocbufb==nullptr) { /* hopeless */ status|=DEC_Insufficient_storage; break;} a=allocbufa; /* use the allocated spaces */ b=allocbufb; /* .. */ } /* copy rhs -> f, save exponent, and reduce so 0.1 <= f < 1 */ uprv_decNumberCopy(f, rhs); exp=f->exponent+f->digits; /* adjusted to Hull rules */ f->exponent=-(f->digits); /* to range */ /* set up working context */ uprv_decContextDefault(&workset, DEC_INIT_DECIMAL64); workset.emax=DEC_MAX_EMAX; workset.emin=DEC_MIN_EMIN; /* [Until further notice, no error is possible and status bits */ /* (Rounded, etc.) should be ignored, not accumulated.] */ /* Calculate initial approximation, and allow for odd exponent */ workset.digits=workp; /* p for initial calculation */ t->bits=0; t->digits=3; a->bits=0; a->digits=3; if ((exp & 1)==0) { /* even exponent */ /* Set t=0.259, a=0.819 */ t->exponent=-3; a->exponent=-3; #if DECDPUN>=3 t->lsu[0]=259; a->lsu[0]=819; #elif DECDPUN==2 t->lsu[0]=59; t->lsu[1]=2; a->lsu[0]=19; a->lsu[1]=8; #else t->lsu[0]=9; t->lsu[1]=5; t->lsu[2]=2; a->lsu[0]=9; a->lsu[1]=1; a->lsu[2]=8; #endif } else { /* odd exponent */ /* Set t=0.0819, a=2.59 */ f->exponent--; /* f=f/10 */ exp++; /* e=e+1 */ t->exponent=-4; a->exponent=-2; #if DECDPUN>=3 t->lsu[0]=819; a->lsu[0]=259; #elif DECDPUN==2 t->lsu[0]=19; t->lsu[1]=8; a->lsu[0]=59; a->lsu[1]=2; #else t->lsu[0]=9; t->lsu[1]=1; t->lsu[2]=8; a->lsu[0]=9; a->lsu[1]=5; a->lsu[2]=2; #endif } decMultiplyOp(a, a, f, &workset, &ignore); /* a=a*f */ decAddOp(a, a, t, &workset, 0, &ignore); /* ..+t */ /* [a is now the initial approximation for sqrt(f), calculated with */ /* currentprecision, which is also a's precision.] */ /* the main calculation loop */ uprv_decNumberZero(&dzero); /* make 0 */ uprv_decNumberZero(t); /* set t = 0.5 */ t->lsu[0]=5; /* .. */ t->exponent=-1; /* .. */ workset.digits=3; /* initial p */ for (; workset.digitsexponent+=exp/2; /* set correct exponent */ rstatus=0; /* clear status */ residue=0; /* .. and accumulator */ decCopyFit(a, a, &approxset, &residue, &rstatus); /* reduce (if needed) */ decFinish(a, &approxset, &residue, &rstatus); /* clean and finalize */ /* Overflow was possible if the input exponent was out-of-range, */ /* in which case quit */ if (rstatus&DEC_Overflow) { status=rstatus; /* use the status as-is */ uprv_decNumberCopy(res, a); /* copy to result */ break; } /* Preserve status except Inexact/Rounded */ status|=(rstatus & ~(DEC_Rounded|DEC_Inexact)); /* Carry out the Hull correction */ a->exponent-=exp/2; /* back to 0.1->1 */ /* a is now at final precision and within 1 ulp of the properly */ /* rounded square root of f; to ensure proper rounding, compare */ /* squares of (a - l/2 ulp) and (a + l/2 ulp) with f. */ /* Here workset.digits=maxp and t=0.5, and a->digits determines */ /* the ulp */ workset.digits--; /* maxp-1 is OK now */ t->exponent=-a->digits-1; /* make 0.5 ulp */ decAddOp(b, a, t, &workset, DECNEG, &ignore); /* b = a - 0.5 ulp */ workset.round=DEC_ROUND_UP; decMultiplyOp(b, b, b, &workset, &ignore); /* b = mulru(b, b) */ decCompareOp(b, f, b, &workset, COMPARE, &ignore); /* b ? f, reversed */ if (decNumberIsNegative(b)) { /* f < b [i.e., b > f] */ /* this is the more common adjustment, though both are rare */ t->exponent++; /* make 1.0 ulp */ t->lsu[0]=1; /* .. */ decAddOp(a, a, t, &workset, DECNEG, &ignore); /* a = a - 1 ulp */ /* assign to approx [round to length] */ approxset.emin-=exp/2; /* adjust to match a */ approxset.emax-=exp/2; decAddOp(a, &dzero, a, &approxset, 0, &ignore); } else { decAddOp(b, a, t, &workset, 0, &ignore); /* b = a + 0.5 ulp */ workset.round=DEC_ROUND_DOWN; decMultiplyOp(b, b, b, &workset, &ignore); /* b = mulrd(b, b) */ decCompareOp(b, b, f, &workset, COMPARE, &ignore); /* b ? f */ if (decNumberIsNegative(b)) { /* b < f */ t->exponent++; /* make 1.0 ulp */ t->lsu[0]=1; /* .. */ decAddOp(a, a, t, &workset, 0, &ignore); /* a = a + 1 ulp */ /* assign to approx [round to length] */ approxset.emin-=exp/2; /* adjust to match a */ approxset.emax-=exp/2; decAddOp(a, &dzero, a, &approxset, 0, &ignore); } } /* [no errors are possible in the above, and rounding/inexact during */ /* estimation are irrelevant, so status was not accumulated] */ /* Here, 0.1 <= a < 1 (still), so adjust back */ a->exponent+=exp/2; /* set correct exponent */ /* count droppable zeros [after any subnormal rounding] by */ /* trimming a copy */ uprv_decNumberCopy(b, a); decTrim(b, set, 1, 1, &dropped); /* [drops trailing zeros] */ /* Set Inexact and Rounded. The answer can only be exact if */ /* it is short enough so that squaring it could fit in workp */ /* digits, so this is the only (relatively rare) condition that */ /* a careful check is needed */ if (b->digits*2-1 > workp) { /* cannot fit */ status|=DEC_Inexact|DEC_Rounded; } else { /* could be exact/unrounded */ uInt mstatus=0; /* local status */ decMultiplyOp(b, b, b, &workset, &mstatus); /* try the multiply */ if (mstatus&DEC_Overflow) { /* result just won't fit */ status|=DEC_Inexact|DEC_Rounded; } else { /* plausible */ decCompareOp(t, b, rhs, &workset, COMPARE, &mstatus); /* b ? rhs */ if (!ISZERO(t)) status|=DEC_Inexact|DEC_Rounded; /* not equal */ else { /* is Exact */ /* here, dropped is the count of trailing zeros in 'a' */ /* use closest exponent to ideal... */ Int todrop=ideal-a->exponent; /* most that can be dropped */ if (todrop<0) status|=DEC_Rounded; /* ideally would add 0s */ else { /* unrounded */ /* there are some to drop, but emax may not allow all */ Int maxexp=set->emax-set->digits+1; Int maxdrop=maxexp-a->exponent; if (todrop>maxdrop && set->clamp) { /* apply clamping */ todrop=maxdrop; status|=DEC_Clamped; } if (dropped0) { /* have some to drop */ decShiftToLeast(a->lsu, D2U(a->digits), todrop); a->exponent+=todrop; /* maintain numerical value */ a->digits-=todrop; /* new length */ } } } } } /* double-check Underflow, as perhaps the result could not have */ /* been subnormal (initial argument too big), or it is now Exact */ if (status&DEC_Underflow) { Int ae=rhs->exponent+rhs->digits-1; /* adjusted exponent */ /* check if truly subnormal */ #if DECEXTFLAG /* DEC_Subnormal too */ if (ae>=set->emin*2) status&=~(DEC_Subnormal|DEC_Underflow); #else if (ae>=set->emin*2) status&=~DEC_Underflow; #endif /* check if truly inexact */ if (!(status&DEC_Inexact)) status&=~DEC_Underflow; } uprv_decNumberCopy(res, a); /* a is now the result */ } while(0); /* end protected */ if (allocbuff!=nullptr) free(allocbuff); /* drop any storage used */ if (allocbufa!=nullptr) free(allocbufa); /* .. */ if (allocbufb!=nullptr) free(allocbufb); /* .. */ #if DECSUBSET if (allocrhs !=nullptr) free(allocrhs); /* .. */ #endif if (status!=0) decStatus(res, status, set);/* then report status */ #if DECCHECK decCheckInexact(res, set); #endif return res; } /* decNumberSquareRoot */ // #if defined(__clang__) || U_GCC_MAJOR_MINOR >= 406 // #pragma GCC diagnostic pop // #endif /* ------------------------------------------------------------------ */ /* decNumberSubtract -- subtract two Numbers */ /* */ /* This computes C = A - B */ /* */ /* res is C, the result. C may be A and/or B (e.g., X=X-X) */ /* lhs is A */ /* rhs is B */ /* set is the context */ /* */ /* C must have space for set->digits digits. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberSubtract(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { uInt status=0; /* accumulator */ decAddOp(res, lhs, rhs, set, DECNEG, &status); if (status!=0) decStatus(res, status, set); #if DECCHECK decCheckInexact(res, set); #endif return res; } /* decNumberSubtract */ /* ------------------------------------------------------------------ */ /* decNumberToIntegralExact -- round-to-integral-value with InExact */ /* decNumberToIntegralValue -- round-to-integral-value */ /* */ /* res is the result */ /* rhs is input number */ /* set is the context */ /* */ /* res must have space for any value of rhs. */ /* */ /* This implements the IEEE special operators and therefore treats */ /* special values as valid. For finite numbers it returns */ /* rescale(rhs, 0) if rhs->exponent is <0. */ /* Otherwise the result is rhs (so no error is possible, except for */ /* sNaN). */ /* */ /* The context is used for rounding mode and status after sNaN, but */ /* the digits setting is ignored. The Exact version will signal */ /* Inexact if the result differs numerically from rhs; the other */ /* never signals Inexact. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberToIntegralExact(decNumber *res, const decNumber *rhs, decContext *set) { decNumber dn; decContext workset; /* working context */ uInt status=0; /* accumulator */ #if DECCHECK if (decCheckOperands(res, DECUNUSED, rhs, set)) return res; #endif /* handle infinities and NaNs */ if (SPECIALARG) { if (decNumberIsInfinite(rhs)) uprv_decNumberCopy(res, rhs); /* an Infinity */ else decNaNs(res, rhs, nullptr, set, &status); /* a NaN */ } else { /* finite */ /* have a finite number; no error possible (res must be big enough) */ if (rhs->exponent>=0) return uprv_decNumberCopy(res, rhs); /* that was easy, but if negative exponent there is work to do... */ workset=*set; /* clone rounding, etc. */ workset.digits=rhs->digits; /* no length rounding */ workset.traps=0; /* no traps */ uprv_decNumberZero(&dn); /* make a number with exponent 0 */ uprv_decNumberQuantize(res, rhs, &dn, &workset); status|=workset.status; } if (status!=0) decStatus(res, status, set); return res; } /* decNumberToIntegralExact */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberToIntegralValue(decNumber *res, const decNumber *rhs, decContext *set) { decContext workset=*set; /* working context */ workset.traps=0; /* no traps */ uprv_decNumberToIntegralExact(res, rhs, &workset); /* this never affects set, except for sNaNs; NaN will have been set */ /* or propagated already, so no need to call decStatus */ set->status|=workset.status&DEC_Invalid_operation; return res; } /* decNumberToIntegralValue */ /* ------------------------------------------------------------------ */ /* decNumberXor -- XOR two Numbers, digitwise */ /* */ /* This computes C = A ^ B */ /* */ /* res is C, the result. C may be A and/or B (e.g., X=X^X) */ /* lhs is A */ /* rhs is B */ /* set is the context (used for result length and error report) */ /* */ /* C must have space for set->digits digits. */ /* */ /* Logical function restrictions apply (see above); a NaN is */ /* returned with Invalid_operation if a restriction is violated. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberXor(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { const Unit *ua, *ub; /* -> operands */ const Unit *msua, *msub; /* -> operand msus */ Unit *uc, *msuc; /* -> result and its msu */ Int msudigs; /* digits in res msu */ #if DECCHECK if (decCheckOperands(res, lhs, rhs, set)) return res; #endif if (lhs->exponent!=0 || decNumberIsSpecial(lhs) || decNumberIsNegative(lhs) || rhs->exponent!=0 || decNumberIsSpecial(rhs) || decNumberIsNegative(rhs)) { decStatus(res, DEC_Invalid_operation, set); return res; } /* operands are valid */ ua=lhs->lsu; /* bottom-up */ ub=rhs->lsu; /* .. */ uc=res->lsu; /* .. */ msua=ua+D2U(lhs->digits)-1; /* -> msu of lhs */ msub=ub+D2U(rhs->digits)-1; /* -> msu of rhs */ msuc=uc+D2U(set->digits)-1; /* -> msu of result */ msudigs=MSUDIGITS(set->digits); /* [faster than remainder] */ for (; uc<=msuc; ua++, ub++, uc++) { /* Unit loop */ Unit a, b; /* extract units */ if (ua>msua) a=0; else a=*ua; if (ub>msub) b=0; else b=*ub; *uc=0; /* can now write back */ if (a|b) { /* maybe 1 bits to examine */ Int i, j; /* This loop could be unrolled and/or use BIN2BCD tables */ for (i=0; i1) { decStatus(res, DEC_Invalid_operation, set); return res; } if (uc==msuc && i==msudigs-1) break; /* just did final digit */ } /* each digit */ } /* non-zero */ } /* each unit */ /* [here uc-1 is the msu of the result] */ res->digits=decGetDigits(res->lsu, static_cast(uc-res->lsu)); res->exponent=0; /* integer */ res->bits=0; /* sign=0 */ return res; /* [no status to set] */ } /* decNumberXor */ /* ================================================================== */ /* Utility routines */ /* ================================================================== */ /* ------------------------------------------------------------------ */ /* decNumberClass -- return the decClass of a decNumber */ /* dn -- the decNumber to test */ /* set -- the context to use for Emin */ /* returns the decClass enum */ /* ------------------------------------------------------------------ */ enum decClass uprv_decNumberClass(const decNumber *dn, decContext *set) { if (decNumberIsSpecial(dn)) { if (decNumberIsQNaN(dn)) return DEC_CLASS_QNAN; if (decNumberIsSNaN(dn)) return DEC_CLASS_SNAN; /* must be an infinity */ if (decNumberIsNegative(dn)) return DEC_CLASS_NEG_INF; return DEC_CLASS_POS_INF; } /* is finite */ if (uprv_decNumberIsNormal(dn, set)) { /* most common */ if (decNumberIsNegative(dn)) return DEC_CLASS_NEG_NORMAL; return DEC_CLASS_POS_NORMAL; } /* is subnormal or zero */ if (decNumberIsZero(dn)) { /* most common */ if (decNumberIsNegative(dn)) return DEC_CLASS_NEG_ZERO; return DEC_CLASS_POS_ZERO; } if (decNumberIsNegative(dn)) return DEC_CLASS_NEG_SUBNORMAL; return DEC_CLASS_POS_SUBNORMAL; } /* decNumberClass */ /* ------------------------------------------------------------------ */ /* decNumberClassToString -- convert decClass to a string */ /* */ /* eclass is a valid decClass */ /* returns a constant string describing the class (max 13+1 chars) */ /* ------------------------------------------------------------------ */ const char *uprv_decNumberClassToString(enum decClass eclass) { if (eclass==DEC_CLASS_POS_NORMAL) return DEC_ClassString_PN; if (eclass==DEC_CLASS_NEG_NORMAL) return DEC_ClassString_NN; if (eclass==DEC_CLASS_POS_ZERO) return DEC_ClassString_PZ; if (eclass==DEC_CLASS_NEG_ZERO) return DEC_ClassString_NZ; if (eclass==DEC_CLASS_POS_SUBNORMAL) return DEC_ClassString_PS; if (eclass==DEC_CLASS_NEG_SUBNORMAL) return DEC_ClassString_NS; if (eclass==DEC_CLASS_POS_INF) return DEC_ClassString_PI; if (eclass==DEC_CLASS_NEG_INF) return DEC_ClassString_NI; if (eclass==DEC_CLASS_QNAN) return DEC_ClassString_QN; if (eclass==DEC_CLASS_SNAN) return DEC_ClassString_SN; return DEC_ClassString_UN; /* Unknown */ } /* decNumberClassToString */ /* ------------------------------------------------------------------ */ /* decNumberCopy -- copy a number */ /* */ /* dest is the target decNumber */ /* src is the source decNumber */ /* returns dest */ /* */ /* (dest==src is allowed and is a no-op) */ /* All fields are updated as required. This is a utility operation, */ /* so special values are unchanged and no error is possible. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberCopy(decNumber *dest, const decNumber *src) { #if DECCHECK if (src==nullptr) return uprv_decNumberZero(dest); #endif if (dest==src) return dest; /* no copy required */ /* Use explicit assignments here as structure assignment could copy */ /* more than just the lsu (for small DECDPUN). This would not affect */ /* the value of the results, but could disturb test harness spill */ /* checking. */ dest->bits=src->bits; dest->exponent=src->exponent; dest->digits=src->digits; dest->lsu[0]=src->lsu[0]; if (src->digits>DECDPUN) { /* more Units to come */ const Unit *smsup, *s; /* work */ Unit *d; /* .. */ /* memcpy for the remaining Units would be safe as they cannot */ /* overlap. However, this explicit loop is faster in short cases. */ d=dest->lsu+1; /* -> first destination */ smsup=src->lsu+D2U(src->digits); /* -> source msu+1 */ for (s=src->lsu+1; sdigits digits. */ /* No exception or error can occur; this is a quiet bitwise operation.*/ /* See also decNumberAbs for a checking version of this. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberCopyAbs(decNumber *res, const decNumber *rhs) { #if DECCHECK if (decCheckOperands(res, DECUNUSED, rhs, DECUNCONT)) return res; #endif uprv_decNumberCopy(res, rhs); res->bits&=~DECNEG; /* turn off sign */ return res; } /* decNumberCopyAbs */ /* ------------------------------------------------------------------ */ /* decNumberCopyNegate -- quiet negate value operator */ /* */ /* This sets C = negate(A) */ /* */ /* res is C, the result. C may be A */ /* rhs is A */ /* */ /* C must have space for set->digits digits. */ /* No exception or error can occur; this is a quiet bitwise operation.*/ /* See also decNumberMinus for a checking version of this. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberCopyNegate(decNumber *res, const decNumber *rhs) { #if DECCHECK if (decCheckOperands(res, DECUNUSED, rhs, DECUNCONT)) return res; #endif uprv_decNumberCopy(res, rhs); res->bits^=DECNEG; /* invert the sign */ return res; } /* decNumberCopyNegate */ /* ------------------------------------------------------------------ */ /* decNumberCopySign -- quiet copy and set sign operator */ /* */ /* This sets C = A with the sign of B */ /* */ /* res is C, the result. C may be A */ /* lhs is A */ /* rhs is B */ /* */ /* C must have space for set->digits digits. */ /* No exception or error can occur; this is a quiet bitwise operation.*/ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberCopySign(decNumber *res, const decNumber *lhs, const decNumber *rhs) { uByte sign; /* rhs sign */ #if DECCHECK if (decCheckOperands(res, DECUNUSED, rhs, DECUNCONT)) return res; #endif sign=rhs->bits & DECNEG; /* save sign bit */ uprv_decNumberCopy(res, lhs); res->bits&=~DECNEG; /* clear the sign */ res->bits|=sign; /* set from rhs */ return res; } /* decNumberCopySign */ /* ------------------------------------------------------------------ */ /* decNumberGetBCD -- get the coefficient in BCD8 */ /* dn is the source decNumber */ /* bcd is the uInt array that will receive dn->digits BCD bytes, */ /* most-significant at offset 0 */ /* returns bcd */ /* */ /* bcd must have at least dn->digits bytes. No error is possible; if */ /* dn is a NaN or Infinite, digits must be 1 and the coefficient 0. */ /* ------------------------------------------------------------------ */ U_CAPI uByte * U_EXPORT2 uprv_decNumberGetBCD(const decNumber *dn, uByte *bcd) { uByte *ub=bcd+dn->digits-1; /* -> lsd */ const Unit *up=dn->lsu; /* Unit pointer, -> lsu */ #if DECDPUN==1 /* trivial simple copy */ for (; ub>=bcd; ub--, up++) *ub=*up; #else /* chopping needed */ uInt u=*up; /* work */ uInt cut=DECDPUN; /* downcounter through unit */ for (; ub>=bcd; ub--) { *ub=(uByte)(u%10); /* [*6554 trick inhibits, here] */ u=u/10; cut--; if (cut>0) continue; /* more in this unit */ up++; u=*up; cut=DECDPUN; } #endif return bcd; } /* decNumberGetBCD */ /* ------------------------------------------------------------------ */ /* decNumberSetBCD -- set (replace) the coefficient from BCD8 */ /* dn is the target decNumber */ /* bcd is the uInt array that will source n BCD bytes, most- */ /* significant at offset 0 */ /* n is the number of digits in the source BCD array (bcd) */ /* returns dn */ /* */ /* dn must have space for at least n digits. No error is possible; */ /* if dn is a NaN, or Infinite, or is to become a zero, n must be 1 */ /* and bcd[0] zero. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberSetBCD(decNumber *dn, const uByte *bcd, uInt n) { Unit *up=dn->lsu+D2U(dn->digits)-1; /* -> msu [target pointer] */ const uByte *ub=bcd; /* -> source msd */ #if DECDPUN==1 /* trivial simple copy */ for (; ub=dn->lsu; up--) { /* each Unit from msu */ *up=0; /* will take <=DECDPUN digits */ for (; cut>0; ub++, cut--) *up=X10(*up)+*ub; cut=DECDPUN; /* next Unit has all digits */ } #endif dn->digits=n; /* set digit count */ return dn; } /* decNumberSetBCD */ /* ------------------------------------------------------------------ */ /* decNumberIsNormal -- test normality of a decNumber */ /* dn is the decNumber to test */ /* set is the context to use for Emin */ /* returns 1 if |dn| is finite and >=Nmin, 0 otherwise */ /* ------------------------------------------------------------------ */ Int uprv_decNumberIsNormal(const decNumber *dn, decContext *set) { Int ae; /* adjusted exponent */ #if DECCHECK if (decCheckOperands(DECUNRESU, DECUNUSED, dn, set)) return 0; #endif if (decNumberIsSpecial(dn)) return 0; /* not finite */ if (decNumberIsZero(dn)) return 0; /* not non-zero */ ae=dn->exponent+dn->digits-1; /* adjusted exponent */ if (aeemin) return 0; /* is subnormal */ return 1; } /* decNumberIsNormal */ /* ------------------------------------------------------------------ */ /* decNumberIsSubnormal -- test subnormality of a decNumber */ /* dn is the decNumber to test */ /* set is the context to use for Emin */ /* returns 1 if |dn| is finite, non-zero, and exponent+dn->digits-1; /* adjusted exponent */ if (aeemin) return 1; /* is subnormal */ return 0; } /* decNumberIsSubnormal */ /* ------------------------------------------------------------------ */ /* decNumberTrim -- remove insignificant zeros */ /* */ /* dn is the number to trim */ /* returns dn */ /* */ /* All fields are updated as required. This is a utility operation, */ /* so special values are unchanged and no error is possible. The */ /* zeros are removed unconditionally. */ /* ------------------------------------------------------------------ */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberTrim(decNumber *dn) { Int dropped; /* work */ decContext set; /* .. */ #if DECCHECK if (decCheckOperands(DECUNRESU, DECUNUSED, dn, DECUNCONT)) return dn; #endif uprv_decContextDefault(&set, DEC_INIT_BASE); /* clamp=0 */ return decTrim(dn, &set, 0, 1, &dropped); } /* decNumberTrim */ /* ------------------------------------------------------------------ */ /* decNumberVersion -- return the name and version of this module */ /* */ /* No error is possible. */ /* ------------------------------------------------------------------ */ const char * uprv_decNumberVersion() { return DECVERSION; } /* decNumberVersion */ /* ------------------------------------------------------------------ */ /* decNumberZero -- set a number to 0 */ /* */ /* dn is the number to set, with space for one digit */ /* returns dn */ /* */ /* No error is possible. */ /* ------------------------------------------------------------------ */ /* Memset is not used as it is much slower in some environments. */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberZero(decNumber *dn) { #if DECCHECK if (decCheckOperands(dn, DECUNUSED, DECUNUSED, DECUNCONT)) return dn; #endif dn->bits=0; dn->exponent=0; dn->digits=1; dn->lsu[0]=0; return dn; } /* decNumberZero */ /* ================================================================== */ /* Local routines */ /* ================================================================== */ /* ------------------------------------------------------------------ */ /* decToString -- lay out a number into a string */ /* */ /* dn is the number to lay out */ /* string is where to lay out the number */ /* eng is 1 if Engineering, 0 if Scientific */ /* */ /* string must be at least dn->digits+14 characters long */ /* No error is possible. */ /* */ /* Note that this routine can generate a -0 or 0.000. These are */ /* never generated in subset to-number or arithmetic, but can occur */ /* in non-subset arithmetic (e.g., -1*0 or 1.234-1.234). */ /* ------------------------------------------------------------------ */ /* If DECCHECK is enabled the string "?" is returned if a number is */ /* invalid. */ static void decToString(const decNumber *dn, char *string, Flag eng) { Int exp=dn->exponent; /* local copy */ Int e; /* E-part value */ Int pre; /* digits before the '.' */ Int cut; /* for counting digits in a Unit */ char *c=string; /* work [output pointer] */ const Unit *up=dn->lsu+D2U(dn->digits)-1; /* -> msu [input pointer] */ uInt u, pow; /* work */ #if DECCHECK if (decCheckOperands(DECUNRESU, dn, DECUNUSED, DECUNCONT)) { strcpy(string, "?"); return;} #endif if (decNumberIsNegative(dn)) { /* Negatives get a minus */ *c='-'; c++; } if (dn->bits&DECSPECIAL) { /* Is a special value */ if (decNumberIsInfinite(dn)) { strcpy(c, "Inf"); strcpy(c+3, "inity"); return;} /* a NaN */ if (dn->bits&DECSNAN) { /* signalling NaN */ *c='s'; c++; } strcpy(c, "NaN"); c+=3; /* step past */ /* if not a clean non-zero coefficient, that's all there is in a */ /* NaN string */ if (exp!=0 || (*dn->lsu==0 && dn->digits==1)) return; /* [drop through to add integer] */ } /* calculate how many digits in msu, and hence first cut */ cut=MSUDIGITS(dn->digits); /* [faster than remainder] */ cut--; /* power of ten for digit */ if (exp==0) { /* simple integer [common fastpath] */ for (;up>=dn->lsu; up--) { /* each Unit from msu */ u=*up; /* contains DECDPUN digits to lay out */ for (; cut>=0; c++, cut--) TODIGIT(u, cut, c, pow); cut=DECDPUN-1; /* next Unit has all digits */ } *c='\0'; /* terminate the string */ return;} /* non-0 exponent -- assume plain form */ pre=dn->digits+exp; /* digits before '.' */ e=0; /* no E */ if ((exp>0) || (pre<-5)) { /* need exponential form */ e=exp+dn->digits-1; /* calculate E value */ pre=1; /* assume one digit before '.' */ if (eng && (e!=0)) { /* engineering: may need to adjust */ Int adj; /* adjustment */ /* The C remainder operator is undefined for negative numbers, so */ /* a positive remainder calculation must be used here */ if (e<0) { adj=(-e)%3; if (adj!=0) adj=3-adj; } else { /* e>0 */ adj=e%3; } e=e-adj; /* if dealing with zero still produce an exponent which is a */ /* multiple of three, as expected, but there will only be the */ /* one zero before the E, still. Otherwise note the padding. */ if (!ISZERO(dn)) pre+=adj; else { /* is zero */ if (adj!=0) { /* 0.00Esnn needed */ e=e+3; pre=-(2-adj); } } /* zero */ } /* eng */ } /* need exponent */ /* lay out the digits of the coefficient, adding 0s and . as needed */ u=*up; if (pre>0) { /* xxx.xxx or xx00 (engineering) form */ Int n=pre; for (; pre>0; pre--, c++, cut--) { if (cut<0) { /* need new Unit */ if (up==dn->lsu) break; /* out of input digits (pre>digits) */ up--; cut=DECDPUN-1; u=*up; } TODIGIT(u, cut, c, pow); } if (ndigits) { /* more to come, after '.' */ *c='.'; c++; for (;; c++, cut--) { if (cut<0) { /* need new Unit */ if (up==dn->lsu) break; /* out of input digits */ up--; cut=DECDPUN-1; u=*up; } TODIGIT(u, cut, c, pow); } } else for (; pre>0; pre--, c++) *c='0'; /* 0 padding (for engineering) needed */ } else { /* 0.xxx or 0.000xxx form */ *c='0'; c++; *c='.'; c++; for (; pre<0; pre++, c++) *c='0'; /* add any 0's after '.' */ for (; ; c++, cut--) { if (cut<0) { /* need new Unit */ if (up==dn->lsu) break; /* out of input digits */ up--; cut=DECDPUN-1; u=*up; } TODIGIT(u, cut, c, pow); } } /* Finally add the E-part, if needed. It will never be 0, has a base maximum and minimum of +999999999 through -999999999, but could range down to -1999999998 for abnormal numbers */ if (e!=0) { Flag had=0; /* 1=had non-zero */ *c='E'; c++; *c='+'; c++; /* assume positive */ u=e; /* .. */ if (e<0) { *(c-1)='-'; /* oops, need - */ u=-e; /* uInt, please */ } /* lay out the exponent [_itoa or equivalent is not ANSI C] */ for (cut=9; cut>=0; cut--) { TODIGIT(u, cut, c, pow); if (*c=='0' && !had) continue; /* skip leading zeros */ had=1; /* had non-0 */ c++; /* step for next */ } /* cut */ } *c='\0'; /* terminate the string (all paths) */ return; } /* decToString */ /* ------------------------------------------------------------------ */ /* decAddOp -- add/subtract operation */ /* */ /* This computes C = A + B */ /* */ /* res is C, the result. C may be A and/or B (e.g., X=X+X) */ /* lhs is A */ /* rhs is B */ /* set is the context */ /* negate is DECNEG if rhs should be negated, or 0 otherwise */ /* status accumulates status for the caller */ /* */ /* C must have space for set->digits digits. */ /* Inexact in status must be 0 for correct Exact zero sign in result */ /* ------------------------------------------------------------------ */ /* If possible, the coefficient is calculated directly into C. */ /* However, if: */ /* -- a digits+1 calculation is needed because the numbers are */ /* unaligned and span more than set->digits digits */ /* -- a carry to digits+1 digits looks possible */ /* -- C is the same as A or B, and the result would destructively */ /* overlap the A or B coefficient */ /* then the result must be calculated into a temporary buffer. In */ /* this case a local (stack) buffer is used if possible, and only if */ /* too long for that does malloc become the final resort. */ /* */ /* Misalignment is handled as follows: */ /* Apad: (AExp>BExp) Swap operands and proceed as for BExp>AExp. */ /* BPad: Apply the padding by a combination of shifting (whole */ /* units) and multiplication (part units). */ /* */ /* Addition, especially x=x+1, is speed-critical. */ /* The static buffer is larger than might be expected to allow for */ /* calls from higher-level functions (notable exp). */ /* ------------------------------------------------------------------ */ static decNumber * decAddOp(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set, uByte negate, uInt *status) { #if DECSUBSET decNumber *alloclhs=nullptr; /* non-nullptr if rounded lhs allocated */ decNumber *allocrhs=nullptr; /* .., rhs */ #endif Int rhsshift; /* working shift (in Units) */ Int maxdigits; /* longest logical length */ Int mult; /* multiplier */ Int residue; /* rounding accumulator */ uByte bits; /* result bits */ Flag diffsign; /* non-0 if arguments have different sign */ Unit *acc; /* accumulator for result */ Unit accbuff[SD2U(DECBUFFER*2+20)]; /* local buffer [*2+20 reduces many */ /* allocations when called from */ /* other operations, notable exp] */ Unit *allocacc=nullptr; /* -> allocated acc buffer, iff allocated */ Int reqdigits=set->digits; /* local copy; requested DIGITS */ Int padding; /* work */ #if DECCHECK if (decCheckOperands(res, lhs, rhs, set)) return res; #endif do { /* protect allocated storage */ #if DECSUBSET if (!set->extended) { /* reduce operands and set lostDigits status, as needed */ if (lhs->digits>reqdigits) { alloclhs=decRoundOperand(lhs, set, status); if (alloclhs==nullptr) break; lhs=alloclhs; } if (rhs->digits>reqdigits) { allocrhs=decRoundOperand(rhs, set, status); if (allocrhs==nullptr) break; rhs=allocrhs; } } #endif /* [following code does not require input rounding] */ /* note whether signs differ [used all paths] */ diffsign=(Flag)((lhs->bits^rhs->bits^negate)&DECNEG); /* handle infinities and NaNs */ if (SPECIALARGS) { /* a special bit set */ if (SPECIALARGS & (DECSNAN | DECNAN)) /* a NaN */ decNaNs(res, lhs, rhs, set, status); else { /* one or two infinities */ if (decNumberIsInfinite(lhs)) { /* LHS is infinity */ /* two infinities with different signs is invalid */ if (decNumberIsInfinite(rhs) && diffsign) { *status|=DEC_Invalid_operation; break; } bits=lhs->bits & DECNEG; /* get sign from LHS */ } else bits=(rhs->bits^negate) & DECNEG;/* RHS must be Infinity */ bits|=DECINF; uprv_decNumberZero(res); res->bits=bits; /* set +/- infinity */ } /* an infinity */ break; } /* Quick exit for add 0s; return the non-0, modified as need be */ if (ISZERO(lhs)) { Int adjust; /* work */ Int lexp=lhs->exponent; /* save in case LHS==RES */ bits=lhs->bits; /* .. */ residue=0; /* clear accumulator */ decCopyFit(res, rhs, set, &residue, status); /* copy (as needed) */ res->bits^=negate; /* flip if rhs was negated */ #if DECSUBSET if (set->extended) { /* exponents on zeros count */ #endif /* exponent will be the lower of the two */ adjust=lexp-res->exponent; /* adjustment needed [if -ve] */ if (ISZERO(res)) { /* both 0: special IEEE 754 rules */ if (adjust<0) res->exponent=lexp; /* set exponent */ /* 0-0 gives +0 unless rounding to -infinity, and -0-0 gives -0 */ if (diffsign) { if (set->round!=DEC_ROUND_FLOOR) res->bits=0; else res->bits=DECNEG; /* preserve 0 sign */ } } else { /* non-0 res */ if (adjust<0) { /* 0-padding needed */ if ((res->digits-adjust)>set->digits) { adjust=res->digits-set->digits; /* to fit exactly */ *status|=DEC_Rounded; /* [but exact] */ } res->digits=decShiftToMost(res->lsu, res->digits, -adjust); res->exponent+=adjust; /* set the exponent. */ } } /* non-0 res */ #if DECSUBSET } /* extended */ #endif decFinish(res, set, &residue, status); /* clean and finalize */ break;} if (ISZERO(rhs)) { /* [lhs is non-zero] */ Int adjust; /* work */ Int rexp=rhs->exponent; /* save in case RHS==RES */ bits=rhs->bits; /* be clean */ residue=0; /* clear accumulator */ decCopyFit(res, lhs, set, &residue, status); /* copy (as needed) */ #if DECSUBSET if (set->extended) { /* exponents on zeros count */ #endif /* exponent will be the lower of the two */ /* [0-0 case handled above] */ adjust=rexp-res->exponent; /* adjustment needed [if -ve] */ if (adjust<0) { /* 0-padding needed */ if ((res->digits-adjust)>set->digits) { adjust=res->digits-set->digits; /* to fit exactly */ *status|=DEC_Rounded; /* [but exact] */ } res->digits=decShiftToMost(res->lsu, res->digits, -adjust); res->exponent+=adjust; /* set the exponent. */ } #if DECSUBSET } /* extended */ #endif decFinish(res, set, &residue, status); /* clean and finalize */ break;} /* [NB: both fastpath and mainpath code below assume these cases */ /* (notably 0-0) have already been handled] */ /* calculate the padding needed to align the operands */ padding=rhs->exponent-lhs->exponent; /* Fastpath cases where the numbers are aligned and normal, the RHS */ /* is all in one unit, no operand rounding is needed, and no carry, */ /* lengthening, or borrow is needed */ if (padding==0 && rhs->digits<=DECDPUN && rhs->exponent>=set->emin /* [some normals drop through] */ && rhs->exponent<=set->emax-set->digits+1 /* [could clamp] */ && rhs->digits<=reqdigits && lhs->digits<=reqdigits) { Int partial=*lhs->lsu; if (!diffsign) { /* adding */ partial+=*rhs->lsu; if ((partial<=DECDPUNMAX) /* result fits in unit */ && (lhs->digits>=DECDPUN || /* .. and no digits-count change */ partial<(Int)powers[lhs->digits])) { /* .. */ if (res!=lhs) uprv_decNumberCopy(res, lhs); /* not in place */ *res->lsu=(Unit)partial; /* [copy could have overwritten RHS] */ break; } /* else drop out for careful add */ } else { /* signs differ */ partial-=*rhs->lsu; if (partial>0) { /* no borrow needed, and non-0 result */ if (res!=lhs) uprv_decNumberCopy(res, lhs); /* not in place */ *res->lsu=(Unit)partial; /* this could have reduced digits [but result>0] */ res->digits=decGetDigits(res->lsu, D2U(res->digits)); break; } /* else drop out for careful subtract */ } } /* Now align (pad) the lhs or rhs so they can be added or */ /* subtracted, as necessary. If one number is much larger than */ /* the other (that is, if in plain form there is a least one */ /* digit between the lowest digit of one and the highest of the */ /* other) padding with up to DIGITS-1 trailing zeros may be */ /* needed; then apply rounding (as exotic rounding modes may be */ /* affected by the residue). */ rhsshift=0; /* rhs shift to left (padding) in Units */ bits=lhs->bits; /* assume sign is that of LHS */ mult=1; /* likely multiplier */ /* [if padding==0 the operands are aligned; no padding is needed] */ if (padding!=0) { /* some padding needed; always pad the RHS, as any required */ /* padding can then be effected by a simple combination of */ /* shifts and a multiply */ Flag swapped=0; if (padding<0) { /* LHS needs the padding */ const decNumber *t; padding=-padding; /* will be +ve */ bits=(uByte)(rhs->bits^negate); /* assumed sign is now that of RHS */ t=lhs; lhs=rhs; rhs=t; swapped=1; } /* If, after pad, rhs would be longer than lhs by digits+1 or */ /* more then lhs cannot affect the answer, except as a residue, */ /* so only need to pad up to a length of DIGITS+1. */ if (rhs->digits+padding > lhs->digits+reqdigits+1) { /* The RHS is sufficient */ /* for residue use the relative sign indication... */ Int shift=reqdigits-rhs->digits; /* left shift needed */ residue=1; /* residue for rounding */ if (diffsign) residue=-residue; /* signs differ */ /* copy, shortening if necessary */ decCopyFit(res, rhs, set, &residue, status); /* if it was already shorter, then need to pad with zeros */ if (shift>0) { res->digits=decShiftToMost(res->lsu, res->digits, shift); res->exponent-=shift; /* adjust the exponent. */ } /* flip the result sign if unswapped and rhs was negated */ if (!swapped) res->bits^=negate; decFinish(res, set, &residue, status); /* done */ break;} /* LHS digits may affect result */ rhsshift=D2U(padding+1)-1; /* this much by Unit shift .. */ mult=powers[padding-(rhsshift*DECDPUN)]; /* .. this by multiplication */ } /* padding needed */ if (diffsign) mult=-mult; /* signs differ */ /* determine the longer operand */ maxdigits=rhs->digits+padding; /* virtual length of RHS */ if (lhs->digits>maxdigits) maxdigits=lhs->digits; /* Decide on the result buffer to use; if possible place directly */ /* into result. */ acc=res->lsu; /* assume add direct to result */ /* If destructive overlap, or the number is too long, or a carry or */ /* borrow to DIGITS+1 might be possible, a buffer must be used. */ /* [Might be worth more sophisticated tests when maxdigits==reqdigits] */ if ((maxdigits>=reqdigits) /* is, or could be, too large */ || (res==rhs && rhsshift>0)) { /* destructive overlap */ /* buffer needed, choose it; units for maxdigits digits will be */ /* needed, +1 Unit for carry or borrow */ Int need=D2U(maxdigits)+1; acc=accbuff; /* assume use local buffer */ if (need*sizeof(Unit)>sizeof(accbuff)) { /* printf("malloc add %ld %ld\n", need, sizeof(accbuff)); */ allocacc=(Unit *)malloc(need*sizeof(Unit)); if (allocacc==nullptr) { /* hopeless -- abandon */ *status|=DEC_Insufficient_storage; break;} acc=allocacc; } } res->bits=(uByte)(bits&DECNEG); /* it's now safe to overwrite.. */ res->exponent=lhs->exponent; /* .. operands (even if aliased) */ #if DECTRACE decDumpAr('A', lhs->lsu, D2U(lhs->digits)); decDumpAr('B', rhs->lsu, D2U(rhs->digits)); printf(" :h: %ld %ld\n", rhsshift, mult); #endif /* add [A+B*m] or subtract [A+B*(-m)] */ U_ASSERT(rhs->digits > 0); U_ASSERT(lhs->digits > 0); res->digits=decUnitAddSub(lhs->lsu, D2U(lhs->digits), rhs->lsu, D2U(rhs->digits), rhsshift, acc, mult) *DECDPUN; /* [units -> digits] */ if (res->digits<0) { /* borrowed... */ res->digits=-res->digits; res->bits^=DECNEG; /* flip the sign */ } #if DECTRACE decDumpAr('+', acc, D2U(res->digits)); #endif /* If a buffer was used the result must be copied back, possibly */ /* shortening. (If no buffer was used then the result must have */ /* fit, so can't need rounding and residue must be 0.) */ residue=0; /* clear accumulator */ if (acc!=res->lsu) { #if DECSUBSET if (set->extended) { /* round from first significant digit */ #endif /* remove leading zeros that were added due to rounding up to */ /* integral Units -- before the test for rounding. */ if (res->digits>reqdigits) res->digits=decGetDigits(acc, D2U(res->digits)); decSetCoeff(res, set, acc, res->digits, &residue, status); #if DECSUBSET } else { /* subset arithmetic rounds from original significant digit */ /* May have an underestimate. This only occurs when both */ /* numbers fit in DECDPUN digits and are padding with a */ /* negative multiple (-10, -100...) and the top digit(s) become */ /* 0. (This only matters when using X3.274 rules where the */ /* leading zero could be included in the rounding.) */ if (res->digitsdigits))=0; /* ensure leading 0 is there */ res->digits=maxdigits; } else { /* remove leading zeros that added due to rounding up to */ /* integral Units (but only those in excess of the original */ /* maxdigits length, unless extended) before test for rounding. */ if (res->digits>reqdigits) { res->digits=decGetDigits(acc, D2U(res->digits)); if (res->digitsdigits=maxdigits; } } decSetCoeff(res, set, acc, res->digits, &residue, status); /* Now apply rounding if needed before removing leading zeros. */ /* This is safe because subnormals are not a possibility */ if (residue!=0) { decApplyRound(res, set, residue, status); residue=0; /* did what needed to be done */ } } /* subset */ #endif } /* used buffer */ /* strip leading zeros [these were left on in case of subset subtract] */ res->digits=decGetDigits(res->lsu, D2U(res->digits)); /* apply checks and rounding */ decFinish(res, set, &residue, status); /* "When the sum of two operands with opposite signs is exactly */ /* zero, the sign of that sum shall be '+' in all rounding modes */ /* except round toward -Infinity, in which mode that sign shall be */ /* '-'." [Subset zeros also never have '-', set by decFinish.] */ if (ISZERO(res) && diffsign #if DECSUBSET && set->extended #endif && (*status&DEC_Inexact)==0) { if (set->round==DEC_ROUND_FLOOR) res->bits|=DECNEG; /* sign - */ else res->bits&=~DECNEG; /* sign + */ } } while(0); /* end protected */ if (allocacc!=nullptr) free(allocacc); /* drop any storage used */ #if DECSUBSET if (allocrhs!=nullptr) free(allocrhs); /* .. */ if (alloclhs!=nullptr) free(alloclhs); /* .. */ #endif return res; } /* decAddOp */ /* ------------------------------------------------------------------ */ /* decDivideOp -- division operation */ /* */ /* This routine performs the calculations for all four division */ /* operators (divide, divideInteger, remainder, remainderNear). */ /* */ /* C=A op B */ /* */ /* res is C, the result. C may be A and/or B (e.g., X=X/X) */ /* lhs is A */ /* rhs is B */ /* set is the context */ /* op is DIVIDE, DIVIDEINT, REMAINDER, or REMNEAR respectively. */ /* status is the usual accumulator */ /* */ /* C must have space for set->digits digits. */ /* */ /* ------------------------------------------------------------------ */ /* The underlying algorithm of this routine is the same as in the */ /* 1981 S/370 implementation, that is, non-restoring long division */ /* with bi-unit (rather than bi-digit) estimation for each unit */ /* multiplier. In this pseudocode overview, complications for the */ /* Remainder operators and division residues for exact rounding are */ /* omitted for clarity. */ /* */ /* Prepare operands and handle special values */ /* Test for x/0 and then 0/x */ /* Exp =Exp1 - Exp2 */ /* Exp =Exp +len(var1) -len(var2) */ /* Sign=Sign1 * Sign2 */ /* Pad accumulator (Var1) to double-length with 0's (pad1) */ /* Pad Var2 to same length as Var1 */ /* msu2pair/plus=1st 2 or 1 units of var2, +1 to allow for round */ /* have=0 */ /* Do until (have=digits+1 OR residue=0) */ /* if exp<0 then if integer divide/residue then leave */ /* this_unit=0 */ /* Do forever */ /* compare numbers */ /* if <0 then leave inner_loop */ /* if =0 then (* quick exit without subtract *) do */ /* this_unit=this_unit+1; output this_unit */ /* leave outer_loop; end */ /* Compare lengths of numbers (mantissae): */ /* If same then tops2=msu2pair -- {units 1&2 of var2} */ /* else tops2=msu2plus -- {0, unit 1 of var2} */ /* tops1=first_unit_of_Var1*10**DECDPUN +second_unit_of_var1 */ /* mult=tops1/tops2 -- Good and safe guess at divisor */ /* if mult=0 then mult=1 */ /* this_unit=this_unit+mult */ /* subtract */ /* end inner_loop */ /* if have\=0 | this_unit\=0 then do */ /* output this_unit */ /* have=have+1; end */ /* var2=var2/10 */ /* exp=exp-1 */ /* end outer_loop */ /* exp=exp+1 -- set the proper exponent */ /* if have=0 then generate answer=0 */ /* Return (Result is defined by Var1) */ /* */ /* ------------------------------------------------------------------ */ /* Two working buffers are needed during the division; one (digits+ */ /* 1) to accumulate the result, and the other (up to 2*digits+1) for */ /* long subtractions. These are acc and var1 respectively. */ /* var1 is a copy of the lhs coefficient, var2 is the rhs coefficient.*/ /* The static buffers may be larger than might be expected to allow */ /* for calls from higher-level functions (notable exp). */ /* ------------------------------------------------------------------ */ static decNumber * decDivideOp(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set, Flag op, uInt *status) { #if DECSUBSET decNumber *alloclhs=nullptr; /* non-nullptr if rounded lhs allocated */ decNumber *allocrhs=nullptr; /* .., rhs */ #endif Unit accbuff[SD2U(DECBUFFER+DECDPUN+10)]; /* local buffer */ Unit *acc=accbuff; /* -> accumulator array for result */ Unit *allocacc=nullptr; /* -> allocated buffer, iff allocated */ Unit *accnext; /* -> where next digit will go */ Int acclength; /* length of acc needed [Units] */ Int accunits; /* count of units accumulated */ Int accdigits; /* count of digits accumulated */ Unit varbuff[SD2U(DECBUFFER*2+DECDPUN)]; /* buffer for var1 */ Unit *var1=varbuff; /* -> var1 array for long subtraction */ Unit *varalloc=nullptr; /* -> allocated buffer, iff used */ Unit *msu1; /* -> msu of var1 */ const Unit *var2; /* -> var2 array */ const Unit *msu2; /* -> msu of var2 */ Int msu2plus; /* msu2 plus one [does not vary] */ eInt msu2pair; /* msu2 pair plus one [does not vary] */ Int var1units, var2units; /* actual lengths */ Int var2ulen; /* logical length (units) */ Int var1initpad=0; /* var1 initial padding (digits) */ Int maxdigits; /* longest LHS or required acc length */ Int mult; /* multiplier for subtraction */ Unit thisunit; /* current unit being accumulated */ Int residue; /* for rounding */ Int reqdigits=set->digits; /* requested DIGITS */ Int exponent; /* working exponent */ Int maxexponent=0; /* DIVIDE maximum exponent if unrounded */ uByte bits; /* working sign */ Unit *target; /* work */ const Unit *source; /* .. */ uInt const *pow; /* .. */ Int shift, cut; /* .. */ #if DECSUBSET Int dropped; /* work */ #endif #if DECCHECK if (decCheckOperands(res, lhs, rhs, set)) return res; #endif do { /* protect allocated storage */ #if DECSUBSET if (!set->extended) { /* reduce operands and set lostDigits status, as needed */ if (lhs->digits>reqdigits) { alloclhs=decRoundOperand(lhs, set, status); if (alloclhs==nullptr) break; lhs=alloclhs; } if (rhs->digits>reqdigits) { allocrhs=decRoundOperand(rhs, set, status); if (allocrhs==nullptr) break; rhs=allocrhs; } } #endif /* [following code does not require input rounding] */ bits=(lhs->bits^rhs->bits)&DECNEG; /* assumed sign for divisions */ /* handle infinities and NaNs */ if (SPECIALARGS) { /* a special bit set */ if (SPECIALARGS & (DECSNAN | DECNAN)) { /* one or two NaNs */ decNaNs(res, lhs, rhs, set, status); break; } /* one or two infinities */ if (decNumberIsInfinite(lhs)) { /* LHS (dividend) is infinite */ if (decNumberIsInfinite(rhs) || /* two infinities are invalid .. */ op & (REMAINDER | REMNEAR)) { /* as is remainder of infinity */ *status|=DEC_Invalid_operation; break; } /* [Note that infinity/0 raises no exceptions] */ uprv_decNumberZero(res); res->bits=bits|DECINF; /* set +/- infinity */ break; } else { /* RHS (divisor) is infinite */ residue=0; if (op&(REMAINDER|REMNEAR)) { /* result is [finished clone of] lhs */ decCopyFit(res, lhs, set, &residue, status); } else { /* a division */ uprv_decNumberZero(res); res->bits=bits; /* set +/- zero */ /* for DIVIDEINT the exponent is always 0. For DIVIDE, result */ /* is a 0 with infinitely negative exponent, clamped to minimum */ if (op&DIVIDE) { res->exponent=set->emin-set->digits+1; *status|=DEC_Clamped; } } decFinish(res, set, &residue, status); break; } } /* handle 0 rhs (x/0) */ if (ISZERO(rhs)) { /* x/0 is always exceptional */ if (ISZERO(lhs)) { uprv_decNumberZero(res); /* [after lhs test] */ *status|=DEC_Division_undefined;/* 0/0 will become NaN */ } else { uprv_decNumberZero(res); if (op&(REMAINDER|REMNEAR)) *status|=DEC_Invalid_operation; else { *status|=DEC_Division_by_zero; /* x/0 */ res->bits=bits|DECINF; /* .. is +/- Infinity */ } } break;} /* handle 0 lhs (0/x) */ if (ISZERO(lhs)) { /* 0/x [x!=0] */ #if DECSUBSET if (!set->extended) uprv_decNumberZero(res); else { #endif if (op&DIVIDE) { residue=0; exponent=lhs->exponent-rhs->exponent; /* ideal exponent */ uprv_decNumberCopy(res, lhs); /* [zeros always fit] */ res->bits=bits; /* sign as computed */ res->exponent=exponent; /* exponent, too */ decFinalize(res, set, &residue, status); /* check exponent */ } else if (op&DIVIDEINT) { uprv_decNumberZero(res); /* integer 0 */ res->bits=bits; /* sign as computed */ } else { /* a remainder */ exponent=rhs->exponent; /* [save in case overwrite] */ uprv_decNumberCopy(res, lhs); /* [zeros always fit] */ if (exponentexponent) res->exponent=exponent; /* use lower */ } #if DECSUBSET } #endif break;} /* Precalculate exponent. This starts off adjusted (and hence fits */ /* in 31 bits) and becomes the usual unadjusted exponent as the */ /* division proceeds. The order of evaluation is important, here, */ /* to avoid wrap. */ exponent=(lhs->exponent+lhs->digits)-(rhs->exponent+rhs->digits); /* If the working exponent is -ve, then some quick exits are */ /* possible because the quotient is known to be <1 */ /* [for REMNEAR, it needs to be < -1, as -0.5 could need work] */ if (exponent<0 && !(op==DIVIDE)) { if (op&DIVIDEINT) { uprv_decNumberZero(res); /* integer part is 0 */ #if DECSUBSET if (set->extended) #endif res->bits=bits; /* set +/- zero */ break;} /* fastpath remainders so long as the lhs has the smaller */ /* (or equal) exponent */ if (lhs->exponent<=rhs->exponent) { if (op&REMAINDER || exponent<-1) { /* It is REMAINDER or safe REMNEAR; result is [finished */ /* clone of] lhs (r = x - 0*y) */ residue=0; decCopyFit(res, lhs, set, &residue, status); decFinish(res, set, &residue, status); break; } /* [unsafe REMNEAR drops through] */ } } /* fastpaths */ /* Long (slow) division is needed; roll up the sleeves... */ /* The accumulator will hold the quotient of the division. */ /* If it needs to be too long for stack storage, then allocate. */ acclength=D2U(reqdigits+DECDPUN); /* in Units */ if (acclength*sizeof(Unit)>sizeof(accbuff)) { /* printf("malloc dvacc %ld units\n", acclength); */ allocacc=(Unit *)malloc(acclength*sizeof(Unit)); if (allocacc==nullptr) { /* hopeless -- abandon */ *status|=DEC_Insufficient_storage; break;} acc=allocacc; /* use the allocated space */ } /* var1 is the padded LHS ready for subtractions. */ /* If it needs to be too long for stack storage, then allocate. */ /* The maximum units needed for var1 (long subtraction) is: */ /* Enough for */ /* (rhs->digits+reqdigits-1) -- to allow full slide to right */ /* or (lhs->digits) -- to allow for long lhs */ /* whichever is larger */ /* +1 -- for rounding of slide to right */ /* +1 -- for leading 0s */ /* +1 -- for pre-adjust if a remainder or DIVIDEINT */ /* [Note: unused units do not participate in decUnitAddSub data] */ maxdigits=rhs->digits+reqdigits-1; if (lhs->digits>maxdigits) maxdigits=lhs->digits; var1units=D2U(maxdigits)+2; /* allocate a guard unit above msu1 for REMAINDERNEAR */ if (!(op&DIVIDE)) var1units++; if ((var1units+1)*sizeof(Unit)>sizeof(varbuff)) { /* printf("malloc dvvar %ld units\n", var1units+1); */ varalloc=(Unit *)malloc((var1units+1)*sizeof(Unit)); if (varalloc==nullptr) { /* hopeless -- abandon */ *status|=DEC_Insufficient_storage; break;} var1=varalloc; /* use the allocated space */ } /* Extend the lhs and rhs to full long subtraction length. The lhs */ /* is truly extended into the var1 buffer, with 0 padding, so a */ /* subtract in place is always possible. The rhs (var2) has */ /* virtual padding (implemented by decUnitAddSub). */ /* One guard unit was allocated above msu1 for rem=rem+rem in */ /* REMAINDERNEAR. */ msu1=var1+var1units-1; /* msu of var1 */ source=lhs->lsu+D2U(lhs->digits)-1; /* msu of input array */ for (target=msu1; source>=lhs->lsu; source--, target--) *target=*source; for (; target>=var1; target--) *target=0; /* rhs (var2) is left-aligned with var1 at the start */ var2ulen=var1units; /* rhs logical length (units) */ var2units=D2U(rhs->digits); /* rhs actual length (units) */ var2=rhs->lsu; /* -> rhs array */ msu2=var2+var2units-1; /* -> msu of var2 [never changes] */ /* now set up the variables which will be used for estimating the */ /* multiplication factor. If these variables are not exact, add */ /* 1 to make sure that the multiplier is never overestimated. */ msu2plus=*msu2; /* it's value .. */ if (var2units>1) msu2plus++; /* .. +1 if any more */ msu2pair=(eInt)*msu2*(DECDPUNMAX+1);/* top two pair .. */ if (var2units>1) { /* .. [else treat 2nd as 0] */ msu2pair+=*(msu2-1); /* .. */ if (var2units>2) msu2pair++; /* .. +1 if any more */ } /* The calculation is working in units, which may have leading zeros, */ /* but the exponent was calculated on the assumption that they are */ /* both left-aligned. Adjust the exponent to compensate: add the */ /* number of leading zeros in var1 msu and subtract those in var2 msu. */ /* [This is actually done by counting the digits and negating, as */ /* lead1=DECDPUN-digits1, and similarly for lead2.] */ for (pow=&powers[1]; *msu1>=*pow; pow++) exponent--; for (pow=&powers[1]; *msu2>=*pow; pow++) exponent++; /* Now, if doing an integer divide or remainder, ensure that */ /* the result will be Unit-aligned. To do this, shift the var1 */ /* accumulator towards least if need be. (It's much easier to */ /* do this now than to reassemble the residue afterwards, if */ /* doing a remainder.) Also ensure the exponent is not negative. */ if (!(op&DIVIDE)) { Unit *u; /* work */ /* save the initial 'false' padding of var1, in digits */ var1initpad=(var1units-D2U(lhs->digits))*DECDPUN; /* Determine the shift to do. */ if (exponent<0) cut=-exponent; else cut=DECDPUN-exponent%DECDPUN; decShiftToLeast(var1, var1units, cut); exponent+=cut; /* maintain numerical value */ var1initpad-=cut; /* .. and reduce padding */ /* clean any most-significant units which were just emptied */ for (u=msu1; cut>=DECDPUN; cut-=DECDPUN, u--) *u=0; } /* align */ else { /* is DIVIDE */ maxexponent=lhs->exponent-rhs->exponent; /* save */ /* optimization: if the first iteration will just produce 0, */ /* preadjust to skip it [valid for DIVIDE only] */ if (*msu1<*msu2) { var2ulen--; /* shift down */ exponent-=DECDPUN; /* update the exponent */ } } /* ---- start the long-division loops ------------------------------ */ accunits=0; /* no units accumulated yet */ accdigits=0; /* .. or digits */ accnext=acc+acclength-1; /* -> msu of acc [NB: allows digits+1] */ for (;;) { /* outer forever loop */ thisunit=0; /* current unit assumed 0 */ /* find the next unit */ for (;;) { /* inner forever loop */ /* strip leading zero units [from either pre-adjust or from */ /* subtract last time around]. Leave at least one unit. */ for (; *msu1==0 && msu1>var1; msu1--) var1units--; if (var1units msu */ for (pv1=msu1; ; pv1--, pv2--) { /* v1=*pv1 -- always OK */ v2=0; /* assume in padding */ if (pv2>=var2) v2=*pv2; /* in range */ if (*pv1!=v2) break; /* no longer the same */ if (pv1==var1) break; /* done; leave pv1 as is */ } /* here when all inspected or a difference seen */ if (*pv1v2. Prepare for real subtraction; the lengths are equal */ /* Estimate the multiplier (there's always a msu1-1)... */ /* Bring in two units of var2 to provide a good estimate. */ mult=(Int)(((eInt)*msu1*(DECDPUNMAX+1)+*(msu1-1))/msu2pair); } /* lengths the same */ else { /* var1units > var2ulen, so subtraction is safe */ /* The var2 msu is one unit towards the lsu of the var1 msu, */ /* so only one unit for var2 can be used. */ mult=(Int)(((eInt)*msu1*(DECDPUNMAX+1)+*(msu1-1))/msu2plus); } if (mult==0) mult=1; /* must always be at least 1 */ /* subtraction needed; var1 is > var2 */ thisunit=(Unit)(thisunit+mult); /* accumulate */ /* subtract var1-var2, into var1; only the overlap needs */ /* processing, as this is an in-place calculation */ shift=var2ulen-var2units; #if DECTRACE decDumpAr('1', &var1[shift], var1units-shift); decDumpAr('2', var2, var2units); printf("m=%ld\n", -mult); #endif decUnitAddSub(&var1[shift], var1units-shift, var2, var2units, 0, &var1[shift], -mult); #if DECTRACE decDumpAr('#', &var1[shift], var1units-shift); #endif /* var1 now probably has leading zeros; these are removed at the */ /* top of the inner loop. */ } /* inner loop */ /* The next unit has been calculated in full; unless it's a */ /* leading zero, add to acc */ if (accunits!=0 || thisunit!=0) { /* is first or non-zero */ *accnext=thisunit; /* store in accumulator */ /* account exactly for the new digits */ if (accunits==0) { accdigits++; /* at least one */ for (pow=&powers[1]; thisunit>=*pow; pow++) accdigits++; } else accdigits+=DECDPUN; accunits++; /* update count */ accnext--; /* ready for next */ if (accdigits>reqdigits) break; /* have enough digits */ } /* if the residue is zero, the operation is done (unless divide */ /* or divideInteger and still not enough digits yet) */ if (*var1==0 && var1units==1) { /* residue is 0 */ if (op&(REMAINDER|REMNEAR)) break; if ((op&DIVIDE) && (exponent<=maxexponent)) break; /* [drop through if divideInteger] */ } /* also done enough if calculating remainder or integer */ /* divide and just did the last ('units') unit */ if (exponent==0 && !(op&DIVIDE)) break; /* to get here, var1 is less than var2, so divide var2 by the per- */ /* Unit power of ten and go for the next digit */ var2ulen--; /* shift down */ exponent-=DECDPUN; /* update the exponent */ } /* outer loop */ /* ---- division is complete --------------------------------------- */ /* here: acc has at least reqdigits+1 of good results (or fewer */ /* if early stop), starting at accnext+1 (its lsu) */ /* var1 has any residue at the stopping point */ /* accunits is the number of digits collected in acc */ if (accunits==0) { /* acc is 0 */ accunits=1; /* show have a unit .. */ accdigits=1; /* .. */ *accnext=0; /* .. whose value is 0 */ } else accnext++; /* back to last placed */ /* accnext now -> lowest unit of result */ residue=0; /* assume no residue */ if (op&DIVIDE) { /* record the presence of any residue, for rounding */ if (*var1!=0 || var1units>1) residue=1; else { /* no residue */ /* Had an exact division; clean up spurious trailing 0s. */ /* There will be at most DECDPUN-1, from the final multiply, */ /* and then only if the result is non-0 (and even) and the */ /* exponent is 'loose'. */ #if DECDPUN>1 Unit lsu=*accnext; if (!(lsu&0x01) && (lsu!=0)) { /* count the trailing zeros */ Int drop=0; for (;; drop++) { /* [will terminate because lsu!=0] */ if (exponent>=maxexponent) break; /* don't chop real 0s */ #if DECDPUN<=4 if ((lsu-QUOT10(lsu, drop+1) *powers[drop+1])!=0) break; /* found non-0 digit */ #else if (lsu%powers[drop+1]!=0) break; /* found non-0 digit */ #endif exponent++; } if (drop>0) { accunits=decShiftToLeast(accnext, accunits, drop); accdigits=decGetDigits(accnext, accunits); accunits=D2U(accdigits); /* [exponent was adjusted in the loop] */ } } /* neither odd nor 0 */ #endif } /* exact divide */ } /* divide */ else /* op!=DIVIDE */ { /* check for coefficient overflow */ if (accdigits+exponent>reqdigits) { *status|=DEC_Division_impossible; break; } if (op & (REMAINDER|REMNEAR)) { /* [Here, the exponent will be 0, because var1 was adjusted */ /* appropriately.] */ Int postshift; /* work */ Flag wasodd=0; /* integer was odd */ Unit *quotlsu; /* for save */ Int quotdigits; /* .. */ bits=lhs->bits; /* remainder sign is always as lhs */ /* Fastpath when residue is truly 0 is worthwhile [and */ /* simplifies the code below] */ if (*var1==0 && var1units==1) { /* residue is 0 */ Int exp=lhs->exponent; /* save min(exponents) */ if (rhs->exponentexponent; uprv_decNumberZero(res); /* 0 coefficient */ #if DECSUBSET if (set->extended) #endif res->exponent=exp; /* .. with proper exponent */ res->bits=(uByte)(bits&DECNEG); /* [cleaned] */ decFinish(res, set, &residue, status); /* might clamp */ break; } /* note if the quotient was odd */ if (*accnext & 0x01) wasodd=1; /* acc is odd */ quotlsu=accnext; /* save in case need to reinspect */ quotdigits=accdigits; /* .. */ /* treat the residue, in var1, as the value to return, via acc */ /* calculate the unused zero digits. This is the smaller of: */ /* var1 initial padding (saved above) */ /* var2 residual padding, which happens to be given by: */ postshift=var1initpad+exponent-lhs->exponent+rhs->exponent; /* [the 'exponent' term accounts for the shifts during divide] */ if (var1initpadexponent; /* exponent is smaller of lhs & rhs */ if (rhs->exponentexponent; /* Now correct the result if doing remainderNear; if it */ /* (looking just at coefficients) is > rhs/2, or == rhs/2 and */ /* the integer was odd then the result should be rem-rhs. */ if (op&REMNEAR) { Int compare, tarunits; /* work */ Unit *up; /* .. */ /* calculate remainder*2 into the var1 buffer (which has */ /* 'headroom' of an extra unit and hence enough space) */ /* [a dedicated 'double' loop would be faster, here] */ tarunits=decUnitAddSub(accnext, accunits, accnext, accunits, 0, accnext, 1); /* decDumpAr('r', accnext, tarunits); */ /* Here, accnext (var1) holds tarunits Units with twice the */ /* remainder's coefficient, which must now be compared to the */ /* RHS. The remainder's exponent may be smaller than the RHS's. */ compare=decUnitCompare(accnext, tarunits, rhs->lsu, D2U(rhs->digits), rhs->exponent-exponent); if (compare==BADINT) { /* deep trouble */ *status|=DEC_Insufficient_storage; break;} /* now restore the remainder by dividing by two; the lsu */ /* is known to be even. */ for (up=accnext; up0 || (compare==0 && wasodd)) { /* adjustment needed */ Int exp, expunits, exprem; /* work */ /* This is effectively causing round-up of the quotient, */ /* so if it was the rare case where it was full and all */ /* nines, it would overflow and hence division-impossible */ /* should be raised */ Flag allnines=0; /* 1 if quotient all nines */ if (quotdigits==reqdigits) { /* could be borderline */ for (up=quotlsu; ; up++) { if (quotdigits>DECDPUN) { if (*up!=DECDPUNMAX) break;/* non-nines */ } else { /* this is the last Unit */ if (*up==powers[quotdigits]-1) allnines=1; break; } quotdigits-=DECDPUN; /* checked those digits */ } /* up */ } /* borderline check */ if (allnines) { *status|=DEC_Division_impossible; break;} /* rem-rhs is needed; the sign will invert. Again, var1 */ /* can safely be used for the working Units array. */ exp=rhs->exponent-exponent; /* RHS padding needed */ /* Calculate units and remainder from exponent. */ expunits=exp/DECDPUN; exprem=exp%DECDPUN; /* subtract [A+B*(-m)]; the result will always be negative */ accunits=-decUnitAddSub(accnext, accunits, rhs->lsu, D2U(rhs->digits), expunits, accnext, -(Int)powers[exprem]); accdigits=decGetDigits(accnext, accunits); /* count digits exactly */ accunits=D2U(accdigits); /* and recalculate the units for copy */ /* [exponent is as for original remainder] */ bits^=DECNEG; /* flip the sign */ } } /* REMNEAR */ } /* REMAINDER or REMNEAR */ } /* not DIVIDE */ /* Set exponent and bits */ res->exponent=exponent; res->bits=(uByte)(bits&DECNEG); /* [cleaned] */ /* Now the coefficient. */ decSetCoeff(res, set, accnext, accdigits, &residue, status); decFinish(res, set, &residue, status); /* final cleanup */ #if DECSUBSET /* If a divide then strip trailing zeros if subset [after round] */ if (!set->extended && (op==DIVIDE)) decTrim(res, set, 0, 1, &dropped); #endif } while(0); /* end protected */ if (varalloc!=nullptr) free(varalloc); /* drop any storage used */ if (allocacc!=nullptr) free(allocacc); /* .. */ #if DECSUBSET if (allocrhs!=nullptr) free(allocrhs); /* .. */ if (alloclhs!=nullptr) free(alloclhs); /* .. */ #endif return res; } /* decDivideOp */ /* ------------------------------------------------------------------ */ /* decMultiplyOp -- multiplication operation */ /* */ /* This routine performs the multiplication C=A x B. */ /* */ /* res is C, the result. C may be A and/or B (e.g., X=X*X) */ /* lhs is A */ /* rhs is B */ /* set is the context */ /* status is the usual accumulator */ /* */ /* C must have space for set->digits digits. */ /* */ /* ------------------------------------------------------------------ */ /* 'Classic' multiplication is used rather than Karatsuba, as the */ /* latter would give only a minor improvement for the short numbers */ /* expected to be handled most (and uses much more memory). */ /* */ /* There are two major paths here: the general-purpose ('old code') */ /* path which handles all DECDPUN values, and a fastpath version */ /* which is used if 64-bit ints are available, DECDPUN<=4, and more */ /* than two calls to decUnitAddSub would be made. */ /* */ /* The fastpath version lumps units together into 8-digit or 9-digit */ /* chunks, and also uses a lazy carry strategy to minimise expensive */ /* 64-bit divisions. The chunks are then broken apart again into */ /* units for continuing processing. Despite this overhead, the */ /* fastpath can speed up some 16-digit operations by 10x (and much */ /* more for higher-precision calculations). */ /* */ /* A buffer always has to be used for the accumulator; in the */ /* fastpath, buffers are also always needed for the chunked copies of */ /* of the operand coefficients. */ /* Static buffers are larger than needed just for multiply, to allow */ /* for calls from other operations (notably exp). */ /* ------------------------------------------------------------------ */ #define FASTMUL (DECUSE64 && DECDPUN<5) static decNumber * decMultiplyOp(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set, uInt *status) { Int accunits; /* Units of accumulator in use */ Int exponent; /* work */ Int residue=0; /* rounding residue */ uByte bits; /* result sign */ Unit *acc; /* -> accumulator Unit array */ Int needbytes; /* size calculator */ void *allocacc=nullptr; /* -> allocated accumulator, iff allocated */ Unit accbuff[SD2U(DECBUFFER*4+1)]; /* buffer (+1 for DECBUFFER==0, */ /* *4 for calls from other operations) */ const Unit *mer, *mermsup; /* work */ Int madlength; /* Units in multiplicand */ Int shift; /* Units to shift multiplicand by */ #if FASTMUL /* if DECDPUN is 1 or 3 work in base 10**9, otherwise */ /* (DECDPUN is 2 or 4) then work in base 10**8 */ #if DECDPUN & 1 /* odd */ #define FASTBASE 1000000000 /* base */ #define FASTDIGS 9 /* digits in base */ #define FASTLAZY 18 /* carry resolution point [1->18] */ #else #define FASTBASE 100000000 #define FASTDIGS 8 #define FASTLAZY 1844 /* carry resolution point [1->1844] */ #endif /* three buffers are used, two for chunked copies of the operands */ /* (base 10**8 or base 10**9) and one base 2**64 accumulator with */ /* lazy carry evaluation */ uInt zlhibuff[(DECBUFFER*2+1)/8+1]; /* buffer (+1 for DECBUFFER==0) */ uInt *zlhi=zlhibuff; /* -> lhs array */ uInt *alloclhi=nullptr; /* -> allocated buffer, iff allocated */ uInt zrhibuff[(DECBUFFER*2+1)/8+1]; /* buffer (+1 for DECBUFFER==0) */ uInt *zrhi=zrhibuff; /* -> rhs array */ uInt *allocrhi=nullptr; /* -> allocated buffer, iff allocated */ uLong zaccbuff[(DECBUFFER*2+1)/4+2]; /* buffer (+1 for DECBUFFER==0) */ /* [allocacc is shared for both paths, as only one will run] */ uLong *zacc=zaccbuff; /* -> accumulator array for exact result */ #if DECDPUN==1 Int zoff; /* accumulator offset */ #endif uInt *lip, *rip; /* item pointers */ uInt *lmsi, *rmsi; /* most significant items */ Int ilhs, irhs, iacc; /* item counts in the arrays */ Int lazy; /* lazy carry counter */ uLong lcarry; /* uLong carry */ uInt carry; /* carry (NB not uLong) */ Int count; /* work */ const Unit *cup; /* .. */ Unit *up; /* .. */ uLong *lp; /* .. */ Int p; /* .. */ #endif #if DECSUBSET decNumber *alloclhs=nullptr; /* -> allocated buffer, iff allocated */ decNumber *allocrhs=nullptr; /* -> allocated buffer, iff allocated */ #endif #if DECCHECK if (decCheckOperands(res, lhs, rhs, set)) return res; #endif /* precalculate result sign */ bits=(uByte)((lhs->bits^rhs->bits)&DECNEG); /* handle infinities and NaNs */ if (SPECIALARGS) { /* a special bit set */ if (SPECIALARGS & (DECSNAN | DECNAN)) { /* one or two NaNs */ decNaNs(res, lhs, rhs, set, status); return res;} /* one or two infinities; Infinity * 0 is invalid */ if (((lhs->bits & DECINF)==0 && ISZERO(lhs)) ||((rhs->bits & DECINF)==0 && ISZERO(rhs))) { *status|=DEC_Invalid_operation; return res;} uprv_decNumberZero(res); res->bits=bits|DECINF; /* infinity */ return res;} /* For best speed, as in DMSRCN [the original Rexx numerics */ /* module], use the shorter number as the multiplier (rhs) and */ /* the longer as the multiplicand (lhs) to minimise the number of */ /* adds (partial products) */ if (lhs->digitsdigits) { /* swap... */ const decNumber *hold=lhs; lhs=rhs; rhs=hold; } do { /* protect allocated storage */ #if DECSUBSET if (!set->extended) { /* reduce operands and set lostDigits status, as needed */ if (lhs->digits>set->digits) { alloclhs=decRoundOperand(lhs, set, status); if (alloclhs==nullptr) break; lhs=alloclhs; } if (rhs->digits>set->digits) { allocrhs=decRoundOperand(rhs, set, status); if (allocrhs==nullptr) break; rhs=allocrhs; } } #endif /* [following code does not require input rounding] */ #if FASTMUL /* fastpath can be used */ /* use the fast path if there are enough digits in the shorter */ /* operand to make the setup and takedown worthwhile */ #define NEEDTWO (DECDPUN*2) /* within two decUnitAddSub calls */ if (rhs->digits>NEEDTWO) { /* use fastpath... */ /* calculate the number of elements in each array */ ilhs=(lhs->digits+FASTDIGS-1)/FASTDIGS; /* [ceiling] */ irhs=(rhs->digits+FASTDIGS-1)/FASTDIGS; /* .. */ iacc=ilhs+irhs; /* allocate buffers if required, as usual */ needbytes=ilhs*sizeof(uInt); if (needbytes>(Int)sizeof(zlhibuff)) { alloclhi=(uInt *)malloc(needbytes); zlhi=alloclhi;} needbytes=irhs*sizeof(uInt); if (needbytes>(Int)sizeof(zrhibuff)) { allocrhi=(uInt *)malloc(needbytes); zrhi=allocrhi;} /* Allocating the accumulator space needs a special case when */ /* DECDPUN=1 because when converting the accumulator to Units */ /* after the multiplication each 8-byte item becomes 9 1-byte */ /* units. Therefore iacc extra bytes are needed at the front */ /* (rounded up to a multiple of 8 bytes), and the uLong */ /* accumulator starts offset the appropriate number of units */ /* to the right to avoid overwrite during the unchunking. */ /* Make sure no signed int overflow below. This is always true */ /* if the given numbers have less digits than DEC_MAX_DIGITS. */ U_ASSERT((uint32_t)iacc <= INT32_MAX/sizeof(uLong)); needbytes=iacc*sizeof(uLong); #if DECDPUN==1 zoff=(iacc+7)/8; /* items to offset by */ needbytes+=zoff*8; #endif if (needbytes>(Int)sizeof(zaccbuff)) { allocacc=(uLong *)malloc(needbytes); zacc=(uLong *)allocacc;} if (zlhi==nullptr||zrhi==nullptr||zacc==nullptr) { *status|=DEC_Insufficient_storage; break;} acc=(Unit *)zacc; /* -> target Unit array */ #if DECDPUN==1 zacc+=zoff; /* start uLong accumulator to right */ #endif /* assemble the chunked copies of the left and right sides */ for (count=lhs->digits, cup=lhs->lsu, lip=zlhi; count>0; lip++) for (p=0, *lip=0; p0; p+=DECDPUN, cup++, count-=DECDPUN) *lip+=*cup*powers[p]; lmsi=lip-1; /* save -> msi */ for (count=rhs->digits, cup=rhs->lsu, rip=zrhi; count>0; rip++) for (p=0, *rip=0; p0; p+=DECDPUN, cup++, count-=DECDPUN) *rip+=*cup*powers[p]; rmsi=rip-1; /* save -> msi */ /* zero the accumulator */ for (lp=zacc; lp0 && rip!=rmsi) continue; lazy=FASTLAZY; /* reset delay count */ /* spin up the accumulator resolving overflows */ for (lp=zacc; lp(up-acc); /* count of units */ } else { /* here to use units directly, without chunking ['old code'] */ #endif /* if accumulator will be too long for local storage, then allocate */ acc=accbuff; /* -> assume buffer for accumulator */ needbytes=(D2U(lhs->digits)+D2U(rhs->digits))*sizeof(Unit); if (needbytes>(Int)sizeof(accbuff)) { allocacc=(Unit *)malloc(needbytes); if (allocacc==nullptr) {*status|=DEC_Insufficient_storage; break;} acc=(Unit *)allocacc; /* use the allocated space */ } /* Now the main long multiplication loop */ /* Unlike the equivalent in the IBM Java implementation, there */ /* is no advantage in calculating from msu to lsu. So, do it */ /* by the book, as it were. */ /* Each iteration calculates ACC=ACC+MULTAND*MULT */ accunits=1; /* accumulator starts at '0' */ *acc=0; /* .. (lsu=0) */ shift=0; /* no multiplicand shift at first */ madlength=D2U(lhs->digits); /* this won't change */ mermsup=rhs->lsu+D2U(rhs->digits); /* -> msu+1 of multiplier */ for (mer=rhs->lsu; merlsu, madlength, 0, &acc[shift], *mer) + shift; else { /* extend acc with a 0; it will be used shortly */ *(acc+accunits)=0; /* [this avoids length of <=0 later] */ accunits++; } /* multiply multiplicand by 10**DECDPUN for next Unit to left */ shift++; /* add this for 'logical length' */ } /* n */ #if FASTMUL } /* unchunked units */ #endif /* common end-path */ #if DECTRACE decDumpAr('*', acc, accunits); /* Show exact result */ #endif /* acc now contains the exact result of the multiplication, */ /* possibly with a leading zero unit; build the decNumber from */ /* it, noting if any residue */ res->bits=bits; /* set sign */ res->digits=decGetDigits(acc, accunits); /* count digits exactly */ /* There can be a 31-bit wrap in calculating the exponent. */ /* This can only happen if both input exponents are negative and */ /* both their magnitudes are large. If there was a wrap, set a */ /* safe very negative exponent, from which decFinalize() will */ /* raise a hard underflow shortly. */ exponent=lhs->exponent+rhs->exponent; /* calculate exponent */ if (lhs->exponent<0 && rhs->exponent<0 && exponent>0) exponent=-2*DECNUMMAXE; /* force underflow */ res->exponent=exponent; /* OK to overwrite now */ /* Set the coefficient. If any rounding, residue records */ decSetCoeff(res, set, acc, res->digits, &residue, status); decFinish(res, set, &residue, status); /* final cleanup */ } while(0); /* end protected */ if (allocacc!=nullptr) free(allocacc); /* drop any storage used */ #if DECSUBSET if (allocrhs!=nullptr) free(allocrhs); /* .. */ if (alloclhs!=nullptr) free(alloclhs); /* .. */ #endif #if FASTMUL if (allocrhi!=nullptr) free(allocrhi); /* .. */ if (alloclhi!=nullptr) free(alloclhi); /* .. */ #endif return res; } /* decMultiplyOp */ /* ------------------------------------------------------------------ */ /* decExpOp -- effect exponentiation */ /* */ /* This computes C = exp(A) */ /* */ /* res is C, the result. C may be A */ /* rhs is A */ /* set is the context; note that rounding mode has no effect */ /* */ /* C must have space for set->digits digits. status is updated but */ /* not set. */ /* */ /* Restrictions: */ /* */ /* digits, emax, and -emin in the context must be less than */ /* 2*DEC_MAX_MATH (1999998), and the rhs must be within these */ /* bounds or a zero. This is an internal routine, so these */ /* restrictions are contractual and not enforced. */ /* */ /* A finite result is rounded using DEC_ROUND_HALF_EVEN; it will */ /* almost always be correctly rounded, but may be up to 1 ulp in */ /* error in rare cases. */ /* */ /* Finite results will always be full precision and Inexact, except */ /* when A is a zero or -Infinity (giving 1 or 0 respectively). */ /* ------------------------------------------------------------------ */ /* This approach used here is similar to the algorithm described in */ /* */ /* Variable Precision Exponential Function, T. E. Hull and */ /* A. Abrham, ACM Transactions on Mathematical Software, Vol 12 #2, */ /* pp79-91, ACM, June 1986. */ /* */ /* with the main difference being that the iterations in the series */ /* evaluation are terminated dynamically (which does not require the */ /* extra variable-precision variables which are expensive in this */ /* context). */ /* */ /* The error analysis in Hull & Abrham's paper applies except for the */ /* round-off error accumulation during the series evaluation. This */ /* code does not precalculate the number of iterations and so cannot */ /* use Horner's scheme. Instead, the accumulation is done at double- */ /* precision, which ensures that the additions of the terms are exact */ /* and do not accumulate round-off (and any round-off errors in the */ /* terms themselves move 'to the right' faster than they can */ /* accumulate). This code also extends the calculation by allowing, */ /* in the spirit of other decNumber operators, the input to be more */ /* precise than the result (the precision used is based on the more */ /* precise of the input or requested result). */ /* */ /* Implementation notes: */ /* */ /* 1. This is separated out as decExpOp so it can be called from */ /* other Mathematical functions (notably Ln) with a wider range */ /* than normal. In particular, it can handle the slightly wider */ /* (double) range needed by Ln (which has to be able to calculate */ /* exp(-x) where x can be the tiniest number (Ntiny). */ /* */ /* 2. Normalizing x to be <=0.1 (instead of <=1) reduces loop */ /* iterations by approximately a third with additional (although */ /* diminishing) returns as the range is reduced to even smaller */ /* fractions. However, h (the power of 10 used to correct the */ /* result at the end, see below) must be kept <=8 as otherwise */ /* the final result cannot be computed. Hence the leverage is a */ /* sliding value (8-h), where potentially the range is reduced */ /* more for smaller values. */ /* */ /* The leverage that can be applied in this way is severely */ /* limited by the cost of the raise-to-the power at the end, */ /* which dominates when the number of iterations is small (less */ /* than ten) or when rhs is short. As an example, the adjustment */ /* x**10,000,000 needs 31 multiplications, all but one full-width. */ /* */ /* 3. The restrictions (especially precision) could be raised with */ /* care, but the full decNumber range seems very hard within the */ /* 32-bit limits. */ /* */ /* 4. The working precisions for the static buffers are twice the */ /* obvious size to allow for calls from decNumberPower. */ /* ------------------------------------------------------------------ */ decNumber * decExpOp(decNumber *res, const decNumber *rhs, decContext *set, uInt *status) { uInt ignore=0; /* working status */ Int h; /* adjusted exponent for 0.xxxx */ Int p; /* working precision */ Int residue; /* rounding residue */ uInt needbytes; /* for space calculations */ const decNumber *x=rhs; /* (may point to safe copy later) */ decContext aset, tset, dset; /* working contexts */ Int comp; /* work */ /* the argument is often copied to normalize it, so (unusually) it */ /* is treated like other buffers, using DECBUFFER, +1 in case */ /* DECBUFFER is 0 */ decNumber bufr[D2N(DECBUFFER*2+1)]; decNumber *allocrhs=nullptr; /* non-nullptr if rhs buffer allocated */ /* the working precision will be no more than set->digits+8+1 */ /* so for on-stack buffers DECBUFFER+9 is used, +1 in case DECBUFFER */ /* is 0 (and twice that for the accumulator) */ /* buffer for t, term (working precision plus) */ decNumber buft[D2N(DECBUFFER*2+9+1)]; decNumber *allocbuft=nullptr; /* -> allocated buft, iff allocated */ decNumber *t=buft; /* term */ /* buffer for a, accumulator (working precision * 2), at least 9 */ decNumber bufa[D2N(DECBUFFER*4+18+1)]; decNumber *allocbufa=nullptr; /* -> allocated bufa, iff allocated */ decNumber *a=bufa; /* accumulator */ /* decNumber for the divisor term; this needs at most 9 digits */ /* and so can be fixed size [16 so can use standard context] */ decNumber bufd[D2N(16)]; decNumber *d=bufd; /* divisor */ decNumber numone; /* constant 1 */ #if DECCHECK Int iterations=0; /* for later sanity check */ if (decCheckOperands(res, DECUNUSED, rhs, set)) return res; #endif do { /* protect allocated storage */ if (SPECIALARG) { /* handle infinities and NaNs */ if (decNumberIsInfinite(rhs)) { /* an infinity */ if (decNumberIsNegative(rhs)) /* -Infinity -> +0 */ uprv_decNumberZero(res); else uprv_decNumberCopy(res, rhs); /* +Infinity -> self */ } else decNaNs(res, rhs, nullptr, set, status); /* a NaN */ break;} if (ISZERO(rhs)) { /* zeros -> exact 1 */ uprv_decNumberZero(res); /* make clean 1 */ *res->lsu=1; /* .. */ break;} /* [no status to set] */ /* e**x when 0 < x < 0.66 is < 1+3x/2, hence can fast-path */ /* positive and negative tiny cases which will result in inexact */ /* 1. This also allows the later add-accumulate to always be */ /* exact (because its length will never be more than twice the */ /* working precision). */ /* The comparator (tiny) needs just one digit, so use the */ /* decNumber d for it (reused as the divisor, etc., below); its */ /* exponent is such that if x is positive it will have */ /* set->digits-1 zeros between the decimal point and the digit, */ /* which is 4, and if x is negative one more zero there as the */ /* more precise result will be of the form 0.9999999 rather than */ /* 1.0000001. Hence, tiny will be 0.0000004 if digits=7 and x>0 */ /* or 0.00000004 if digits=7 and x<0. If RHS not larger than */ /* this then the result will be 1.000000 */ uprv_decNumberZero(d); /* clean */ *d->lsu=4; /* set 4 .. */ d->exponent=-set->digits; /* * 10**(-d) */ if (decNumberIsNegative(rhs)) d->exponent--; /* negative case */ comp=decCompare(d, rhs, 1); /* signless compare */ if (comp==BADINT) { *status|=DEC_Insufficient_storage; break;} if (comp>=0) { /* rhs < d */ Int shift=set->digits-1; uprv_decNumberZero(res); /* set 1 */ *res->lsu=1; /* .. */ res->digits=decShiftToMost(res->lsu, 1, shift); res->exponent=-shift; /* make 1.0000... */ *status|=DEC_Inexact | DEC_Rounded; /* .. inexactly */ break;} /* tiny */ /* set up the context to be used for calculating a, as this is */ /* used on both paths below */ uprv_decContextDefault(&aset, DEC_INIT_DECIMAL64); /* accumulator bounds are as requested (could underflow) */ aset.emax=set->emax; /* usual bounds */ aset.emin=set->emin; /* .. */ aset.clamp=0; /* and no concrete format */ /* calculate the adjusted (Hull & Abrham) exponent (where the */ /* decimal point is just to the left of the coefficient msd) */ h=rhs->exponent+rhs->digits; /* if h>8 then 10**h cannot be calculated safely; however, when */ /* h=8 then exp(|rhs|) will be at least exp(1E+7) which is at */ /* least 6.59E+4342944, so (due to the restriction on Emax/Emin) */ /* overflow (or underflow to 0) is guaranteed -- so this case can */ /* be handled by simply forcing the appropriate excess */ if (h>8) { /* overflow/underflow */ /* set up here so Power call below will over or underflow to */ /* zero; set accumulator to either 2 or 0.02 */ /* [stack buffer for a is always big enough for this] */ uprv_decNumberZero(a); *a->lsu=2; /* not 1 but < exp(1) */ if (decNumberIsNegative(rhs)) a->exponent=-2; /* make 0.02 */ h=8; /* clamp so 10**h computable */ p=9; /* set a working precision */ } else { /* h<=8 */ Int maxlever=(rhs->digits>8?1:0); /* [could/should increase this for precisions >40 or so, too] */ /* if h is 8, cannot normalize to a lower upper limit because */ /* the final result will not be computable (see notes above), */ /* but leverage can be applied whenever h is less than 8. */ /* Apply as much as possible, up to a MAXLEVER digits, which */ /* sets the tradeoff against the cost of the later a**(10**h). */ /* As h is increased, the working precision below also */ /* increases to compensate for the "constant digits at the */ /* front" effect. */ Int lever=MINI(8-h, maxlever); /* leverage attainable */ Int use=-rhs->digits-lever; /* exponent to use for RHS */ h+=lever; /* apply leverage selected */ if (h<0) { /* clamp */ use+=h; /* [may end up subnormal] */ h=0; } /* Take a copy of RHS if it needs normalization (true whenever x>=1) */ if (rhs->exponent!=use) { decNumber *newrhs=bufr; /* assume will fit on stack */ needbytes=sizeof(decNumber)+(D2U(rhs->digits)-1)*sizeof(Unit); if (needbytes>sizeof(bufr)) { /* need malloc space */ allocrhs=(decNumber *)malloc(needbytes); if (allocrhs==nullptr) { /* hopeless -- abandon */ *status|=DEC_Insufficient_storage; break;} newrhs=allocrhs; /* use the allocated space */ } uprv_decNumberCopy(newrhs, rhs); /* copy to safe space */ newrhs->exponent=use; /* normalize; now <1 */ x=newrhs; /* ready for use */ /* decNumberShow(x); */ } /* Now use the usual power series to evaluate exp(x). The */ /* series starts as 1 + x + x^2/2 ... so prime ready for the */ /* third term by setting the term variable t=x, the accumulator */ /* a=1, and the divisor d=2. */ /* First determine the working precision. From Hull & Abrham */ /* this is set->digits+h+2. However, if x is 'over-precise' we */ /* need to allow for all its digits to potentially participate */ /* (consider an x where all the excess digits are 9s) so in */ /* this case use x->digits+h+2 */ p=MAXI(x->digits, set->digits)+h+2; /* [h<=8] */ /* a and t are variable precision, and depend on p, so space */ /* must be allocated for them if necessary */ /* the accumulator needs to be able to hold 2p digits so that */ /* the additions on the second and subsequent iterations are */ /* sufficiently exact. */ needbytes=sizeof(decNumber)+(D2U(p*2)-1)*sizeof(Unit); if (needbytes>sizeof(bufa)) { /* need malloc space */ allocbufa=(decNumber *)malloc(needbytes); if (allocbufa==nullptr) { /* hopeless -- abandon */ *status|=DEC_Insufficient_storage; break;} a=allocbufa; /* use the allocated space */ } /* the term needs to be able to hold p digits (which is */ /* guaranteed to be larger than x->digits, so the initial copy */ /* is safe); it may also be used for the raise-to-power */ /* calculation below, which needs an extra two digits */ needbytes=sizeof(decNumber)+(D2U(p+2)-1)*sizeof(Unit); if (needbytes>sizeof(buft)) { /* need malloc space */ allocbuft=(decNumber *)malloc(needbytes); if (allocbuft==nullptr) { /* hopeless -- abandon */ *status|=DEC_Insufficient_storage; break;} t=allocbuft; /* use the allocated space */ } uprv_decNumberCopy(t, x); /* term=x */ uprv_decNumberZero(a); *a->lsu=1; /* accumulator=1 */ uprv_decNumberZero(d); *d->lsu=2; /* divisor=2 */ uprv_decNumberZero(&numone); *numone.lsu=1; /* constant 1 for increment */ /* set up the contexts for calculating a, t, and d */ uprv_decContextDefault(&tset, DEC_INIT_DECIMAL64); dset=tset; /* accumulator bounds are set above, set precision now */ aset.digits=p*2; /* double */ /* term bounds avoid any underflow or overflow */ tset.digits=p; tset.emin=DEC_MIN_EMIN; /* [emax is plenty] */ /* [dset.digits=16, etc., are sufficient] */ /* finally ready to roll */ for (;;) { #if DECCHECK iterations++; #endif /* only the status from the accumulation is interesting */ /* [but it should remain unchanged after first add] */ decAddOp(a, a, t, &aset, 0, status); /* a=a+t */ decMultiplyOp(t, t, x, &tset, &ignore); /* t=t*x */ decDivideOp(t, t, d, &tset, DIVIDE, &ignore); /* t=t/d */ /* the iteration ends when the term cannot affect the result, */ /* if rounded to p digits, which is when its value is smaller */ /* than the accumulator by p+1 digits. There must also be */ /* full precision in a. */ if (((a->digits+a->exponent)>=(t->digits+t->exponent+p+1)) && (a->digits>=p)) break; decAddOp(d, d, &numone, &dset, 0, &ignore); /* d=d+1 */ } /* iterate */ #if DECCHECK /* just a sanity check; comment out test to show always */ if (iterations>p+3) printf("Exp iterations=%ld, status=%08lx, p=%ld, d=%ld\n", (LI)iterations, (LI)*status, (LI)p, (LI)x->digits); #endif } /* h<=8 */ /* apply postconditioning: a=a**(10**h) -- this is calculated */ /* at a slightly higher precision than Hull & Abrham suggest */ if (h>0) { Int seenbit=0; /* set once a 1-bit is seen */ Int i; /* counter */ Int n=powers[h]; /* always positive */ aset.digits=p+2; /* sufficient precision */ /* avoid the overhead and many extra digits of decNumberPower */ /* as all that is needed is the short 'multipliers' loop; here */ /* accumulate the answer into t */ uprv_decNumberZero(t); *t->lsu=1; /* acc=1 */ for (i=1;;i++){ /* for each bit [top bit ignored] */ /* abandon if have had overflow or terminal underflow */ if (*status & (DEC_Overflow|DEC_Underflow)) { /* interesting? */ if (*status&DEC_Overflow || ISZERO(t)) break;} n=n<<1; /* move next bit to testable position */ if (n<0) { /* top bit is set */ seenbit=1; /* OK, have a significant bit */ decMultiplyOp(t, t, a, &aset, status); /* acc=acc*x */ } if (i==31) break; /* that was the last bit */ if (!seenbit) continue; /* no need to square 1 */ decMultiplyOp(t, t, t, &aset, status); /* acc=acc*acc [square] */ } /*i*/ /* 32 bits */ /* decNumberShow(t); */ a=t; /* and carry on using t instead of a */ } /* Copy and round the result to res */ residue=1; /* indicate dirt to right .. */ if (ISZERO(a)) residue=0; /* .. unless underflowed to 0 */ aset.digits=set->digits; /* [use default rounding] */ decCopyFit(res, a, &aset, &residue, status); /* copy & shorten */ decFinish(res, set, &residue, status); /* cleanup/set flags */ } while(0); /* end protected */ if (allocrhs !=nullptr) free(allocrhs); /* drop any storage used */ if (allocbufa!=nullptr) free(allocbufa); /* .. */ if (allocbuft!=nullptr) free(allocbuft); /* .. */ /* [status is handled by caller] */ return res; } /* decExpOp */ /* ------------------------------------------------------------------ */ /* Initial-estimate natural logarithm table */ /* */ /* LNnn -- 90-entry 16-bit table for values from .10 through .99. */ /* The result is a 4-digit encode of the coefficient (c=the */ /* top 14 bits encoding 0-9999) and a 2-digit encode of the */ /* exponent (e=the bottom 2 bits encoding 0-3) */ /* */ /* The resulting value is given by: */ /* */ /* v = -c * 10**(-e-3) */ /* */ /* where e and c are extracted from entry k = LNnn[x-10] */ /* where x is truncated (NB) into the range 10 through 99, */ /* and then c = k>>2 and e = k&3. */ /* ------------------------------------------------------------------ */ static const uShort LNnn[90]={9016, 8652, 8316, 8008, 7724, 7456, 7208, 6972, 6748, 6540, 6340, 6148, 5968, 5792, 5628, 5464, 5312, 5164, 5020, 4884, 4748, 4620, 4496, 4376, 4256, 4144, 4032, 39233, 38181, 37157, 36157, 35181, 34229, 33297, 32389, 31501, 30629, 29777, 28945, 28129, 27329, 26545, 25777, 25021, 24281, 23553, 22837, 22137, 21445, 20769, 20101, 19445, 18801, 18165, 17541, 16925, 16321, 15721, 15133, 14553, 13985, 13421, 12865, 12317, 11777, 11241, 10717, 10197, 9685, 9177, 8677, 8185, 7697, 7213, 6737, 6269, 5801, 5341, 4889, 4437, 39930, 35534, 31186, 26886, 22630, 18418, 14254, 10130, 6046, 20055}; /* ------------------------------------------------------------------ */ /* decLnOp -- effect natural logarithm */ /* */ /* This computes C = ln(A) */ /* */ /* res is C, the result. C may be A */ /* rhs is A */ /* set is the context; note that rounding mode has no effect */ /* */ /* C must have space for set->digits digits. */ /* */ /* Notable cases: */ /* A<0 -> Invalid */ /* A=0 -> -Infinity (Exact) */ /* A=+Infinity -> +Infinity (Exact) */ /* A=1 exactly -> 0 (Exact) */ /* */ /* Restrictions (as for Exp): */ /* */ /* digits, emax, and -emin in the context must be less than */ /* DEC_MAX_MATH+11 (1000010), and the rhs must be within these */ /* bounds or a zero. This is an internal routine, so these */ /* restrictions are contractual and not enforced. */ /* */ /* A finite result is rounded using DEC_ROUND_HALF_EVEN; it will */ /* almost always be correctly rounded, but may be up to 1 ulp in */ /* error in rare cases. */ /* ------------------------------------------------------------------ */ /* The result is calculated using Newton's method, with each */ /* iteration calculating a' = a + x * exp(-a) - 1. See, for example, */ /* Epperson 1989. */ /* */ /* The iteration ends when the adjustment x*exp(-a)-1 is tiny enough. */ /* This has to be calculated at the sum of the precision of x and the */ /* working precision. */ /* */ /* Implementation notes: */ /* */ /* 1. This is separated out as decLnOp so it can be called from */ /* other Mathematical functions (e.g., Log 10) with a wider range */ /* than normal. In particular, it can handle the slightly wider */ /* (+9+2) range needed by a power function. */ /* */ /* 2. The speed of this function is about 10x slower than exp, as */ /* it typically needs 4-6 iterations for short numbers, and the */ /* extra precision needed adds a squaring effect, twice. */ /* */ /* 3. Fastpaths are included for ln(10) and ln(2), up to length 40, */ /* as these are common requests. ln(10) is used by log10(x). */ /* */ /* 4. An iteration might be saved by widening the LNnn table, and */ /* would certainly save at least one if it were made ten times */ /* bigger, too (for truncated fractions 0.100 through 0.999). */ /* However, for most practical evaluations, at least four or five */ /* iterations will be needed -- so this would only speed up by */ /* 20-25% and that probably does not justify increasing the table */ /* size. */ /* */ /* 5. The static buffers are larger than might be expected to allow */ /* for calls from decNumberPower. */ /* ------------------------------------------------------------------ */ // #if defined(__clang__) || U_GCC_MAJOR_MINOR >= 406 // #pragma GCC diagnostic push // #pragma GCC diagnostic ignored "-Warray-bounds" // #endif decNumber * decLnOp(decNumber *res, const decNumber *rhs, decContext *set, uInt *status) { uInt ignore=0; /* working status accumulator */ uInt needbytes; /* for space calculations */ Int residue; /* rounding residue */ Int r; /* rhs=f*10**r [see below] */ Int p; /* working precision */ Int pp; /* precision for iteration */ Int t; /* work */ /* buffers for a (accumulator, typically precision+2) and b */ /* (adjustment calculator, same size) */ decNumber bufa[D2N(DECBUFFER+12)]; decNumber *allocbufa=nullptr; /* -> allocated bufa, iff allocated */ decNumber *a=bufa; /* accumulator/work */ decNumber bufb[D2N(DECBUFFER*2+2)]; decNumber *allocbufb=nullptr; /* -> allocated bufa, iff allocated */ decNumber *b=bufb; /* adjustment/work */ decNumber numone; /* constant 1 */ decNumber cmp; /* work */ decContext aset, bset; /* working contexts */ #if DECCHECK Int iterations=0; /* for later sanity check */ if (decCheckOperands(res, DECUNUSED, rhs, set)) return res; #endif do { /* protect allocated storage */ if (SPECIALARG) { /* handle infinities and NaNs */ if (decNumberIsInfinite(rhs)) { /* an infinity */ if (decNumberIsNegative(rhs)) /* -Infinity -> error */ *status|=DEC_Invalid_operation; else uprv_decNumberCopy(res, rhs); /* +Infinity -> self */ } else decNaNs(res, rhs, nullptr, set, status); /* a NaN */ break;} if (ISZERO(rhs)) { /* +/- zeros -> -Infinity */ uprv_decNumberZero(res); /* make clean */ res->bits=DECINF|DECNEG; /* set - infinity */ break;} /* [no status to set] */ /* Non-zero negatives are bad... */ if (decNumberIsNegative(rhs)) { /* -x -> error */ *status|=DEC_Invalid_operation; break;} /* Here, rhs is positive, finite, and in range */ /* lookaside fastpath code for ln(2) and ln(10) at common lengths */ if (rhs->exponent==0 && set->digits<=40) { #if DECDPUN==1 if (rhs->lsu[0]==0 && rhs->lsu[1]==1 && rhs->digits==2) { /* ln(10) */ #else if (rhs->lsu[0]==10 && rhs->digits==2) { /* ln(10) */ #endif aset=*set; aset.round=DEC_ROUND_HALF_EVEN; #define LN10 "2.302585092994045684017991454684364207601" uprv_decNumberFromString(res, LN10, &aset); *status|=(DEC_Inexact | DEC_Rounded); /* is inexact */ break;} if (rhs->lsu[0]==2 && rhs->digits==1) { /* ln(2) */ aset=*set; aset.round=DEC_ROUND_HALF_EVEN; #define LN2 "0.6931471805599453094172321214581765680755" uprv_decNumberFromString(res, LN2, &aset); *status|=(DEC_Inexact | DEC_Rounded); break;} } /* integer and short */ /* Determine the working precision. This is normally the */ /* requested precision + 2, with a minimum of 9. However, if */ /* the rhs is 'over-precise' then allow for all its digits to */ /* potentially participate (consider an rhs where all the excess */ /* digits are 9s) so in this case use rhs->digits+2. */ p=MAXI(rhs->digits, MAXI(set->digits, 7))+2; /* Allocate space for the accumulator and the high-precision */ /* adjustment calculator, if necessary. The accumulator must */ /* be able to hold p digits, and the adjustment up to */ /* rhs->digits+p digits. They are also made big enough for 16 */ /* digits so that they can be used for calculating the initial */ /* estimate. */ needbytes=sizeof(decNumber)+(D2U(MAXI(p,16))-1)*sizeof(Unit); if (needbytes>sizeof(bufa)) { /* need malloc space */ allocbufa=(decNumber *)malloc(needbytes); if (allocbufa==nullptr) { /* hopeless -- abandon */ *status|=DEC_Insufficient_storage; break;} a=allocbufa; /* use the allocated space */ } pp=p+rhs->digits; needbytes=sizeof(decNumber)+(D2U(MAXI(pp,16))-1)*sizeof(Unit); if (needbytes>sizeof(bufb)) { /* need malloc space */ allocbufb=(decNumber *)malloc(needbytes); if (allocbufb==nullptr) { /* hopeless -- abandon */ *status|=DEC_Insufficient_storage; break;} b=allocbufb; /* use the allocated space */ } /* Prepare an initial estimate in acc. Calculate this by */ /* considering the coefficient of x to be a normalized fraction, */ /* f, with the decimal point at far left and multiplied by */ /* 10**r. Then, rhs=f*10**r and 0.1<=f<1, and */ /* ln(x) = ln(f) + ln(10)*r */ /* Get the initial estimate for ln(f) from a small lookup */ /* table (see above) indexed by the first two digits of f, */ /* truncated. */ uprv_decContextDefault(&aset, DEC_INIT_DECIMAL64); /* 16-digit extended */ r=rhs->exponent+rhs->digits; /* 'normalised' exponent */ uprv_decNumberFromInt32(a, r); /* a=r */ uprv_decNumberFromInt32(b, 2302585); /* b=ln(10) (2.302585) */ b->exponent=-6; /* .. */ decMultiplyOp(a, a, b, &aset, &ignore); /* a=a*b */ /* now get top two digits of rhs into b by simple truncate and */ /* force to integer */ residue=0; /* (no residue) */ aset.digits=2; aset.round=DEC_ROUND_DOWN; decCopyFit(b, rhs, &aset, &residue, &ignore); /* copy & shorten */ b->exponent=0; /* make integer */ t=decGetInt(b); /* [cannot fail] */ if (t<10) t=X10(t); /* adjust single-digit b */ t=LNnn[t-10]; /* look up ln(b) */ uprv_decNumberFromInt32(b, t>>2); /* b=ln(b) coefficient */ b->exponent=-(t&3)-3; /* set exponent */ b->bits=DECNEG; /* ln(0.10)->ln(0.99) always -ve */ aset.digits=16; aset.round=DEC_ROUND_HALF_EVEN; /* restore */ decAddOp(a, a, b, &aset, 0, &ignore); /* acc=a+b */ /* the initial estimate is now in a, with up to 4 digits correct. */ /* When rhs is at or near Nmax the estimate will be low, so we */ /* will approach it from below, avoiding overflow when calling exp. */ uprv_decNumberZero(&numone); *numone.lsu=1; /* constant 1 for adjustment */ /* accumulator bounds are as requested (could underflow, but */ /* cannot overflow) */ aset.emax=set->emax; aset.emin=set->emin; aset.clamp=0; /* no concrete format */ /* set up a context to be used for the multiply and subtract */ bset=aset; bset.emax=DEC_MAX_MATH*2; /* use double bounds for the */ bset.emin=-DEC_MAX_MATH*2; /* adjustment calculation */ /* [see decExpOp call below] */ /* for each iteration double the number of digits to calculate, */ /* up to a maximum of p */ pp=9; /* initial precision */ /* [initially 9 as then the sequence starts 7+2, 16+2, and */ /* 34+2, which is ideal for standard-sized numbers] */ aset.digits=pp; /* working context */ bset.digits=pp+rhs->digits; /* wider context */ for (;;) { /* iterate */ #if DECCHECK iterations++; if (iterations>24) break; /* consider 9 * 2**24 */ #endif /* calculate the adjustment (exp(-a)*x-1) into b. This is a */ /* catastrophic subtraction but it really is the difference */ /* from 1 that is of interest. */ /* Use the internal entry point to Exp as it allows the double */ /* range for calculating exp(-a) when a is the tiniest subnormal. */ a->bits^=DECNEG; /* make -a */ decExpOp(b, a, &bset, &ignore); /* b=exp(-a) */ a->bits^=DECNEG; /* restore sign of a */ /* now multiply by rhs and subtract 1, at the wider precision */ decMultiplyOp(b, b, rhs, &bset, &ignore); /* b=b*rhs */ decAddOp(b, b, &numone, &bset, DECNEG, &ignore); /* b=b-1 */ /* the iteration ends when the adjustment cannot affect the */ /* result by >=0.5 ulp (at the requested digits), which */ /* is when its value is smaller than the accumulator by */ /* set->digits+1 digits (or it is zero) -- this is a looser */ /* requirement than for Exp because all that happens to the */ /* accumulator after this is the final rounding (but note that */ /* there must also be full precision in a, or a=0). */ if (decNumberIsZero(b) || (a->digits+a->exponent)>=(b->digits+b->exponent+set->digits+1)) { if (a->digits==p) break; if (decNumberIsZero(a)) { decCompareOp(&cmp, rhs, &numone, &aset, COMPARE, &ignore); /* rhs=1 ? */ if (cmp.lsu[0]==0) a->exponent=0; /* yes, exact 0 */ else *status|=(DEC_Inexact | DEC_Rounded); /* no, inexact */ break; } /* force padding if adjustment has gone to 0 before full length */ if (decNumberIsZero(b)) b->exponent=a->exponent-p; } /* not done yet ... */ decAddOp(a, a, b, &aset, 0, &ignore); /* a=a+b for next estimate */ if (pp==p) continue; /* precision is at maximum */ /* lengthen the next calculation */ pp=pp*2; /* double precision */ if (pp>p) pp=p; /* clamp to maximum */ aset.digits=pp; /* working context */ bset.digits=pp+rhs->digits; /* wider context */ } /* Newton's iteration */ #if DECCHECK /* just a sanity check; remove the test to show always */ if (iterations>24) printf("Ln iterations=%ld, status=%08lx, p=%ld, d=%ld\n", (LI)iterations, (LI)*status, (LI)p, (LI)rhs->digits); #endif /* Copy and round the result to res */ residue=1; /* indicate dirt to right */ if (ISZERO(a)) residue=0; /* .. unless underflowed to 0 */ aset.digits=set->digits; /* [use default rounding] */ decCopyFit(res, a, &aset, &residue, status); /* copy & shorten */ decFinish(res, set, &residue, status); /* cleanup/set flags */ } while(0); /* end protected */ if (allocbufa!=nullptr) free(allocbufa); /* drop any storage used */ if (allocbufb!=nullptr) free(allocbufb); /* .. */ /* [status is handled by caller] */ return res; } /* decLnOp */ // #if defined(__clang__) || U_GCC_MAJOR_MINOR >= 406 // #pragma GCC diagnostic pop // #endif /* ------------------------------------------------------------------ */ /* decQuantizeOp -- force exponent to requested value */ /* */ /* This computes C = op(A, B), where op adjusts the coefficient */ /* of C (by rounding or shifting) such that the exponent (-scale) */ /* of C has the value B or matches the exponent of B. */ /* The numerical value of C will equal A, except for the effects of */ /* any rounding that occurred. */ /* */ /* res is C, the result. C may be A or B */ /* lhs is A, the number to adjust */ /* rhs is B, the requested exponent */ /* set is the context */ /* quant is 1 for quantize or 0 for rescale */ /* status is the status accumulator (this can be called without */ /* risk of control loss) */ /* */ /* C must have space for set->digits digits. */ /* */ /* Unless there is an error or the result is infinite, the exponent */ /* after the operation is guaranteed to be that requested. */ /* ------------------------------------------------------------------ */ static decNumber * decQuantizeOp(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set, Flag quant, uInt *status) { #if DECSUBSET decNumber *alloclhs=nullptr; /* non-nullptr if rounded lhs allocated */ decNumber *allocrhs=nullptr; /* .., rhs */ #endif const decNumber *inrhs=rhs; /* save original rhs */ Int reqdigits=set->digits; /* requested DIGITS */ Int reqexp; /* requested exponent [-scale] */ Int residue=0; /* rounding residue */ Int etiny=set->emin-(reqdigits-1); #if DECCHECK if (decCheckOperands(res, lhs, rhs, set)) return res; #endif do { /* protect allocated storage */ #if DECSUBSET if (!set->extended) { /* reduce operands and set lostDigits status, as needed */ if (lhs->digits>reqdigits) { alloclhs=decRoundOperand(lhs, set, status); if (alloclhs==nullptr) break; lhs=alloclhs; } if (rhs->digits>reqdigits) { /* [this only checks lostDigits] */ allocrhs=decRoundOperand(rhs, set, status); if (allocrhs==nullptr) break; rhs=allocrhs; } } #endif /* [following code does not require input rounding] */ /* Handle special values */ if (SPECIALARGS) { /* NaNs get usual processing */ if (SPECIALARGS & (DECSNAN | DECNAN)) decNaNs(res, lhs, rhs, set, status); /* one infinity but not both is bad */ else if ((lhs->bits ^ rhs->bits) & DECINF) *status|=DEC_Invalid_operation; /* both infinity: return lhs */ else uprv_decNumberCopy(res, lhs); /* [nop if in place] */ break; } /* set requested exponent */ if (quant) reqexp=inrhs->exponent; /* quantize -- match exponents */ else { /* rescale -- use value of rhs */ /* Original rhs must be an integer that fits and is in range, */ /* which could be from -1999999997 to +999999999, thanks to */ /* subnormals */ reqexp=decGetInt(inrhs); /* [cannot fail] */ } #if DECSUBSET if (!set->extended) etiny=set->emin; /* no subnormals */ #endif if (reqexp==BADINT /* bad (rescale only) or .. */ || reqexp==BIGODD || reqexp==BIGEVEN /* very big (ditto) or .. */ || (reqexpset->emax)) { /* > emax */ *status|=DEC_Invalid_operation; break;} /* the RHS has been processed, so it can be overwritten now if necessary */ if (ISZERO(lhs)) { /* zero coefficient unchanged */ uprv_decNumberCopy(res, lhs); /* [nop if in place] */ res->exponent=reqexp; /* .. just set exponent */ #if DECSUBSET if (!set->extended) res->bits=0; /* subset specification; no -0 */ #endif } else { /* non-zero lhs */ Int adjust=reqexp-lhs->exponent; /* digit adjustment needed */ /* if adjusted coefficient will definitely not fit, give up now */ if ((lhs->digits-adjust)>reqdigits) { *status|=DEC_Invalid_operation; break; } if (adjust>0) { /* increasing exponent */ /* this will decrease the length of the coefficient by adjust */ /* digits, and must round as it does so */ decContext workset; /* work */ workset=*set; /* clone rounding, etc. */ workset.digits=lhs->digits-adjust; /* set requested length */ /* [note that the latter can be <1, here] */ decCopyFit(res, lhs, &workset, &residue, status); /* fit to result */ decApplyRound(res, &workset, residue, status); /* .. and round */ residue=0; /* [used] */ /* If just rounded a 999s case, exponent will be off by one; */ /* adjust back (after checking space), if so. */ if (res->exponent>reqexp) { /* re-check needed, e.g., for quantize(0.9999, 0.001) under */ /* set->digits==3 */ if (res->digits==reqdigits) { /* cannot shift by 1 */ *status&=~(DEC_Inexact | DEC_Rounded); /* [clean these] */ *status|=DEC_Invalid_operation; break; } res->digits=decShiftToMost(res->lsu, res->digits, 1); /* shift */ res->exponent--; /* (re)adjust the exponent. */ } #if DECSUBSET if (ISZERO(res) && !set->extended) res->bits=0; /* subset; no -0 */ #endif } /* increase */ else /* adjust<=0 */ { /* decreasing or = exponent */ /* this will increase the length of the coefficient by -adjust */ /* digits, by adding zero or more trailing zeros; this is */ /* already checked for fit, above */ uprv_decNumberCopy(res, lhs); /* [it will fit] */ /* if padding needed (adjust<0), add it now... */ if (adjust<0) { res->digits=decShiftToMost(res->lsu, res->digits, -adjust); res->exponent+=adjust; /* adjust the exponent */ } } /* decrease */ } /* non-zero */ /* Check for overflow [do not use Finalize in this case, as an */ /* overflow here is a "don't fit" situation] */ if (res->exponent>set->emax-res->digits+1) { /* too big */ *status|=DEC_Invalid_operation; break; } else { decFinalize(res, set, &residue, status); /* set subnormal flags */ *status&=~DEC_Underflow; /* suppress Underflow [as per 754] */ } } while(0); /* end protected */ #if DECSUBSET if (allocrhs!=nullptr) free(allocrhs); /* drop any storage used */ if (alloclhs!=nullptr) free(alloclhs); /* .. */ #endif return res; } /* decQuantizeOp */ /* ------------------------------------------------------------------ */ /* decCompareOp -- compare, min, or max two Numbers */ /* */ /* This computes C = A ? B and carries out one of four operations: */ /* COMPARE -- returns the signum (as a number) giving the */ /* result of a comparison unless one or both */ /* operands is a NaN (in which case a NaN results) */ /* COMPSIG -- as COMPARE except that a quiet NaN raises */ /* Invalid operation. */ /* COMPMAX -- returns the larger of the operands, using the */ /* 754 maxnum operation */ /* COMPMAXMAG -- ditto, comparing absolute values */ /* COMPMIN -- the 754 minnum operation */ /* COMPMINMAG -- ditto, comparing absolute values */ /* COMTOTAL -- returns the signum (as a number) giving the */ /* result of a comparison using 754 total ordering */ /* */ /* res is C, the result. C may be A and/or B (e.g., X=X?X) */ /* lhs is A */ /* rhs is B */ /* set is the context */ /* op is the operation flag */ /* status is the usual accumulator */ /* */ /* C must have space for one digit for COMPARE or set->digits for */ /* COMPMAX, COMPMIN, COMPMAXMAG, or COMPMINMAG. */ /* ------------------------------------------------------------------ */ /* The emphasis here is on speed for common cases, and avoiding */ /* coefficient comparison if possible. */ /* ------------------------------------------------------------------ */ static decNumber * decCompareOp(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set, Flag op, uInt *status) { #if DECSUBSET decNumber *alloclhs=nullptr; /* non-nullptr if rounded lhs allocated */ decNumber *allocrhs=nullptr; /* .., rhs */ #endif Int result=0; /* default result value */ uByte merged; /* work */ #if DECCHECK if (decCheckOperands(res, lhs, rhs, set)) return res; #endif do { /* protect allocated storage */ #if DECSUBSET if (!set->extended) { /* reduce operands and set lostDigits status, as needed */ if (lhs->digits>set->digits) { alloclhs=decRoundOperand(lhs, set, status); if (alloclhs==nullptr) {result=BADINT; break;} lhs=alloclhs; } if (rhs->digits>set->digits) { allocrhs=decRoundOperand(rhs, set, status); if (allocrhs==nullptr) {result=BADINT; break;} rhs=allocrhs; } } #endif /* [following code does not require input rounding] */ /* If total ordering then handle differing signs 'up front' */ if (op==COMPTOTAL) { /* total ordering */ if (decNumberIsNegative(lhs) && !decNumberIsNegative(rhs)) { result=-1; break; } if (!decNumberIsNegative(lhs) && decNumberIsNegative(rhs)) { result=+1; break; } } /* handle NaNs specially; let infinities drop through */ /* This assumes sNaN (even just one) leads to NaN. */ merged=(lhs->bits | rhs->bits) & (DECSNAN | DECNAN); if (merged) { /* a NaN bit set */ if (op==COMPARE); /* result will be NaN */ else if (op==COMPSIG) /* treat qNaN as sNaN */ *status|=DEC_Invalid_operation | DEC_sNaN; else if (op==COMPTOTAL) { /* total ordering, always finite */ /* signs are known to be the same; compute the ordering here */ /* as if the signs are both positive, then invert for negatives */ if (!decNumberIsNaN(lhs)) result=-1; else if (!decNumberIsNaN(rhs)) result=+1; /* here if both NaNs */ else if (decNumberIsSNaN(lhs) && decNumberIsQNaN(rhs)) result=-1; else if (decNumberIsQNaN(lhs) && decNumberIsSNaN(rhs)) result=+1; else { /* both NaN or both sNaN */ /* now it just depends on the payload */ result=decUnitCompare(lhs->lsu, D2U(lhs->digits), rhs->lsu, D2U(rhs->digits), 0); /* [Error not possible, as these are 'aligned'] */ } /* both same NaNs */ if (decNumberIsNegative(lhs)) result=-result; break; } /* total order */ else if (merged & DECSNAN); /* sNaN -> qNaN */ else { /* here if MIN or MAX and one or two quiet NaNs */ /* min or max -- 754 rules ignore single NaN */ if (!decNumberIsNaN(lhs) || !decNumberIsNaN(rhs)) { /* just one NaN; force choice to be the non-NaN operand */ op=COMPMAX; if (lhs->bits & DECNAN) result=-1; /* pick rhs */ else result=+1; /* pick lhs */ break; } } /* max or min */ op=COMPNAN; /* use special path */ decNaNs(res, lhs, rhs, set, status); /* propagate NaN */ break; } /* have numbers */ if (op==COMPMAXMAG || op==COMPMINMAG) result=decCompare(lhs, rhs, 1); else result=decCompare(lhs, rhs, 0); /* sign matters */ } while(0); /* end protected */ if (result==BADINT) *status|=DEC_Insufficient_storage; /* rare */ else { if (op==COMPARE || op==COMPSIG ||op==COMPTOTAL) { /* returning signum */ if (op==COMPTOTAL && result==0) { /* operands are numerically equal or same NaN (and same sign, */ /* tested first); if identical, leave result 0 */ if (lhs->exponent!=rhs->exponent) { if (lhs->exponentexponent) result=-1; else result=+1; if (decNumberIsNegative(lhs)) result=-result; } /* lexp!=rexp */ } /* total-order by exponent */ uprv_decNumberZero(res); /* [always a valid result] */ if (result!=0) { /* must be -1 or +1 */ *res->lsu=1; if (result<0) res->bits=DECNEG; } } else if (op==COMPNAN); /* special, drop through */ else { /* MAX or MIN, non-NaN result */ Int residue=0; /* rounding accumulator */ /* choose the operand for the result */ const decNumber *choice; if (result==0) { /* operands are numerically equal */ /* choose according to sign then exponent (see 754) */ uByte slhs=(lhs->bits & DECNEG); uByte srhs=(rhs->bits & DECNEG); #if DECSUBSET if (!set->extended) { /* subset: force left-hand */ op=COMPMAX; result=+1; } else #endif if (slhs!=srhs) { /* signs differ */ if (slhs) result=-1; /* rhs is max */ else result=+1; /* lhs is max */ } else if (slhs && srhs) { /* both negative */ if (lhs->exponentexponent) result=+1; else result=-1; /* [if equal, use lhs, technically identical] */ } else { /* both positive */ if (lhs->exponent>rhs->exponent) result=+1; else result=-1; /* [ditto] */ } } /* numerically equal */ /* here result will be non-0; reverse if looking for MIN */ if (op==COMPMIN || op==COMPMINMAG) result=-result; choice=(result>0 ? lhs : rhs); /* choose */ /* copy chosen to result, rounding if need be */ decCopyFit(res, choice, set, &residue, status); decFinish(res, set, &residue, status); } } #if DECSUBSET if (allocrhs!=nullptr) free(allocrhs); /* free any storage used */ if (alloclhs!=nullptr) free(alloclhs); /* .. */ #endif return res; } /* decCompareOp */ /* ------------------------------------------------------------------ */ /* decCompare -- compare two decNumbers by numerical value */ /* */ /* This routine compares A ? B without altering them. */ /* */ /* Arg1 is A, a decNumber which is not a NaN */ /* Arg2 is B, a decNumber which is not a NaN */ /* Arg3 is 1 for a sign-independent compare, 0 otherwise */ /* */ /* returns -1, 0, or 1 for AB, or BADINT if failure */ /* (the only possible failure is an allocation error) */ /* ------------------------------------------------------------------ */ static Int decCompare(const decNumber *lhs, const decNumber *rhs, Flag abs_c) { Int result; /* result value */ Int sigr; /* rhs signum */ Int compare; /* work */ result=1; /* assume signum(lhs) */ if (ISZERO(lhs)) result=0; if (abs_c) { if (ISZERO(rhs)) return result; /* LHS wins or both 0 */ /* RHS is non-zero */ if (result==0) return -1; /* LHS is 0; RHS wins */ /* [here, both non-zero, result=1] */ } else { /* signs matter */ if (result && decNumberIsNegative(lhs)) result=-1; sigr=1; /* compute signum(rhs) */ if (ISZERO(rhs)) sigr=0; else if (decNumberIsNegative(rhs)) sigr=-1; if (result > sigr) return +1; /* L > R, return 1 */ if (result < sigr) return -1; /* L < R, return -1 */ if (result==0) return 0; /* both 0 */ } /* signums are the same; both are non-zero */ if ((lhs->bits | rhs->bits) & DECINF) { /* one or more infinities */ if (decNumberIsInfinite(rhs)) { if (decNumberIsInfinite(lhs)) result=0;/* both infinite */ else result=-result; /* only rhs infinite */ } return result; } /* must compare the coefficients, allowing for exponents */ if (lhs->exponent>rhs->exponent) { /* LHS exponent larger */ /* swap sides, and sign */ const decNumber *temp=lhs; lhs=rhs; rhs=temp; result=-result; } compare=decUnitCompare(lhs->lsu, D2U(lhs->digits), rhs->lsu, D2U(rhs->digits), rhs->exponent-lhs->exponent); if (compare!=BADINT) compare*=result; /* comparison succeeded */ return compare; } /* decCompare */ /* ------------------------------------------------------------------ */ /* decUnitCompare -- compare two >=0 integers in Unit arrays */ /* */ /* This routine compares A ? B*10**E where A and B are unit arrays */ /* A is a plain integer */ /* B has an exponent of E (which must be non-negative) */ /* */ /* Arg1 is A first Unit (lsu) */ /* Arg2 is A length in Units */ /* Arg3 is B first Unit (lsu) */ /* Arg4 is B length in Units */ /* Arg5 is E (0 if the units are aligned) */ /* */ /* returns -1, 0, or 1 for AB, or BADINT if failure */ /* (the only possible failure is an allocation error, which can */ /* only occur if E!=0) */ /* ------------------------------------------------------------------ */ static Int decUnitCompare(const Unit *a, Int alength, const Unit *b, Int blength, Int exp) { Unit *acc; /* accumulator for result */ Unit accbuff[SD2U(DECBUFFER*2+1)]; /* local buffer */ Unit *allocacc=nullptr; /* -> allocated acc buffer, iff allocated */ Int accunits, need; /* units in use or needed for acc */ const Unit *l, *r, *u; /* work */ Int expunits, exprem, result; /* .. */ if (exp==0) { /* aligned; fastpath */ if (alength>blength) return 1; if (alength=a; l--, r--) { if (*l>*r) return 1; if (*l<*r) return -1; } return 0; /* all units match */ } /* aligned */ /* Unaligned. If one is >1 unit longer than the other, padded */ /* approximately, then can return easily */ if (alength>blength+(Int)D2U(exp)) return 1; if (alength+1sizeof(accbuff)) { allocacc=(Unit *)malloc(need*sizeof(Unit)); if (allocacc==nullptr) return BADINT; /* hopeless -- abandon */ acc=allocacc; } /* Calculate units and remainder from exponent. */ expunits=exp/DECDPUN; exprem=exp%DECDPUN; /* subtract [A+B*(-m)] */ accunits=decUnitAddSub(a, alength, b, blength, expunits, acc, -(Int)powers[exprem]); /* [UnitAddSub result may have leading zeros, even on zero] */ if (accunits<0) result=-1; /* negative result */ else { /* non-negative result */ /* check units of the result before freeing any storage */ for (u=acc; u=0 integers in Unit arrays */ /* */ /* This routine performs the calculation: */ /* */ /* C=A+(B*M) */ /* */ /* Where M is in the range -DECDPUNMAX through +DECDPUNMAX. */ /* */ /* A may be shorter or longer than B. */ /* */ /* Leading zeros are not removed after a calculation. The result is */ /* either the same length as the longer of A and B (adding any */ /* shift), or one Unit longer than that (if a Unit carry occurred). */ /* */ /* A and B content are not altered unless C is also A or B. */ /* C may be the same array as A or B, but only if no zero padding is */ /* requested (that is, C may be B only if bshift==0). */ /* C is filled from the lsu; only those units necessary to complete */ /* the calculation are referenced. */ /* */ /* Arg1 is A first Unit (lsu) */ /* Arg2 is A length in Units */ /* Arg3 is B first Unit (lsu) */ /* Arg4 is B length in Units */ /* Arg5 is B shift in Units (>=0; pads with 0 units if positive) */ /* Arg6 is C first Unit (lsu) */ /* Arg7 is M, the multiplier */ /* */ /* returns the count of Units written to C, which will be non-zero */ /* and negated if the result is negative. That is, the sign of the */ /* returned Int is the sign of the result (positive for zero) and */ /* the absolute value of the Int is the count of Units. */ /* */ /* It is the caller's responsibility to make sure that C size is */ /* safe, allowing space if necessary for a one-Unit carry. */ /* */ /* This routine is severely performance-critical; *any* change here */ /* must be measured (timed) to assure no performance degradation. */ /* In particular, trickery here tends to be counter-productive, as */ /* increased complexity of code hurts register optimizations on */ /* register-poor architectures. Avoiding divisions is nearly */ /* always a Good Idea, however. */ /* */ /* Special thanks to Rick McGuire (IBM Cambridge, MA) and Dave Clark */ /* (IBM Warwick, UK) for some of the ideas used in this routine. */ /* ------------------------------------------------------------------ */ static Int decUnitAddSub(const Unit *a, Int alength, const Unit *b, Int blength, Int bshift, Unit *c, Int m) { const Unit *alsu=a; /* A lsu [need to remember it] */ Unit *clsu=c; /* C ditto */ Unit *minC; /* low water mark for C */ Unit *maxC; /* high water mark for C */ eInt carry=0; /* carry integer (could be Long) */ Int add; /* work */ #if DECDPUN<=4 /* myriadal, millenary, etc. */ Int est; /* estimated quotient */ #endif #if DECTRACE if (alength<1 || blength<1) printf("decUnitAddSub: alen blen m %ld %ld [%ld]\n", alength, blength, m); #endif maxC=c+alength; /* A is usually the longer */ minC=c+blength; /* .. and B the shorter */ if (bshift!=0) { /* B is shifted; low As copy across */ minC+=bshift; /* if in place [common], skip copy unless there's a gap [rare] */ if (a==c && bshift<=alength) { c+=bshift; a+=bshift; } else for (; cmaxC) { /* swap */ Unit *hold=minC; minC=maxC; maxC=hold; } /* For speed, do the addition as two loops; the first where both A */ /* and B contribute, and the second (if necessary) where only one or */ /* other of the numbers contribute. */ /* Carry handling is the same (i.e., duplicated) in each case. */ for (; c=0) { est=(((ueInt)carry>>11)*53687)>>18; *c=(Unit)(carry-est*(DECDPUNMAX+1)); /* remainder */ carry=est; /* likely quotient [89%] */ if (*c>11)*53687)>>18; *c=(Unit)(carry-est*(DECDPUNMAX+1)); carry=est-(DECDPUNMAX+1); /* correctly negative */ if (*c=0) { est=(((ueInt)carry>>3)*16777)>>21; *c=(Unit)(carry-est*(DECDPUNMAX+1)); /* remainder */ carry=est; /* likely quotient [99%] */ if (*c>3)*16777)>>21; *c=(Unit)(carry-est*(DECDPUNMAX+1)); carry=est-(DECDPUNMAX+1); /* correctly negative */ if (*c=0) { est=QUOT10(carry, DECDPUN); *c=(Unit)(carry-est*(DECDPUNMAX+1)); /* remainder */ carry=est; /* quotient */ continue; } /* negative case */ carry=carry+(eInt)(DECDPUNMAX+1)*(DECDPUNMAX+1); /* make positive */ est=QUOT10(carry, DECDPUN); *c=(Unit)(carry-est*(DECDPUNMAX+1)); carry=est-(DECDPUNMAX+1); /* correctly negative */ #else /* remainder operator is undefined if negative, so must test */ if ((ueInt)carry<(DECDPUNMAX+1)*2) { /* fastpath carry +1 */ *c=(Unit)(carry-(DECDPUNMAX+1)); /* [helps additions] */ carry=1; continue; } if (carry>=0) { *c=(Unit)(carry%(DECDPUNMAX+1)); carry=carry/(DECDPUNMAX+1); continue; } /* negative case */ carry=carry+(eInt)(DECDPUNMAX+1)*(DECDPUNMAX+1); /* make positive */ *c=(Unit)(carry%(DECDPUNMAX+1)); carry=carry/(DECDPUNMAX+1)-(DECDPUNMAX+1); #endif } /* c */ /* now may have one or other to complete */ /* [pretest to avoid loop setup/shutdown] */ if (cDECDPUNMAX */ #if DECDPUN==4 /* use divide-by-multiply */ if (carry>=0) { est=(((ueInt)carry>>11)*53687)>>18; *c=(Unit)(carry-est*(DECDPUNMAX+1)); /* remainder */ carry=est; /* likely quotient [79.7%] */ if (*c>11)*53687)>>18; *c=(Unit)(carry-est*(DECDPUNMAX+1)); carry=est-(DECDPUNMAX+1); /* correctly negative */ if (*c=0) { est=(((ueInt)carry>>3)*16777)>>21; *c=(Unit)(carry-est*(DECDPUNMAX+1)); /* remainder */ carry=est; /* likely quotient [99%] */ if (*c>3)*16777)>>21; *c=(Unit)(carry-est*(DECDPUNMAX+1)); carry=est-(DECDPUNMAX+1); /* correctly negative */ if (*c=0) { est=QUOT10(carry, DECDPUN); *c=(Unit)(carry-est*(DECDPUNMAX+1)); /* remainder */ carry=est; /* quotient */ continue; } /* negative case */ carry=carry+(eInt)(DECDPUNMAX+1)*(DECDPUNMAX+1); /* make positive */ est=QUOT10(carry, DECDPUN); *c=(Unit)(carry-est*(DECDPUNMAX+1)); carry=est-(DECDPUNMAX+1); /* correctly negative */ #else if ((ueInt)carry<(DECDPUNMAX+1)*2){ /* fastpath carry 1 */ *c=(Unit)(carry-(DECDPUNMAX+1)); carry=1; continue; } /* remainder operator is undefined if negative, so must test */ if (carry>=0) { *c=(Unit)(carry%(DECDPUNMAX+1)); carry=carry/(DECDPUNMAX+1); continue; } /* negative case */ carry=carry+(eInt)(DECDPUNMAX+1)*(DECDPUNMAX+1); /* make positive */ *c=(Unit)(carry%(DECDPUNMAX+1)); carry=carry/(DECDPUNMAX+1)-(DECDPUNMAX+1); #endif } /* c */ /* OK, all A and B processed; might still have carry or borrow */ /* return number of Units in the result, negated if a borrow */ if (carry==0) return static_cast(c-clsu); /* no carry, so no more to do */ if (carry>0) { /* positive carry */ *c=(Unit)carry; /* place as new unit */ c++; /* .. */ return static_cast(c-clsu); } /* -ve carry: it's a borrow; complement needed */ add=1; /* temporary carry... */ for (c=clsu; c(clsu-c); /* -ve result indicates borrowed */ } /* decUnitAddSub */ /* ------------------------------------------------------------------ */ /* decTrim -- trim trailing zeros or normalize */ /* */ /* dn is the number to trim or normalize */ /* set is the context to use to check for clamp */ /* all is 1 to remove all trailing zeros, 0 for just fraction ones */ /* noclamp is 1 to unconditional (unclamped) trim */ /* dropped returns the number of discarded trailing zeros */ /* returns dn */ /* */ /* If clamp is set in the context then the number of zeros trimmed */ /* may be limited if the exponent is high. */ /* All fields are updated as required. This is a utility operation, */ /* so special values are unchanged and no error is possible. */ /* ------------------------------------------------------------------ */ static decNumber * decTrim(decNumber *dn, decContext *set, Flag all, Flag noclamp, Int *dropped) { Int d, exp; /* work */ uInt cut; /* .. */ Unit *up; /* -> current Unit */ #if DECCHECK if (decCheckOperands(dn, DECUNUSED, DECUNUSED, DECUNCONT)) return dn; #endif *dropped=0; /* assume no zeros dropped */ if ((dn->bits & DECSPECIAL) /* fast exit if special .. */ || (*dn->lsu & 0x01)) return dn; /* .. or odd */ if (ISZERO(dn)) { /* .. or 0 */ dn->exponent=0; /* (sign is preserved) */ return dn; } /* have a finite number which is even */ exp=dn->exponent; cut=1; /* digit (1-DECDPUN) in Unit */ up=dn->lsu; /* -> current Unit */ for (d=0; ddigits-1; d++) { /* [don't strip the final digit] */ /* slice by powers */ #if DECDPUN<=4 uInt quot=QUOT10(*up, cut); if ((*up-quot*powers[cut])!=0) break; /* found non-0 digit */ #else if (*up%powers[cut]!=0) break; /* found non-0 digit */ #endif /* have a trailing 0 */ if (!all) { /* trimming */ /* [if exp>0 then all trailing 0s are significant for trim] */ if (exp<=0) { /* if digit might be significant */ if (exp==0) break; /* then quit */ exp++; /* next digit might be significant */ } } cut++; /* next power */ if (cut>DECDPUN) { /* need new Unit */ up++; cut=1; } } /* d */ if (d==0) return dn; /* none to drop */ /* may need to limit drop if clamping */ if (set->clamp && !noclamp) { Int maxd=set->emax-set->digits+1-dn->exponent; if (maxd<=0) return dn; /* nothing possible */ if (d>maxd) d=maxd; } /* effect the drop */ decShiftToLeast(dn->lsu, D2U(dn->digits), d); dn->exponent+=d; /* maintain numerical value */ dn->digits-=d; /* new length */ *dropped=d; /* report the count */ return dn; } /* decTrim */ /* ------------------------------------------------------------------ */ /* decReverse -- reverse a Unit array in place */ /* */ /* ulo is the start of the array */ /* uhi is the end of the array (highest Unit to include) */ /* */ /* The units ulo through uhi are reversed in place (if the number */ /* of units is odd, the middle one is untouched). Note that the */ /* digit(s) in each unit are unaffected. */ /* ------------------------------------------------------------------ */ static void decReverse(Unit *ulo, Unit *uhi) { Unit temp; for (; ulo=uar; source--, target--) *target=*source; } else { first=uar+D2U(digits+shift)-1; /* where msu of source will end up */ for (; source>=uar; source--, target--) { /* split the source Unit and accumulate remainder for next */ #if DECDPUN<=4 uInt quot=QUOT10(*source, cut); uInt rem=*source-quot*powers[cut]; next+=quot; #else uInt rem=*source%powers[cut]; next+=*source/powers[cut]; #endif if (target<=first) *target=(Unit)next; /* write to target iff valid */ next=rem*powers[DECDPUN-cut]; /* save remainder for next Unit */ } } /* shift-move */ /* propagate any partial unit to one below and clear the rest */ for (; target>=uar; target--) { *target=(Unit)next; next=0; } return digits+shift; } /* decShiftToMost */ /* ------------------------------------------------------------------ */ /* decShiftToLeast -- shift digits in array towards least significant */ /* */ /* uar is the array */ /* units is length of the array, in units */ /* shift is the number of digits to remove from the lsu end; it */ /* must be zero or positive and <= than units*DECDPUN. */ /* */ /* returns the new length of the integer in the array, in units */ /* */ /* Removed digits are discarded (lost). Units not required to hold */ /* the final result are unchanged. */ /* ------------------------------------------------------------------ */ static Int decShiftToLeast(Unit *uar, Int units, Int shift) { Unit *target, *up; /* work */ Int cut, count; /* work */ Int quot, rem; /* for division */ if (shift==0) return units; /* [fastpath] nothing to do */ if (shift==units*DECDPUN) { /* [fastpath] little to do */ *uar=0; /* all digits cleared gives zero */ return 1; /* leaves just the one */ } target=uar; /* both paths */ cut=MSUDIGITS(shift); if (cut==DECDPUN) { /* unit-boundary case; easy */ up=uar+D2U(shift); for (; up(target-uar); } /* messier */ up=uar+D2U(shift-cut); /* source; correct to whole Units */ count=units*DECDPUN-shift; /* the maximum new length */ #if DECDPUN<=4 quot=QUOT10(*up, cut); #else quot=*up/powers[cut]; #endif for (; ; target++) { *target=(Unit)quot; count-=(DECDPUN-cut); if (count<=0) break; up++; quot=*up; #if DECDPUN<=4 quot=QUOT10(quot, cut); rem=*up-quot*powers[cut]; #else rem=quot%powers[cut]; quot=quot/powers[cut]; #endif *target=(Unit)(*target+rem*powers[DECDPUN-cut]); count-=cut; if (count<=0) break; } return static_cast(target-uar+1); } /* decShiftToLeast */ #if DECSUBSET /* ------------------------------------------------------------------ */ /* decRoundOperand -- round an operand [used for subset only] */ /* */ /* dn is the number to round (dn->digits is > set->digits) */ /* set is the relevant context */ /* status is the status accumulator */ /* */ /* returns an allocated decNumber with the rounded result. */ /* */ /* lostDigits and other status may be set by this. */ /* */ /* Since the input is an operand, it must not be modified. */ /* Instead, return an allocated decNumber, rounded as required. */ /* It is the caller's responsibility to free the allocated storage. */ /* */ /* If no storage is available then the result cannot be used, so nullptr */ /* is returned. */ /* ------------------------------------------------------------------ */ static decNumber *decRoundOperand(const decNumber *dn, decContext *set, uInt *status) { decNumber *res; /* result structure */ uInt newstatus=0; /* status from round */ Int residue=0; /* rounding accumulator */ /* Allocate storage for the returned decNumber, big enough for the */ /* length specified by the context */ res=(decNumber *)malloc(sizeof(decNumber) +(D2U(set->digits)-1)*sizeof(Unit)); if (res==nullptr) { *status|=DEC_Insufficient_storage; return nullptr; } decCopyFit(res, dn, set, &residue, &newstatus); decApplyRound(res, set, residue, &newstatus); /* If that set Inexact then "lost digits" is raised... */ if (newstatus & DEC_Inexact) newstatus|=DEC_Lost_digits; *status|=newstatus; return res; } /* decRoundOperand */ #endif /* ------------------------------------------------------------------ */ /* decCopyFit -- copy a number, truncating the coefficient if needed */ /* */ /* dest is the target decNumber */ /* src is the source decNumber */ /* set is the context [used for length (digits) and rounding mode] */ /* residue is the residue accumulator */ /* status contains the current status to be updated */ /* */ /* (dest==src is allowed and will be a no-op if fits) */ /* All fields are updated as required. */ /* ------------------------------------------------------------------ */ static void decCopyFit(decNumber *dest, const decNumber *src, decContext *set, Int *residue, uInt *status) { dest->bits=src->bits; dest->exponent=src->exponent; decSetCoeff(dest, set, src->lsu, src->digits, residue, status); } /* decCopyFit */ /* ------------------------------------------------------------------ */ /* decSetCoeff -- set the coefficient of a number */ /* */ /* dn is the number whose coefficient array is to be set. */ /* It must have space for set->digits digits */ /* set is the context [for size] */ /* lsu -> lsu of the source coefficient [may be dn->lsu] */ /* len is digits in the source coefficient [may be dn->digits] */ /* residue is the residue accumulator. This has values as in */ /* decApplyRound, and will be unchanged unless the */ /* target size is less than len. In this case, the */ /* coefficient is truncated and the residue is updated to */ /* reflect the previous residue and the dropped digits. */ /* status is the status accumulator, as usual */ /* */ /* The coefficient may already be in the number, or it can be an */ /* external intermediate array. If it is in the number, lsu must == */ /* dn->lsu and len must == dn->digits. */ /* */ /* Note that the coefficient length (len) may be < set->digits, and */ /* in this case this merely copies the coefficient (or is a no-op */ /* if dn->lsu==lsu). */ /* */ /* Note also that (only internally, from decQuantizeOp and */ /* decSetSubnormal) the value of set->digits may be less than one, */ /* indicating a round to left. This routine handles that case */ /* correctly; caller ensures space. */ /* */ /* dn->digits, dn->lsu (and as required), and dn->exponent are */ /* updated as necessary. dn->bits (sign) is unchanged. */ /* */ /* DEC_Rounded status is set if any digits are discarded. */ /* DEC_Inexact status is set if any non-zero digits are discarded, or */ /* incoming residue was non-0 (implies rounded) */ /* ------------------------------------------------------------------ */ /* mapping array: maps 0-9 to canonical residues, so that a residue */ /* can be adjusted in the range [-1, +1] and achieve correct rounding */ /* 0 1 2 3 4 5 6 7 8 9 */ static const uByte resmap[10]={0, 3, 3, 3, 3, 5, 7, 7, 7, 7}; static void decSetCoeff(decNumber *dn, decContext *set, const Unit *lsu, Int len, Int *residue, uInt *status) { Int discard; /* number of digits to discard */ uInt cut; /* cut point in Unit */ const Unit *up; /* work */ Unit *target; /* .. */ Int count; /* .. */ #if DECDPUN<=4 uInt temp; /* .. */ #endif discard=len-set->digits; /* digits to discard */ if (discard<=0) { /* no digits are being discarded */ if (dn->lsu!=lsu) { /* copy needed */ /* copy the coefficient array to the result number; no shift needed */ count=len; /* avoids D2U */ up=lsu; for (target=dn->lsu; count>0; target++, up++, count-=DECDPUN) *target=*up; dn->digits=len; /* set the new length */ } /* dn->exponent and residue are unchanged, record any inexactitude */ if (*residue!=0) *status|=(DEC_Inexact | DEC_Rounded); return; } /* some digits must be discarded ... */ dn->exponent+=discard; /* maintain numerical value */ *status|=DEC_Rounded; /* accumulate Rounded status */ if (*residue>1) *residue=1; /* previous residue now to right, so reduce */ if (discard>len) { /* everything, +1, is being discarded */ /* guard digit is 0 */ /* residue is all the number [NB could be all 0s] */ if (*residue<=0) { /* not already positive */ count=len; /* avoids D2U */ for (up=lsu; count>0; up++, count-=DECDPUN) if (*up!=0) { /* found non-0 */ *residue=1; break; /* no need to check any others */ } } if (*residue!=0) *status|=DEC_Inexact; /* record inexactitude */ *dn->lsu=0; /* coefficient will now be 0 */ dn->digits=1; /* .. */ return; } /* total discard */ /* partial discard [most common case] */ /* here, at least the first (most significant) discarded digit exists */ /* spin up the number, noting residue during the spin, until get to */ /* the Unit with the first discarded digit. When reach it, extract */ /* it and remember its position */ count=0; for (up=lsu;; up++) { count+=DECDPUN; if (count>=discard) break; /* full ones all checked */ if (*up!=0) *residue=1; } /* up */ /* here up -> Unit with first discarded digit */ cut=discard-(count-DECDPUN)-1; if (cut==DECDPUN-1) { /* unit-boundary case (fast) */ Unit half=(Unit)powers[DECDPUN]>>1; /* set residue directly */ if (*up>=half) { if (*up>half) *residue=7; else *residue+=5; /* add sticky bit */ } else { /* digits<=0) { /* special for Quantize/Subnormal :-( */ *dn->lsu=0; /* .. result is 0 */ dn->digits=1; /* .. */ } else { /* shift to least */ count=set->digits; /* now digits to end up with */ dn->digits=count; /* set the new length */ up++; /* move to next */ /* on unit boundary, so shift-down copy loop is simple */ for (target=dn->lsu; count>0; target++, up++, count-=DECDPUN) *target=*up; } } /* unit-boundary case */ else { /* discard digit is in low digit(s), and not top digit */ uInt discard1; /* first discarded digit */ uInt quot, rem; /* for divisions */ if (cut==0) quot=*up; /* is at bottom of unit */ else /* cut>0 */ { /* it's not at bottom of unit */ #if DECDPUN<=4 U_ASSERT(/* cut >= 0 &&*/ cut <= 4); quot=QUOT10(*up, cut); rem=*up-quot*powers[cut]; #else rem=*up%powers[cut]; quot=*up/powers[cut]; #endif if (rem!=0) *residue=1; } /* discard digit is now at bottom of quot */ #if DECDPUN<=4 temp=(quot*6554)>>16; /* fast /10 */ /* Vowels algorithm here not a win (9 instructions) */ discard1=quot-X10(temp); quot=temp; #else discard1=quot%10; quot=quot/10; #endif /* here, discard1 is the guard digit, and residue is everything */ /* else [use mapping array to accumulate residue safely] */ *residue+=resmap[discard1]; cut++; /* update cut */ /* here: up -> Unit of the array with bottom digit */ /* cut is the division point for each Unit */ /* quot holds the uncut high-order digits for the current unit */ if (set->digits<=0) { /* special for Quantize/Subnormal :-( */ *dn->lsu=0; /* .. result is 0 */ dn->digits=1; /* .. */ } else { /* shift to least needed */ count=set->digits; /* now digits to end up with */ dn->digits=count; /* set the new length */ /* shift-copy the coefficient array to the result number */ for (target=dn->lsu; ; target++) { *target=(Unit)quot; count-=(DECDPUN-cut); if (count<=0) break; up++; quot=*up; #if DECDPUN<=4 quot=QUOT10(quot, cut); rem=*up-quot*powers[cut]; #else rem=quot%powers[cut]; quot=quot/powers[cut]; #endif *target=(Unit)(*target+rem*powers[DECDPUN-cut]); count-=cut; if (count<=0) break; } /* shift-copy loop */ } /* shift to least */ } /* not unit boundary */ if (*residue!=0) *status|=DEC_Inexact; /* record inexactitude */ return; } /* decSetCoeff */ /* ------------------------------------------------------------------ */ /* decApplyRound -- apply pending rounding to a number */ /* */ /* dn is the number, with space for set->digits digits */ /* set is the context [for size and rounding mode] */ /* residue indicates pending rounding, being any accumulated */ /* guard and sticky information. It may be: */ /* 6-9: rounding digit is >5 */ /* 5: rounding digit is exactly half-way */ /* 1-4: rounding digit is <5 and >0 */ /* 0: the coefficient is exact */ /* -1: as 1, but the hidden digits are subtractive, that */ /* is, of the opposite sign to dn. In this case the */ /* coefficient must be non-0. This case occurs when */ /* subtracting a small number (which can be reduced to */ /* a sticky bit); see decAddOp. */ /* status is the status accumulator, as usual */ /* */ /* This routine applies rounding while keeping the length of the */ /* coefficient constant. The exponent and status are unchanged */ /* except if: */ /* */ /* -- the coefficient was increased and is all nines (in which */ /* case Overflow could occur, and is handled directly here so */ /* the caller does not need to re-test for overflow) */ /* */ /* -- the coefficient was decreased and becomes all nines (in which */ /* case Underflow could occur, and is also handled directly). */ /* */ /* All fields in dn are updated as required. */ /* */ /* ------------------------------------------------------------------ */ static void decApplyRound(decNumber *dn, decContext *set, Int residue, uInt *status) { Int bump; /* 1 if coefficient needs to be incremented */ /* -1 if coefficient needs to be decremented */ if (residue==0) return; /* nothing to apply */ bump=0; /* assume a smooth ride */ /* now decide whether, and how, to round, depending on mode */ switch (set->round) { case DEC_ROUND_05UP: { /* round zero or five up (for reround) */ /* This is the same as DEC_ROUND_DOWN unless there is a */ /* positive residue and the lsd of dn is 0 or 5, in which case */ /* it is bumped; when residue is <0, the number is therefore */ /* bumped down unless the final digit was 1 or 6 (in which */ /* case it is bumped down and then up -- a no-op) */ Int lsd5=*dn->lsu%5; /* get lsd and quintate */ if (residue<0 && lsd5!=1) bump=-1; else if (residue>0 && lsd5==0) bump=1; /* [bump==1 could be applied directly; use common path for clarity] */ break;} /* r-05 */ case DEC_ROUND_DOWN: { /* no change, except if negative residue */ if (residue<0) bump=-1; break;} /* r-d */ case DEC_ROUND_HALF_DOWN: { if (residue>5) bump=1; break;} /* r-h-d */ case DEC_ROUND_HALF_EVEN: { if (residue>5) bump=1; /* >0.5 goes up */ else if (residue==5) { /* exactly 0.5000... */ /* 0.5 goes up iff [new] lsd is odd */ if (*dn->lsu & 0x01) bump=1; } break;} /* r-h-e */ case DEC_ROUND_HALF_UP: { if (residue>=5) bump=1; break;} /* r-h-u */ case DEC_ROUND_UP: { if (residue>0) bump=1; break;} /* r-u */ case DEC_ROUND_CEILING: { /* same as _UP for positive numbers, and as _DOWN for negatives */ /* [negative residue cannot occur on 0] */ if (decNumberIsNegative(dn)) { if (residue<0) bump=-1; } else { if (residue>0) bump=1; } break;} /* r-c */ case DEC_ROUND_FLOOR: { /* same as _UP for negative numbers, and as _DOWN for positive */ /* [negative residue cannot occur on 0] */ if (!decNumberIsNegative(dn)) { if (residue<0) bump=-1; } else { if (residue>0) bump=1; } break;} /* r-f */ default: { /* e.g., DEC_ROUND_MAX */ *status|=DEC_Invalid_context; #if DECTRACE || (DECCHECK && DECVERB) printf("Unknown rounding mode: %d\n", set->round); #endif break;} } /* switch */ /* now bump the number, up or down, if need be */ if (bump==0) return; /* no action required */ /* Simply use decUnitAddSub unless bumping up and the number is */ /* all nines. In this special case set to 100... explicitly */ /* and adjust the exponent by one (as otherwise could overflow */ /* the array) */ /* Similarly handle all-nines result if bumping down. */ if (bump>0) { Unit *up; /* work */ uInt count=dn->digits; /* digits to be checked */ for (up=dn->lsu; ; up++) { if (count<=DECDPUN) { /* this is the last Unit (the msu) */ if (*up!=powers[count]-1) break; /* not still 9s */ /* here if it, too, is all nines */ *up=(Unit)powers[count-1]; /* here 999 -> 100 etc. */ for (up=up-1; up>=dn->lsu; up--) *up=0; /* others all to 0 */ dn->exponent++; /* and bump exponent */ /* [which, very rarely, could cause Overflow...] */ if ((dn->exponent+dn->digits)>set->emax+1) { decSetOverflow(dn, set, status); } return; /* done */ } /* a full unit to check, with more to come */ if (*up!=DECDPUNMAX) break; /* not still 9s */ count-=DECDPUN; } /* up */ } /* bump>0 */ else { /* -1 */ /* here checking for a pre-bump of 1000... (leading 1, all */ /* other digits zero) */ Unit *up, *sup; /* work */ uInt count=dn->digits; /* digits to be checked */ for (up=dn->lsu; ; up++) { if (count<=DECDPUN) { /* this is the last Unit (the msu) */ if (*up!=powers[count-1]) break; /* not 100.. */ /* here if have the 1000... case */ sup=up; /* save msu pointer */ *up=(Unit)powers[count]-1; /* here 100 in msu -> 999 */ /* others all to all-nines, too */ for (up=up-1; up>=dn->lsu; up--) *up=(Unit)powers[DECDPUN]-1; dn->exponent--; /* and bump exponent */ /* iff the number was at the subnormal boundary (exponent=etiny) */ /* then the exponent is now out of range, so it will in fact get */ /* clamped to etiny and the final 9 dropped. */ /* printf(">> emin=%d exp=%d sdig=%d\n", set->emin, */ /* dn->exponent, set->digits); */ if (dn->exponent+1==set->emin-set->digits+1) { if (count==1 && dn->digits==1) *sup=0; /* here 9 -> 0[.9] */ else { *sup=(Unit)powers[count-1]-1; /* here 999.. in msu -> 99.. */ dn->digits--; } dn->exponent++; *status|=DEC_Underflow | DEC_Subnormal | DEC_Inexact | DEC_Rounded; } return; /* done */ } /* a full unit to check, with more to come */ if (*up!=0) break; /* not still 0s */ count-=DECDPUN; } /* up */ } /* bump<0 */ /* Actual bump needed. Do it. */ decUnitAddSub(dn->lsu, D2U(dn->digits), uarrone, 1, 0, dn->lsu, bump); } /* decApplyRound */ #if DECSUBSET /* ------------------------------------------------------------------ */ /* decFinish -- finish processing a number */ /* */ /* dn is the number */ /* set is the context */ /* residue is the rounding accumulator (as in decApplyRound) */ /* status is the accumulator */ /* */ /* This finishes off the current number by: */ /* 1. If not extended: */ /* a. Converting a zero result to clean '0' */ /* b. Reducing positive exponents to 0, if would fit in digits */ /* 2. Checking for overflow and subnormals (always) */ /* Note this is just Finalize when no subset arithmetic. */ /* All fields are updated as required. */ /* ------------------------------------------------------------------ */ static void decFinish(decNumber *dn, decContext *set, Int *residue, uInt *status) { if (!set->extended) { if ISZERO(dn) { /* value is zero */ dn->exponent=0; /* clean exponent .. */ dn->bits=0; /* .. and sign */ return; /* no error possible */ } if (dn->exponent>=0) { /* non-negative exponent */ /* >0; reduce to integer if possible */ if (set->digits >= (dn->exponent+dn->digits)) { dn->digits=decShiftToMost(dn->lsu, dn->digits, dn->exponent); dn->exponent=0; } } } /* !extended */ decFinalize(dn, set, residue, status); } /* decFinish */ #endif /* ------------------------------------------------------------------ */ /* decFinalize -- final check, clamp, and round of a number */ /* */ /* dn is the number */ /* set is the context */ /* residue is the rounding accumulator (as in decApplyRound) */ /* status is the status accumulator */ /* */ /* This finishes off the current number by checking for subnormal */ /* results, applying any pending rounding, checking for overflow, */ /* and applying any clamping. */ /* Underflow and overflow conditions are raised as appropriate. */ /* All fields are updated as required. */ /* ------------------------------------------------------------------ */ static void decFinalize(decNumber *dn, decContext *set, Int *residue, uInt *status) { Int shift; /* shift needed if clamping */ Int tinyexp=set->emin-dn->digits+1; /* precalculate subnormal boundary */ /* Must be careful, here, when checking the exponent as the */ /* adjusted exponent could overflow 31 bits [because it may already */ /* be up to twice the expected]. */ /* First test for subnormal. This must be done before any final */ /* round as the result could be rounded to Nmin or 0. */ if (dn->exponent<=tinyexp) { /* prefilter */ Int comp; decNumber nmin; /* A very nasty case here is dn == Nmin and residue<0 */ if (dn->exponentemin; comp=decCompare(dn, &nmin, 1); /* (signless compare) */ if (comp==BADINT) { /* oops */ *status|=DEC_Insufficient_storage; /* abandon... */ return; } if (*residue<0 && comp==0) { /* neg residue and dn==Nmin */ decApplyRound(dn, set, *residue, status); /* might force down */ decSetSubnormal(dn, set, residue, status); return; } } /* now apply any pending round (this could raise overflow). */ if (*residue!=0) decApplyRound(dn, set, *residue, status); /* Check for overflow [redundant in the 'rare' case] or clamp */ if (dn->exponent<=set->emax-set->digits+1) return; /* neither needed */ /* here when might have an overflow or clamp to do */ if (dn->exponent>set->emax-dn->digits+1) { /* too big */ decSetOverflow(dn, set, status); return; } /* here when the result is normal but in clamp range */ if (!set->clamp) return; /* here when need to apply the IEEE exponent clamp (fold-down) */ shift=dn->exponent-(set->emax-set->digits+1); /* shift coefficient (if non-zero) */ if (!ISZERO(dn)) { dn->digits=decShiftToMost(dn->lsu, dn->digits, shift); } dn->exponent-=shift; /* adjust the exponent to match */ *status|=DEC_Clamped; /* and record the dirty deed */ return; } /* decFinalize */ /* ------------------------------------------------------------------ */ /* decSetOverflow -- set number to proper overflow value */ /* */ /* dn is the number (used for sign [only] and result) */ /* set is the context [used for the rounding mode, etc.] */ /* status contains the current status to be updated */ /* */ /* This sets the sign of a number and sets its value to either */ /* Infinity or the maximum finite value, depending on the sign of */ /* dn and the rounding mode, following IEEE 754 rules. */ /* ------------------------------------------------------------------ */ static void decSetOverflow(decNumber *dn, decContext *set, uInt *status) { Flag needmax=0; /* result is maximum finite value */ uByte sign=dn->bits&DECNEG; /* clean and save sign bit */ if (ISZERO(dn)) { /* zero does not overflow magnitude */ Int emax=set->emax; /* limit value */ if (set->clamp) emax-=set->digits-1; /* lower if clamping */ if (dn->exponent>emax) { /* clamp required */ dn->exponent=emax; *status|=DEC_Clamped; } return; } uprv_decNumberZero(dn); switch (set->round) { case DEC_ROUND_DOWN: { needmax=1; /* never Infinity */ break;} /* r-d */ case DEC_ROUND_05UP: { needmax=1; /* never Infinity */ break;} /* r-05 */ case DEC_ROUND_CEILING: { if (sign) needmax=1; /* Infinity if non-negative */ break;} /* r-c */ case DEC_ROUND_FLOOR: { if (!sign) needmax=1; /* Infinity if negative */ break;} /* r-f */ default: break; /* Infinity in all other cases */ } if (needmax) { decSetMaxValue(dn, set); dn->bits=sign; /* set sign */ } else dn->bits=sign|DECINF; /* Value is +/-Infinity */ *status|=DEC_Overflow | DEC_Inexact | DEC_Rounded; } /* decSetOverflow */ /* ------------------------------------------------------------------ */ /* decSetMaxValue -- set number to +Nmax (maximum normal value) */ /* */ /* dn is the number to set */ /* set is the context [used for digits and emax] */ /* */ /* This sets the number to the maximum positive value. */ /* ------------------------------------------------------------------ */ static void decSetMaxValue(decNumber *dn, decContext *set) { Unit *up; /* work */ Int count=set->digits; /* nines to add */ dn->digits=count; /* fill in all nines to set maximum value */ for (up=dn->lsu; ; up++) { if (count>DECDPUN) *up=DECDPUNMAX; /* unit full o'nines */ else { /* this is the msu */ *up=(Unit)(powers[count]-1); break; } count-=DECDPUN; /* filled those digits */ } /* up */ dn->bits=0; /* + sign */ dn->exponent=set->emax-set->digits+1; } /* decSetMaxValue */ /* ------------------------------------------------------------------ */ /* decSetSubnormal -- process value whose exponent is extended) { uprv_decNumberZero(dn); /* always full overflow */ *status|=DEC_Underflow | DEC_Subnormal | DEC_Inexact | DEC_Rounded; return; } #endif /* Full arithmetic -- allow subnormals, rounded to minimum exponent */ /* (Etiny) if needed */ etiny=set->emin-(set->digits-1); /* smallest allowed exponent */ if ISZERO(dn) { /* value is zero */ /* residue can never be non-zero here */ #if DECCHECK if (*residue!=0) { printf("++ Subnormal 0 residue %ld\n", (LI)*residue); *status|=DEC_Invalid_operation; } #endif if (dn->exponentexponent=etiny; *status|=DEC_Clamped; } return; } *status|=DEC_Subnormal; /* have a non-zero subnormal */ adjust=etiny-dn->exponent; /* calculate digits to remove */ if (adjust<=0) { /* not out of range; unrounded */ /* residue can never be non-zero here, except in the Nmin-residue */ /* case (which is a subnormal result), so can take fast-path here */ /* it may already be inexact (from setting the coefficient) */ if (*status&DEC_Inexact) *status|=DEC_Underflow; return; } /* adjust>0, so need to rescale the result so exponent becomes Etiny */ /* [this code is similar to that in rescale] */ workset=*set; /* clone rounding, etc. */ workset.digits=dn->digits-adjust; /* set requested length */ workset.emin-=adjust; /* and adjust emin to match */ /* [note that the latter can be <1, here, similar to Rescale case] */ decSetCoeff(dn, &workset, dn->lsu, dn->digits, residue, status); decApplyRound(dn, &workset, *residue, status); /* Use 754 default rule: Underflow is set iff Inexact */ /* [independent of whether trapped] */ if (*status&DEC_Inexact) *status|=DEC_Underflow; /* if rounded up a 999s case, exponent will be off by one; adjust */ /* back if so [it will fit, because it was shortened earlier] */ if (dn->exponent>etiny) { dn->digits=decShiftToMost(dn->lsu, dn->digits, 1); dn->exponent--; /* (re)adjust the exponent. */ } /* if rounded to zero, it is by definition clamped... */ if (ISZERO(dn)) *status|=DEC_Clamped; } /* decSetSubnormal */ /* ------------------------------------------------------------------ */ /* decCheckMath - check entry conditions for a math function */ /* */ /* This checks the context and the operand */ /* */ /* rhs is the operand to check */ /* set is the context to check */ /* status is unchanged if both are good */ /* */ /* returns non-zero if status is changed, 0 otherwise */ /* */ /* Restrictions enforced: */ /* */ /* digits, emax, and -emin in the context must be less than */ /* DEC_MAX_MATH (999999), and A must be within these bounds if */ /* non-zero. Invalid_operation is set in the status if a */ /* restriction is violated. */ /* ------------------------------------------------------------------ */ static uInt decCheckMath(const decNumber *rhs, decContext *set, uInt *status) { uInt save=*status; /* record */ if (set->digits>DEC_MAX_MATH || set->emax>DEC_MAX_MATH || -set->emin>DEC_MAX_MATH) *status|=DEC_Invalid_context; else if ((rhs->digits>DEC_MAX_MATH || rhs->exponent+rhs->digits>DEC_MAX_MATH+1 || rhs->exponent+rhs->digits<2*(1-DEC_MAX_MATH)) && !ISZERO(rhs)) *status|=DEC_Invalid_operation; return (*status!=save); } /* decCheckMath */ /* ------------------------------------------------------------------ */ /* decGetInt -- get integer from a number */ /* */ /* dn is the number [which will not be altered] */ /* */ /* returns one of: */ /* BADINT if there is a non-zero fraction */ /* the converted integer */ /* BIGEVEN if the integer is even and magnitude > 2*10**9 */ /* BIGODD if the integer is odd and magnitude > 2*10**9 */ /* */ /* This checks and gets a whole number from the input decNumber. */ /* The sign can be determined from dn by the caller when BIGEVEN or */ /* BIGODD is returned. */ /* ------------------------------------------------------------------ */ static Int decGetInt(const decNumber *dn) { Int theInt; /* result accumulator */ const Unit *up; /* work */ Int got; /* digits (real or not) processed */ Int ilength=dn->digits+dn->exponent; /* integral length */ Flag neg=decNumberIsNegative(dn); /* 1 if -ve */ /* The number must be an integer that fits in 10 digits */ /* Assert, here, that 10 is enough for any rescale Etiny */ #if DEC_MAX_EMAX > 999999999 #error GetInt may need updating [for Emax] #endif #if DEC_MIN_EMIN < -999999999 #error GetInt may need updating [for Emin] #endif if (ISZERO(dn)) return 0; /* zeros are OK, with any exponent */ up=dn->lsu; /* ready for lsu */ theInt=0; /* ready to accumulate */ if (dn->exponent>=0) { /* relatively easy */ /* no fractional part [usual]; allow for positive exponent */ got=dn->exponent; } else { /* -ve exponent; some fractional part to check and discard */ Int count=-dn->exponent; /* digits to discard */ /* spin up whole units until reach the Unit with the unit digit */ for (; count>=DECDPUN; up++) { if (*up!=0) return BADINT; /* non-zero Unit to discard */ count-=DECDPUN; } if (count==0) got=0; /* [a multiple of DECDPUN] */ else { /* [not multiple of DECDPUN] */ Int rem; /* work */ /* slice off fraction digits and check for non-zero */ #if DECDPUN<=4 theInt=QUOT10(*up, count); rem=*up-theInt*powers[count]; #else rem=*up%powers[count]; /* slice off discards */ theInt=*up/powers[count]; #endif if (rem!=0) return BADINT; /* non-zero fraction */ /* it looks good */ got=DECDPUN-count; /* number of digits so far */ up++; /* ready for next */ } } /* now it's known there's no fractional part */ /* tricky code now, to accumulate up to 9.3 digits */ if (got==0) {theInt=*up; got+=DECDPUN; up++;} /* ensure lsu is there */ if (ilength<11) { Int save=theInt; /* collect any remaining unit(s) */ for (; got1999999997) ilength=11; else if (!neg && theInt>999999999) ilength=11; if (ilength==11) theInt=save; /* restore correct low bit */ } } if (ilength>10) { /* too big */ if (theInt&1) return BIGODD; /* bottom bit 1 */ return BIGEVEN; /* bottom bit 0 */ } if (neg) theInt=-theInt; /* apply sign */ return theInt; } /* decGetInt */ /* ------------------------------------------------------------------ */ /* decDecap -- decapitate the coefficient of a number */ /* */ /* dn is the number to be decapitated */ /* drop is the number of digits to be removed from the left of dn; */ /* this must be <= dn->digits (if equal, the coefficient is */ /* set to 0) */ /* */ /* Returns dn; dn->digits will be <= the initial digits less drop */ /* (after removing drop digits there may be leading zero digits */ /* which will also be removed). Only dn->lsu and dn->digits change. */ /* ------------------------------------------------------------------ */ static decNumber *decDecap(decNumber *dn, Int drop) { Unit *msu; /* -> target cut point */ Int cut; /* work */ if (drop>=dn->digits) { /* losing the whole thing */ #if DECCHECK if (drop>dn->digits) printf("decDecap called with drop>digits [%ld>%ld]\n", (LI)drop, (LI)dn->digits); #endif dn->lsu[0]=0; dn->digits=1; return dn; } msu=dn->lsu+D2U(dn->digits-drop)-1; /* -> likely msu */ cut=MSUDIGITS(dn->digits-drop); /* digits to be in use in msu */ if (cut!=DECDPUN) *msu%=powers[cut]; /* clear left digits */ /* that may have left leading zero digits, so do a proper count... */ dn->digits=decGetDigits(dn->lsu, static_cast(msu-dn->lsu+1)); return dn; } /* decDecap */ /* ------------------------------------------------------------------ */ /* decBiStr -- compare string with pairwise options */ /* */ /* targ is the string to compare */ /* str1 is one of the strings to compare against (length may be 0) */ /* str2 is the other; it must be the same length as str1 */ /* */ /* returns 1 if strings compare equal, (that is, it is the same */ /* length as str1 and str2, and each character of targ is in either */ /* str1 or str2 in the corresponding position), or 0 otherwise */ /* */ /* This is used for generic caseless compare, including the awkward */ /* case of the Turkish dotted and dotless Is. Use as (for example): */ /* if (decBiStr(test, "mike", "MIKE")) ... */ /* ------------------------------------------------------------------ */ static Flag decBiStr(const char *targ, const char *str1, const char *str2) { for (;;targ++, str1++, str2++) { if (*targ!=*str1 && *targ!=*str2) return 0; /* *targ has a match in one (or both, if terminator) */ if (*targ=='\0') break; } /* forever */ return 1; } /* decBiStr */ /* ------------------------------------------------------------------ */ /* decNaNs -- handle NaN operand or operands */ /* */ /* res is the result number */ /* lhs is the first operand */ /* rhs is the second operand, or nullptr if none */ /* context is used to limit payload length */ /* status contains the current status */ /* returns res in case convenient */ /* */ /* Called when one or both operands is a NaN, and propagates the */ /* appropriate result to res. When an sNaN is found, it is changed */ /* to a qNaN and Invalid operation is set. */ /* ------------------------------------------------------------------ */ static decNumber * decNaNs(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set, uInt *status) { /* This decision tree ends up with LHS being the source pointer, */ /* and status updated if need be */ if (lhs->bits & DECSNAN) *status|=DEC_Invalid_operation | DEC_sNaN; else if (rhs==nullptr); else if (rhs->bits & DECSNAN) { lhs=rhs; *status|=DEC_Invalid_operation | DEC_sNaN; } else if (lhs->bits & DECNAN); else lhs=rhs; /* propagate the payload */ if (lhs->digits<=set->digits) uprv_decNumberCopy(res, lhs); /* easy */ else { /* too long */ const Unit *ul; Unit *ur, *uresp1; /* copy safe number of units, then decapitate */ res->bits=lhs->bits; /* need sign etc. */ uresp1=res->lsu+D2U(set->digits); for (ur=res->lsu, ul=lhs->lsu; urdigits=D2U(set->digits)*DECDPUN; /* maybe still too long */ if (res->digits>set->digits) decDecap(res, res->digits-set->digits); } res->bits&=~DECSNAN; /* convert any sNaN to NaN, while */ res->bits|=DECNAN; /* .. preserving sign */ res->exponent=0; /* clean exponent */ /* [coefficient was copied/decapitated] */ return res; } /* decNaNs */ /* ------------------------------------------------------------------ */ /* decStatus -- apply non-zero status */ /* */ /* dn is the number to set if error */ /* status contains the current status (not yet in context) */ /* set is the context */ /* */ /* If the status is an error status, the number is set to a NaN, */ /* unless the error was an overflow, divide-by-zero, or underflow, */ /* in which case the number will have already been set. */ /* */ /* The context status is then updated with the new status. Note that */ /* this may raise a signal, so control may never return from this */ /* routine (hence resources must be recovered before it is called). */ /* ------------------------------------------------------------------ */ static void decStatus(decNumber *dn, uInt status, decContext *set) { if (status & DEC_NaNs) { /* error status -> NaN */ /* if cause was an sNaN, clear and propagate [NaN is already set up] */ if (status & DEC_sNaN) status&=~DEC_sNaN; else { uprv_decNumberZero(dn); /* other error: clean throughout */ dn->bits=DECNAN; /* and make a quiet NaN */ } } uprv_decContextSetStatus(set, status); /* [may not return] */ return; } /* decStatus */ /* ------------------------------------------------------------------ */ /* decGetDigits -- count digits in a Units array */ /* */ /* uar is the Unit array holding the number (this is often an */ /* accumulator of some sort) */ /* len is the length of the array in units [>=1] */ /* */ /* returns the number of (significant) digits in the array */ /* */ /* All leading zeros are excluded, except the last if the array has */ /* only zero Units. */ /* ------------------------------------------------------------------ */ /* This may be called twice during some operations. */ static Int decGetDigits(Unit *uar, Int len) { Unit *up=uar+(len-1); /* -> msu */ Int digits=(len-1)*DECDPUN+1; /* possible digits excluding msu */ #if DECDPUN>4 uInt const *pow; /* work */ #endif /* (at least 1 in final msu) */ #if DECCHECK if (len<1) printf("decGetDigits called with len<1 [%ld]\n", (LI)len); #endif for (; up>=uar; up--) { if (*up==0) { /* unit is all 0s */ if (digits==1) break; /* a zero has one digit */ digits-=DECDPUN; /* adjust for 0 unit */ continue;} /* found the first (most significant) non-zero Unit */ #if DECDPUN>1 /* not done yet */ if (*up<10) break; /* is 1-9 */ digits++; #if DECDPUN>2 /* not done yet */ if (*up<100) break; /* is 10-99 */ digits++; #if DECDPUN>3 /* not done yet */ if (*up<1000) break; /* is 100-999 */ digits++; #if DECDPUN>4 /* count the rest ... */ for (pow=&powers[4]; *up>=*pow; pow++) digits++; #endif #endif #endif #endif break; } /* up */ return digits; } /* decGetDigits */ #if DECTRACE | DECCHECK /* ------------------------------------------------------------------ */ /* decNumberShow -- display a number [debug aid] */ /* dn is the number to show */ /* */ /* Shows: sign, exponent, coefficient (msu first), digits */ /* or: sign, special-value */ /* ------------------------------------------------------------------ */ /* this is public so other modules can use it */ void uprv_decNumberShow(const decNumber *dn) { const Unit *up; /* work */ uInt u, d; /* .. */ Int cut; /* .. */ char isign='+'; /* main sign */ if (dn==nullptr) { printf("nullptr\n"); return;} if (decNumberIsNegative(dn)) isign='-'; printf(" >> %c ", isign); if (dn->bits&DECSPECIAL) { /* Is a special value */ if (decNumberIsInfinite(dn)) printf("Infinity"); else { /* a NaN */ if (dn->bits&DECSNAN) printf("sNaN"); /* signalling NaN */ else printf("NaN"); } /* if coefficient and exponent are 0, no more to do */ if (dn->exponent==0 && dn->digits==1 && *dn->lsu==0) { printf("\n"); return;} /* drop through to report other information */ printf(" "); } /* now carefully display the coefficient */ up=dn->lsu+D2U(dn->digits)-1; /* msu */ printf("%ld", (LI)*up); for (up=up-1; up>=dn->lsu; up--) { u=*up; printf(":"); for (cut=DECDPUN-1; cut>=0; cut--) { d=u/powers[cut]; u-=d*powers[cut]; printf("%ld", (LI)d); } /* cut */ } /* up */ if (dn->exponent!=0) { char esign='+'; if (dn->exponent<0) esign='-'; printf(" E%c%ld", esign, (LI)abs(dn->exponent)); } printf(" [%ld]\n", (LI)dn->digits); } /* decNumberShow */ #endif #if DECTRACE || DECCHECK /* ------------------------------------------------------------------ */ /* decDumpAr -- display a unit array [debug/check aid] */ /* name is a single-character tag name */ /* ar is the array to display */ /* len is the length of the array in Units */ /* ------------------------------------------------------------------ */ static void decDumpAr(char name, const Unit *ar, Int len) { Int i; const char *spec; #if DECDPUN==9 spec="%09d "; #elif DECDPUN==8 spec="%08d "; #elif DECDPUN==7 spec="%07d "; #elif DECDPUN==6 spec="%06d "; #elif DECDPUN==5 spec="%05d "; #elif DECDPUN==4 spec="%04d "; #elif DECDPUN==3 spec="%03d "; #elif DECDPUN==2 spec="%02d "; #else spec="%d "; #endif printf(" :%c: ", name); for (i=len-1; i>=0; i--) { if (i==len-1) printf("%ld ", (LI)ar[i]); else printf(spec, ar[i]); } printf("\n"); return;} #endif #if DECCHECK /* ------------------------------------------------------------------ */ /* decCheckOperands -- check operand(s) to a routine */ /* res is the result structure (not checked; it will be set to */ /* quiet NaN if error found (and it is not nullptr)) */ /* lhs is the first operand (may be DECUNRESU) */ /* rhs is the second (may be DECUNUSED) */ /* set is the context (may be DECUNCONT) */ /* returns 0 if both operands, and the context are clean, or 1 */ /* otherwise (in which case the context will show an error, */ /* unless nullptr). Note that res is not cleaned; caller should */ /* handle this so res=nullptr case is safe. */ /* The caller is expected to abandon immediately if 1 is returned. */ /* ------------------------------------------------------------------ */ static Flag decCheckOperands(decNumber *res, const decNumber *lhs, const decNumber *rhs, decContext *set) { Flag bad=0; if (set==nullptr) { /* oops; hopeless */ #if DECTRACE || DECVERB printf("Reference to context is nullptr.\n"); #endif bad=1; return 1;} else if (set!=DECUNCONT && (set->digits<1 || set->round>=DEC_ROUND_MAX)) { bad=1; #if DECTRACE || DECVERB printf("Bad context [digits=%ld round=%ld].\n", (LI)set->digits, (LI)set->round); #endif } else { if (res==nullptr) { bad=1; #if DECTRACE /* this one not DECVERB as standard tests include nullptr */ printf("Reference to result is nullptr.\n"); #endif } if (!bad && lhs!=DECUNUSED) bad=(decCheckNumber(lhs)); if (!bad && rhs!=DECUNUSED) bad=(decCheckNumber(rhs)); } if (bad) { if (set!=DECUNCONT) uprv_decContextSetStatus(set, DEC_Invalid_operation); if (res!=DECUNRESU && res!=nullptr) { uprv_decNumberZero(res); res->bits=DECNAN; /* qNaN */ } } return bad; } /* decCheckOperands */ /* ------------------------------------------------------------------ */ /* decCheckNumber -- check a number */ /* dn is the number to check */ /* returns 0 if the number is clean, or 1 otherwise */ /* */ /* The number is considered valid if it could be a result from some */ /* operation in some valid context. */ /* ------------------------------------------------------------------ */ static Flag decCheckNumber(const decNumber *dn) { const Unit *up; /* work */ uInt maxuint; /* .. */ Int ae, d, digits; /* .. */ Int emin, emax; /* .. */ if (dn==nullptr) { /* hopeless */ #if DECTRACE /* this one not DECVERB as standard tests include nullptr */ printf("Reference to decNumber is nullptr.\n"); #endif return 1;} /* check special values */ if (dn->bits & DECSPECIAL) { if (dn->exponent!=0) { #if DECTRACE || DECVERB printf("Exponent %ld (not 0) for a special value [%02x].\n", (LI)dn->exponent, dn->bits); #endif return 1;} /* 2003.09.08: NaNs may now have coefficients, so next tests Inf only */ if (decNumberIsInfinite(dn)) { if (dn->digits!=1) { #if DECTRACE || DECVERB printf("Digits %ld (not 1) for an infinity.\n", (LI)dn->digits); #endif return 1;} if (*dn->lsu!=0) { #if DECTRACE || DECVERB printf("LSU %ld (not 0) for an infinity.\n", (LI)*dn->lsu); #endif decDumpAr('I', dn->lsu, D2U(dn->digits)); return 1;} } /* Inf */ /* 2002.12.26: negative NaNs can now appear through proposed IEEE */ /* concrete formats (decimal64, etc.). */ return 0; } /* check the coefficient */ if (dn->digits<1 || dn->digits>DECNUMMAXP) { #if DECTRACE || DECVERB printf("Digits %ld in number.\n", (LI)dn->digits); #endif return 1;} d=dn->digits; for (up=dn->lsu; d>0; up++) { if (d>DECDPUN) maxuint=DECDPUNMAX; else { /* reached the msu */ maxuint=powers[d]-1; if (dn->digits>1 && *upmaxuint) { #if DECTRACE || DECVERB printf("Bad Unit [%08lx] in %ld-digit number at offset %ld [maxuint %ld].\n", (LI)*up, (LI)dn->digits, (LI)(up-dn->lsu), (LI)maxuint); #endif return 1;} d-=DECDPUN; } /* check the exponent. Note that input operands can have exponents */ /* which are out of the set->emin/set->emax and set->digits range */ /* (just as they can have more digits than set->digits). */ ae=dn->exponent+dn->digits-1; /* adjusted exponent */ emax=DECNUMMAXE; emin=DECNUMMINE; digits=DECNUMMAXP; if (ae+emax) { #if DECTRACE || DECVERB printf("Adjusted exponent overflow [%ld].\n", (LI)ae); uprv_decNumberShow(dn); #endif return 1;} return 0; /* it's OK */ } /* decCheckNumber */ /* ------------------------------------------------------------------ */ /* decCheckInexact -- check a normal finite inexact result has digits */ /* dn is the number to check */ /* set is the context (for status and precision) */ /* sets Invalid operation, etc., if some digits are missing */ /* [this check is not made for DECSUBSET compilation or when */ /* subnormal is not set] */ /* ------------------------------------------------------------------ */ static void decCheckInexact(const decNumber *dn, decContext *set) { #if !DECSUBSET && DECEXTFLAG if ((set->status & (DEC_Inexact|DEC_Subnormal))==DEC_Inexact && (set->digits!=dn->digits) && !(dn->bits & DECSPECIAL)) { #if DECTRACE || DECVERB printf("Insufficient digits [%ld] on normal Inexact result.\n", (LI)dn->digits); uprv_decNumberShow(dn); #endif uprv_decContextSetStatus(set, DEC_Invalid_operation); } #else /* next is a noop for quiet compiler */ if (dn!=nullptr && dn->digits==0) set->status|=DEC_Invalid_operation; #endif return; } /* decCheckInexact */ #endif #if DECALLOC #undef malloc #undef free /* ------------------------------------------------------------------ */ /* decMalloc -- accountable allocation routine */ /* n is the number of bytes to allocate */ /* */ /* Semantics is the same as the stdlib malloc routine, but bytes */ /* allocated are accounted for globally, and corruption fences are */ /* added before and after the 'actual' storage. */ /* ------------------------------------------------------------------ */ /* This routine allocates storage with an extra twelve bytes; 8 are */ /* at the start and hold: */ /* 0-3 the original length requested */ /* 4-7 buffer corruption detection fence (DECFENCE, x4) */ /* The 4 bytes at the end also hold a corruption fence (DECFENCE, x4) */ /* ------------------------------------------------------------------ */ static void *decMalloc(size_t n) { uInt size=n+12; /* true size */ void *alloc; /* -> allocated storage */ uByte *b, *b0; /* work */ uInt uiwork; /* for macros */ alloc=malloc(size); /* -> allocated storage */ if (alloc==nullptr) return nullptr; /* out of strorage */ b0=(uByte *)alloc; /* as bytes */ decAllocBytes+=n; /* account for storage */ UBFROMUI(alloc, n); /* save n */ /* printf(" alloc ++ dAB: %ld (%ld)\n", (LI)decAllocBytes, (LI)n); */ for (b=b0+4; b play area */ } /* decMalloc */ /* ------------------------------------------------------------------ */ /* decFree -- accountable free routine */ /* alloc is the storage to free */ /* */ /* Semantics is the same as the stdlib malloc routine, except that */ /* the global storage accounting is updated and the fences are */ /* checked to ensure that no routine has written 'out of bounds'. */ /* ------------------------------------------------------------------ */ /* This routine first checks that the fences have not been corrupted. */ /* It then frees the storage using the 'truw' storage address (that */ /* is, offset by 8). */ /* ------------------------------------------------------------------ */ static void decFree(void *alloc) { uInt n; /* original length */ uByte *b, *b0; /* work */ uInt uiwork; /* for macros */ if (alloc==nullptr) return; /* allowed; it's a nop */ b0=(uByte *)alloc; /* as bytes */ b0-=8; /* -> true start of storage */ n=UBTOUI(b0); /* lift length */ for (b=b0+4; b= endpoints.end - endpoints.start) { UPRV_UNREACHABLE_EXIT; } return pattern.charAt(endpoints.start + index); } int32_t ParsedPatternInfo::length(int32_t flags) const { return getLengthFromEndpoints(getEndpoints(flags)); } int32_t ParsedPatternInfo::getLengthFromEndpoints(const Endpoints& endpoints) { return endpoints.end - endpoints.start; } UnicodeString ParsedPatternInfo::getString(int32_t flags) const { const Endpoints& endpoints = getEndpoints(flags); if (endpoints.start == endpoints.end) { return UnicodeString(); } // Create a new UnicodeString return UnicodeString(pattern, endpoints.start, endpoints.end - endpoints.start); } const Endpoints& ParsedPatternInfo::getEndpoints(int32_t flags) const { bool prefix = (flags & AFFIX_PREFIX) != 0; bool isNegative = (flags & AFFIX_NEGATIVE_SUBPATTERN) != 0; bool padding = (flags & AFFIX_PADDING) != 0; if (isNegative && padding) { return negative.paddingEndpoints; } else if (padding) { return positive.paddingEndpoints; } else if (prefix && isNegative) { return negative.prefixEndpoints; } else if (prefix) { return positive.prefixEndpoints; } else if (isNegative) { return negative.suffixEndpoints; } else { return positive.suffixEndpoints; } } bool ParsedPatternInfo::positiveHasPlusSign() const { return positive.hasPlusSign; } bool ParsedPatternInfo::hasNegativeSubpattern() const { return fHasNegativeSubpattern; } bool ParsedPatternInfo::negativeHasMinusSign() const { return negative.hasMinusSign; } bool ParsedPatternInfo::hasCurrencySign() const { return positive.hasCurrencySign || (fHasNegativeSubpattern && negative.hasCurrencySign); } bool ParsedPatternInfo::containsSymbolType(AffixPatternType type, UErrorCode& status) const { return AffixUtils::containsType(pattern, type, status); } bool ParsedPatternInfo::hasBody() const { return positive.integerTotal > 0; } bool ParsedPatternInfo::currencyAsDecimal() const { return positive.hasCurrencyDecimal; } ///////////////////////////////////////////////////// /// BEGIN RECURSIVE DESCENT PARSER IMPLEMENTATION /// ///////////////////////////////////////////////////// UChar32 ParsedPatternInfo::ParserState::peek() { if (offset == pattern.length()) { return -1; } else { return pattern.char32At(offset); } } UChar32 ParsedPatternInfo::ParserState::peek2() { if (offset == pattern.length()) { return -1; } int32_t cp1 = pattern.char32At(offset); int32_t offset2 = offset + U16_LENGTH(cp1); if (offset2 == pattern.length()) { return -1; } return pattern.char32At(offset2); } UChar32 ParsedPatternInfo::ParserState::next() { int32_t codePoint = peek(); offset += U16_LENGTH(codePoint); return codePoint; } void ParsedPatternInfo::consumePattern(const UnicodeString& patternString, UErrorCode& status) { if (U_FAILURE(status)) { return; } this->pattern = patternString; // This class is not intended for writing twice! // Use move assignment to overwrite instead. U_ASSERT(state.offset == 0); // pattern := subpattern (';' subpattern)? currentSubpattern = &positive; consumeSubpattern(status); if (U_FAILURE(status)) { return; } if (state.peek() == u';') { state.next(); // consume the ';' // Don't consume the negative subpattern if it is empty (trailing ';') if (state.peek() != -1) { fHasNegativeSubpattern = true; currentSubpattern = &negative; consumeSubpattern(status); if (U_FAILURE(status)) { return; } } } if (state.peek() != -1) { state.toParseException(u"Found unquoted special character"); status = U_UNQUOTED_SPECIAL; } } void ParsedPatternInfo::consumeSubpattern(UErrorCode& status) { // subpattern := literals? number exponent? literals? consumePadding(PadPosition::UNUM_PAD_BEFORE_PREFIX, status); if (U_FAILURE(status)) { return; } consumeAffix(currentSubpattern->prefixEndpoints, status); if (U_FAILURE(status)) { return; } consumePadding(PadPosition::UNUM_PAD_AFTER_PREFIX, status); if (U_FAILURE(status)) { return; } consumeFormat(status); if (U_FAILURE(status)) { return; } consumeExponent(status); if (U_FAILURE(status)) { return; } consumePadding(PadPosition::UNUM_PAD_BEFORE_SUFFIX, status); if (U_FAILURE(status)) { return; } consumeAffix(currentSubpattern->suffixEndpoints, status); if (U_FAILURE(status)) { return; } consumePadding(PadPosition::UNUM_PAD_AFTER_SUFFIX, status); if (U_FAILURE(status)) { return; } } void ParsedPatternInfo::consumePadding(PadPosition paddingLocation, UErrorCode& status) { if (state.peek() != u'*') { return; } if (currentSubpattern->hasPadding) { state.toParseException(u"Cannot have multiple pad specifiers"); status = U_MULTIPLE_PAD_SPECIFIERS; return; } currentSubpattern->paddingLocation = paddingLocation; currentSubpattern->hasPadding = true; state.next(); // consume the '*' currentSubpattern->paddingEndpoints.start = state.offset; consumeLiteral(status); currentSubpattern->paddingEndpoints.end = state.offset; } void ParsedPatternInfo::consumeAffix(Endpoints& endpoints, UErrorCode& status) { // literals := { literal } endpoints.start = state.offset; while (true) { switch (state.peek()) { case u'#': case u'@': case u';': case u'*': case u'.': case u',': case u'0': case u'1': case u'2': case u'3': case u'4': case u'5': case u'6': case u'7': case u'8': case u'9': case -1: // Characters that cannot appear unquoted in a literal // break outer; goto after_outer; case u'%': currentSubpattern->hasPercentSign = true; break; case u'‰': currentSubpattern->hasPerMilleSign = true; break; case u'¤': currentSubpattern->hasCurrencySign = true; break; case u'-': currentSubpattern->hasMinusSign = true; break; case u'+': currentSubpattern->hasPlusSign = true; break; default: break; } consumeLiteral(status); if (U_FAILURE(status)) { return; } } after_outer: endpoints.end = state.offset; } void ParsedPatternInfo::consumeLiteral(UErrorCode& status) { if (state.peek() == -1) { state.toParseException(u"Expected unquoted literal but found EOL"); status = U_PATTERN_SYNTAX_ERROR; return; } else if (state.peek() == u'\'') { state.next(); // consume the starting quote while (state.peek() != u'\'') { if (state.peek() == -1) { state.toParseException(u"Expected quoted literal but found EOL"); status = U_PATTERN_SYNTAX_ERROR; return; } else { state.next(); // consume a quoted character } } state.next(); // consume the ending quote } else { // consume a non-quoted literal character state.next(); } } void ParsedPatternInfo::consumeFormat(UErrorCode& status) { consumeIntegerFormat(status); if (U_FAILURE(status)) { return; } if (state.peek() == u'.') { state.next(); // consume the decimal point currentSubpattern->hasDecimal = true; currentSubpattern->widthExceptAffixes += 1; consumeFractionFormat(status); if (U_FAILURE(status)) { return; } } else if (state.peek() == u'¤') { // Check if currency is a decimal separator switch (state.peek2()) { case u'#': case u'0': case u'1': case u'2': case u'3': case u'4': case u'5': case u'6': case u'7': case u'8': case u'9': break; default: // Currency symbol followed by a non-numeric character; // treat as a normal affix. return; } // Currency symbol is followed by a numeric character; // treat as a decimal separator. currentSubpattern->hasCurrencySign = true; currentSubpattern->hasCurrencyDecimal = true; currentSubpattern->hasDecimal = true; currentSubpattern->widthExceptAffixes += 1; state.next(); // consume the symbol consumeFractionFormat(status); if (U_FAILURE(status)) { return; } } } void ParsedPatternInfo::consumeIntegerFormat(UErrorCode& status) { // Convenience reference: ParsedSubpatternInfo& result = *currentSubpattern; while (true) { switch (state.peek()) { case u',': result.widthExceptAffixes += 1; result.groupingSizes <<= 16; break; case u'#': if (result.integerNumerals > 0) { state.toParseException(u"# cannot follow 0 before decimal point"); status = U_UNEXPECTED_TOKEN; return; } result.widthExceptAffixes += 1; result.groupingSizes += 1; if (result.integerAtSigns > 0) { result.integerTrailingHashSigns += 1; } else { result.integerLeadingHashSigns += 1; } result.integerTotal += 1; break; case u'@': if (result.integerNumerals > 0) { state.toParseException(u"Cannot mix 0 and @"); status = U_UNEXPECTED_TOKEN; return; } if (result.integerTrailingHashSigns > 0) { state.toParseException(u"Cannot nest # inside of a run of @"); status = U_UNEXPECTED_TOKEN; return; } result.widthExceptAffixes += 1; result.groupingSizes += 1; result.integerAtSigns += 1; result.integerTotal += 1; break; case u'0': case u'1': case u'2': case u'3': case u'4': case u'5': case u'6': case u'7': case u'8': case u'9': if (result.integerAtSigns > 0) { state.toParseException(u"Cannot mix @ and 0"); status = U_UNEXPECTED_TOKEN; return; } result.widthExceptAffixes += 1; result.groupingSizes += 1; result.integerNumerals += 1; result.integerTotal += 1; if (!result.rounding.isZeroish() || state.peek() != u'0') { result.rounding.appendDigit(static_cast(state.peek() - u'0'), 0, true); } break; default: goto after_outer; } state.next(); // consume the symbol } after_outer: // Disallow patterns with a trailing ',' or with two ',' next to each other auto grouping1 = static_cast (result.groupingSizes & 0xffff); auto grouping2 = static_cast ((result.groupingSizes >> 16) & 0xffff); auto grouping3 = static_cast ((result.groupingSizes >> 32) & 0xffff); if (grouping1 == 0 && grouping2 != -1) { state.toParseException(u"Trailing grouping separator is invalid"); status = U_UNEXPECTED_TOKEN; return; } if (grouping2 == 0 && grouping3 != -1) { state.toParseException(u"Grouping width of zero is invalid"); status = U_PATTERN_SYNTAX_ERROR; return; } } void ParsedPatternInfo::consumeFractionFormat(UErrorCode& status) { // Convenience reference: ParsedSubpatternInfo& result = *currentSubpattern; int32_t zeroCounter = 0; while (true) { switch (state.peek()) { case u'#': result.widthExceptAffixes += 1; result.fractionHashSigns += 1; result.fractionTotal += 1; zeroCounter++; break; case u'0': case u'1': case u'2': case u'3': case u'4': case u'5': case u'6': case u'7': case u'8': case u'9': if (result.fractionHashSigns > 0) { state.toParseException(u"0 cannot follow # after decimal point"); status = U_UNEXPECTED_TOKEN; return; } result.widthExceptAffixes += 1; result.fractionNumerals += 1; result.fractionTotal += 1; if (state.peek() == u'0') { zeroCounter++; } else { result.rounding .appendDigit(static_cast(state.peek() - u'0'), zeroCounter, false); zeroCounter = 0; } break; default: return; } state.next(); // consume the symbol } } void ParsedPatternInfo::consumeExponent(UErrorCode& status) { // Convenience reference: ParsedSubpatternInfo& result = *currentSubpattern; if (state.peek() != u'E') { return; } if ((result.groupingSizes & 0xffff0000L) != 0xffff0000L) { state.toParseException(u"Cannot have grouping separator in scientific notation"); status = U_MALFORMED_EXPONENTIAL_PATTERN; return; } state.next(); // consume the E result.widthExceptAffixes++; if (state.peek() == u'+') { state.next(); // consume the + result.exponentHasPlusSign = true; result.widthExceptAffixes++; } while (state.peek() == u'0') { state.next(); // consume the 0 result.exponentZeros += 1; result.widthExceptAffixes++; } } /////////////////////////////////////////////////// /// END RECURSIVE DESCENT PARSER IMPLEMENTATION /// /////////////////////////////////////////////////// void PatternParser::parseToExistingPropertiesImpl(const UnicodeString& pattern, DecimalFormatProperties& properties, IgnoreRounding ignoreRounding, UErrorCode& status) { if (pattern.length() == 0) { // Backwards compatibility requires that we reset to the default values. // TODO: Only overwrite the properties that "saveToProperties" normally touches? properties.clear(); return; } ParsedPatternInfo patternInfo; parseToPatternInfo(pattern, patternInfo, status); if (U_FAILURE(status)) { return; } patternInfoToProperties(properties, patternInfo, ignoreRounding, status); } void PatternParser::patternInfoToProperties(DecimalFormatProperties& properties, ParsedPatternInfo& patternInfo, IgnoreRounding _ignoreRounding, UErrorCode& status) { // Translate from PatternParseResult to Properties. // Note that most data from "negative" is ignored per the specification of DecimalFormat. const ParsedSubpatternInfo& positive = patternInfo.positive; bool ignoreRounding; if (_ignoreRounding == IGNORE_ROUNDING_NEVER) { ignoreRounding = false; } else if (_ignoreRounding == IGNORE_ROUNDING_IF_CURRENCY) { ignoreRounding = positive.hasCurrencySign; } else { U_ASSERT(_ignoreRounding == IGNORE_ROUNDING_ALWAYS); ignoreRounding = true; } // Grouping settings auto grouping1 = static_cast (positive.groupingSizes & 0xffff); auto grouping2 = static_cast ((positive.groupingSizes >> 16) & 0xffff); auto grouping3 = static_cast ((positive.groupingSizes >> 32) & 0xffff); if (grouping2 != -1) { properties.groupingSize = grouping1; properties.groupingUsed = true; } else { properties.groupingSize = -1; properties.groupingUsed = false; } if (grouping3 != -1) { properties.secondaryGroupingSize = grouping2; } else { properties.secondaryGroupingSize = -1; } // For backwards compatibility, require that the pattern emit at least one min digit. int minInt, minFrac; if (positive.integerTotal == 0 && positive.fractionTotal > 0) { // patterns like ".##" minInt = 0; minFrac = uprv_max(1, positive.fractionNumerals); } else if (positive.integerNumerals == 0 && positive.fractionNumerals == 0) { // patterns like "#.##" minInt = 1; minFrac = 0; } else { minInt = positive.integerNumerals; minFrac = positive.fractionNumerals; } // Rounding settings // Don't set basic rounding when there is a currency sign; defer to CurrencyUsage if (positive.integerAtSigns > 0) { properties.minimumFractionDigits = -1; properties.maximumFractionDigits = -1; properties.roundingIncrement = 0.0; properties.minimumSignificantDigits = positive.integerAtSigns; properties.maximumSignificantDigits = positive.integerAtSigns + positive.integerTrailingHashSigns; } else if (!positive.rounding.isZeroish()) { if (!ignoreRounding) { properties.minimumFractionDigits = minFrac; properties.maximumFractionDigits = positive.fractionTotal; properties.roundingIncrement = positive.rounding.toDouble(); } else { properties.minimumFractionDigits = -1; properties.maximumFractionDigits = -1; properties.roundingIncrement = 0.0; } properties.minimumSignificantDigits = -1; properties.maximumSignificantDigits = -1; } else { if (!ignoreRounding) { properties.minimumFractionDigits = minFrac; properties.maximumFractionDigits = positive.fractionTotal; properties.roundingIncrement = 0.0; } else { properties.minimumFractionDigits = -1; properties.maximumFractionDigits = -1; properties.roundingIncrement = 0.0; } properties.minimumSignificantDigits = -1; properties.maximumSignificantDigits = -1; } // If the pattern ends with a '.' then force the decimal point. if (positive.hasDecimal && positive.fractionTotal == 0) { properties.decimalSeparatorAlwaysShown = true; } else { properties.decimalSeparatorAlwaysShown = false; } // Persist the currency as decimal separator properties.currencyAsDecimal = positive.hasCurrencyDecimal; // Scientific notation settings if (positive.exponentZeros > 0) { properties.exponentSignAlwaysShown = positive.exponentHasPlusSign; properties.minimumExponentDigits = positive.exponentZeros; if (positive.integerAtSigns == 0) { // patterns without '@' can define max integer digits, used for engineering notation properties.minimumIntegerDigits = positive.integerNumerals; properties.maximumIntegerDigits = positive.integerTotal; } else { // patterns with '@' cannot define max integer digits properties.minimumIntegerDigits = 1; properties.maximumIntegerDigits = -1; } } else { properties.exponentSignAlwaysShown = false; properties.minimumExponentDigits = -1; properties.minimumIntegerDigits = minInt; properties.maximumIntegerDigits = -1; } // Compute the affix patterns (required for both padding and affixes) UnicodeString posPrefix = patternInfo.getString(AffixPatternProvider::AFFIX_PREFIX); UnicodeString posSuffix = patternInfo.getString(0); // Padding settings if (positive.hasPadding) { // The width of the positive prefix and suffix templates are included in the padding int paddingWidth = positive.widthExceptAffixes + AffixUtils::estimateLength(posPrefix, status) + AffixUtils::estimateLength(posSuffix, status); properties.formatWidth = paddingWidth; UnicodeString rawPaddingString = patternInfo.getString(AffixPatternProvider::AFFIX_PADDING); if (rawPaddingString.length() == 1) { properties.padString = rawPaddingString; } else if (rawPaddingString.length() == 2) { if (rawPaddingString.charAt(0) == u'\'') { properties.padString.setTo(u"'", -1); } else { properties.padString = rawPaddingString; } } else { properties.padString = UnicodeString(rawPaddingString, 1, rawPaddingString.length() - 2); } properties.padPosition = positive.paddingLocation; } else { properties.formatWidth = -1; properties.padString.setToBogus(); properties.padPosition.nullify(); } // Set the affixes // Always call the setter, even if the prefixes are empty, especially in the case of the // negative prefix pattern, to prevent default values from overriding the pattern. properties.positivePrefixPattern = posPrefix; properties.positiveSuffixPattern = posSuffix; if (patternInfo.fHasNegativeSubpattern) { properties.negativePrefixPattern = patternInfo.getString( AffixPatternProvider::AFFIX_NEGATIVE_SUBPATTERN | AffixPatternProvider::AFFIX_PREFIX); properties.negativeSuffixPattern = patternInfo.getString( AffixPatternProvider::AFFIX_NEGATIVE_SUBPATTERN); } else { properties.negativePrefixPattern.setToBogus(); properties.negativeSuffixPattern.setToBogus(); } // Set the magnitude multiplier if (positive.hasPercentSign) { properties.magnitudeMultiplier = 2; } else if (positive.hasPerMilleSign) { properties.magnitudeMultiplier = 3; } else { properties.magnitudeMultiplier = 0; } } /////////////////////////////////////////////////////////////////// /// End PatternStringParser.java; begin PatternStringUtils.java /// /////////////////////////////////////////////////////////////////// // Determine whether a given roundingIncrement should be ignored for formatting // based on the current maxFrac value (maximum fraction digits). For example a // roundingIncrement of 0.01 should be ignored if maxFrac is 1, but not if maxFrac // is 2 or more. Note that roundingIncrements are rounded in significance, so // a roundingIncrement of 0.006 is treated like 0.01 for this determination, i.e. // it should not be ignored if maxFrac is 2 or more (but a roundingIncrement of // 0.005 is treated like 0.001 for significance). This is the reason for the // initial doubling below. // roundIncr must be non-zero. bool PatternStringUtils::ignoreRoundingIncrement(double roundIncr, int32_t maxFrac) { if (maxFrac < 0) { return false; } int32_t frac = 0; roundIncr *= 2.0; for (frac = 0; frac <= maxFrac && roundIncr <= 1.0; frac++, roundIncr *= 10.0); return (frac > maxFrac); } UnicodeString PatternStringUtils::propertiesToPatternString(const DecimalFormatProperties& properties, UErrorCode& status) { UnicodeString sb; // Convenience references // The uprv_min() calls prevent DoS int32_t dosMax = 100; int32_t grouping1 = uprv_max(0, uprv_min(properties.groupingSize, dosMax)); int32_t grouping2 = uprv_max(0, uprv_min(properties.secondaryGroupingSize, dosMax)); bool useGrouping = properties.groupingUsed; int32_t paddingWidth = uprv_min(properties.formatWidth, dosMax); NullableValue paddingLocation = properties.padPosition; UnicodeString paddingString = properties.padString; int32_t minInt = uprv_max(0, uprv_min(properties.minimumIntegerDigits, dosMax)); int32_t maxInt = uprv_min(properties.maximumIntegerDigits, dosMax); int32_t minFrac = uprv_max(0, uprv_min(properties.minimumFractionDigits, dosMax)); int32_t maxFrac = uprv_min(properties.maximumFractionDigits, dosMax); int32_t minSig = uprv_min(properties.minimumSignificantDigits, dosMax); int32_t maxSig = uprv_min(properties.maximumSignificantDigits, dosMax); bool alwaysShowDecimal = properties.decimalSeparatorAlwaysShown; int32_t exponentDigits = uprv_min(properties.minimumExponentDigits, dosMax); bool exponentShowPlusSign = properties.exponentSignAlwaysShown; AutoAffixPatternProvider affixProvider(properties, status); // Prefixes sb.append(affixProvider.get().getString(AffixPatternProvider::AFFIX_POS_PREFIX)); int32_t afterPrefixPos = sb.length(); // Figure out the grouping sizes. if (!useGrouping) { grouping1 = 0; grouping2 = 0; } else if (grouping1 == grouping2) { grouping1 = 0; } int32_t groupingLength = grouping1 + grouping2 + 1; // Figure out the digits we need to put in the pattern. double increment = properties.roundingIncrement; UnicodeString digitsString; int32_t digitsStringScale = 0; if (maxSig != uprv_min(dosMax, -1)) { // Significant Digits. while (digitsString.length() < minSig) { digitsString.append(u'@'); } while (digitsString.length() < maxSig) { digitsString.append(u'#'); } } else if (increment != 0.0 && !ignoreRoundingIncrement(increment,maxFrac)) { // Rounding Increment. DecimalQuantity incrementQuantity; incrementQuantity.setToDouble(increment); incrementQuantity.roundToInfinity(); digitsStringScale = incrementQuantity.getLowerDisplayMagnitude(); incrementQuantity.adjustMagnitude(-digitsStringScale); incrementQuantity.setMinInteger(minInt - digitsStringScale); UnicodeString str = incrementQuantity.toPlainString(); if (str.charAt(0) == u'-') { // TODO: Unsupported operation exception or fail silently? digitsString.append(str, 1, str.length() - 1); } else { digitsString.append(str); } } while (digitsString.length() + digitsStringScale < minInt) { digitsString.insert(0, u'0'); } while (-digitsStringScale < minFrac) { digitsString.append(u'0'); digitsStringScale--; } // Write the digits to the string builder int32_t m0 = uprv_max(groupingLength, digitsString.length() + digitsStringScale); m0 = (maxInt != dosMax) ? uprv_max(maxInt, m0) - 1 : m0 - 1; int32_t mN = (maxFrac != dosMax) ? uprv_min(-maxFrac, digitsStringScale) : digitsStringScale; for (int32_t magnitude = m0; magnitude >= mN; magnitude--) { int32_t di = digitsString.length() + digitsStringScale - magnitude - 1; if (di < 0 || di >= digitsString.length()) { sb.append(u'#'); } else { sb.append(digitsString.charAt(di)); } // Decimal separator if (magnitude == 0 && (alwaysShowDecimal || mN < 0)) { if (properties.currencyAsDecimal) { sb.append(u'¤'); } else { sb.append(u'.'); } } if (!useGrouping) { continue; } // Least-significant grouping separator if (magnitude > 0 && magnitude == grouping1) { sb.append(u','); } // All other grouping separators if (magnitude > grouping1 && grouping2 > 0 && (magnitude - grouping1) % grouping2 == 0) { sb.append(u','); } } // Exponential notation if (exponentDigits != uprv_min(dosMax, -1)) { sb.append(u'E'); if (exponentShowPlusSign) { sb.append(u'+'); } for (int32_t i = 0; i < exponentDigits; i++) { sb.append(u'0'); } } // Suffixes int32_t beforeSuffixPos = sb.length(); sb.append(affixProvider.get().getString(AffixPatternProvider::AFFIX_POS_SUFFIX)); // Resolve Padding if (paddingWidth > 0 && !paddingLocation.isNull()) { while (paddingWidth - sb.length() > 0) { sb.insert(afterPrefixPos, u'#'); beforeSuffixPos++; } int32_t addedLength; switch (paddingLocation.get(status)) { case PadPosition::UNUM_PAD_BEFORE_PREFIX: addedLength = escapePaddingString(paddingString, sb, 0, status); sb.insert(0, u'*'); afterPrefixPos += addedLength + 1; beforeSuffixPos += addedLength + 1; break; case PadPosition::UNUM_PAD_AFTER_PREFIX: addedLength = escapePaddingString(paddingString, sb, afterPrefixPos, status); sb.insert(afterPrefixPos, u'*'); afterPrefixPos += addedLength + 1; beforeSuffixPos += addedLength + 1; break; case PadPosition::UNUM_PAD_BEFORE_SUFFIX: escapePaddingString(paddingString, sb, beforeSuffixPos, status); sb.insert(beforeSuffixPos, u'*'); break; case PadPosition::UNUM_PAD_AFTER_SUFFIX: sb.append(u'*'); escapePaddingString(paddingString, sb, sb.length(), status); break; } if (U_FAILURE(status)) { return sb; } } // Negative affixes // Ignore if the negative prefix pattern is "-" and the negative suffix is empty if (affixProvider.get().hasNegativeSubpattern()) { sb.append(u';'); sb.append(affixProvider.get().getString(AffixPatternProvider::AFFIX_NEG_PREFIX)); // Copy the positive digit format into the negative. // This is optional; the pattern is the same as if '#' were appended here instead. // NOTE: It is not safe to append the UnicodeString to itself, so we need to copy. // See https://unicode-org.atlassian.net/browse/ICU-13707 UnicodeString copy(sb); sb.append(copy, afterPrefixPos, beforeSuffixPos - afterPrefixPos); sb.append(affixProvider.get().getString(AffixPatternProvider::AFFIX_NEG_SUFFIX)); } return sb; } int PatternStringUtils::escapePaddingString(UnicodeString input, UnicodeString& output, int startIndex, UErrorCode& status) { (void) status; if (input.length() == 0) { input.setTo(kFallbackPaddingString, -1); } int startLength = output.length(); if (input.length() == 1) { if (input.compare(u"'", -1) == 0) { output.insert(startIndex, u"''", -1); } else { output.insert(startIndex, input); } } else { output.insert(startIndex, u'\''); int offset = 1; for (int i = 0; i < input.length(); i++) { // it's okay to deal in chars here because the quote mark is the only interesting thing. char16_t ch = input.charAt(i); if (ch == u'\'') { output.insert(startIndex + offset, u"''", -1); offset += 2; } else { output.insert(startIndex + offset, ch); offset += 1; } } output.insert(startIndex + offset, u'\''); } return output.length() - startLength; } UnicodeString PatternStringUtils::convertLocalized(const UnicodeString& input, const DecimalFormatSymbols& symbols, bool toLocalized, UErrorCode& status) { // Construct a table of strings to be converted between localized and standard. static constexpr int32_t LEN = 21; UnicodeString table[LEN][2]; int standIdx = toLocalized ? 0 : 1; int localIdx = toLocalized ? 1 : 0; // TODO: Add approximately sign here? table[0][standIdx] = u"%"; table[0][localIdx] = symbols.getConstSymbol(DecimalFormatSymbols::kPercentSymbol); table[1][standIdx] = u"‰"; table[1][localIdx] = symbols.getConstSymbol(DecimalFormatSymbols::kPerMillSymbol); table[2][standIdx] = u"."; table[2][localIdx] = symbols.getConstSymbol(DecimalFormatSymbols::kDecimalSeparatorSymbol); table[3][standIdx] = u","; table[3][localIdx] = symbols.getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol); table[4][standIdx] = u"-"; table[4][localIdx] = symbols.getConstSymbol(DecimalFormatSymbols::kMinusSignSymbol); table[5][standIdx] = u"+"; table[5][localIdx] = symbols.getConstSymbol(DecimalFormatSymbols::kPlusSignSymbol); table[6][standIdx] = u";"; table[6][localIdx] = symbols.getConstSymbol(DecimalFormatSymbols::kPatternSeparatorSymbol); table[7][standIdx] = u"@"; table[7][localIdx] = symbols.getConstSymbol(DecimalFormatSymbols::kSignificantDigitSymbol); table[8][standIdx] = u"E"; table[8][localIdx] = symbols.getConstSymbol(DecimalFormatSymbols::kExponentialSymbol); table[9][standIdx] = u"*"; table[9][localIdx] = symbols.getConstSymbol(DecimalFormatSymbols::kPadEscapeSymbol); table[10][standIdx] = u"#"; table[10][localIdx] = symbols.getConstSymbol(DecimalFormatSymbols::kDigitSymbol); for (int i = 0; i < 10; i++) { table[11 + i][standIdx] = u'0' + i; table[11 + i][localIdx] = symbols.getConstDigitSymbol(i); } // Special case: quotes are NOT allowed to be in any localIdx strings. // Substitute them with '’' instead. for (int32_t i = 0; i < LEN; i++) { table[i][localIdx].findAndReplace(u'\'', u'’'); } // Iterate through the string and convert. // State table: // 0 => base state // 1 => first char inside a quoted sequence in input and output string // 2 => inside a quoted sequence in input and output string // 3 => first char after a close quote in input string; // close quote still needs to be written to output string // 4 => base state in input string; inside quoted sequence in output string // 5 => first char inside a quoted sequence in input string; // inside quoted sequence in output string UnicodeString result; int state = 0; for (int offset = 0; offset < input.length(); offset++) { char16_t ch = input.charAt(offset); // Handle a quote character (state shift) if (ch == u'\'') { if (state == 0) { result.append(u'\''); state = 1; continue; } else if (state == 1) { result.append(u'\''); state = 0; continue; } else if (state == 2) { state = 3; continue; } else if (state == 3) { result.append(u'\''); result.append(u'\''); state = 1; continue; } else if (state == 4) { state = 5; continue; } else { U_ASSERT(state == 5); result.append(u'\''); result.append(u'\''); state = 4; continue; } } if (state == 0 || state == 3 || state == 4) { for (auto& pair : table) { // Perform a greedy match on this symbol string UnicodeString temp = input.tempSubString(offset, pair[0].length()); if (temp == pair[0]) { // Skip ahead past this region for the next iteration offset += pair[0].length() - 1; if (state == 3 || state == 4) { result.append(u'\''); state = 0; } result.append(pair[1]); goto continue_outer; } } // No replacement found. Check if a special quote is necessary for (auto& pair : table) { UnicodeString temp = input.tempSubString(offset, pair[1].length()); if (temp == pair[1]) { if (state == 0) { result.append(u'\''); state = 4; } result.append(ch); goto continue_outer; } } // Still nothing. Copy the char verbatim. (Add a close quote if necessary) if (state == 3 || state == 4) { result.append(u'\''); state = 0; } result.append(ch); } else { U_ASSERT(state == 1 || state == 2 || state == 5); result.append(ch); state = 2; } continue_outer:; } // Resolve final quotes if (state == 3 || state == 4) { result.append(u'\''); state = 0; } if (state != 0) { // Malformed localized pattern: unterminated quote status = U_PATTERN_SYNTAX_ERROR; } return result; } void PatternStringUtils::patternInfoToStringBuilder(const AffixPatternProvider& patternInfo, bool isPrefix, PatternSignType patternSignType, bool approximately, StandardPlural::Form plural, bool perMilleReplacesPercent, bool dropCurrencySymbols, UnicodeString& output) { // Should the output render '+' where '-' would normally appear in the pattern? bool plusReplacesMinusSign = (patternSignType == PATTERN_SIGN_TYPE_POS_SIGN) && !patternInfo.positiveHasPlusSign(); // Should we use the affix from the negative subpattern? // (If not, we will use the positive subpattern.) bool useNegativeAffixPattern = patternInfo.hasNegativeSubpattern() && (patternSignType == PATTERN_SIGN_TYPE_NEG || (patternInfo.negativeHasMinusSign() && (plusReplacesMinusSign || approximately))); // Resolve the flags for the affix pattern. int flags = 0; if (useNegativeAffixPattern) { flags |= AffixPatternProvider::AFFIX_NEGATIVE_SUBPATTERN; } if (isPrefix) { flags |= AffixPatternProvider::AFFIX_PREFIX; } if (plural != StandardPlural::Form::COUNT) { U_ASSERT(plural == (AffixPatternProvider::AFFIX_PLURAL_MASK & plural)); flags |= plural; } // Should we prepend a sign to the pattern? bool prependSign; if (!isPrefix || useNegativeAffixPattern) { prependSign = false; } else if (patternSignType == PATTERN_SIGN_TYPE_NEG) { prependSign = true; } else { prependSign = plusReplacesMinusSign || approximately; } // What symbols should take the place of the sign placeholder? const char16_t* signSymbols = u"-"; if (approximately) { if (plusReplacesMinusSign) { signSymbols = u"~+"; } else if (patternSignType == PATTERN_SIGN_TYPE_NEG) { signSymbols = u"~-"; } else { signSymbols = u"~"; } } else if (plusReplacesMinusSign) { signSymbols = u"+"; } // Compute the number of tokens in the affix pattern (signSymbols is considered one token). int length = patternInfo.length(flags) + (prependSign ? 1 : 0); // Finally, set the result into the StringBuilder. output.remove(); for (int index = 0; index < length; index++) { char16_t candidate; if (prependSign && index == 0) { candidate = u'-'; } else if (prependSign) { candidate = patternInfo.charAt(flags, index - 1); } else { candidate = patternInfo.charAt(flags, index); } if (candidate == u'-') { if (u_strlen(signSymbols) == 1) { candidate = signSymbols[0]; } else { output.append(signSymbols[0]); candidate = signSymbols[1]; } } if (perMilleReplacesPercent && candidate == u'%') { candidate = u'‰'; } if (dropCurrencySymbols && candidate == u'\u00A4') { continue; } output.append(candidate); } } PatternSignType PatternStringUtils::resolveSignDisplay(UNumberSignDisplay signDisplay, Signum signum) { switch (signDisplay) { case UNUM_SIGN_AUTO: case UNUM_SIGN_ACCOUNTING: switch (signum) { case SIGNUM_NEG: case SIGNUM_NEG_ZERO: return PATTERN_SIGN_TYPE_NEG; case SIGNUM_POS_ZERO: case SIGNUM_POS: return PATTERN_SIGN_TYPE_POS; default: break; } break; case UNUM_SIGN_ALWAYS: case UNUM_SIGN_ACCOUNTING_ALWAYS: switch (signum) { case SIGNUM_NEG: case SIGNUM_NEG_ZERO: return PATTERN_SIGN_TYPE_NEG; case SIGNUM_POS_ZERO: case SIGNUM_POS: return PATTERN_SIGN_TYPE_POS_SIGN; default: break; } break; case UNUM_SIGN_EXCEPT_ZERO: case UNUM_SIGN_ACCOUNTING_EXCEPT_ZERO: switch (signum) { case SIGNUM_NEG: return PATTERN_SIGN_TYPE_NEG; case SIGNUM_NEG_ZERO: case SIGNUM_POS_ZERO: return PATTERN_SIGN_TYPE_POS; case SIGNUM_POS: return PATTERN_SIGN_TYPE_POS_SIGN; default: break; } break; case UNUM_SIGN_NEGATIVE: case UNUM_SIGN_ACCOUNTING_NEGATIVE: switch (signum) { case SIGNUM_NEG: return PATTERN_SIGN_TYPE_NEG; case SIGNUM_NEG_ZERO: case SIGNUM_POS_ZERO: case SIGNUM_POS: return PATTERN_SIGN_TYPE_POS; default: break; } break; case UNUM_SIGN_NEVER: return PATTERN_SIGN_TYPE_POS; default: break; } UPRV_UNREACHABLE_EXIT; return PATTERN_SIGN_TYPE_POS; } #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/smpdtfst.cpp0000644000176200001440000001017714700200761016605 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2009-2013, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* * * This file contains the class SimpleDateFormatStaticSets * * SimpleDateFormatStaticSets holds the UnicodeSets that are needed for lenient * parsing of literal characters in date/time strings. ******************************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/uniset.h" #include "unicode/udat.h" #include "cmemory.h" #include "uassert.h" #include "ucln_in.h" #include "umutex.h" #include "smpdtfst.h" U_NAMESPACE_BEGIN SimpleDateFormatStaticSets *gStaticSets = nullptr; UInitOnce gSimpleDateFormatStaticSetsInitOnce {}; SimpleDateFormatStaticSets::SimpleDateFormatStaticSets(UErrorCode &status) : fDateIgnorables(nullptr), fTimeIgnorables(nullptr), fOtherIgnorables(nullptr) { fDateIgnorables = new UnicodeSet(UNICODE_STRING("[-,./[:whitespace:]]", 20), status); fTimeIgnorables = new UnicodeSet(UNICODE_STRING("[-.:[:whitespace:]]", 19), status); fOtherIgnorables = new UnicodeSet(UNICODE_STRING("[:whitespace:]", 14), status); // Check for null pointers if (fDateIgnorables == nullptr || fTimeIgnorables == nullptr || fOtherIgnorables == nullptr) { goto ExitConstrDeleteAll; } // Freeze all the sets fDateIgnorables->freeze(); fTimeIgnorables->freeze(); fOtherIgnorables->freeze(); return; // If we reached this point, everything is fine so just exit ExitConstrDeleteAll: // Remove all sets and return error delete fDateIgnorables; fDateIgnorables = nullptr; delete fTimeIgnorables; fTimeIgnorables = nullptr; delete fOtherIgnorables; fOtherIgnorables = nullptr; status = U_MEMORY_ALLOCATION_ERROR; } SimpleDateFormatStaticSets::~SimpleDateFormatStaticSets() { delete fDateIgnorables; fDateIgnorables = nullptr; delete fTimeIgnorables; fTimeIgnorables = nullptr; delete fOtherIgnorables; fOtherIgnorables = nullptr; } //------------------------------------------------------------------------------ // // smpdtfmt_cleanup Memory cleanup function, free/delete all // cached memory. Called by ICU's u_cleanup() function. // //------------------------------------------------------------------------------ UBool SimpleDateFormatStaticSets::cleanup() { delete gStaticSets; gStaticSets = nullptr; gSimpleDateFormatStaticSetsInitOnce.reset(); return true; } U_CDECL_BEGIN static UBool U_CALLCONV smpdtfmt_cleanup() { return SimpleDateFormatStaticSets::cleanup(); } static void U_CALLCONV smpdtfmt_initSets(UErrorCode &status) { ucln_i18n_registerCleanup(UCLN_I18N_SMPDTFMT, smpdtfmt_cleanup); U_ASSERT(gStaticSets == nullptr); gStaticSets = new SimpleDateFormatStaticSets(status); if (gStaticSets == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } } U_CDECL_END UnicodeSet *SimpleDateFormatStaticSets::getIgnorables(UDateFormatField fieldIndex) { UErrorCode status = U_ZERO_ERROR; umtx_initOnce(gSimpleDateFormatStaticSetsInitOnce, &smpdtfmt_initSets, status); if (U_FAILURE(status)) { return nullptr; } switch (fieldIndex) { case UDAT_YEAR_FIELD: case UDAT_MONTH_FIELD: case UDAT_DATE_FIELD: case UDAT_STANDALONE_DAY_FIELD: case UDAT_STANDALONE_MONTH_FIELD: return gStaticSets->fDateIgnorables; case UDAT_HOUR_OF_DAY1_FIELD: case UDAT_HOUR_OF_DAY0_FIELD: case UDAT_MINUTE_FIELD: case UDAT_SECOND_FIELD: case UDAT_HOUR1_FIELD: case UDAT_HOUR0_FIELD: return gStaticSets->fTimeIgnorables; default: return gStaticSets->fOtherIgnorables; } } U_NAMESPACE_END #endif // #if !UCONFIG_NO_FORMATTING stringi/src/icu74/i18n/toupptrn.h0000644000176200001440000000341114700200761016272 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2001-2007, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 05/24/01 aliu Creation. ********************************************************************** */ #ifndef TOUPPTRN_H #define TOUPPTRN_H #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/translit.h" #include "casetrn.h" U_NAMESPACE_BEGIN /** * A transliterator that performs locale-sensitive toUpper() * case mapping. * @author Alan Liu */ class UppercaseTransliterator : public CaseMapTransliterator { public: /** * Constructs a transliterator. * @param loc the given locale. */ UppercaseTransliterator(); /** * Destructor. */ virtual ~UppercaseTransliterator(); /** * Copy constructor. */ UppercaseTransliterator(const UppercaseTransliterator&); /** * Transliterator API. * @return a copy of the object. */ virtual UppercaseTransliterator* clone() const override; /** * ICU "poor man's RTTI", returns a UClassID for the actual class. */ virtual UClassID getDynamicClassID() const override; /** * ICU "poor man's RTTI", returns a UClassID for this class. */ U_I18N_API static UClassID U_EXPORT2 getStaticClassID(); private: /** * Assignment operator. */ UppercaseTransliterator& operator=(const UppercaseTransliterator&); }; U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ #endif stringi/src/icu74/i18n/number_mapper.cpp0000644000176200001440000005210114700200761017566 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING // Allow implicit conversion from char16_t* to UnicodeString for this file: // Helpful in toString methods and elsewhere. #define UNISTR_FROM_STRING_EXPLICIT #include "number_mapper.h" #include "number_patternstring.h" #include "unicode/errorcode.h" #include "number_utils.h" using namespace icu; using namespace icu::number; using namespace icu::number::impl; UnlocalizedNumberFormatter NumberPropertyMapper::create(const DecimalFormatProperties& properties, const DecimalFormatSymbols& symbols, DecimalFormatWarehouse& warehouse, UErrorCode& status) { return NumberFormatter::with().macros(oldToNew(properties, symbols, warehouse, nullptr, status)); } UnlocalizedNumberFormatter NumberPropertyMapper::create(const DecimalFormatProperties& properties, const DecimalFormatSymbols& symbols, DecimalFormatWarehouse& warehouse, DecimalFormatProperties& exportedProperties, UErrorCode& status) { return NumberFormatter::with().macros( oldToNew( properties, symbols, warehouse, &exportedProperties, status)); } MacroProps NumberPropertyMapper::oldToNew(const DecimalFormatProperties& properties, const DecimalFormatSymbols& symbols, DecimalFormatWarehouse& warehouse, DecimalFormatProperties* exportedProperties, UErrorCode& status) { MacroProps macros; Locale locale = symbols.getLocale(); ///////////// // SYMBOLS // ///////////// macros.symbols.setTo(symbols); ////////////////// // PLURAL RULES // ////////////////// if (!properties.currencyPluralInfo.fPtr.isNull()) { macros.rules = properties.currencyPluralInfo.fPtr->getPluralRules(); } ///////////// // AFFIXES // ///////////// warehouse.affixProvider.setTo(properties, status); macros.affixProvider = &warehouse.affixProvider.get(); /////////// // UNITS // /////////// bool useCurrency = ( !properties.currency.isNull() || !properties.currencyPluralInfo.fPtr.isNull() || !properties.currencyUsage.isNull() || warehouse.affixProvider.get().hasCurrencySign()); CurrencyUnit currency = resolveCurrency(properties, locale, status); UCurrencyUsage currencyUsage = properties.currencyUsage.getOrDefault(UCURR_USAGE_STANDARD); if (useCurrency) { // NOTE: Slicing is OK. macros.unit = currency; // NOLINT } /////////////////////// // ROUNDING STRATEGY // /////////////////////// int32_t maxInt = properties.maximumIntegerDigits; int32_t minInt = properties.minimumIntegerDigits; int32_t maxFrac = properties.maximumFractionDigits; int32_t minFrac = properties.minimumFractionDigits; int32_t minSig = properties.minimumSignificantDigits; int32_t maxSig = properties.maximumSignificantDigits; double roundingIncrement = properties.roundingIncrement; // Not assigning directly to macros.roundingMode here: we change // roundingMode if and when we also change macros.precision. RoundingMode roundingMode = properties.roundingMode.getOrDefault(UNUM_ROUND_HALFEVEN); bool explicitMinMaxFrac = minFrac != -1 || maxFrac != -1; bool explicitMinMaxSig = minSig != -1 || maxSig != -1; // Resolve min/max frac for currencies, required for the validation logic and for when minFrac or // maxFrac was // set (but not both) on a currency instance. // NOTE: Increments are handled in "Precision.constructCurrency()". if (useCurrency && (minFrac == -1 || maxFrac == -1)) { int32_t digits = ucurr_getDefaultFractionDigitsForUsage( currency.getISOCurrency(), currencyUsage, &status); if (minFrac == -1 && maxFrac == -1) { minFrac = digits; maxFrac = digits; } else if (minFrac == -1) { minFrac = std::min(maxFrac, digits); } else /* if (maxFrac == -1) */ { maxFrac = std::max(minFrac, digits); } } // Validate min/max int/frac. // For backwards compatibility, minimum overrides maximum if the two conflict. if (minInt == 0 && maxFrac != 0) { minFrac = (minFrac < 0 || (minFrac == 0 && maxInt == 0)) ? 1 : minFrac; maxFrac = maxFrac < 0 ? -1 : maxFrac < minFrac ? minFrac : maxFrac; minInt = 0; maxInt = maxInt < 0 ? -1 : maxInt > kMaxIntFracSig ? -1 : maxInt; } else { // Force a digit before the decimal point. minFrac = minFrac < 0 ? 0 : minFrac; maxFrac = maxFrac < 0 ? -1 : maxFrac < minFrac ? minFrac : maxFrac; minInt = minInt <= 0 ? 1 : minInt > kMaxIntFracSig ? 1 : minInt; maxInt = maxInt < 0 ? -1 : maxInt < minInt ? minInt : maxInt > kMaxIntFracSig ? -1 : maxInt; } Precision precision; if (!properties.currencyUsage.isNull()) { precision = Precision::constructCurrency(currencyUsage).withCurrency(currency); } else if (roundingIncrement != 0.0) { if (PatternStringUtils::ignoreRoundingIncrement(roundingIncrement, maxFrac)) { precision = Precision::constructFraction(minFrac, maxFrac); } else { // Convert the double increment to an integer increment precision = Precision::increment(roundingIncrement).withMinFraction(minFrac); } } else if (explicitMinMaxSig) { minSig = minSig < 1 ? 1 : minSig > kMaxIntFracSig ? kMaxIntFracSig : minSig; maxSig = maxSig < 0 ? kMaxIntFracSig : maxSig < minSig ? minSig : maxSig > kMaxIntFracSig ? kMaxIntFracSig : maxSig; precision = Precision::constructSignificant(minSig, maxSig); } else if (explicitMinMaxFrac) { precision = Precision::constructFraction(minFrac, maxFrac); } else if (useCurrency) { precision = Precision::constructCurrency(currencyUsage); } if (!precision.isBogus()) { macros.roundingMode = roundingMode; macros.precision = precision; } /////////////////// // INTEGER WIDTH // /////////////////// macros.integerWidth = IntegerWidth( static_cast(minInt), static_cast(maxInt), properties.formatFailIfMoreThanMaxDigits); /////////////////////// // GROUPING STRATEGY // /////////////////////// macros.grouper = Grouper::forProperties(properties); ///////////// // PADDING // ///////////// if (properties.formatWidth > 0) { macros.padder = Padder::forProperties(properties); } /////////////////////////////// // DECIMAL MARK ALWAYS SHOWN // /////////////////////////////// macros.decimal = properties.decimalSeparatorAlwaysShown ? UNUM_DECIMAL_SEPARATOR_ALWAYS : UNUM_DECIMAL_SEPARATOR_AUTO; /////////////////////// // SIGN ALWAYS SHOWN // /////////////////////// macros.sign = properties.signAlwaysShown ? UNUM_SIGN_ALWAYS : UNUM_SIGN_AUTO; ///////////////////////// // SCIENTIFIC NOTATION // ///////////////////////// if (properties.minimumExponentDigits != -1) { // Scientific notation is required. // This whole section feels like a hack, but it is needed for regression tests. // The mapping from property bag to scientific notation is nontrivial due to LDML rules. if (maxInt > 8) { // But #13110: The maximum of 8 digits has unknown origins and is not in the spec. // If maxInt is greater than 8, it is set to minInt, even if minInt is greater than 8. maxInt = minInt; macros.integerWidth = IntegerWidth::zeroFillTo(minInt).truncateAt(maxInt); } else if (maxInt > minInt && minInt > 1) { // Bug #13289: if maxInt > minInt > 1, then minInt should be 1. minInt = 1; macros.integerWidth = IntegerWidth::zeroFillTo(minInt).truncateAt(maxInt); } int engineering = maxInt < 0 ? -1 : maxInt; macros.notation = ScientificNotation( // Engineering interval: static_cast(engineering), // Enforce minimum integer digits (for patterns like "000.00E0"): (engineering == minInt), // Minimum exponent digits: static_cast(properties.minimumExponentDigits), // Exponent sign always shown: properties.exponentSignAlwaysShown ? UNUM_SIGN_ALWAYS : UNUM_SIGN_AUTO); // Scientific notation also involves overriding the rounding mode. // TODO: Overriding here is a bit of a hack. Should this logic go earlier? if (macros.precision.fType == Precision::PrecisionType::RND_FRACTION) { // For the purposes of rounding, get the original min/max int/frac, since the local // variables have been manipulated for display purposes. int maxInt_ = properties.maximumIntegerDigits; int minInt_ = properties.minimumIntegerDigits; int minFrac_ = properties.minimumFractionDigits; int maxFrac_ = properties.maximumFractionDigits; if (minInt_ == 0 && maxFrac_ == 0) { // Patterns like "#E0" and "##E0", which mean no rounding! macros.precision = Precision::unlimited(); } else if (minInt_ == 0 && minFrac_ == 0) { // Patterns like "#.##E0" (no zeros in the mantissa), which mean round to maxFrac+1 macros.precision = Precision::constructSignificant(1, maxFrac_ + 1); } else { int maxSig_ = minInt_ + maxFrac_; // Bug #20058: if maxInt_ > minInt_ > 1, then minInt_ should be 1. if (maxInt_ > minInt_ && minInt_ > 1) { minInt_ = 1; } int minSig_ = minInt_ + minFrac_; // To avoid regression, maxSig is not reset when minInt_ set to 1. // TODO: Reset maxSig_ = 1 + minFrac_ to follow the spec. macros.precision = Precision::constructSignificant(minSig_, maxSig_); } macros.roundingMode = roundingMode; } } ////////////////////// // COMPACT NOTATION // ////////////////////// if (!properties.compactStyle.isNull()) { if (properties.compactStyle.getNoError() == UNumberCompactStyle::UNUM_LONG) { macros.notation = Notation::compactLong(); } else { macros.notation = Notation::compactShort(); } } ///////////////// // MULTIPLIERS // ///////////////// macros.scale = scaleFromProperties(properties); ////////////////////// // PROPERTY EXPORTS // ////////////////////// if (exportedProperties != nullptr) { exportedProperties->currency = currency; exportedProperties->roundingMode = roundingMode; exportedProperties->minimumIntegerDigits = minInt; exportedProperties->maximumIntegerDigits = maxInt == -1 ? INT32_MAX : maxInt; Precision rounding_; if (precision.fType == Precision::PrecisionType::RND_CURRENCY) { rounding_ = precision.withCurrency(currency, status); } else { rounding_ = precision; } int minFrac_ = minFrac; int maxFrac_ = maxFrac; int minSig_ = minSig; int maxSig_ = maxSig; double increment_ = 0.0; if (rounding_.fType == Precision::PrecisionType::RND_FRACTION) { minFrac_ = rounding_.fUnion.fracSig.fMinFrac; maxFrac_ = rounding_.fUnion.fracSig.fMaxFrac; } else if (rounding_.fType == Precision::PrecisionType::RND_INCREMENT || rounding_.fType == Precision::PrecisionType::RND_INCREMENT_ONE || rounding_.fType == Precision::PrecisionType::RND_INCREMENT_FIVE) { minFrac_ = rounding_.fUnion.increment.fMinFrac; // If incrementRounding is used, maxFrac is set equal to minFrac maxFrac_ = rounding_.fUnion.increment.fMinFrac; // Convert the integer increment to a double DecimalQuantity dq; dq.setToLong(rounding_.fUnion.increment.fIncrement); dq.adjustMagnitude(rounding_.fUnion.increment.fIncrementMagnitude); increment_ = dq.toDouble(); } else if (rounding_.fType == Precision::PrecisionType::RND_SIGNIFICANT) { minSig_ = rounding_.fUnion.fracSig.fMinSig; maxSig_ = rounding_.fUnion.fracSig.fMaxSig; } exportedProperties->minimumFractionDigits = minFrac_; exportedProperties->maximumFractionDigits = maxFrac_; exportedProperties->minimumSignificantDigits = minSig_; exportedProperties->maximumSignificantDigits = maxSig_; exportedProperties->roundingIncrement = increment_; } return macros; } void PropertiesAffixPatternProvider::setTo(const DecimalFormatProperties& properties, UErrorCode& status) { fBogus = false; // There are two ways to set affixes in DecimalFormat: via the pattern string (applyPattern), and via the // explicit setters (setPositivePrefix and friends). The way to resolve the settings is as follows: // // 1) If the explicit setting is present for the field, use it. // 2) Otherwise, follows UTS 35 rules based on the pattern string. // // Importantly, the explicit setters affect only the one field they override. If you set the positive // prefix, that should not affect the negative prefix. // Convenience: Extract the properties into local variables. // Variables are named with three chars: [p/n][p/s][o/p] // [p/n] => p for positive, n for negative // [p/s] => p for prefix, s for suffix // [o/p] => o for escaped custom override string, p for pattern string UnicodeString ppo = AffixUtils::escape(properties.positivePrefix); UnicodeString pso = AffixUtils::escape(properties.positiveSuffix); UnicodeString npo = AffixUtils::escape(properties.negativePrefix); UnicodeString nso = AffixUtils::escape(properties.negativeSuffix); const UnicodeString& ppp = properties.positivePrefixPattern; const UnicodeString& psp = properties.positiveSuffixPattern; const UnicodeString& npp = properties.negativePrefixPattern; const UnicodeString& nsp = properties.negativeSuffixPattern; if (!properties.positivePrefix.isBogus()) { posPrefix = ppo; } else if (!ppp.isBogus()) { posPrefix = ppp; } else { // UTS 35: Default positive prefix is empty string. posPrefix = u""; } if (!properties.positiveSuffix.isBogus()) { posSuffix = pso; } else if (!psp.isBogus()) { posSuffix = psp; } else { // UTS 35: Default positive suffix is empty string. posSuffix = u""; } if (!properties.negativePrefix.isBogus()) { negPrefix = npo; } else if (!npp.isBogus()) { negPrefix = npp; } else { // UTS 35: Default negative prefix is "-" with positive prefix. // Important: We prepend the "-" to the pattern, not the override! negPrefix = ppp.isBogus() ? u"-" : u"-" + ppp; } if (!properties.negativeSuffix.isBogus()) { negSuffix = nso; } else if (!nsp.isBogus()) { negSuffix = nsp; } else { // UTS 35: Default negative prefix is the positive prefix. negSuffix = psp.isBogus() ? u"" : psp; } // For declaring if this is a currency pattern, we need to look at the // original pattern, not at any user-specified overrides. isCurrencyPattern = ( AffixUtils::hasCurrencySymbols(ppp, status) || AffixUtils::hasCurrencySymbols(psp, status) || AffixUtils::hasCurrencySymbols(npp, status) || AffixUtils::hasCurrencySymbols(nsp, status) || properties.currencyAsDecimal); fCurrencyAsDecimal = properties.currencyAsDecimal; } char16_t PropertiesAffixPatternProvider::charAt(int flags, int i) const { return getStringInternal(flags).charAt(i); } int PropertiesAffixPatternProvider::length(int flags) const { return getStringInternal(flags).length(); } UnicodeString PropertiesAffixPatternProvider::getString(int32_t flags) const { return getStringInternal(flags); } const UnicodeString& PropertiesAffixPatternProvider::getStringInternal(int32_t flags) const { bool prefix = (flags & AFFIX_PREFIX) != 0; bool negative = (flags & AFFIX_NEGATIVE_SUBPATTERN) != 0; if (prefix && negative) { return negPrefix; } else if (prefix) { return posPrefix; } else if (negative) { return negSuffix; } else { return posSuffix; } } bool PropertiesAffixPatternProvider::positiveHasPlusSign() const { // TODO: Change the internal APIs to propagate out the error? ErrorCode localStatus; return AffixUtils::containsType(posPrefix, TYPE_PLUS_SIGN, localStatus) || AffixUtils::containsType(posSuffix, TYPE_PLUS_SIGN, localStatus); } bool PropertiesAffixPatternProvider::hasNegativeSubpattern() const { return ( (negSuffix != posSuffix) || negPrefix.tempSubString(1) != posPrefix || negPrefix.charAt(0) != u'-' ); } bool PropertiesAffixPatternProvider::negativeHasMinusSign() const { ErrorCode localStatus; return AffixUtils::containsType(negPrefix, TYPE_MINUS_SIGN, localStatus) || AffixUtils::containsType(negSuffix, TYPE_MINUS_SIGN, localStatus); } bool PropertiesAffixPatternProvider::hasCurrencySign() const { return isCurrencyPattern; } bool PropertiesAffixPatternProvider::containsSymbolType(AffixPatternType type, UErrorCode& status) const { return AffixUtils::containsType(posPrefix, type, status) || AffixUtils::containsType(posSuffix, type, status) || AffixUtils::containsType(negPrefix, type, status) || AffixUtils::containsType(negSuffix, type, status); } bool PropertiesAffixPatternProvider::hasBody() const { return true; } bool PropertiesAffixPatternProvider::currencyAsDecimal() const { return fCurrencyAsDecimal; } void CurrencyPluralInfoAffixProvider::setTo(const CurrencyPluralInfo& cpi, const DecimalFormatProperties& properties, UErrorCode& status) { // We need to use a PropertiesAffixPatternProvider, not the simpler version ParsedPatternInfo, // because user-specified affix overrides still need to work. fBogus = false; DecimalFormatProperties pluralProperties(properties); for (int32_t plural = 0; plural < StandardPlural::COUNT; plural++) { const char* keyword = StandardPlural::getKeyword(static_cast(plural)); UnicodeString patternString; patternString = cpi.getCurrencyPluralPattern(keyword, patternString); PatternParser::parseToExistingProperties( patternString, pluralProperties, IGNORE_ROUNDING_NEVER, status); affixesByPlural[plural].setTo(pluralProperties, status); } } char16_t CurrencyPluralInfoAffixProvider::charAt(int32_t flags, int32_t i) const { int32_t pluralOrdinal = (flags & AFFIX_PLURAL_MASK); return affixesByPlural[pluralOrdinal].charAt(flags, i); } int32_t CurrencyPluralInfoAffixProvider::length(int32_t flags) const { int32_t pluralOrdinal = (flags & AFFIX_PLURAL_MASK); return affixesByPlural[pluralOrdinal].length(flags); } UnicodeString CurrencyPluralInfoAffixProvider::getString(int32_t flags) const { int32_t pluralOrdinal = (flags & AFFIX_PLURAL_MASK); return affixesByPlural[pluralOrdinal].getString(flags); } bool CurrencyPluralInfoAffixProvider::positiveHasPlusSign() const { return affixesByPlural[StandardPlural::OTHER].positiveHasPlusSign(); } bool CurrencyPluralInfoAffixProvider::hasNegativeSubpattern() const { return affixesByPlural[StandardPlural::OTHER].hasNegativeSubpattern(); } bool CurrencyPluralInfoAffixProvider::negativeHasMinusSign() const { return affixesByPlural[StandardPlural::OTHER].negativeHasMinusSign(); } bool CurrencyPluralInfoAffixProvider::hasCurrencySign() const { return affixesByPlural[StandardPlural::OTHER].hasCurrencySign(); } bool CurrencyPluralInfoAffixProvider::containsSymbolType(AffixPatternType type, UErrorCode& status) const { return affixesByPlural[StandardPlural::OTHER].containsSymbolType(type, status); } bool CurrencyPluralInfoAffixProvider::hasBody() const { return affixesByPlural[StandardPlural::OTHER].hasBody(); } bool CurrencyPluralInfoAffixProvider::currencyAsDecimal() const { return affixesByPlural[StandardPlural::OTHER].currencyAsDecimal(); } #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/units_router.h0000644000176200001440000001450414700200761017146 0ustar liggesusers// © 2020 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #ifndef __UNITS_ROUTER_H__ #define __UNITS_ROUTER_H__ #include #include "cmemory.h" #include "measunit_impl.h" #include "unicode/locid.h" #include "unicode/measunit.h" #include "unicode/stringpiece.h" #include "unicode/uobject.h" #include "units_complexconverter.h" #include "units_data.h" U_NAMESPACE_BEGIN // Forward declarations class Measure; namespace number { class Precision; } namespace units { struct RouteResult : UMemory { // A list of measures: a single measure for single units, multiple measures // for mixed units. MaybeStackVector measures; // The output unit for this RouteResult. This may be a MIXED unit - for // example: "yard-and-foot-and-inch", for which `measures` will have three // elements. MeasureUnitImpl outputUnit; RouteResult(MaybeStackVector measures, MeasureUnitImpl outputUnit) : measures(std::move(measures)), outputUnit(std::move(outputUnit)) {} }; /** * Contains the complex unit converter and the limit which representing the smallest value that the * converter should accept. For example, if the converter is converting to `foot+inch` and the limit * equals 3.0, thus means the converter should not convert to a value less than `3.0 feet`. * * NOTE: * if the limit doest not has a value `i.e. (std::numeric_limits::lowest())`, this mean there * is no limit for the converter. */ struct ConverterPreference : UMemory { ComplexUnitsConverter converter; double limit; UnicodeString precision; // The output unit for this ConverterPreference. This may be a MIXED unit - // for example: "yard-and-foot-and-inch". MeasureUnitImpl targetUnit; // In case there is no limit, the limit will be -inf. ConverterPreference(const MeasureUnitImpl &source, const MeasureUnitImpl &complexTarget, UnicodeString precision, const ConversionRates &ratesInfo, UErrorCode &status) : ConverterPreference(source, complexTarget, std::numeric_limits::lowest(), precision, ratesInfo, status) {} ConverterPreference(const MeasureUnitImpl &source, const MeasureUnitImpl &complexTarget, double limit, UnicodeString precision, const ConversionRates &ratesInfo, UErrorCode &status) : converter(source, complexTarget, ratesInfo, status), limit(limit), precision(std::move(precision)), targetUnit(complexTarget.copy(status)) {} }; } // namespace units // Export explicit template instantiations of MaybeStackArray, MemoryPool and // MaybeStackVector. This is required when building DLLs for Windows. (See // datefmt.h, collationiterator.h, erarules.h and others for similar examples.) // // Note: These need to be outside of the units namespace, or Clang will generate // a compile error. #if U_PF_WINDOWS <= U_PLATFORM && U_PLATFORM <= U_PF_CYGWIN template class U_I18N_API MaybeStackArray; template class U_I18N_API MemoryPool; template class U_I18N_API MaybeStackVector; #endif namespace units { /** * `UnitsRouter` responsible for converting from a single unit (such as `meter` or `meter-per-second`) to * one of the complex units based on the limits. * For example: * if the input is `meter` and the output as following * {`foot+inch`, limit: 3.0} * {`inch` , limit: no value (-inf)} * Thus means if the input in `meter` is greater than or equal to `3.0 feet`, the output will be in * `foot+inch`, otherwise, the output will be in `inch`. * * NOTE: * the output units and the their limits MUST BE in order, for example, if the output units, from the * previous example, are the following: * {`inch` , limit: no value (-inf)} * {`foot+inch`, limit: 3.0} * IN THIS CASE THE OUTPUT WILL BE ALWAYS IN `inch`. * * NOTE: * the output units and their limits will be extracted from the units preferences database by knowing * the following: * - input unit * - locale * - usage * * DESIGN: * `UnitRouter` uses internally `ComplexUnitConverter` in order to convert the input units to the * desired complex units and to check the limit too. */ class U_I18N_API UnitsRouter { public: UnitsRouter(StringPiece inputUnitIdentifier, const Locale &locale, StringPiece usage, UErrorCode &status); UnitsRouter(const MeasureUnit &inputUnit, const Locale &locale, StringPiece usage, UErrorCode &status); /** * Performs locale and usage sensitive unit conversion. * @param quantity The quantity to convert, expressed in terms of inputUnit. * @param rounder If not null, this RoundingImpl will be used to do rounding * on the converted value. If the rounder lacks an fPrecision, the * rounder will be modified to use the preferred precision for the usage * and locale preference, alternatively with the default precision. * @param status Receives status. */ RouteResult route(double quantity, icu::number::impl::RoundingImpl *rounder, UErrorCode &status) const; /** * Returns the list of possible output units, i.e. the full set of * preferences, for the localized, usage-specific unit preferences. * * The returned pointer should be valid for the lifetime of the * UnitsRouter instance. */ const MaybeStackVector *getOutputUnits() const; private: // List of possible output units. TODO: converterPreferences_ now also has // this data available. Maybe drop outputUnits_ and have getOutputUnits // construct a the list from data in converterPreferences_ instead? MaybeStackVector outputUnits_; MaybeStackVector converterPreferences_; static number::Precision parseSkeletonToPrecision(icu::UnicodeString precisionSkeleton, UErrorCode &status); void init(const MeasureUnit &inputUnit, const Locale &locale, StringPiece usage, UErrorCode &status); }; } // namespace units U_NAMESPACE_END #endif //__UNITS_ROUTER_H__ #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/collationroot.cpp0000644000176200001440000001100614700200761017621 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2012-2014, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * collationroot.cpp * * created on: 2012dec17 * created by: Markus W. Scherer */ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION #include "unicode/coll.h" #include "unicode/udata.h" #include "collation.h" #include "collationdata.h" #include "collationdatareader.h" #include "collationroot.h" #include "collationsettings.h" #include "collationtailoring.h" #include "normalizer2impl.h" #include "ucln_in.h" #include "udatamem.h" #include "umutex.h" #include "umapfile.h" U_NAMESPACE_BEGIN namespace { static const CollationCacheEntry *rootSingleton = nullptr; static UInitOnce initOnce {}; } // namespace U_CDECL_BEGIN static UBool U_CALLCONV uprv_collation_root_cleanup() { SharedObject::clearPtr(rootSingleton); initOnce.reset(); return true; } U_CDECL_END UDataMemory* CollationRoot::loadFromFile(const char* ucadataPath, UErrorCode &errorCode) { UDataMemory dataMemory; UDataMemory *rDataMem = nullptr; if (U_FAILURE(errorCode)) { return nullptr; } if (uprv_mapFile(&dataMemory, ucadataPath, &errorCode)) { if (dataMemory.pHeader->dataHeader.magic1 == 0xda && dataMemory.pHeader->dataHeader.magic2 == 0x27 && CollationDataReader::isAcceptable(nullptr, "icu", "ucadata", &dataMemory.pHeader->info)) { rDataMem = UDataMemory_createNewInstance(&errorCode); if (U_FAILURE(errorCode)) { return nullptr; } rDataMem->pHeader = dataMemory.pHeader; rDataMem->mapAddr = dataMemory.mapAddr; rDataMem->map = dataMemory.map; return rDataMem; } errorCode = U_INVALID_FORMAT_ERROR; return nullptr; } errorCode = U_MISSING_RESOURCE_ERROR; return nullptr; } void U_CALLCONV CollationRoot::load(const char* ucadataPath, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; } LocalPointer t(new CollationTailoring(nullptr)); if(t.isNull() || t->isBogus()) { errorCode = U_MEMORY_ALLOCATION_ERROR; return; } t->memory = ucadataPath ? CollationRoot::loadFromFile(ucadataPath, errorCode) : udata_openChoice(U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "coll", "icu", "ucadata", CollationDataReader::isAcceptable, t->version, &errorCode); if(U_FAILURE(errorCode)) { return; } const uint8_t *inBytes = static_cast(udata_getMemory(t->memory)); CollationDataReader::read(nullptr, inBytes, udata_getLength(t->memory), *t, errorCode); if(U_FAILURE(errorCode)) { return; } ucln_i18n_registerCleanup(UCLN_I18N_COLLATION_ROOT, uprv_collation_root_cleanup); CollationCacheEntry *entry = new CollationCacheEntry(Locale::getRoot(), t.getAlias()); if(entry != nullptr) { t.orphan(); // The rootSingleton took ownership of the tailoring. entry->addRef(); rootSingleton = entry; } } const CollationCacheEntry * CollationRoot::getRootCacheEntry(UErrorCode &errorCode) { umtx_initOnce(initOnce, CollationRoot::load, static_cast(nullptr), errorCode); if(U_FAILURE(errorCode)) { return nullptr; } return rootSingleton; } const CollationTailoring * CollationRoot::getRoot(UErrorCode &errorCode) { umtx_initOnce(initOnce, CollationRoot::load, static_cast(nullptr), errorCode); if(U_FAILURE(errorCode)) { return nullptr; } return rootSingleton->tailoring; } const CollationData * CollationRoot::getData(UErrorCode &errorCode) { const CollationTailoring *root = getRoot(errorCode); if(U_FAILURE(errorCode)) { return nullptr; } return root->data; } const CollationSettings * CollationRoot::getSettings(UErrorCode &errorCode) { const CollationTailoring *root = getRoot(errorCode); if(U_FAILURE(errorCode)) { return nullptr; } return root->settings; } void CollationRoot::forceLoadFromFile(const char* ucadataPath, UErrorCode &errorCode) { umtx_initOnce(initOnce, CollationRoot::load, ucadataPath, errorCode); } U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION stringi/src/icu74/i18n/coll.cpp0000644000176200001440000007766714700200761015713 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * Copyright (C) 1996-2014, International Business Machines Corporation and * others. All Rights Reserved. ****************************************************************************** */ /** * File coll.cpp * * Created by: Helena Shih * * Modification History: * * Date Name Description * 2/5/97 aliu Modified createDefault to load collation data from * binary files when possible. Added related methods * createCollationFromFile, chopLocale, createPathName. * 2/11/97 aliu Added methods addToCache, findInCache, which implement * a Collation cache. Modified createDefault to look in * cache first, and also to store newly created Collation * objects in the cache. Modified to not use gLocPath. * 2/12/97 aliu Modified to create objects from RuleBasedCollator cache. * Moved cache out of Collation class. * 2/13/97 aliu Moved several methods out of this class and into * RuleBasedCollator, with modifications. Modified * createDefault() to call new RuleBasedCollator(Locale&) * constructor. General clean up and documentation. * 2/20/97 helena Added clone, operator==, operator!=, operator=, and copy * constructor. * 05/06/97 helena Added memory allocation error detection. * 05/08/97 helena Added createInstance(). * 6/20/97 helena Java class name change. * 04/23/99 stephen Removed EDecompositionMode, merged with * Normalizer::EMode * 11/23/9 srl Inlining of some critical functions * 01/29/01 synwee Modified into a C++ wrapper calling C APIs (ucol.h) * 2012-2014 markus Rewritten in C++ again. */ #include "utypeinfo.h" // for 'typeid' to work #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION #include "unicode/coll.h" #include "unicode/tblcoll.h" #include "collationdata.h" #include "collationroot.h" #include "collationtailoring.h" #include "ucol_imp.h" #include "cstring.h" #include "cmemory.h" #include "umutex.h" #include "servloc.h" #include "uassert.h" #include "ustrenum.h" #include "uresimp.h" #include "ucln_in.h" static icu::Locale* availableLocaleList = nullptr; static int32_t availableLocaleListCount; #if !UCONFIG_NO_SERVICE static icu::ICULocaleService* gService = nullptr; static icu::UInitOnce gServiceInitOnce {}; #endif static icu::UInitOnce gAvailableLocaleListInitOnce {}; /** * Release all static memory held by collator. */ U_CDECL_BEGIN static UBool U_CALLCONV collator_cleanup() { #if !UCONFIG_NO_SERVICE if (gService) { delete gService; gService = nullptr; } gServiceInitOnce.reset(); #endif if (availableLocaleList) { delete []availableLocaleList; availableLocaleList = nullptr; } availableLocaleListCount = 0; gAvailableLocaleListInitOnce.reset(); return true; } U_CDECL_END U_NAMESPACE_BEGIN #if !UCONFIG_NO_SERVICE // ------------------------------------------ // // Registration // //------------------------------------------- CollatorFactory::~CollatorFactory() {} //------------------------------------------- UBool CollatorFactory::visible() const { return true; } //------------------------------------------- UnicodeString& CollatorFactory::getDisplayName(const Locale& objectLocale, const Locale& displayLocale, UnicodeString& result) { return objectLocale.getDisplayName(displayLocale, result); } // ------------------------------------- class ICUCollatorFactory : public ICUResourceBundleFactory { public: ICUCollatorFactory() : ICUResourceBundleFactory(UnicodeString(U_ICUDATA_COLL, -1, US_INV)) { } virtual ~ICUCollatorFactory(); protected: virtual UObject* create(const ICUServiceKey& key, const ICUService* service, UErrorCode& status) const override; }; ICUCollatorFactory::~ICUCollatorFactory() {} UObject* ICUCollatorFactory::create(const ICUServiceKey& key, const ICUService* /* service */, UErrorCode& status) const { if (handlesKey(key, status)) { const LocaleKey& lkey = static_cast(key); Locale loc; // make sure the requested locale is correct // default LocaleFactory uses currentLocale since that's the one vetted by handlesKey // but for ICU rb resources we use the actual one since it will fallback again lkey.canonicalLocale(loc); return Collator::makeInstance(loc, status); } return nullptr; } // ------------------------------------- class ICUCollatorService : public ICULocaleService { public: ICUCollatorService() : ICULocaleService(UNICODE_STRING_SIMPLE("Collator")) { UErrorCode status = U_ZERO_ERROR; registerFactory(new ICUCollatorFactory(), status); } virtual ~ICUCollatorService(); virtual UObject* cloneInstance(UObject* instance) const override { return ((Collator*)instance)->clone(); } virtual UObject* handleDefault(const ICUServiceKey& key, UnicodeString* actualID, UErrorCode& status) const override { const LocaleKey* lkey = dynamic_cast(&key); U_ASSERT(lkey != nullptr); if (actualID) { // Ugly Hack Alert! We return an empty actualID to signal // to callers that this is a default object, not a "real" // service-created object. (TODO remove in 3.0) [aliu] actualID->truncate(0); } Locale loc(""); lkey->canonicalLocale(loc); return Collator::makeInstance(loc, status); } virtual UObject* getKey(ICUServiceKey& key, UnicodeString* actualReturn, UErrorCode& status) const override { UnicodeString ar; if (actualReturn == nullptr) { actualReturn = &ar; } return (Collator*)ICULocaleService::getKey(key, actualReturn, status); } virtual UBool isDefault() const override { return countFactories() == 1; } }; ICUCollatorService::~ICUCollatorService() {} // ------------------------------------- static void U_CALLCONV initService() { gService = new ICUCollatorService(); ucln_i18n_registerCleanup(UCLN_I18N_COLLATOR, collator_cleanup); } static ICULocaleService* getService() { umtx_initOnce(gServiceInitOnce, &initService); return gService; } // ------------------------------------- static inline UBool hasService() { UBool retVal = !gServiceInitOnce.isReset() && (getService() != nullptr); return retVal; } #endif /* UCONFIG_NO_SERVICE */ static void U_CALLCONV initAvailableLocaleList(UErrorCode &status) { U_ASSERT(availableLocaleListCount == 0); U_ASSERT(availableLocaleList == nullptr); // for now, there is a hardcoded list, so just walk through that list and set it up. UResourceBundle *index = nullptr; StackUResourceBundle installed; int32_t i = 0; index = ures_openDirect(U_ICUDATA_COLL, "res_index", &status); ures_getByKey(index, "InstalledLocales", installed.getAlias(), &status); if(U_SUCCESS(status)) { availableLocaleListCount = ures_getSize(installed.getAlias()); availableLocaleList = new Locale[availableLocaleListCount]; if (availableLocaleList != nullptr) { ures_resetIterator(installed.getAlias()); while(ures_hasNext(installed.getAlias())) { const char *tempKey = nullptr; ures_getNextString(installed.getAlias(), nullptr, &tempKey, &status); availableLocaleList[i++] = Locale(tempKey); } } U_ASSERT(availableLocaleListCount == i); } ures_close(index); ucln_i18n_registerCleanup(UCLN_I18N_COLLATOR, collator_cleanup); } static UBool isAvailableLocaleListInitialized(UErrorCode &status) { umtx_initOnce(gAvailableLocaleListInitOnce, &initAvailableLocaleList, status); return U_SUCCESS(status); } // Collator public methods ----------------------------------------------- namespace { static const struct { const char *name; UColAttribute attr; } collAttributes[] = { { "colStrength", UCOL_STRENGTH }, { "colBackwards", UCOL_FRENCH_COLLATION }, { "colCaseLevel", UCOL_CASE_LEVEL }, { "colCaseFirst", UCOL_CASE_FIRST }, { "colAlternate", UCOL_ALTERNATE_HANDLING }, { "colNormalization", UCOL_NORMALIZATION_MODE }, { "colNumeric", UCOL_NUMERIC_COLLATION } }; static const struct { const char *name; UColAttributeValue value; } collAttributeValues[] = { { "primary", UCOL_PRIMARY }, { "secondary", UCOL_SECONDARY }, { "tertiary", UCOL_TERTIARY }, { "quaternary", UCOL_QUATERNARY }, // Note: Not supporting typo "quarternary" because it was never supported in locale IDs. { "identical", UCOL_IDENTICAL }, { "no", UCOL_OFF }, { "yes", UCOL_ON }, { "shifted", UCOL_SHIFTED }, { "non-ignorable", UCOL_NON_IGNORABLE }, { "lower", UCOL_LOWER_FIRST }, { "upper", UCOL_UPPER_FIRST } }; static const char *collReorderCodes[UCOL_REORDER_CODE_LIMIT - UCOL_REORDER_CODE_FIRST] = { "space", "punct", "symbol", "currency", "digit" }; int32_t getReorderCode(const char *s) { for (int32_t i = 0; i < UPRV_LENGTHOF(collReorderCodes); ++i) { if (uprv_stricmp(s, collReorderCodes[i]) == 0) { return UCOL_REORDER_CODE_FIRST + i; } } // Not supporting "others" = UCOL_REORDER_CODE_OTHERS // as a synonym for Zzzz = USCRIPT_UNKNOWN for now: // Avoid introducing synonyms/aliases. return -1; } /** * Sets collation attributes according to locale keywords. See * http://www.unicode.org/reports/tr35/tr35-collation.html#Collation_Settings * * Using "alias" keywords and values where defined: * http://www.unicode.org/reports/tr35/tr35.html#Old_Locale_Extension_Syntax * http://unicode.org/repos/cldr/trunk/common/bcp47/collation.xml */ void setAttributesFromKeywords(const Locale &loc, Collator &coll, UErrorCode &errorCode) { if (U_FAILURE(errorCode)) { return; } if (uprv_strcmp(loc.getName(), loc.getBaseName()) == 0) { // No keywords. return; } char value[1024]; // The reordering value could be long. // Check for collation keywords that were already deprecated // before any were supported in createInstance() (except for "collation"). int32_t length = loc.getKeywordValue("colHiraganaQuaternary", value, UPRV_LENGTHOF(value), errorCode); if (U_FAILURE(errorCode)) { errorCode = U_ILLEGAL_ARGUMENT_ERROR; return; } if (length != 0) { errorCode = U_UNSUPPORTED_ERROR; return; } length = loc.getKeywordValue("variableTop", value, UPRV_LENGTHOF(value), errorCode); if (U_FAILURE(errorCode)) { errorCode = U_ILLEGAL_ARGUMENT_ERROR; return; } if (length != 0) { errorCode = U_UNSUPPORTED_ERROR; return; } // Parse known collation keywords, ignore others. if (errorCode == U_STRING_NOT_TERMINATED_WARNING) { errorCode = U_ZERO_ERROR; } for (int32_t i = 0; i < UPRV_LENGTHOF(collAttributes); ++i) { length = loc.getKeywordValue(collAttributes[i].name, value, UPRV_LENGTHOF(value), errorCode); if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) { errorCode = U_ILLEGAL_ARGUMENT_ERROR; return; } if (length == 0) { continue; } for (int32_t j = 0;; ++j) { if (j == UPRV_LENGTHOF(collAttributeValues)) { errorCode = U_ILLEGAL_ARGUMENT_ERROR; return; } if (uprv_stricmp(value, collAttributeValues[j].name) == 0) { coll.setAttribute(collAttributes[i].attr, collAttributeValues[j].value, errorCode); break; } } } length = loc.getKeywordValue("colReorder", value, UPRV_LENGTHOF(value), errorCode); if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) { errorCode = U_ILLEGAL_ARGUMENT_ERROR; return; } if (length != 0) { int32_t codes[USCRIPT_CODE_LIMIT + (UCOL_REORDER_CODE_LIMIT - UCOL_REORDER_CODE_FIRST)]; int32_t codesLength = 0; char *scriptName = value; for (;;) { if (codesLength == UPRV_LENGTHOF(codes)) { errorCode = U_ILLEGAL_ARGUMENT_ERROR; return; } char *limit = scriptName; char c; while ((c = *limit) != 0 && c != '-') { ++limit; } *limit = 0; int32_t code; if ((limit - scriptName) == 4) { // Strict parsing, accept only 4-letter script codes, not long names. code = u_getPropertyValueEnum(UCHAR_SCRIPT, scriptName); } else { code = getReorderCode(scriptName); } if (code < 0) { errorCode = U_ILLEGAL_ARGUMENT_ERROR; return; } codes[codesLength++] = code; if (c == 0) { break; } scriptName = limit + 1; } coll.setReorderCodes(codes, codesLength, errorCode); } length = loc.getKeywordValue("kv", value, UPRV_LENGTHOF(value), errorCode); if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) { errorCode = U_ILLEGAL_ARGUMENT_ERROR; return; } if (length != 0) { int32_t code = getReorderCode(value); if (code < 0) { errorCode = U_ILLEGAL_ARGUMENT_ERROR; return; } coll.setMaxVariable((UColReorderCode)code, errorCode); } if (U_FAILURE(errorCode)) { errorCode = U_ILLEGAL_ARGUMENT_ERROR; } } } // namespace Collator* U_EXPORT2 Collator::createInstance(UErrorCode& success) { return createInstance(Locale::getDefault(), success); } Collator* U_EXPORT2 Collator::createInstance(const Locale& desiredLocale, UErrorCode& status) { if (U_FAILURE(status)) return 0; if (desiredLocale.isBogus()) { // Locale constructed from malformed locale ID or language tag. status = U_ILLEGAL_ARGUMENT_ERROR; return nullptr; } Collator* coll; #if !UCONFIG_NO_SERVICE if (hasService()) { Locale actualLoc; coll = (Collator*)gService->get(desiredLocale, &actualLoc, status); } else #endif { coll = makeInstance(desiredLocale, status); // Either returns nullptr with U_FAILURE(status), or non-nullptr with U_SUCCESS(status) } // The use of *coll in setAttributesFromKeywords can cause the nullptr check to be // optimized out of the delete even though setAttributesFromKeywords returns // immediately if U_FAILURE(status), so we add a check here. if (U_FAILURE(status)) { return nullptr; } setAttributesFromKeywords(desiredLocale, *coll, status); if (U_FAILURE(status)) { delete coll; return nullptr; } return coll; } Collator* Collator::makeInstance(const Locale& desiredLocale, UErrorCode& status) { const CollationCacheEntry *entry = CollationLoader::loadTailoring(desiredLocale, status); if (U_SUCCESS(status)) { Collator *result = new RuleBasedCollator(entry); if (result != nullptr) { // Both the unified cache's get() and the RBC constructor // did addRef(). Undo one of them. entry->removeRef(); return result; } status = U_MEMORY_ALLOCATION_ERROR; } if (entry != nullptr) { // Undo the addRef() from the cache.get(). entry->removeRef(); } return nullptr; } Collator * Collator::safeClone() const { return clone(); } // implement deprecated, previously abstract method Collator::EComparisonResult Collator::compare(const UnicodeString& source, const UnicodeString& target) const { UErrorCode ec = U_ZERO_ERROR; return (EComparisonResult)compare(source, target, ec); } // implement deprecated, previously abstract method Collator::EComparisonResult Collator::compare(const UnicodeString& source, const UnicodeString& target, int32_t length) const { UErrorCode ec = U_ZERO_ERROR; return (EComparisonResult)compare(source, target, length, ec); } // implement deprecated, previously abstract method Collator::EComparisonResult Collator::compare(const char16_t* source, int32_t sourceLength, const char16_t* target, int32_t targetLength) const { UErrorCode ec = U_ZERO_ERROR; return (EComparisonResult)compare(source, sourceLength, target, targetLength, ec); } UCollationResult Collator::compare(UCharIterator &/*sIter*/, UCharIterator &/*tIter*/, UErrorCode &status) const { if(U_SUCCESS(status)) { // Not implemented in the base class. status = U_UNSUPPORTED_ERROR; } return UCOL_EQUAL; } UCollationResult Collator::compareUTF8(const StringPiece &source, const StringPiece &target, UErrorCode &status) const { if(U_FAILURE(status)) { return UCOL_EQUAL; } UCharIterator sIter, tIter; uiter_setUTF8(&sIter, source.data(), source.length()); uiter_setUTF8(&tIter, target.data(), target.length()); return compare(sIter, tIter, status); } UBool Collator::equals(const UnicodeString& source, const UnicodeString& target) const { UErrorCode ec = U_ZERO_ERROR; return (compare(source, target, ec) == UCOL_EQUAL); } UBool Collator::greaterOrEqual(const UnicodeString& source, const UnicodeString& target) const { UErrorCode ec = U_ZERO_ERROR; return (compare(source, target, ec) != UCOL_LESS); } UBool Collator::greater(const UnicodeString& source, const UnicodeString& target) const { UErrorCode ec = U_ZERO_ERROR; return (compare(source, target, ec) == UCOL_GREATER); } // this API ignores registered collators, since it returns an // array of indefinite lifetime const Locale* U_EXPORT2 Collator::getAvailableLocales(int32_t& count) { UErrorCode status = U_ZERO_ERROR; Locale *result = nullptr; count = 0; if (isAvailableLocaleListInitialized(status)) { result = availableLocaleList; count = availableLocaleListCount; } return result; } UnicodeString& U_EXPORT2 Collator::getDisplayName(const Locale& objectLocale, const Locale& displayLocale, UnicodeString& name) { #if !UCONFIG_NO_SERVICE if (hasService()) { UnicodeString locNameStr; LocaleUtility::initNameFromLocale(objectLocale, locNameStr); return gService->getDisplayName(locNameStr, name, displayLocale); } #endif return objectLocale.getDisplayName(displayLocale, name); } UnicodeString& U_EXPORT2 Collator::getDisplayName(const Locale& objectLocale, UnicodeString& name) { return getDisplayName(objectLocale, Locale::getDefault(), name); } /* This is useless information */ /*void Collator::getVersion(UVersionInfo versionInfo) const { if (versionInfo!=nullptr) uprv_memcpy(versionInfo, fVersion, U_MAX_VERSION_LENGTH); } */ // UCollator protected constructor destructor ---------------------------- /** * Default constructor. * Constructor is different from the old default Collator constructor. * The task for determining the default collation strength and normalization mode * is left to the child class. */ Collator::Collator() : UObject() { } /** * Constructor. * Empty constructor, does not handle the arguments. * This constructor is done for backward compatibility with 1.7 and 1.8. * The task for handling the argument collation strength and normalization * mode is left to the child class. * @param collationStrength collation strength * @param decompositionMode * @deprecated 2.4 use the default constructor instead */ Collator::Collator(UCollationStrength, UNormalizationMode ) : UObject() { } Collator::~Collator() { } Collator::Collator(const Collator &other) : UObject(other) { } bool Collator::operator==(const Collator& other) const { // Subclasses: Call this method and then add more specific checks. return typeid(*this) == typeid(other); } bool Collator::operator!=(const Collator& other) const { return !operator==(other); } int32_t U_EXPORT2 Collator::getBound(const uint8_t *source, int32_t sourceLength, UColBoundMode boundType, uint32_t noOfLevels, uint8_t *result, int32_t resultLength, UErrorCode &status) { return ucol_getBound(source, sourceLength, boundType, noOfLevels, result, resultLength, &status); } void Collator::setLocales(const Locale& /* requestedLocale */, const Locale& /* validLocale */, const Locale& /*actualLocale*/) { } UnicodeSet *Collator::getTailoredSet(UErrorCode &status) const { if(U_FAILURE(status)) { return nullptr; } // everything can be changed return new UnicodeSet(0, 0x10FFFF); } // ------------------------------------- #if !UCONFIG_NO_SERVICE URegistryKey U_EXPORT2 Collator::registerInstance(Collator* toAdopt, const Locale& locale, UErrorCode& status) { if (U_SUCCESS(status)) { // Set the collator locales while registering so that createInstance() // need not guess whether the collator's locales are already set properly // (as they are by the data loader). toAdopt->setLocales(locale, locale, locale); return getService()->registerInstance(toAdopt, locale, status); } return nullptr; } // ------------------------------------- class CFactory : public LocaleKeyFactory { private: CollatorFactory* _delegate; Hashtable* _ids; public: CFactory(CollatorFactory* delegate, UErrorCode& status) : LocaleKeyFactory(delegate->visible() ? VISIBLE : INVISIBLE) , _delegate(delegate) , _ids(nullptr) { if (U_SUCCESS(status)) { int32_t count = 0; _ids = new Hashtable(status); if (_ids) { const UnicodeString * idlist = _delegate->getSupportedIDs(count, status); for (int i = 0; i < count; ++i) { _ids->put(idlist[i], (void*)this, status); if (U_FAILURE(status)) { delete _ids; _ids = nullptr; return; } } } else { status = U_MEMORY_ALLOCATION_ERROR; } } } virtual ~CFactory(); virtual UObject* create(const ICUServiceKey& key, const ICUService* service, UErrorCode& status) const override; protected: virtual const Hashtable* getSupportedIDs(UErrorCode& status) const override { if (U_SUCCESS(status)) { return _ids; } return nullptr; } virtual UnicodeString& getDisplayName(const UnicodeString& id, const Locale& locale, UnicodeString& result) const override; }; CFactory::~CFactory() { delete _delegate; delete _ids; } UObject* CFactory::create(const ICUServiceKey& key, const ICUService* /* service */, UErrorCode& status) const { if (handlesKey(key, status)) { const LocaleKey* lkey = dynamic_cast(&key); U_ASSERT(lkey != nullptr); Locale validLoc; lkey->currentLocale(validLoc); return _delegate->createCollator(validLoc); } return nullptr; } UnicodeString& CFactory::getDisplayName(const UnicodeString& id, const Locale& locale, UnicodeString& result) const { if ((_coverage & 0x1) == 0) { UErrorCode status = U_ZERO_ERROR; const Hashtable* ids = getSupportedIDs(status); if (ids && (ids->get(id) != nullptr)) { Locale loc; LocaleUtility::initLocaleFromName(id, loc); return _delegate->getDisplayName(loc, locale, result); } } result.setToBogus(); return result; } URegistryKey U_EXPORT2 Collator::registerFactory(CollatorFactory* toAdopt, UErrorCode& status) { if (U_SUCCESS(status)) { CFactory* f = new CFactory(toAdopt, status); if (f) { return getService()->registerFactory(f, status); } status = U_MEMORY_ALLOCATION_ERROR; } return nullptr; } // ------------------------------------- UBool U_EXPORT2 Collator::unregister(URegistryKey key, UErrorCode& status) { if (U_SUCCESS(status)) { if (hasService()) { return gService->unregister(key, status); } status = U_ILLEGAL_ARGUMENT_ERROR; } return false; } #endif /* UCONFIG_NO_SERVICE */ class CollationLocaleListEnumeration : public StringEnumeration { private: int32_t index; public: static UClassID U_EXPORT2 getStaticClassID(); virtual UClassID getDynamicClassID() const override; public: CollationLocaleListEnumeration() : index(0) { // The global variables should already be initialized. //isAvailableLocaleListInitialized(status); } virtual ~CollationLocaleListEnumeration(); virtual StringEnumeration * clone() const override { CollationLocaleListEnumeration *result = new CollationLocaleListEnumeration(); if (result) { result->index = index; } return result; } virtual int32_t count(UErrorCode &/*status*/) const override { return availableLocaleListCount; } virtual const char* next(int32_t* resultLength, UErrorCode& /*status*/) override { const char* result; if(index < availableLocaleListCount) { result = availableLocaleList[index++].getName(); if(resultLength != nullptr) { *resultLength = (int32_t)uprv_strlen(result); } } else { if(resultLength != nullptr) { *resultLength = 0; } result = nullptr; } return result; } virtual const UnicodeString* snext(UErrorCode& status) override { int32_t resultLength = 0; const char *s = next(&resultLength, status); return setChars(s, resultLength, status); } virtual void reset(UErrorCode& /*status*/) override { index = 0; } }; CollationLocaleListEnumeration::~CollationLocaleListEnumeration() {} UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationLocaleListEnumeration) // ------------------------------------- StringEnumeration* U_EXPORT2 Collator::getAvailableLocales() { #if !UCONFIG_NO_SERVICE if (hasService()) { return getService()->getAvailableLocales(); } #endif /* UCONFIG_NO_SERVICE */ UErrorCode status = U_ZERO_ERROR; if (isAvailableLocaleListInitialized(status)) { return new CollationLocaleListEnumeration(); } return nullptr; } StringEnumeration* U_EXPORT2 Collator::getKeywords(UErrorCode& status) { return UStringEnumeration::fromUEnumeration( ucol_getKeywords(&status), status); } StringEnumeration* U_EXPORT2 Collator::getKeywordValues(const char *keyword, UErrorCode& status) { return UStringEnumeration::fromUEnumeration( ucol_getKeywordValues(keyword, &status), status); } StringEnumeration* U_EXPORT2 Collator::getKeywordValuesForLocale(const char* key, const Locale& locale, UBool commonlyUsed, UErrorCode& status) { return UStringEnumeration::fromUEnumeration( ucol_getKeywordValuesForLocale( key, locale.getName(), commonlyUsed, &status), status); } Locale U_EXPORT2 Collator::getFunctionalEquivalent(const char* keyword, const Locale& locale, UBool& isAvailable, UErrorCode& status) { // This is a wrapper over ucol_getFunctionalEquivalent char loc[ULOC_FULLNAME_CAPACITY]; /*int32_t len =*/ ucol_getFunctionalEquivalent(loc, sizeof(loc), keyword, locale.getName(), &isAvailable, &status); if (U_FAILURE(status)) { *loc = 0; // root } return Locale::createFromName(loc); } Collator::ECollationStrength Collator::getStrength() const { UErrorCode intStatus = U_ZERO_ERROR; return (ECollationStrength)getAttribute(UCOL_STRENGTH, intStatus); } void Collator::setStrength(ECollationStrength newStrength) { UErrorCode intStatus = U_ZERO_ERROR; setAttribute(UCOL_STRENGTH, (UColAttributeValue)newStrength, intStatus); } Collator & Collator::setMaxVariable(UColReorderCode /*group*/, UErrorCode &errorCode) { if (U_SUCCESS(errorCode)) { errorCode = U_UNSUPPORTED_ERROR; } return *this; } UColReorderCode Collator::getMaxVariable() const { return UCOL_REORDER_CODE_PUNCTUATION; } int32_t Collator::getReorderCodes(int32_t* /* dest*/, int32_t /* destCapacity*/, UErrorCode& status) const { if (U_SUCCESS(status)) { status = U_UNSUPPORTED_ERROR; } return 0; } void Collator::setReorderCodes(const int32_t* /* reorderCodes */, int32_t /* reorderCodesLength */, UErrorCode& status) { if (U_SUCCESS(status)) { status = U_UNSUPPORTED_ERROR; } } int32_t Collator::getEquivalentReorderCodes(int32_t reorderCode, int32_t *dest, int32_t capacity, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return 0; } if(capacity < 0 || (dest == nullptr && capacity > 0)) { errorCode = U_ILLEGAL_ARGUMENT_ERROR; return 0; } const CollationData *baseData = CollationRoot::getData(errorCode); if(U_FAILURE(errorCode)) { return 0; } return baseData->getEquivalentScripts(reorderCode, dest, capacity, errorCode); } int32_t Collator::internalGetShortDefinitionString(const char * /*locale*/, char * /*buffer*/, int32_t /*capacity*/, UErrorCode &status) const { if(U_SUCCESS(status)) { status = U_UNSUPPORTED_ERROR; /* Shouldn't happen, internal function */ } return 0; } UCollationResult Collator::internalCompareUTF8(const char *left, int32_t leftLength, const char *right, int32_t rightLength, UErrorCode &errorCode) const { if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } if((left == nullptr && leftLength != 0) || (right == nullptr && rightLength != 0)) { errorCode = U_ILLEGAL_ARGUMENT_ERROR; return UCOL_EQUAL; } return compareUTF8( StringPiece(left, (leftLength < 0) ? static_cast(uprv_strlen(left)) : leftLength), StringPiece(right, (rightLength < 0) ? static_cast(uprv_strlen(right)) : rightLength), errorCode); } int32_t Collator::internalNextSortKeyPart(UCharIterator * /*iter*/, uint32_t /*state*/[2], uint8_t * /*dest*/, int32_t /*count*/, UErrorCode &errorCode) const { if (U_SUCCESS(errorCode)) { errorCode = U_UNSUPPORTED_ERROR; } return 0; } // UCollator private data members ---------------------------------------- /* This is useless information */ /*const UVersionInfo Collator::fVersion = {1, 1, 0, 0};*/ // ------------------------------------- U_NAMESPACE_END #endif /* #if !UCONFIG_NO_COLLATION */ /* eof */ stringi/src/icu74/i18n/coptccal.cpp0000644000176200001440000001156514700200761016533 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2003 - 2013, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "umutex.h" #include "coptccal.h" #include "cecal.h" #include U_NAMESPACE_BEGIN UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CopticCalendar) static const int32_t COPTIC_JD_EPOCH_OFFSET = 1824665; //------------------------------------------------------------------------- // Constructors... //------------------------------------------------------------------------- CopticCalendar::CopticCalendar(const Locale& aLocale, UErrorCode& success) : CECalendar(aLocale, success) { } CopticCalendar::CopticCalendar (const CopticCalendar& other) : CECalendar(other) { } CopticCalendar::~CopticCalendar() { } CopticCalendar* CopticCalendar::clone() const { return new CopticCalendar(*this); } const char* CopticCalendar::getType() const { return "coptic"; } //------------------------------------------------------------------------- // Calendar framework //------------------------------------------------------------------------- int32_t CopticCalendar::handleGetExtendedYear() { int32_t eyear; if (newerField(UCAL_EXTENDED_YEAR, UCAL_YEAR) == UCAL_EXTENDED_YEAR) { eyear = internalGet(UCAL_EXTENDED_YEAR, 1); // Default to year 1 } else { // The year defaults to the epoch start, the era to CE int32_t era = internalGet(UCAL_ERA, CE); if (era == BCE) { eyear = 1 - internalGet(UCAL_YEAR, 1); // Convert to extended year } else { eyear = internalGet(UCAL_YEAR, 1); // Default to year 1 } } return eyear; } void CopticCalendar::handleComputeFields(int32_t julianDay, UErrorCode &/*status*/) { int32_t eyear, month, day, era, year; jdToCE(julianDay, getJDEpochOffset(), eyear, month, day); if (eyear <= 0) { era = BCE; year = 1 - eyear; } else { era = CE; year = eyear; } internalSet(UCAL_EXTENDED_YEAR, eyear); internalSet(UCAL_ERA, era); internalSet(UCAL_YEAR, year); internalSet(UCAL_MONTH, month); internalSet(UCAL_ORDINAL_MONTH, month); internalSet(UCAL_DATE, day); internalSet(UCAL_DAY_OF_YEAR, (30 * month) + day); } constexpr uint32_t kCopticRelatedYearDiff = 284; int32_t CopticCalendar::getRelatedYear(UErrorCode &status) const { int32_t year = get(UCAL_EXTENDED_YEAR, status); if (U_FAILURE(status)) { return 0; } return year + kCopticRelatedYearDiff; } void CopticCalendar::setRelatedYear(int32_t year) { // set extended year set(UCAL_EXTENDED_YEAR, year - kCopticRelatedYearDiff); } /** * The system maintains a static default century start date and Year. They are * initialized the first time they are used. Once the system default century date * and year are set, they do not change. */ static UDate gSystemDefaultCenturyStart = DBL_MIN; static int32_t gSystemDefaultCenturyStartYear = -1; static icu::UInitOnce gSystemDefaultCenturyInit {}; static void U_CALLCONV initializeSystemDefaultCentury() { UErrorCode status = U_ZERO_ERROR; CopticCalendar calendar(Locale("@calendar=coptic"), status); if (U_SUCCESS(status)) { calendar.setTime(Calendar::getNow(), status); calendar.add(UCAL_YEAR, -80, status); gSystemDefaultCenturyStart = calendar.getTime(status); gSystemDefaultCenturyStartYear = calendar.get(UCAL_YEAR, status); } // We have no recourse upon failure unless we want to propagate the failure // out. } UDate CopticCalendar::defaultCenturyStart() const { // lazy-evaluate systemDefaultCenturyStart umtx_initOnce(gSystemDefaultCenturyInit, &initializeSystemDefaultCentury); return gSystemDefaultCenturyStart; } int32_t CopticCalendar::defaultCenturyStartYear() const { // lazy-evaluate systemDefaultCenturyStart umtx_initOnce(gSystemDefaultCenturyInit, &initializeSystemDefaultCentury); return gSystemDefaultCenturyStartYear; } int32_t CopticCalendar::getJDEpochOffset() const { return COPTIC_JD_EPOCH_OFFSET; } #if 0 // We do not want to introduce this API in ICU4C. // It was accidentally introduced in ICU4J as a public API. //------------------------------------------------------------------------- // Calendar system Conversion methods... //------------------------------------------------------------------------- int32_t CopticCalendar::copticToJD(int32_t year, int32_t month, int32_t day) { return CECalendar::ceToJD(year, month, day, COPTIC_JD_EPOCH_OFFSET); } #endif U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ //eof stringi/src/icu74/i18n/fmtable.cpp0000644000176200001440000006672714700200761016367 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 1997-2016, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* * * File FMTABLE.CPP * * Modification History: * * Date Name Description * 03/25/97 clhuang Initial Implementation. ******************************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include #include #include "unicode/fmtable.h" #include "unicode/ustring.h" #include "unicode/measure.h" #include "unicode/curramt.h" #include "unicode/uformattable.h" #include "charstr.h" #include "cmemory.h" #include "cstring.h" #include "fmtableimp.h" #include "number_decimalquantity.h" // ***************************************************************************** // class Formattable // ***************************************************************************** U_NAMESPACE_BEGIN UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Formattable) using number::impl::DecimalQuantity; //-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-. // NOTE: As of 3.0, there are limitations to the UObject API. It does // not (yet) support cloning, operator=, nor operator==. To // work around this, I implement some simple inlines here. Later // these can be modified or removed. [alan] // NOTE: These inlines assume that all fObjects are in fact instances // of the Measure class, which is true as of 3.0. [alan] // Return true if *a == *b. static inline UBool objectEquals(const UObject* a, const UObject* b) { // LATER: return *a == *b; return *((const Measure*) a) == *b; } // Return a clone of *a. static inline UObject* objectClone(const UObject* a) { // LATER: return a->clone(); return ((const Measure*) a)->clone(); } // Return true if *a is an instance of Measure. static inline UBool instanceOfMeasure(const UObject* a) { return dynamic_cast(a) != nullptr; } /** * Creates a new Formattable array and copies the values from the specified * original. * @param array the original array * @param count the original array count * @return the new Formattable array. */ static Formattable* createArrayCopy(const Formattable* array, int32_t count) { Formattable *result = new Formattable[count]; if (result != nullptr) { for (int32_t i=0; i INT32_MAX) { status = U_INVALID_FORMAT_ERROR; return INT32_MAX; } else if (fValue.fInt64 < INT32_MIN) { status = U_INVALID_FORMAT_ERROR; return INT32_MIN; } else { return (int32_t)fValue.fInt64; } case Formattable::kDouble: if (fValue.fDouble > INT32_MAX) { status = U_INVALID_FORMAT_ERROR; return INT32_MAX; } else if (fValue.fDouble < INT32_MIN) { status = U_INVALID_FORMAT_ERROR; return INT32_MIN; } else { return (int32_t)fValue.fDouble; // loses fraction } case Formattable::kObject: if (fValue.fObject == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return 0; } // TODO Later replace this with instanceof call if (instanceOfMeasure(fValue.fObject)) { return ((const Measure*) fValue.fObject)-> getNumber().getLong(status); } U_FALLTHROUGH; default: status = U_INVALID_FORMAT_ERROR; return 0; } } // ------------------------------------- // Maximum int that can be represented exactly in a double. (53 bits) // Larger ints may be rounded to a near-by value as not all are representable. // TODO: move this constant elsewhere, possibly configure it for different // floating point formats, if any non-standard ones are still in use. static const int64_t U_DOUBLE_MAX_EXACT_INT = 9007199254740992LL; int64_t Formattable::getInt64(UErrorCode& status) const { if (U_FAILURE(status)) { return 0; } switch (fType) { case Formattable::kLong: case Formattable::kInt64: return fValue.fInt64; case Formattable::kDouble: if (fValue.fDouble > (double)U_INT64_MAX) { status = U_INVALID_FORMAT_ERROR; return U_INT64_MAX; } else if (fValue.fDouble < (double)U_INT64_MIN) { status = U_INVALID_FORMAT_ERROR; return U_INT64_MIN; } else if (fabs(fValue.fDouble) > U_DOUBLE_MAX_EXACT_INT && fDecimalQuantity != nullptr) { if (fDecimalQuantity->fitsInLong(true)) { return fDecimalQuantity->toLong(); } else { // Unexpected status = U_INVALID_FORMAT_ERROR; return fDecimalQuantity->isNegative() ? U_INT64_MIN : U_INT64_MAX; } } else { return (int64_t)fValue.fDouble; } case Formattable::kObject: if (fValue.fObject == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return 0; } if (instanceOfMeasure(fValue.fObject)) { return ((const Measure*) fValue.fObject)-> getNumber().getInt64(status); } U_FALLTHROUGH; default: status = U_INVALID_FORMAT_ERROR; return 0; } } // ------------------------------------- double Formattable::getDouble(UErrorCode& status) const { if (U_FAILURE(status)) { return 0; } switch (fType) { case Formattable::kLong: case Formattable::kInt64: // loses precision return (double)fValue.fInt64; case Formattable::kDouble: return fValue.fDouble; case Formattable::kObject: if (fValue.fObject == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return 0; } // TODO Later replace this with instanceof call if (instanceOfMeasure(fValue.fObject)) { return ((const Measure*) fValue.fObject)-> getNumber().getDouble(status); } U_FALLTHROUGH; default: status = U_INVALID_FORMAT_ERROR; return 0; } } const UObject* Formattable::getObject() const { return (fType == kObject) ? fValue.fObject : nullptr; } // ------------------------------------- // Sets the value to a double value d. void Formattable::setDouble(double d) { dispose(); fType = kDouble; fValue.fDouble = d; } // ------------------------------------- // Sets the value to a long value l. void Formattable::setLong(int32_t l) { dispose(); fType = kLong; fValue.fInt64 = l; } // ------------------------------------- // Sets the value to an int64 value ll. void Formattable::setInt64(int64_t ll) { dispose(); fType = kInt64; fValue.fInt64 = ll; } // ------------------------------------- // Sets the value to a Date instance d. void Formattable::setDate(UDate d) { dispose(); fType = kDate; fValue.fDate = d; } // ------------------------------------- // Sets the value to a string value stringToCopy. void Formattable::setString(const UnicodeString& stringToCopy) { dispose(); fType = kString; fValue.fString = new UnicodeString(stringToCopy); } // ------------------------------------- // Sets the value to an array of Formattable objects. void Formattable::setArray(const Formattable* array, int32_t count) { dispose(); fType = kArray; fValue.fArrayAndCount.fArray = createArrayCopy(array, count); fValue.fArrayAndCount.fCount = count; } // ------------------------------------- // Adopts the stringToAdopt value. void Formattable::adoptString(UnicodeString* stringToAdopt) { dispose(); fType = kString; fValue.fString = stringToAdopt; } // ------------------------------------- // Adopts the array value and its count. void Formattable::adoptArray(Formattable* array, int32_t count) { dispose(); fType = kArray; fValue.fArrayAndCount.fArray = array; fValue.fArrayAndCount.fCount = count; } void Formattable::adoptObject(UObject* objectToAdopt) { dispose(); fType = kObject; fValue.fObject = objectToAdopt; } // ------------------------------------- UnicodeString& Formattable::getString(UnicodeString& result, UErrorCode& status) const { if (fType != kString) { setError(status, U_INVALID_FORMAT_ERROR); result.setToBogus(); } else { if (fValue.fString == nullptr) { setError(status, U_MEMORY_ALLOCATION_ERROR); } else { result = *fValue.fString; } } return result; } // ------------------------------------- const UnicodeString& Formattable::getString(UErrorCode& status) const { if (fType != kString) { setError(status, U_INVALID_FORMAT_ERROR); return *getBogus(); } if (fValue.fString == nullptr) { setError(status, U_MEMORY_ALLOCATION_ERROR); return *getBogus(); } return *fValue.fString; } // ------------------------------------- UnicodeString& Formattable::getString(UErrorCode& status) { if (fType != kString) { setError(status, U_INVALID_FORMAT_ERROR); return *getBogus(); } if (fValue.fString == nullptr) { setError(status, U_MEMORY_ALLOCATION_ERROR); return *getBogus(); } return *fValue.fString; } // ------------------------------------- const Formattable* Formattable::getArray(int32_t& count, UErrorCode& status) const { if (fType != kArray) { setError(status, U_INVALID_FORMAT_ERROR); count = 0; return nullptr; } count = fValue.fArrayAndCount.fCount; return fValue.fArrayAndCount.fArray; } // ------------------------------------- // Gets the bogus string, ensures mondo bogosity. UnicodeString* Formattable::getBogus() const { return (UnicodeString*)&fBogus; /* cast away const :-( */ } // -------------------------------------- StringPiece Formattable::getDecimalNumber(UErrorCode &status) { if (U_FAILURE(status)) { return ""; } if (fDecimalStr != nullptr) { return fDecimalStr->toStringPiece(); } CharString *decimalStr = internalGetCharString(status); if(decimalStr == nullptr) { return ""; // getDecimalNumber returns "" for error cases } else { return decimalStr->toStringPiece(); } } CharString *Formattable::internalGetCharString(UErrorCode &status) { if(fDecimalStr == nullptr) { if (fDecimalQuantity == nullptr) { // No decimal number for the formattable yet. Which means the value was // set directly by the user as an int, int64 or double. If the value came // from parsing, or from the user setting a decimal number, fDecimalNum // would already be set. // LocalPointer dq(new DecimalQuantity(), status); if (U_FAILURE(status)) { return nullptr; } populateDecimalQuantity(*dq, status); if (U_FAILURE(status)) { return nullptr; } fDecimalQuantity = dq.orphan(); } fDecimalStr = new CharString(); if (fDecimalStr == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } // Older ICUs called uprv_decNumberToString here, which is not exactly the same as // DecimalQuantity::toScientificString(). The biggest difference is that uprv_decNumberToString does // not print scientific notation for magnitudes greater than -5 and smaller than some amount (+5?). if (fDecimalQuantity->isInfinite()) { fDecimalStr->append("Infinity", status); } else if (fDecimalQuantity->isNaN()) { fDecimalStr->append("NaN", status); } else if (fDecimalQuantity->isZeroish()) { fDecimalStr->append("0", -1, status); } else if (fType==kLong || fType==kInt64 || // use toPlainString for integer types (fDecimalQuantity->getMagnitude() != INT32_MIN && std::abs(fDecimalQuantity->getMagnitude()) < 5)) { fDecimalStr->appendInvariantChars(fDecimalQuantity->toPlainString(), status); } else { fDecimalStr->appendInvariantChars(fDecimalQuantity->toScientificString(), status); } } return fDecimalStr; } void Formattable::populateDecimalQuantity(number::impl::DecimalQuantity& output, UErrorCode& status) const { if (fDecimalQuantity != nullptr) { output = *fDecimalQuantity; return; } switch (fType) { case kDouble: output.setToDouble(this->getDouble()); output.roundToInfinity(); break; case kLong: output.setToInt(this->getLong()); break; case kInt64: output.setToLong(this->getInt64()); break; default: // The formattable's value is not a numeric type. status = U_INVALID_STATE_ERROR; } } // --------------------------------------- void Formattable::adoptDecimalQuantity(DecimalQuantity *dq) { if (fDecimalQuantity != nullptr) { delete fDecimalQuantity; } fDecimalQuantity = dq; if (dq == nullptr) { // allow adoptDigitList(nullptr) to clear return; } // Set the value into the Union of simple type values. // Cannot use the set() functions because they would delete the fDecimalNum value. if (fDecimalQuantity->fitsInLong()) { fValue.fInt64 = fDecimalQuantity->toLong(); if (fValue.fInt64 <= INT32_MAX && fValue.fInt64 >= INT32_MIN) { fType = kLong; } else { fType = kInt64; } } else { fType = kDouble; fValue.fDouble = fDecimalQuantity->toDouble(); } } // --------------------------------------- void Formattable::setDecimalNumber(StringPiece numberString, UErrorCode &status) { if (U_FAILURE(status)) { return; } dispose(); auto* dq = new DecimalQuantity(); dq->setToDecNumber(numberString, status); adoptDecimalQuantity(dq); // Note that we do not hang on to the caller's input string. // If we are asked for the string, we will regenerate one from fDecimalQuantity. } #if 0 //---------------------------------------------------- // console I/O //---------------------------------------------------- #ifdef _DEBUG #include using namespace std; #include "unicode/datefmt.h" #include "unistrm.h" class FormattableStreamer /* not : public UObject because all methods are static */ { public: static void streamOut(ostream& stream, const Formattable& obj); private: FormattableStreamer() {} // private - forbid instantiation }; // This is for debugging purposes only. This will send a displayable // form of the Formattable object to the output stream. void FormattableStreamer::streamOut(ostream& stream, const Formattable& obj) { static DateFormat *defDateFormat = 0; UnicodeString buffer; switch(obj.getType()) { case Formattable::kDate : // Creates a DateFormat instance for formatting the // Date instance. if (defDateFormat == 0) { defDateFormat = DateFormat::createInstance(); } defDateFormat->format(obj.getDate(), buffer); stream << buffer; break; case Formattable::kDouble : // Output the double as is. stream << obj.getDouble() << 'D'; break; case Formattable::kLong : // Output the double as is. stream << obj.getLong() << 'L'; break; case Formattable::kString: // Output the double as is. Please see UnicodeString console // I/O routine for more details. stream << '"' << obj.getString(buffer) << '"'; break; case Formattable::kArray: int32_t i, count; const Formattable* array; array = obj.getArray(count); stream << '['; // Recursively calling the console I/O routine for each element in the array. for (i=0; itoUFormattable(); if( fmt == nullptr ) { *status = U_MEMORY_ALLOCATION_ERROR; } return fmt; } U_CAPI void U_EXPORT2 ufmt_close(UFormattable *fmt) { Formattable *obj = Formattable::fromUFormattable(fmt); delete obj; } U_CAPI UFormattableType U_EXPORT2 ufmt_getType(const UFormattable *fmt, UErrorCode *status) { if(U_FAILURE(*status)) { return (UFormattableType)UFMT_COUNT; } const Formattable *obj = Formattable::fromUFormattable(fmt); return (UFormattableType)obj->getType(); } U_CAPI UBool U_EXPORT2 ufmt_isNumeric(const UFormattable *fmt) { const Formattable *obj = Formattable::fromUFormattable(fmt); return obj->isNumeric(); } U_CAPI UDate U_EXPORT2 ufmt_getDate(const UFormattable *fmt, UErrorCode *status) { const Formattable *obj = Formattable::fromUFormattable(fmt); return obj->getDate(*status); } U_CAPI double U_EXPORT2 ufmt_getDouble(UFormattable *fmt, UErrorCode *status) { Formattable *obj = Formattable::fromUFormattable(fmt); return obj->getDouble(*status); } U_CAPI int32_t U_EXPORT2 ufmt_getLong(UFormattable *fmt, UErrorCode *status) { Formattable *obj = Formattable::fromUFormattable(fmt); return obj->getLong(*status); } U_CAPI const void *U_EXPORT2 ufmt_getObject(const UFormattable *fmt, UErrorCode *status) { const Formattable *obj = Formattable::fromUFormattable(fmt); const void *ret = obj->getObject(); if( ret==nullptr && (obj->getType() != Formattable::kObject) && U_SUCCESS( *status )) { *status = U_INVALID_FORMAT_ERROR; } return ret; } U_CAPI const char16_t* U_EXPORT2 ufmt_getUChars(UFormattable *fmt, int32_t *len, UErrorCode *status) { Formattable *obj = Formattable::fromUFormattable(fmt); // avoid bogosity by checking the type first. if( obj->getType() != Formattable::kString ) { if( U_SUCCESS(*status) ){ *status = U_INVALID_FORMAT_ERROR; } return nullptr; } // This should return a valid string UnicodeString &str = obj->getString(*status); if( U_SUCCESS(*status) && len != nullptr ) { *len = str.length(); } return str.getTerminatedBuffer(); } U_CAPI int32_t U_EXPORT2 ufmt_getArrayLength(const UFormattable* fmt, UErrorCode *status) { const Formattable *obj = Formattable::fromUFormattable(fmt); int32_t count; (void)obj->getArray(count, *status); return count; } U_CAPI UFormattable * U_EXPORT2 ufmt_getArrayItemByIndex(UFormattable* fmt, int32_t n, UErrorCode *status) { Formattable *obj = Formattable::fromUFormattable(fmt); int32_t count; (void)obj->getArray(count, *status); if(U_FAILURE(*status)) { return nullptr; } else if(n<0 || n>=count) { setError(*status, U_INDEX_OUTOFBOUNDS_ERROR); return nullptr; } else { return (*obj)[n].toUFormattable(); // returns non-const Formattable } } U_CAPI const char * U_EXPORT2 ufmt_getDecNumChars(UFormattable *fmt, int32_t *len, UErrorCode *status) { if(U_FAILURE(*status)) { return ""; } Formattable *obj = Formattable::fromUFormattable(fmt); CharString *charString = obj->internalGetCharString(*status); if(U_FAILURE(*status)) { return ""; } if(charString == nullptr) { *status = U_MEMORY_ALLOCATION_ERROR; return ""; } else { if(len!=nullptr) { *len = charString->length(); } return charString->data(); } } U_CAPI int64_t U_EXPORT2 ufmt_getInt64(UFormattable *fmt, UErrorCode *status) { Formattable *obj = Formattable::fromUFormattable(fmt); return obj->getInt64(*status); } #endif /* #if !UCONFIG_NO_FORMATTING */ //eof stringi/src/icu74/i18n/simpletz.cpp0000644000176200001440000013326714700200761016616 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 1997-2013, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* * * File SIMPLETZ.H * * Modification History: * * Date Name Description * 12/05/96 clhuang Creation. * 04/21/97 aliu Fixed miscellaneous bugs found by inspection and * testing. * 07/29/97 aliu Ported source bodies back from Java version with * numerous feature enhancements and bug fixes. * 08/10/98 stephen JDK 1.2 sync. * 09/17/98 stephen Fixed getOffset() for last hour of year and DST * 12/02/99 aliu Added TimeMode and constructor and setStart/EndRule * methods that take TimeMode. Whitespace cleanup. ******************************************************************************** */ #include "utypeinfo.h" // for 'typeid' to work #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/simpletz.h" #include "unicode/gregocal.h" #include "unicode/smpdtfmt.h" #include "cmemory.h" #include "gregoimp.h" #include "umutex.h" U_NAMESPACE_BEGIN UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SimpleTimeZone) // Use only for decodeStartRule() and decodeEndRule() where the year is not // available. Set February to 29 days to accommodate rules with that date // and day-of-week-on-or-before-that-date mode (DOW_LE_DOM_MODE). // The compareToRule() method adjusts to February 28 in non-leap years. // // For actual getOffset() calculations, use Grego::monthLength() and // Grego::previousMonthLength() which take leap years into account. // We handle leap years assuming always // Gregorian, since we know they didn't have daylight time when // Gregorian calendar started. const int8_t SimpleTimeZone::STATICMONTHLENGTH[] = {31,29,31,30,31,30,31,31,30,31,30,31}; static const char16_t DST_STR[] = {0x0028,0x0044,0x0053,0x0054,0x0029,0}; // "(DST)" static const char16_t STD_STR[] = {0x0028,0x0053,0x0054,0x0044,0x0029,0}; // "(STD)" // ***************************************************************************** // class SimpleTimeZone // ***************************************************************************** SimpleTimeZone::SimpleTimeZone(int32_t rawOffsetGMT, const UnicodeString& ID) : BasicTimeZone(ID), startMonth(0), startDay(0), startDayOfWeek(0), startTime(0), startTimeMode(WALL_TIME), endTimeMode(WALL_TIME), endMonth(0), endDay(0), endDayOfWeek(0), endTime(0), startYear(0), rawOffset(rawOffsetGMT), useDaylight(false), startMode(DOM_MODE), endMode(DOM_MODE), dstSavings(U_MILLIS_PER_HOUR) { clearTransitionRules(); } // ------------------------------------- SimpleTimeZone::SimpleTimeZone(int32_t rawOffsetGMT, const UnicodeString& ID, int8_t savingsStartMonth, int8_t savingsStartDay, int8_t savingsStartDayOfWeek, int32_t savingsStartTime, int8_t savingsEndMonth, int8_t savingsEndDay, int8_t savingsEndDayOfWeek, int32_t savingsEndTime, UErrorCode& status) : BasicTimeZone(ID) { clearTransitionRules(); construct(rawOffsetGMT, savingsStartMonth, savingsStartDay, savingsStartDayOfWeek, savingsStartTime, WALL_TIME, savingsEndMonth, savingsEndDay, savingsEndDayOfWeek, savingsEndTime, WALL_TIME, U_MILLIS_PER_HOUR, status); } // ------------------------------------- SimpleTimeZone::SimpleTimeZone(int32_t rawOffsetGMT, const UnicodeString& ID, int8_t savingsStartMonth, int8_t savingsStartDay, int8_t savingsStartDayOfWeek, int32_t savingsStartTime, int8_t savingsEndMonth, int8_t savingsEndDay, int8_t savingsEndDayOfWeek, int32_t savingsEndTime, int32_t savingsDST, UErrorCode& status) : BasicTimeZone(ID) { clearTransitionRules(); construct(rawOffsetGMT, savingsStartMonth, savingsStartDay, savingsStartDayOfWeek, savingsStartTime, WALL_TIME, savingsEndMonth, savingsEndDay, savingsEndDayOfWeek, savingsEndTime, WALL_TIME, savingsDST, status); } // ------------------------------------- SimpleTimeZone::SimpleTimeZone(int32_t rawOffsetGMT, const UnicodeString& ID, int8_t savingsStartMonth, int8_t savingsStartDay, int8_t savingsStartDayOfWeek, int32_t savingsStartTime, TimeMode savingsStartTimeMode, int8_t savingsEndMonth, int8_t savingsEndDay, int8_t savingsEndDayOfWeek, int32_t savingsEndTime, TimeMode savingsEndTimeMode, int32_t savingsDST, UErrorCode& status) : BasicTimeZone(ID) { clearTransitionRules(); construct(rawOffsetGMT, savingsStartMonth, savingsStartDay, savingsStartDayOfWeek, savingsStartTime, savingsStartTimeMode, savingsEndMonth, savingsEndDay, savingsEndDayOfWeek, savingsEndTime, savingsEndTimeMode, savingsDST, status); } /** * Internal construction method. */ void SimpleTimeZone::construct(int32_t rawOffsetGMT, int8_t savingsStartMonth, int8_t savingsStartDay, int8_t savingsStartDayOfWeek, int32_t savingsStartTime, TimeMode savingsStartTimeMode, int8_t savingsEndMonth, int8_t savingsEndDay, int8_t savingsEndDayOfWeek, int32_t savingsEndTime, TimeMode savingsEndTimeMode, int32_t savingsDST, UErrorCode& status) { this->rawOffset = rawOffsetGMT; this->startMonth = savingsStartMonth; this->startDay = savingsStartDay; this->startDayOfWeek = savingsStartDayOfWeek; this->startTime = savingsStartTime; this->startTimeMode = savingsStartTimeMode; this->endMonth = savingsEndMonth; this->endDay = savingsEndDay; this->endDayOfWeek = savingsEndDayOfWeek; this->endTime = savingsEndTime; this->endTimeMode = savingsEndTimeMode; this->dstSavings = savingsDST; this->startYear = 0; this->startMode = DOM_MODE; this->endMode = DOM_MODE; decodeRules(status); if (savingsDST == 0) { status = U_ILLEGAL_ARGUMENT_ERROR; } } // ------------------------------------- SimpleTimeZone::~SimpleTimeZone() { deleteTransitionRules(); } // ------------------------------------- // Called by TimeZone::createDefault(), then clone() inside a Mutex - be careful. SimpleTimeZone::SimpleTimeZone(const SimpleTimeZone &source) : BasicTimeZone(source) { *this = source; } // ------------------------------------- // Called by TimeZone::createDefault(), then clone() inside a Mutex - be careful. SimpleTimeZone & SimpleTimeZone::operator=(const SimpleTimeZone &right) { if (this != &right) { TimeZone::operator=(right); rawOffset = right.rawOffset; startMonth = right.startMonth; startDay = right.startDay; startDayOfWeek = right.startDayOfWeek; startTime = right.startTime; startTimeMode = right.startTimeMode; startMode = right.startMode; endMonth = right.endMonth; endDay = right.endDay; endDayOfWeek = right.endDayOfWeek; endTime = right.endTime; endTimeMode = right.endTimeMode; endMode = right.endMode; startYear = right.startYear; dstSavings = right.dstSavings; useDaylight = right.useDaylight; clearTransitionRules(); } return *this; } // ------------------------------------- bool SimpleTimeZone::operator==(const TimeZone& that) const { return ((this == &that) || (typeid(*this) == typeid(that) && TimeZone::operator==(that) && hasSameRules(that))); } // ------------------------------------- // Called by TimeZone::createDefault() inside a Mutex - be careful. SimpleTimeZone* SimpleTimeZone::clone() const { return new SimpleTimeZone(*this); } // ------------------------------------- /** * Sets the daylight savings starting year, that is, the year this time zone began * observing its specified daylight savings time rules. The time zone is considered * not to observe daylight savings time prior to that year; SimpleTimeZone doesn't * support historical daylight-savings-time rules. * @param year the daylight savings starting year. */ void SimpleTimeZone::setStartYear(int32_t year) { startYear = year; transitionRulesInitialized = false; } // ------------------------------------- /** * Sets the daylight savings starting rule. For example, in the U.S., Daylight Savings * Time starts at the first Sunday in April, at 2 AM in standard time. * Therefore, you can set the start rule by calling: * setStartRule(TimeFields.APRIL, 1, TimeFields.SUNDAY, 2*60*60*1000); * The dayOfWeekInMonth and dayOfWeek parameters together specify how to calculate * the exact starting date. Their exact meaning depend on their respective signs, * allowing various types of rules to be constructed, as follows:

    *
  • If both dayOfWeekInMonth and dayOfWeek are positive, they specify the * day of week in the month (e.g., (2, WEDNESDAY) is the second Wednesday * of the month). *
  • If dayOfWeek is positive and dayOfWeekInMonth is negative, they specify * the day of week in the month counting backward from the end of the month. * (e.g., (-1, MONDAY) is the last Monday in the month) *
  • If dayOfWeek is zero and dayOfWeekInMonth is positive, dayOfWeekInMonth * specifies the day of the month, regardless of what day of the week it is. * (e.g., (10, 0) is the tenth day of the month) *
  • If dayOfWeek is zero and dayOfWeekInMonth is negative, dayOfWeekInMonth * specifies the day of the month counting backward from the end of the * month, regardless of what day of the week it is (e.g., (-2, 0) is the * next-to-last day of the month). *
  • If dayOfWeek is negative and dayOfWeekInMonth is positive, they specify the * first specified day of the week on or after the specified day of the month. * (e.g., (15, -SUNDAY) is the first Sunday after the 15th of the month * [or the 15th itself if the 15th is a Sunday].) *
  • If dayOfWeek and DayOfWeekInMonth are both negative, they specify the * last specified day of the week on or before the specified day of the month. * (e.g., (-20, -TUESDAY) is the last Tuesday before the 20th of the month * [or the 20th itself if the 20th is a Tuesday].)
* @param month the daylight savings starting month. Month is 0-based. * eg, 0 for January. * @param dayOfWeekInMonth the daylight savings starting * day-of-week-in-month. Please see the member description for an example. * @param dayOfWeek the daylight savings starting day-of-week. Please see * the member description for an example. * @param time the daylight savings starting time. Please see the member * description for an example. */ void SimpleTimeZone::setStartRule(int32_t month, int32_t dayOfWeekInMonth, int32_t dayOfWeek, int32_t time, TimeMode mode, UErrorCode& status) { startMonth = (int8_t)month; startDay = (int8_t)dayOfWeekInMonth; startDayOfWeek = (int8_t)dayOfWeek; startTime = time; startTimeMode = mode; decodeStartRule(status); transitionRulesInitialized = false; } // ------------------------------------- void SimpleTimeZone::setStartRule(int32_t month, int32_t dayOfMonth, int32_t time, TimeMode mode, UErrorCode& status) { setStartRule(month, dayOfMonth, 0, time, mode, status); } // ------------------------------------- void SimpleTimeZone::setStartRule(int32_t month, int32_t dayOfMonth, int32_t dayOfWeek, int32_t time, TimeMode mode, UBool after, UErrorCode& status) { setStartRule(month, after ? dayOfMonth : -dayOfMonth, -dayOfWeek, time, mode, status); } // ------------------------------------- /** * Sets the daylight savings ending rule. For example, in the U.S., Daylight * Savings Time ends at the last (-1) Sunday in October, at 2 AM in standard time. * Therefore, you can set the end rule by calling: * setEndRule(TimeFields.OCTOBER, -1, TimeFields.SUNDAY, 2*60*60*1000); * Various other types of rules can be specified by manipulating the dayOfWeek * and dayOfWeekInMonth parameters. For complete details, see the documentation * for setStartRule(). * @param month the daylight savings ending month. Month is 0-based. * eg, 0 for January. * @param dayOfWeekInMonth the daylight savings ending * day-of-week-in-month. See setStartRule() for a complete explanation. * @param dayOfWeek the daylight savings ending day-of-week. See setStartRule() * for a complete explanation. * @param time the daylight savings ending time. Please see the member * description for an example. */ void SimpleTimeZone::setEndRule(int32_t month, int32_t dayOfWeekInMonth, int32_t dayOfWeek, int32_t time, TimeMode mode, UErrorCode& status) { endMonth = (int8_t)month; endDay = (int8_t)dayOfWeekInMonth; endDayOfWeek = (int8_t)dayOfWeek; endTime = time; endTimeMode = mode; decodeEndRule(status); transitionRulesInitialized = false; } // ------------------------------------- void SimpleTimeZone::setEndRule(int32_t month, int32_t dayOfMonth, int32_t time, TimeMode mode, UErrorCode& status) { setEndRule(month, dayOfMonth, 0, time, mode, status); } // ------------------------------------- void SimpleTimeZone::setEndRule(int32_t month, int32_t dayOfMonth, int32_t dayOfWeek, int32_t time, TimeMode mode, UBool after, UErrorCode& status) { setEndRule(month, after ? dayOfMonth : -dayOfMonth, -dayOfWeek, time, mode, status); } // ------------------------------------- int32_t SimpleTimeZone::getOffset(uint8_t era, int32_t year, int32_t month, int32_t day, uint8_t dayOfWeek, int32_t millis, UErrorCode& status) const { // Check the month before calling Grego::monthLength(). This // duplicates the test that occurs in the 7-argument getOffset(), // however, this is unavoidable. We don't mind because this method, in // fact, should not be called; internal code should always call the // 7-argument getOffset(), and outside code should use Calendar.get(int // field) with fields ZONE_OFFSET and DST_OFFSET. We can't get rid of // this method because it's public API. - liu 8/10/98 if(month < UCAL_JANUARY || month > UCAL_DECEMBER) { status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } return getOffset(era, year, month, day, dayOfWeek, millis, Grego::monthLength(year, month), status); } int32_t SimpleTimeZone::getOffset(uint8_t era, int32_t year, int32_t month, int32_t day, uint8_t dayOfWeek, int32_t millis, int32_t /*monthLength*/, UErrorCode& status) const { // Check the month before calling Grego::monthLength(). This // duplicates a test that occurs in the 9-argument getOffset(), // however, this is unavoidable. We don't mind because this method, in // fact, should not be called; internal code should always call the // 9-argument getOffset(), and outside code should use Calendar.get(int // field) with fields ZONE_OFFSET and DST_OFFSET. We can't get rid of // this method because it's public API. - liu 8/10/98 if (month < UCAL_JANUARY || month > UCAL_DECEMBER) { status = U_ILLEGAL_ARGUMENT_ERROR; return -1; } // We ignore monthLength because it can be derived from year and month. // This is so that February in leap years is calculated correctly. // We keep this argument in this function for backwards compatibility. return getOffset(era, year, month, day, dayOfWeek, millis, Grego::monthLength(year, month), Grego::previousMonthLength(year, month), status); } int32_t SimpleTimeZone::getOffset(uint8_t era, int32_t year, int32_t month, int32_t day, uint8_t dayOfWeek, int32_t millis, int32_t monthLength, int32_t prevMonthLength, UErrorCode& status) const { if(U_FAILURE(status)) return 0; if ((era != GregorianCalendar::AD && era != GregorianCalendar::BC) || month < UCAL_JANUARY || month > UCAL_DECEMBER || day < 1 || day > monthLength || dayOfWeek < UCAL_SUNDAY || dayOfWeek > UCAL_SATURDAY || millis < 0 || millis >= U_MILLIS_PER_DAY || monthLength < 28 || monthLength > 31 || prevMonthLength < 28 || prevMonthLength > 31) { status = U_ILLEGAL_ARGUMENT_ERROR; return -1; } int32_t result = rawOffset; // Bail out if we are before the onset of daylight savings time if(!useDaylight || year < startYear || era != GregorianCalendar::AD) return result; // Check for southern hemisphere. We assume that the start and end // month are different. UBool southern = (startMonth > endMonth); // Compare the date to the starting and ending rules.+1 = date>rule, -1 // = date= 0)) { endCompare = compareToRule((int8_t)month, (int8_t)monthLength, (int8_t)prevMonthLength, (int8_t)day, (int8_t)dayOfWeek, millis, endTimeMode == WALL_TIME ? dstSavings : (endTimeMode == UTC_TIME ? -rawOffset : 0), endMode, (int8_t)endMonth, (int8_t)endDayOfWeek, (int8_t)endDay, endTime); } // Check for both the northern and southern hemisphere cases. We // assume that in the northern hemisphere, the start rule is before the // end rule within the calendar year, and vice versa for the southern // hemisphere. if ((!southern && (startCompare >= 0 && endCompare < 0)) || (southern && (startCompare >= 0 || endCompare < 0))) result += dstSavings; return result; } void SimpleTimeZone::getOffsetFromLocal(UDate date, UTimeZoneLocalOption nonExistingTimeOpt, UTimeZoneLocalOption duplicatedTimeOpt, int32_t& rawOffsetGMT, int32_t& savingsDST, UErrorCode& status) const { if (U_FAILURE(status)) { return; } rawOffsetGMT = getRawOffset(); int32_t year, month, dom, dow, millis; int32_t day = ClockMath::floorDivide(date, U_MILLIS_PER_DAY, &millis); Grego::dayToFields(day, year, month, dom, dow); savingsDST = getOffset(GregorianCalendar::AD, year, month, dom, (uint8_t) dow, millis, Grego::monthLength(year, month), status) - rawOffsetGMT; if (U_FAILURE(status)) { return; } UBool recalc = false; // Now we need some adjustment if (savingsDST > 0) { if ((nonExistingTimeOpt & kStdDstMask) == kStandard || ((nonExistingTimeOpt & kStdDstMask) != kDaylight && (nonExistingTimeOpt & kFormerLatterMask) != kLatter)) { date -= getDSTSavings(); recalc = true; } } else { if ((duplicatedTimeOpt & kStdDstMask) == kDaylight || ((duplicatedTimeOpt & kStdDstMask) != kStandard && (duplicatedTimeOpt & kFormerLatterMask) == kFormer)) { date -= getDSTSavings(); recalc = true; } } if (recalc) { day = ClockMath::floorDivide(date, U_MILLIS_PER_DAY, &millis); Grego::dayToFields(day, year, month, dom, dow); savingsDST = getOffset(GregorianCalendar::AD, year, month, dom, (uint8_t) dow, millis, Grego::monthLength(year, month), status) - rawOffsetGMT; } } // ------------------------------------- /** * Compare a given date in the year to a rule. Return 1, 0, or -1, depending * on whether the date is after, equal to, or before the rule date. The * millis are compared directly against the ruleMillis, so any * standard-daylight adjustments must be handled by the caller. * * @return 1 if the date is after the rule date, -1 if the date is before * the rule date, or 0 if the date is equal to the rule date. */ int32_t SimpleTimeZone::compareToRule(int8_t month, int8_t monthLen, int8_t prevMonthLen, int8_t dayOfMonth, int8_t dayOfWeek, int32_t millis, int32_t millisDelta, EMode ruleMode, int8_t ruleMonth, int8_t ruleDayOfWeek, int8_t ruleDay, int32_t ruleMillis) { // Make adjustments for startTimeMode and endTimeMode millis += millisDelta; while (millis >= U_MILLIS_PER_DAY) { millis -= U_MILLIS_PER_DAY; ++dayOfMonth; dayOfWeek = (int8_t)(1 + (dayOfWeek % 7)); // dayOfWeek is one-based if (dayOfMonth > monthLen) { dayOfMonth = 1; /* When incrementing the month, it is desirable to overflow * from DECEMBER to DECEMBER+1, since we use the result to * compare against a real month. Wraparound of the value * leads to bug 4173604. */ ++month; } } while (millis < 0) { millis += U_MILLIS_PER_DAY; --dayOfMonth; dayOfWeek = (int8_t)(1 + ((dayOfWeek+5) % 7)); // dayOfWeek is one-based if (dayOfMonth < 1) { dayOfMonth = prevMonthLen; --month; } } // first compare months. If they're different, we don't have to worry about days // and times if (month < ruleMonth) return -1; else if (month > ruleMonth) return 1; // calculate the actual day of month for the rule int32_t ruleDayOfMonth = 0; // Adjust the ruleDay to the monthLen, for non-leap year February 29 rule days. if (ruleDay > monthLen) { ruleDay = monthLen; } switch (ruleMode) { // if the mode is day-of-month, the day of month is given case DOM_MODE: ruleDayOfMonth = ruleDay; break; // if the mode is day-of-week-in-month, calculate the day-of-month from it case DOW_IN_MONTH_MODE: // In this case ruleDay is the day-of-week-in-month (this code is using // the dayOfWeek and dayOfMonth parameters to figure out the day-of-week // of the first day of the month, so it's trusting that they're really // consistent with each other) if (ruleDay > 0) ruleDayOfMonth = 1 + (ruleDay - 1) * 7 + (7 + ruleDayOfWeek - (dayOfWeek - dayOfMonth + 1)) % 7; // if ruleDay is negative (we assume it's not zero here), we have to do // the same calculation figuring backward from the last day of the month. else { // (again, this code is trusting that dayOfWeek and dayOfMonth are // consistent with each other here, since we're using them to figure // the day of week of the first of the month) ruleDayOfMonth = monthLen + (ruleDay + 1) * 7 - (7 + (dayOfWeek + monthLen - dayOfMonth) - ruleDayOfWeek) % 7; } break; case DOW_GE_DOM_MODE: ruleDayOfMonth = ruleDay + (49 + ruleDayOfWeek - ruleDay - dayOfWeek + dayOfMonth) % 7; break; case DOW_LE_DOM_MODE: ruleDayOfMonth = ruleDay - (49 - ruleDayOfWeek + ruleDay + dayOfWeek - dayOfMonth) % 7; // Note at this point ruleDayOfMonth may be <1, although it will // be >=1 for well-formed rules. break; } // now that we have a real day-in-month for the rule, we can compare days... if (dayOfMonth < ruleDayOfMonth) return -1; else if (dayOfMonth > ruleDayOfMonth) return 1; // ...and if they're equal, we compare times if (millis < ruleMillis) return -1; else if (millis > ruleMillis) return 1; else return 0; } // ------------------------------------- int32_t SimpleTimeZone::getRawOffset() const { return rawOffset; } // ------------------------------------- void SimpleTimeZone::setRawOffset(int32_t offsetMillis) { rawOffset = offsetMillis; transitionRulesInitialized = false; } // ------------------------------------- void SimpleTimeZone::setDSTSavings(int32_t millisSavedDuringDST, UErrorCode& status) { if (millisSavedDuringDST == 0) { status = U_ILLEGAL_ARGUMENT_ERROR; } else { dstSavings = millisSavedDuringDST; } transitionRulesInitialized = false; } // ------------------------------------- int32_t SimpleTimeZone::getDSTSavings() const { return dstSavings; } // ------------------------------------- UBool SimpleTimeZone::useDaylightTime() const { return useDaylight; } // ------------------------------------- /** * Overrides TimeZone * Queries if the given date is in Daylight Savings Time. */ UBool SimpleTimeZone::inDaylightTime(UDate date, UErrorCode& status) const { // This method is wasteful since it creates a new GregorianCalendar and // deletes it each time it is called. However, this is a deprecated method // and provided only for Java compatibility as of 8/6/97 [LIU]. if (U_FAILURE(status)) return false; GregorianCalendar *gc = new GregorianCalendar(*this, status); /* test for nullptr */ if (gc == 0) { status = U_MEMORY_ALLOCATION_ERROR; return false; } gc->setTime(date, status); UBool result = gc->inDaylightTime(status); delete gc; return result; } // ------------------------------------- /** * Return true if this zone has the same rules and offset as another zone. * @param other the TimeZone object to be compared with * @return true if the given zone has the same rules and offset as this one */ UBool SimpleTimeZone::hasSameRules(const TimeZone& other) const { if (this == &other) return true; if (typeid(*this) != typeid(other)) return false; SimpleTimeZone *that = (SimpleTimeZone*)&other; return rawOffset == that->rawOffset && useDaylight == that->useDaylight && (!useDaylight // Only check rules if using DST || (dstSavings == that->dstSavings && startMode == that->startMode && startMonth == that->startMonth && startDay == that->startDay && startDayOfWeek == that->startDayOfWeek && startTime == that->startTime && startTimeMode == that->startTimeMode && endMode == that->endMode && endMonth == that->endMonth && endDay == that->endDay && endDayOfWeek == that->endDayOfWeek && endTime == that->endTime && endTimeMode == that->endTimeMode && startYear == that->startYear)); } // ------------------------------------- //---------------------------------------------------------------------- // Rule representation // // We represent the following flavors of rules: // 5 the fifth of the month // lastSun the last Sunday in the month // lastMon the last Monday in the month // Sun>=8 first Sunday on or after the eighth // Sun<=25 last Sunday on or before the 25th // This is further complicated by the fact that we need to remain // backward compatible with the 1.1 FCS. Finally, we need to minimize // API changes. In order to satisfy these requirements, we support // three representation systems, and we translate between them. // // INTERNAL REPRESENTATION // This is the format SimpleTimeZone objects take after construction or // streaming in is complete. Rules are represented directly, using an // unencoded format. We will discuss the start rule only below; the end // rule is analogous. // startMode Takes on enumerated values DAY_OF_MONTH, // DOW_IN_MONTH, DOW_AFTER_DOM, or DOW_BEFORE_DOM. // startDay The day of the month, or for DOW_IN_MONTH mode, a // value indicating which DOW, such as +1 for first, // +2 for second, -1 for last, etc. // startDayOfWeek The day of the week. Ignored for DAY_OF_MONTH. // // ENCODED REPRESENTATION // This is the format accepted by the constructor and by setStartRule() // and setEndRule(). It uses various combinations of positive, negative, // and zero values to encode the different rules. This representation // allows us to specify all the different rule flavors without altering // the API. // MODE startMonth startDay startDayOfWeek // DOW_IN_MONTH_MODE >=0 !=0 >0 // DOM_MODE >=0 >0 ==0 // DOW_GE_DOM_MODE >=0 >0 <0 // DOW_LE_DOM_MODE >=0 <0 <0 // (no DST) don't care ==0 don't care // // STREAMED REPRESENTATION // We must retain binary compatibility with the 1.1 FCS. The 1.1 code only // handles DOW_IN_MONTH_MODE and non-DST mode, the latter indicated by the // flag useDaylight. When we stream an object out, we translate into an // approximate DOW_IN_MONTH_MODE representation so the object can be parsed // and used by 1.1 code. Following that, we write out the full // representation separately so that contemporary code can recognize and // parse it. The full representation is written in a "packed" format, // consisting of a version number, a length, and an array of bytes. Future // versions of this class may specify different versions. If they wish to // include additional data, they should do so by storing them after the // packed representation below. //---------------------------------------------------------------------- /** * Given a set of encoded rules in startDay and startDayOfMonth, decode * them and set the startMode appropriately. Do the same for endDay and * endDayOfMonth. Upon entry, the day of week variables may be zero or * negative, in order to indicate special modes. The day of month * variables may also be negative. Upon exit, the mode variables will be * set, and the day of week and day of month variables will be positive. * This method also recognizes a startDay or endDay of zero as indicating * no DST. */ void SimpleTimeZone::decodeRules(UErrorCode& status) { decodeStartRule(status); decodeEndRule(status); } /** * Decode the start rule and validate the parameters. The parameters are * expected to be in encoded form, which represents the various rule modes * by negating or zeroing certain values. Representation formats are: *

*

 *            DOW_IN_MONTH  DOM    DOW>=DOM  DOW<=DOM  no DST
 *            ------------  -----  --------  --------  ----------
 * month       0..11        same    same      same     don't care
 * day        -5..5         1..31   1..31    -1..-31   0
 * dayOfWeek   1..7         0      -1..-7    -1..-7    don't care
 * time        0..ONEDAY    same    same      same     don't care
 * 
* The range for month does not include UNDECIMBER since this class is * really specific to GregorianCalendar, which does not use that month. * The range for time includes ONEDAY (vs. ending at ONEDAY-1) because the * end rule is an exclusive limit point. That is, the range of times that * are in DST include those >= the start and < the end. For this reason, * it should be possible to specify an end of ONEDAY in order to include the * entire day. Although this is equivalent to time 0 of the following day, * it's not always possible to specify that, for example, on December 31. * While arguably the start range should still be 0..ONEDAY-1, we keep * the start and end ranges the same for consistency. */ void SimpleTimeZone::decodeStartRule(UErrorCode& status) { if(U_FAILURE(status)) return; useDaylight = (UBool)((startDay != 0) && (endDay != 0) ? true : false); if (useDaylight && dstSavings == 0) { dstSavings = U_MILLIS_PER_HOUR; } if (startDay != 0) { if (startMonth < UCAL_JANUARY || startMonth > UCAL_DECEMBER) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } if (startTime < 0 || startTime > U_MILLIS_PER_DAY || startTimeMode < WALL_TIME || startTimeMode > UTC_TIME) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } if (startDayOfWeek == 0) { startMode = DOM_MODE; } else { if (startDayOfWeek > 0) { startMode = DOW_IN_MONTH_MODE; } else { startDayOfWeek = (int8_t)-startDayOfWeek; if (startDay > 0) { startMode = DOW_GE_DOM_MODE; } else { startDay = (int8_t)-startDay; startMode = DOW_LE_DOM_MODE; } } if (startDayOfWeek > UCAL_SATURDAY) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } } if (startMode == DOW_IN_MONTH_MODE) { if (startDay < -5 || startDay > 5) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } } else if (startDay<1 || startDay > STATICMONTHLENGTH[startMonth]) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } } } /** * Decode the end rule and validate the parameters. This method is exactly * analogous to decodeStartRule(). * @see decodeStartRule */ void SimpleTimeZone::decodeEndRule(UErrorCode& status) { if(U_FAILURE(status)) return; useDaylight = (UBool)((startDay != 0) && (endDay != 0) ? true : false); if (useDaylight && dstSavings == 0) { dstSavings = U_MILLIS_PER_HOUR; } if (endDay != 0) { if (endMonth < UCAL_JANUARY || endMonth > UCAL_DECEMBER) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } if (endTime < 0 || endTime > U_MILLIS_PER_DAY || endTimeMode < WALL_TIME || endTimeMode > UTC_TIME) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } if (endDayOfWeek == 0) { endMode = DOM_MODE; } else { if (endDayOfWeek > 0) { endMode = DOW_IN_MONTH_MODE; } else { endDayOfWeek = (int8_t)-endDayOfWeek; if (endDay > 0) { endMode = DOW_GE_DOM_MODE; } else { endDay = (int8_t)-endDay; endMode = DOW_LE_DOM_MODE; } } if (endDayOfWeek > UCAL_SATURDAY) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } } if (endMode == DOW_IN_MONTH_MODE) { if (endDay < -5 || endDay > 5) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } } else if (endDay<1 || endDay > STATICMONTHLENGTH[endMonth]) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } } } UBool SimpleTimeZone::getNextTransition(UDate base, UBool inclusive, TimeZoneTransition& result) const { if (!useDaylight) { return false; } UErrorCode status = U_ZERO_ERROR; checkTransitionRules(status); if (U_FAILURE(status)) { return false; } UDate firstTransitionTime = firstTransition->getTime(); if (base < firstTransitionTime || (inclusive && base == firstTransitionTime)) { result = *firstTransition; } UDate stdDate, dstDate; UBool stdAvail = stdRule->getNextStart(base, dstRule->getRawOffset(), dstRule->getDSTSavings(), inclusive, stdDate); UBool dstAvail = dstRule->getNextStart(base, stdRule->getRawOffset(), stdRule->getDSTSavings(), inclusive, dstDate); if (stdAvail && (!dstAvail || stdDate < dstDate)) { result.setTime(stdDate); result.setFrom(*dstRule); result.setTo(*stdRule); return true; } if (dstAvail && (!stdAvail || dstDate < stdDate)) { result.setTime(dstDate); result.setFrom(*stdRule); result.setTo(*dstRule); return true; } return false; } UBool SimpleTimeZone::getPreviousTransition(UDate base, UBool inclusive, TimeZoneTransition& result) const { if (!useDaylight) { return false; } UErrorCode status = U_ZERO_ERROR; checkTransitionRules(status); if (U_FAILURE(status)) { return false; } UDate firstTransitionTime = firstTransition->getTime(); if (base < firstTransitionTime || (!inclusive && base == firstTransitionTime)) { return false; } UDate stdDate, dstDate; UBool stdAvail = stdRule->getPreviousStart(base, dstRule->getRawOffset(), dstRule->getDSTSavings(), inclusive, stdDate); UBool dstAvail = dstRule->getPreviousStart(base, stdRule->getRawOffset(), stdRule->getDSTSavings(), inclusive, dstDate); if (stdAvail && (!dstAvail || stdDate > dstDate)) { result.setTime(stdDate); result.setFrom(*dstRule); result.setTo(*stdRule); return true; } if (dstAvail && (!stdAvail || dstDate > stdDate)) { result.setTime(dstDate); result.setFrom(*stdRule); result.setTo(*dstRule); return true; } return false; } void SimpleTimeZone::clearTransitionRules() { initialRule = nullptr; firstTransition = nullptr; stdRule = nullptr; dstRule = nullptr; transitionRulesInitialized = false; } void SimpleTimeZone::deleteTransitionRules() { if (initialRule != nullptr) { delete initialRule; } if (firstTransition != nullptr) { delete firstTransition; } if (stdRule != nullptr) { delete stdRule; } if (dstRule != nullptr) { delete dstRule; } clearTransitionRules(); } /* * Lazy transition rules initializer * * Note On the removal of UMTX_CHECK from checkTransitionRules(): * * It would be faster to have a UInitOnce as part of a SimpleTimeZone object, * which would avoid needing to lock a mutex to check the initialization state. * But we can't easily because simpletz.h is a public header, and including * a UInitOnce as a member of SimpleTimeZone would publicly expose internal ICU headers. * * Alternatively we could have a pointer to a UInitOnce in the SimpleTimeZone object, * allocate it in the constructors. This would be a more intrusive change, but doable * if performance turns out to be an issue. */ void SimpleTimeZone::checkTransitionRules(UErrorCode& status) const { if (U_FAILURE(status)) { return; } static UMutex gLock; umtx_lock(&gLock); if (!transitionRulesInitialized) { SimpleTimeZone *ncThis = const_cast(this); ncThis->initTransitionRules(status); } umtx_unlock(&gLock); } void SimpleTimeZone::initTransitionRules(UErrorCode& status) { if (U_FAILURE(status)) { return; } if (transitionRulesInitialized) { return; } deleteTransitionRules(); UnicodeString tzid; getID(tzid); if (useDaylight) { DateTimeRule* dtRule; DateTimeRule::TimeRuleType timeRuleType; UDate firstStdStart, firstDstStart; // Create a TimeZoneRule for daylight saving time timeRuleType = (startTimeMode == STANDARD_TIME) ? DateTimeRule::STANDARD_TIME : ((startTimeMode == UTC_TIME) ? DateTimeRule::UTC_TIME : DateTimeRule::WALL_TIME); switch (startMode) { case DOM_MODE: dtRule = new DateTimeRule(startMonth, startDay, startTime, timeRuleType); break; case DOW_IN_MONTH_MODE: dtRule = new DateTimeRule(startMonth, startDay, startDayOfWeek, startTime, timeRuleType); break; case DOW_GE_DOM_MODE: dtRule = new DateTimeRule(startMonth, startDay, startDayOfWeek, true, startTime, timeRuleType); break; case DOW_LE_DOM_MODE: dtRule = new DateTimeRule(startMonth, startDay, startDayOfWeek, false, startTime, timeRuleType); break; default: status = U_INVALID_STATE_ERROR; return; } // Check for Null pointer if (dtRule == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } // For now, use ID + "(DST)" as the name dstRule = new AnnualTimeZoneRule(tzid+UnicodeString(DST_STR), getRawOffset(), getDSTSavings(), dtRule, startYear, AnnualTimeZoneRule::MAX_YEAR); // Check for Null pointer if (dstRule == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; deleteTransitionRules(); return; } // Calculate the first DST start time dstRule->getFirstStart(getRawOffset(), 0, firstDstStart); // Create a TimeZoneRule for standard time timeRuleType = (endTimeMode == STANDARD_TIME) ? DateTimeRule::STANDARD_TIME : ((endTimeMode == UTC_TIME) ? DateTimeRule::UTC_TIME : DateTimeRule::WALL_TIME); switch (endMode) { case DOM_MODE: dtRule = new DateTimeRule(endMonth, endDay, endTime, timeRuleType); break; case DOW_IN_MONTH_MODE: dtRule = new DateTimeRule(endMonth, endDay, endDayOfWeek, endTime, timeRuleType); break; case DOW_GE_DOM_MODE: dtRule = new DateTimeRule(endMonth, endDay, endDayOfWeek, true, endTime, timeRuleType); break; case DOW_LE_DOM_MODE: dtRule = new DateTimeRule(endMonth, endDay, endDayOfWeek, false, endTime, timeRuleType); break; } // Check for Null pointer if (dtRule == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; deleteTransitionRules(); return; } // For now, use ID + "(STD)" as the name stdRule = new AnnualTimeZoneRule(tzid+UnicodeString(STD_STR), getRawOffset(), 0, dtRule, startYear, AnnualTimeZoneRule::MAX_YEAR); //Check for Null pointer if (stdRule == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; deleteTransitionRules(); return; } // Calculate the first STD start time stdRule->getFirstStart(getRawOffset(), dstRule->getDSTSavings(), firstStdStart); // Create a TimeZoneRule for initial time if (firstStdStart < firstDstStart) { initialRule = new InitialTimeZoneRule(tzid+UnicodeString(DST_STR), getRawOffset(), dstRule->getDSTSavings()); if (initialRule == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; deleteTransitionRules(); return; } firstTransition = new TimeZoneTransition(firstStdStart, *initialRule, *stdRule); } else { initialRule = new InitialTimeZoneRule(tzid+UnicodeString(STD_STR), getRawOffset(), 0); if (initialRule == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; deleteTransitionRules(); return; } firstTransition = new TimeZoneTransition(firstDstStart, *initialRule, *dstRule); } if (firstTransition == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; deleteTransitionRules(); return; } } else { // Create a TimeZoneRule for initial time initialRule = new InitialTimeZoneRule(tzid, getRawOffset(), 0); // Check for null pointer. if (initialRule == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; deleteTransitionRules(); return; } } transitionRulesInitialized = true; } int32_t SimpleTimeZone::countTransitionRules(UErrorCode& /*status*/) const { return (useDaylight) ? 2 : 0; } void SimpleTimeZone::getTimeZoneRules(const InitialTimeZoneRule*& initial, const TimeZoneRule* trsrules[], int32_t& trscount, UErrorCode& status) const { if (U_FAILURE(status)) { return; } checkTransitionRules(status); if (U_FAILURE(status)) { return; } initial = initialRule; int32_t cnt = 0; if (stdRule != nullptr) { if (cnt < trscount) { trsrules[cnt++] = stdRule; } if (cnt < trscount) { trsrules[cnt++] = dstRule; } } trscount = cnt; } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ //eof stringi/src/icu74/i18n/zonemeta.h0000644000176200001440000001270114700200761016223 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2007-2013, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ #ifndef ZONEMETA_H #define ZONEMETA_H #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/unistr.h" #include "hash.h" U_NAMESPACE_BEGIN struct OlsonToMetaMappingEntry : public UMemory { const char16_t *mzid; // const because it's a reference to a resource bundle string. UDate from; UDate to; }; class UVector; class TimeZone; class U_I18N_API ZoneMeta { public: /** * Return the canonical id for this tzid defined by CLDR, which might be the id itself. * If the given system tzid is not known, U_ILLEGAL_ARGUMENT_ERROR is set in the status. * * Note: this internal API supports all known system IDs and "Etc/Unknown" (which is * NOT a system ID). */ static UnicodeString& U_EXPORT2 getCanonicalCLDRID(const UnicodeString &tzid, UnicodeString &systemID, UErrorCode& status); /** * Return the canonical id for this tzid defined by CLDR, which might be the id itself. * This overload method returns a persistent const char16_t*, which is guaranteed to persist * (a pointer to a resource). If the given system tzid is not known, U_ILLEGAL_ARGUMENT_ERROR * is set in the status. * @param tzid Zone ID * @param status Receives the status * @return The canonical ID for the input time zone ID */ static const char16_t* U_EXPORT2 getCanonicalCLDRID(const UnicodeString &tzid, UErrorCode& status); /* * Convenient method returning CLDR canonical ID for the given time zone */ static const char16_t* U_EXPORT2 getCanonicalCLDRID(const TimeZone& tz); /** * Returns primary IANA zone ID for the input zone ID, which might be the id itself. * If the given system tzid is not known, U_ILLEGAL_ARGUMENT_ERROR is set in the status. * * @param tzid Zone ID * @param ianaID Output IANA ID * @param status Receives the status * @return A primary IANA zone ID equivalent to the input zone ID. */ static UnicodeString& U_EXPORT2 getIanaID(const UnicodeString& tzid, UnicodeString& ianaID, UErrorCode& status); /** * Return the canonical country code for this tzid. If we have none, or if the time zone * is not associated with a country, return bogus string. * @param tzid Zone ID * @param country [output] Country code * @param isPrimary [output] true if the zone is the primary zone for the country * @return A reference to the result country */ static UnicodeString& U_EXPORT2 getCanonicalCountry(const UnicodeString &tzid, UnicodeString &country, UBool *isPrimary = nullptr); /** * Returns a CLDR metazone ID for the given Olson tzid and time. */ static UnicodeString& U_EXPORT2 getMetazoneID(const UnicodeString &tzid, UDate date, UnicodeString &result); /** * Returns an Olson ID for the ginve metazone and region */ static UnicodeString& U_EXPORT2 getZoneIdByMetazone(const UnicodeString &mzid, const UnicodeString ®ion, UnicodeString &result); static const UVector* U_EXPORT2 getMetazoneMappings(const UnicodeString &tzid); static const UVector* U_EXPORT2 getAvailableMetazoneIDs(); /** * Returns the pointer to the persistent time zone ID string, or nullptr if the given tzid is not in the * tz database. This method is useful when you maintain persistent zone IDs without duplication. */ static const char16_t* U_EXPORT2 findTimeZoneID(const UnicodeString& tzid); /** * Returns the pointer to the persistent meta zone ID string, or nullptr if the given mzid is not available. * This method is useful when you maintain persistent meta zone IDs without duplication. */ static const char16_t* U_EXPORT2 findMetaZoneID(const UnicodeString& mzid); /** * Creates a custom zone for the offset * @param offset GMT offset in milliseconds * @return A custom TimeZone for the offset with normalized time zone id */ static TimeZone* createCustomTimeZone(int32_t offset); /** * Returns the time zone's short ID (null terminated) for the zone. * For example, "uslax" for zone "America/Los_Angeles". * @param tz the time zone * @return the short ID of the time zone, or null if the short ID is not available. */ static const char16_t* U_EXPORT2 getShortID(const TimeZone& tz); /** * Returns the time zone's short ID (null terminated) for the zone ID. * For example, "uslax" for zone ID "America/Los_Angeles". * @param tz the time zone ID * @return the short ID of the time zone ID, or null if the short ID is not available. */ static const char16_t* U_EXPORT2 getShortID(const UnicodeString& id); private: ZoneMeta() = delete; // Prevent construction. static UVector* createMetazoneMappings(const UnicodeString &tzid); static UnicodeString& formatCustomID(uint8_t hour, uint8_t min, uint8_t sec, UBool negative, UnicodeString& id); static const char16_t* getShortIDFromCanonical(const char16_t* canonicalID); }; U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ #endif // ZONEMETA_H stringi/src/icu74/i18n/numrange_impl.cpp0000644000176200001440000004363114700200761017577 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING // Allow implicit conversion from char16_t* to UnicodeString for this file: // Helpful in toString methods and elsewhere. #define UNISTR_FROM_STRING_EXPLICIT #include "unicode/numberrangeformatter.h" #include "numrange_impl.h" #include "patternprops.h" #include "pluralranges.h" #include "uresimp.h" #include "util.h" using namespace icu; using namespace icu::number; using namespace icu::number::impl; namespace { // Helper function for 2-dimensional switch statement constexpr int8_t identity2d(UNumberRangeIdentityFallback a, UNumberRangeIdentityResult b) { return static_cast(a) | (static_cast(b) << 4); } struct NumberRangeData { SimpleFormatter rangePattern; // Note: approximatelyPattern is unused since ICU 69. // SimpleFormatter approximatelyPattern; }; class NumberRangeDataSink : public ResourceSink { public: NumberRangeDataSink(NumberRangeData& data) : fData(data) {} void put(const char* key, ResourceValue& value, UBool /*noFallback*/, UErrorCode& status) override { ResourceTable miscTable = value.getTable(status); if (U_FAILURE(status)) { return; } for (int i = 0; miscTable.getKeyAndValue(i, key, value); i++) { if (uprv_strcmp(key, "range") == 0) { if (hasRangeData()) { continue; // have already seen this pattern } fData.rangePattern = {value.getUnicodeString(status), status}; } /* // Note: approximatelyPattern is unused since ICU 69. else if (uprv_strcmp(key, "approximately") == 0) { if (hasApproxData()) { continue; // have already seen this pattern } fData.approximatelyPattern = {value.getUnicodeString(status), status}; } */ } } bool hasRangeData() { return fData.rangePattern.getArgumentLimit() != 0; } /* // Note: approximatelyPattern is unused since ICU 69. bool hasApproxData() { return fData.approximatelyPattern.getArgumentLimit() != 0; } */ bool isComplete() { return hasRangeData() /* && hasApproxData() */; } void fillInDefaults(UErrorCode& status) { if (!hasRangeData()) { fData.rangePattern = {u"{0}–{1}", status}; } /* if (!hasApproxData()) { fData.approximatelyPattern = {u"~{0}", status}; } */ } private: NumberRangeData& fData; }; void getNumberRangeData(const char* localeName, const char* nsName, NumberRangeData& data, UErrorCode& status) { if (U_FAILURE(status)) { return; } LocalUResourceBundlePointer rb(ures_open(nullptr, localeName, &status)); if (U_FAILURE(status)) { return; } NumberRangeDataSink sink(data); CharString dataPath; dataPath.append("NumberElements/", -1, status); dataPath.append(nsName, -1, status); dataPath.append("/miscPatterns", -1, status); if (U_FAILURE(status)) { return; } UErrorCode localStatus = U_ZERO_ERROR; ures_getAllItemsWithFallback(rb.getAlias(), dataPath.data(), sink, localStatus); if (U_FAILURE(localStatus) && localStatus != U_MISSING_RESOURCE_ERROR) { status = localStatus; return; } // Fall back to latn if necessary if (!sink.isComplete()) { ures_getAllItemsWithFallback(rb.getAlias(), "NumberElements/latn/miscPatterns", sink, status); } sink.fillInDefaults(status); } } // namespace NumberRangeFormatterImpl::NumberRangeFormatterImpl(const RangeMacroProps& macros, UErrorCode& status) : formatterImpl1(macros.formatter1.fMacros, status), formatterImpl2(macros.formatter2.fMacros, status), fSameFormatters(macros.singleFormatter), fCollapse(macros.collapse), fIdentityFallback(macros.identityFallback), fApproximatelyFormatter(status) { const char* nsName = formatterImpl1.getRawMicroProps().nsName; if (!fSameFormatters && uprv_strcmp(nsName, formatterImpl2.getRawMicroProps().nsName) != 0) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } NumberRangeData data; getNumberRangeData(macros.locale.getName(), nsName, data, status); if (U_FAILURE(status)) { return; } fRangeFormatter = data.rangePattern; if (fSameFormatters && ( fIdentityFallback == UNUM_IDENTITY_FALLBACK_APPROXIMATELY || fIdentityFallback == UNUM_IDENTITY_FALLBACK_APPROXIMATELY_OR_SINGLE_VALUE)) { MacroProps approximatelyMacros(macros.formatter1.fMacros); approximatelyMacros.approximately = true; // Use in-place construction because NumberFormatterImpl has internal self-pointers fApproximatelyFormatter.~NumberFormatterImpl(); new (&fApproximatelyFormatter) NumberFormatterImpl(approximatelyMacros, status); } // TODO: Get locale from PluralRules instead? fPluralRanges = StandardPluralRanges::forLocale(macros.locale, status); if (U_FAILURE(status)) { return; } } void NumberRangeFormatterImpl::format(UFormattedNumberRangeData& data, bool equalBeforeRounding, UErrorCode& status) const { if (U_FAILURE(status)) { return; } MicroProps micros1; MicroProps micros2; formatterImpl1.preProcess(data.quantity1, micros1, status); if (fSameFormatters) { formatterImpl1.preProcess(data.quantity2, micros2, status); } else { formatterImpl2.preProcess(data.quantity2, micros2, status); } if (U_FAILURE(status)) { return; } // If any of the affixes are different, an identity is not possible // and we must use formatRange(). // TODO: Write this as MicroProps operator==() ? // TODO: Avoid the redundancy of these equality operations with the // ones in formatRange? if (!micros1.modInner->semanticallyEquivalent(*micros2.modInner) || !micros1.modMiddle->semanticallyEquivalent(*micros2.modMiddle) || !micros1.modOuter->semanticallyEquivalent(*micros2.modOuter)) { formatRange(data, micros1, micros2, status); data.identityResult = UNUM_IDENTITY_RESULT_NOT_EQUAL; return; } // Check for identity if (equalBeforeRounding) { data.identityResult = UNUM_IDENTITY_RESULT_EQUAL_BEFORE_ROUNDING; } else if (data.quantity1 == data.quantity2) { data.identityResult = UNUM_IDENTITY_RESULT_EQUAL_AFTER_ROUNDING; } else { data.identityResult = UNUM_IDENTITY_RESULT_NOT_EQUAL; } switch (identity2d(fIdentityFallback, data.identityResult)) { case identity2d(UNUM_IDENTITY_FALLBACK_RANGE, UNUM_IDENTITY_RESULT_NOT_EQUAL): case identity2d(UNUM_IDENTITY_FALLBACK_RANGE, UNUM_IDENTITY_RESULT_EQUAL_AFTER_ROUNDING): case identity2d(UNUM_IDENTITY_FALLBACK_RANGE, UNUM_IDENTITY_RESULT_EQUAL_BEFORE_ROUNDING): case identity2d(UNUM_IDENTITY_FALLBACK_APPROXIMATELY, UNUM_IDENTITY_RESULT_NOT_EQUAL): case identity2d(UNUM_IDENTITY_FALLBACK_APPROXIMATELY_OR_SINGLE_VALUE, UNUM_IDENTITY_RESULT_NOT_EQUAL): case identity2d(UNUM_IDENTITY_FALLBACK_SINGLE_VALUE, UNUM_IDENTITY_RESULT_NOT_EQUAL): formatRange(data, micros1, micros2, status); break; case identity2d(UNUM_IDENTITY_FALLBACK_APPROXIMATELY, UNUM_IDENTITY_RESULT_EQUAL_AFTER_ROUNDING): case identity2d(UNUM_IDENTITY_FALLBACK_APPROXIMATELY, UNUM_IDENTITY_RESULT_EQUAL_BEFORE_ROUNDING): case identity2d(UNUM_IDENTITY_FALLBACK_APPROXIMATELY_OR_SINGLE_VALUE, UNUM_IDENTITY_RESULT_EQUAL_AFTER_ROUNDING): formatApproximately(data, micros1, micros2, status); break; case identity2d(UNUM_IDENTITY_FALLBACK_APPROXIMATELY_OR_SINGLE_VALUE, UNUM_IDENTITY_RESULT_EQUAL_BEFORE_ROUNDING): case identity2d(UNUM_IDENTITY_FALLBACK_SINGLE_VALUE, UNUM_IDENTITY_RESULT_EQUAL_AFTER_ROUNDING): case identity2d(UNUM_IDENTITY_FALLBACK_SINGLE_VALUE, UNUM_IDENTITY_RESULT_EQUAL_BEFORE_ROUNDING): formatSingleValue(data, micros1, micros2, status); break; default: UPRV_UNREACHABLE_EXIT; } } void NumberRangeFormatterImpl::formatSingleValue(UFormattedNumberRangeData& data, MicroProps& micros1, MicroProps& micros2, UErrorCode& status) const { if (U_FAILURE(status)) { return; } if (fSameFormatters) { int32_t length = NumberFormatterImpl::writeNumber(micros1.simple, data.quantity1, data.getStringRef(), 0, status); NumberFormatterImpl::writeAffixes(micros1, data.getStringRef(), 0, length, status); } else { formatRange(data, micros1, micros2, status); } } void NumberRangeFormatterImpl::formatApproximately (UFormattedNumberRangeData& data, MicroProps& micros1, MicroProps& micros2, UErrorCode& status) const { if (U_FAILURE(status)) { return; } if (fSameFormatters) { // Re-format using the approximately formatter: MicroProps microsAppx; data.quantity1.resetExponent(); fApproximatelyFormatter.preProcess(data.quantity1, microsAppx, status); int32_t length = NumberFormatterImpl::writeNumber(microsAppx.simple, data.quantity1, data.getStringRef(), 0, status); length += microsAppx.modInner->apply(data.getStringRef(), 0, length, status); length += microsAppx.modMiddle->apply(data.getStringRef(), 0, length, status); microsAppx.modOuter->apply(data.getStringRef(), 0, length, status); } else { formatRange(data, micros1, micros2, status); } } void NumberRangeFormatterImpl::formatRange(UFormattedNumberRangeData& data, MicroProps& micros1, MicroProps& micros2, UErrorCode& status) const { if (U_FAILURE(status)) { return; } // modInner is always notation (scientific); collapsable in ALL. // modOuter is always units; collapsable in ALL, AUTO, and UNIT. // modMiddle could be either; collapsable in ALL and sometimes AUTO and UNIT. // Never collapse an outer mod but not an inner mod. bool collapseOuter, collapseMiddle, collapseInner; switch (fCollapse) { case UNUM_RANGE_COLLAPSE_ALL: case UNUM_RANGE_COLLAPSE_AUTO: case UNUM_RANGE_COLLAPSE_UNIT: { // OUTER MODIFIER collapseOuter = micros1.modOuter->semanticallyEquivalent(*micros2.modOuter); if (!collapseOuter) { // Never collapse inner mods if outer mods are not collapsable collapseMiddle = false; collapseInner = false; break; } // MIDDLE MODIFIER collapseMiddle = micros1.modMiddle->semanticallyEquivalent(*micros2.modMiddle); if (!collapseMiddle) { // Never collapse inner mods if outer mods are not collapsable collapseInner = false; break; } // MIDDLE MODIFIER HEURISTICS // (could disable collapsing of the middle modifier) // The modifiers are equal by this point, so we can look at just one of them. const Modifier* mm = micros1.modMiddle; if (fCollapse == UNUM_RANGE_COLLAPSE_UNIT) { // Only collapse if the modifier is a unit. // TODO: Make a better way to check for a unit? // TODO: Handle case where the modifier has both notation and unit (compact currency)? if (!mm->containsField({UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}) && !mm->containsField({UFIELD_CATEGORY_NUMBER, UNUM_PERCENT_FIELD})) { collapseMiddle = false; } } else if (fCollapse == UNUM_RANGE_COLLAPSE_AUTO) { // Heuristic as of ICU 63: collapse only if the modifier is more than one code point. if (mm->getCodePointCount() <= 1) { collapseMiddle = false; } } if (!collapseMiddle || fCollapse != UNUM_RANGE_COLLAPSE_ALL) { collapseInner = false; break; } // INNER MODIFIER collapseInner = micros1.modInner->semanticallyEquivalent(*micros2.modInner); // All done checking for collapsibility. break; } default: collapseOuter = false; collapseMiddle = false; collapseInner = false; break; } FormattedStringBuilder& string = data.getStringRef(); int32_t lengthPrefix = 0; int32_t length1 = 0; int32_t lengthInfix = 0; int32_t length2 = 0; int32_t lengthSuffix = 0; // Use #define so that these are evaluated at the call site. #define UPRV_INDEX_0 (lengthPrefix) #define UPRV_INDEX_1 (lengthPrefix + length1) #define UPRV_INDEX_2 (lengthPrefix + length1 + lengthInfix) #define UPRV_INDEX_3 (lengthPrefix + length1 + lengthInfix + length2) #define UPRV_INDEX_4 (lengthPrefix + length1 + lengthInfix + length2 + lengthSuffix) int32_t lengthRange = SimpleModifier::formatTwoArgPattern( fRangeFormatter, string, 0, &lengthPrefix, &lengthSuffix, kUndefinedField, status); if (U_FAILURE(status)) { return; } lengthInfix = lengthRange - lengthPrefix - lengthSuffix; U_ASSERT(lengthInfix > 0); // SPACING HEURISTIC // Add spacing unless all modifiers are collapsed. // TODO: add API to control this? // TODO: Use a data-driven heuristic like currency spacing? // TODO: Use Unicode [:whitespace:] instead of PatternProps whitespace? (consider speed implications) { bool repeatInner = !collapseInner && micros1.modInner->getCodePointCount() > 0; bool repeatMiddle = !collapseMiddle && micros1.modMiddle->getCodePointCount() > 0; bool repeatOuter = !collapseOuter && micros1.modOuter->getCodePointCount() > 0; if (repeatInner || repeatMiddle || repeatOuter) { // Add spacing if there is not already spacing if (!PatternProps::isWhiteSpace(string.charAt(UPRV_INDEX_1))) { lengthInfix += string.insertCodePoint(UPRV_INDEX_1, u'\u0020', kUndefinedField, status); } if (!PatternProps::isWhiteSpace(string.charAt(UPRV_INDEX_2 - 1))) { lengthInfix += string.insertCodePoint(UPRV_INDEX_2, u'\u0020', kUndefinedField, status); } } } length1 += NumberFormatterImpl::writeNumber(micros1.simple, data.quantity1, string, UPRV_INDEX_0, status); // ICU-21684: Write the second number to a temp string to avoid repeated insert operations FormattedStringBuilder tempString; NumberFormatterImpl::writeNumber(micros2.simple, data.quantity2, tempString, 0, status); length2 += string.insert(UPRV_INDEX_2, tempString, status); // TODO: Support padding? if (collapseInner) { const Modifier& mod = resolveModifierPlurals(*micros1.modInner, *micros2.modInner); lengthSuffix += mod.apply(string, UPRV_INDEX_0, UPRV_INDEX_4, status); lengthPrefix += mod.getPrefixLength(); lengthSuffix -= mod.getPrefixLength(); } else { length1 += micros1.modInner->apply(string, UPRV_INDEX_0, UPRV_INDEX_1, status); length2 += micros2.modInner->apply(string, UPRV_INDEX_2, UPRV_INDEX_4, status); } if (collapseMiddle) { const Modifier& mod = resolveModifierPlurals(*micros1.modMiddle, *micros2.modMiddle); lengthSuffix += mod.apply(string, UPRV_INDEX_0, UPRV_INDEX_4, status); lengthPrefix += mod.getPrefixLength(); lengthSuffix -= mod.getPrefixLength(); } else { length1 += micros1.modMiddle->apply(string, UPRV_INDEX_0, UPRV_INDEX_1, status); length2 += micros2.modMiddle->apply(string, UPRV_INDEX_2, UPRV_INDEX_4, status); } if (collapseOuter) { const Modifier& mod = resolveModifierPlurals(*micros1.modOuter, *micros2.modOuter); lengthSuffix += mod.apply(string, UPRV_INDEX_0, UPRV_INDEX_4, status); lengthPrefix += mod.getPrefixLength(); lengthSuffix -= mod.getPrefixLength(); } else { length1 += micros1.modOuter->apply(string, UPRV_INDEX_0, UPRV_INDEX_1, status); length2 += micros2.modOuter->apply(string, UPRV_INDEX_2, UPRV_INDEX_4, status); } // Now that all pieces are added, save the span info. data.appendSpanInfo(UFIELD_CATEGORY_NUMBER_RANGE_SPAN, 0, UPRV_INDEX_0, length1, status); data.appendSpanInfo(UFIELD_CATEGORY_NUMBER_RANGE_SPAN, 1, UPRV_INDEX_2, length2, status); } const Modifier& NumberRangeFormatterImpl::resolveModifierPlurals(const Modifier& first, const Modifier& second) const { Modifier::Parameters parameters; first.getParameters(parameters); if (parameters.obj == nullptr) { // No plural form; return a fallback (e.g., the first) return first; } StandardPlural::Form firstPlural = parameters.plural; second.getParameters(parameters); if (parameters.obj == nullptr) { // No plural form; return a fallback (e.g., the first) return first; } StandardPlural::Form secondPlural = parameters.plural; // Get the required plural form from data StandardPlural::Form resultPlural = fPluralRanges.resolve(firstPlural, secondPlural); // Get and return the new Modifier const Modifier* mod = parameters.obj->getModifier(parameters.signum, resultPlural); U_ASSERT(mod != nullptr); return *mod; } #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/wintzimpl.cpp0000644000176200001440000001336714700200761017002 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************** * Copyright (C) 2009-2013, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************** * * File WINTZIMPL.CPP * ******************************************************************************** */ #include "unicode/utypes.h" #if U_PLATFORM_USES_ONLY_WIN32_API && !UCONFIG_NO_FORMATTING #include "wintzimpl.h" #include "unicode/unistr.h" #include "unicode/timezone.h" #include "unicode/basictz.h" #include "putilimp.h" #include "uassert.h" #include "cmemory.h" #ifndef WIN32_LEAN_AND_MEAN # define WIN32_LEAN_AND_MEAN #endif # define VC_EXTRALEAN # define NOUSER # define NOSERVICE # define NOIME # define NOMCX #include U_NAMESPACE_USE static UBool getSystemTimeInformation(TimeZone *tz, SYSTEMTIME &daylightDate, SYSTEMTIME &standardDate, int32_t &bias, int32_t &daylightBias, int32_t &standardBias) { UErrorCode status = U_ZERO_ERROR; UBool result = true; BasicTimeZone *btz = (BasicTimeZone*)tz; // we should check type InitialTimeZoneRule *initial = nullptr; AnnualTimeZoneRule *std = nullptr, *dst = nullptr; btz->getSimpleRulesNear(uprv_getUTCtime(), initial, std, dst, status); if (U_SUCCESS(status)) { if (std == nullptr || dst == nullptr) { bias = -1 * (initial->getRawOffset()/60000); standardBias = 0; daylightBias = 0; // Do not use DST. Set 0 to all stadardDate/daylightDate fields standardDate.wYear = standardDate.wMonth = standardDate.wDayOfWeek = standardDate.wDay = standardDate.wHour = standardDate.wMinute = standardDate.wSecond = standardDate.wMilliseconds = 0; daylightDate.wYear = daylightDate.wMonth = daylightDate.wDayOfWeek = daylightDate.wDay = daylightDate.wHour = daylightDate.wMinute = daylightDate.wSecond = daylightDate.wMilliseconds = 0; } else { U_ASSERT(std->getRule()->getDateRuleType() == DateTimeRule::DOW); U_ASSERT(dst->getRule()->getDateRuleType() == DateTimeRule::DOW); bias = -1 * (std->getRawOffset()/60000); standardBias = 0; daylightBias = -1 * (dst->getDSTSavings()/60000); // Always use DOW type rule int32_t hour, min, sec, mil; standardDate.wYear = 0; standardDate.wMonth = static_cast(std->getRule()->getRuleMonth()) + 1; standardDate.wDay = static_cast(std->getRule()->getRuleWeekInMonth()); if (standardDate.wDay < 0) { standardDate.wDay = 5; } standardDate.wDayOfWeek = static_cast(std->getRule()->getRuleDayOfWeek()) - 1; mil = std->getRule()->getRuleMillisInDay(); hour = mil/3600000; mil %= 3600000; min = mil/60000; mil %= 60000; sec = mil/1000; mil %= 1000; standardDate.wHour = static_cast(hour); standardDate.wMinute = static_cast(min); standardDate.wSecond = static_cast(sec); standardDate.wMilliseconds = static_cast(mil); daylightDate.wYear = 0; daylightDate.wMonth = static_cast(dst->getRule()->getRuleMonth()) + 1; daylightDate.wDay = static_cast(dst->getRule()->getRuleWeekInMonth()); if (daylightDate.wDay < 0) { daylightDate.wDay = 5; } daylightDate.wDayOfWeek = static_cast(dst->getRule()->getRuleDayOfWeek()) - 1; mil = dst->getRule()->getRuleMillisInDay(); hour = mil/3600000; mil %= 3600000; min = mil/60000; mil %= 60000; sec = mil/1000; mil %= 1000; daylightDate.wHour = static_cast(hour); daylightDate.wMinute = static_cast(min); daylightDate.wSecond = static_cast(sec); daylightDate.wMilliseconds = static_cast(mil); } } else { result = false; } delete initial; delete std; delete dst; return result; } static UBool getWindowsTimeZoneInfo(TIME_ZONE_INFORMATION *zoneInfo, const char16_t *icuid, int32_t length) { UBool result = false; UnicodeString id = UnicodeString(icuid, length); TimeZone *tz = TimeZone::createTimeZone(id); if (tz != nullptr) { int32_t bias; int32_t daylightBias; int32_t standardBias; SYSTEMTIME daylightDate; SYSTEMTIME standardDate; if (getSystemTimeInformation(tz, daylightDate, standardDate, bias, daylightBias, standardBias)) { uprv_memset(zoneInfo, 0, sizeof(TIME_ZONE_INFORMATION)); // We do not set standard/daylight names, so nullify first. zoneInfo->Bias = bias; zoneInfo->DaylightBias = daylightBias; zoneInfo->StandardBias = standardBias; zoneInfo->DaylightDate = daylightDate; zoneInfo->StandardDate = standardDate; result = true; } } return result; } /* * Given the timezone icuid, fill in zoneInfo by calling auxiliary functions that creates a timezone and extract the * information to put into zoneInfo. This includes bias and standard time date and daylight saving date. */ U_CAPI UBool U_EXPORT2 uprv_getWindowsTimeZoneInfo(TIME_ZONE_INFORMATION *zoneInfo, const char16_t *icuid, int32_t length) { if (getWindowsTimeZoneInfo(zoneInfo, icuid, length)) { return true; } else { return false; } } #endif stringi/src/icu74/i18n/rematch.cpp0000644000176200001440000065726414700200761016402 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ************************************************************************** * Copyright (C) 2002-2016 International Business Machines Corporation * and others. All rights reserved. ************************************************************************** */ // // file: rematch.cpp // // Contains the implementation of class RegexMatcher, // which is one of the main API classes for the ICU regular expression package. // #include "unicode/utypes.h" #if !UCONFIG_NO_REGULAR_EXPRESSIONS #include "unicode/regex.h" #include "unicode/uniset.h" #include "unicode/uchar.h" #include "unicode/ustring.h" #include "unicode/rbbi.h" #include "unicode/utf.h" #include "unicode/utf16.h" #include "uassert.h" #include "cmemory.h" #include "cstr.h" #include "uvector.h" #include "uvectr32.h" #include "uvectr64.h" #include "regeximp.h" #include "regexst.h" #include "regextxt.h" #include "ucase.h" // #include // Needed for heapcheck testing U_NAMESPACE_BEGIN // Default limit for the size of the back track stack, to avoid system // failures causedby heap exhaustion. Units are in 32 bit words, not bytes. // This value puts ICU's limits higher than most other regexp implementations, // which use recursion rather than the heap, and take more storage per // backtrack point. // static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000; // Time limit counter constant. // Time limits for expression evaluation are in terms of quanta of work by // the engine, each of which is 10,000 state saves. // This constant determines that state saves per tick number. static const int32_t TIMER_INITIAL_VALUE = 10000; // Test for any of the Unicode line terminating characters. static inline UBool isLineTerminator(UChar32 c) { if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) { return false; } return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029; } //----------------------------------------------------------------------------- // // Constructor and Destructor // //----------------------------------------------------------------------------- RegexMatcher::RegexMatcher(const RegexPattern *pat) { fDeferredStatus = U_ZERO_ERROR; init(fDeferredStatus); if (U_FAILURE(fDeferredStatus)) { return; } if (pat==nullptr) { fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR; return; } fPattern = pat; init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus); } RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, uint32_t flags, UErrorCode &status) { init(status); if (U_FAILURE(status)) { return; } UParseError pe; fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); fPattern = fPatternOwned; UText inputText = UTEXT_INITIALIZER; utext_openConstUnicodeString(&inputText, &input, &status); init2(&inputText, status); utext_close(&inputText); fInputUniStrMaybeMutable = true; } RegexMatcher::RegexMatcher(UText *regexp, UText *input, uint32_t flags, UErrorCode &status) { init(status); if (U_FAILURE(status)) { return; } UParseError pe; fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); if (U_FAILURE(status)) { return; } fPattern = fPatternOwned; init2(input, status); } RegexMatcher::RegexMatcher(const UnicodeString ®exp, uint32_t flags, UErrorCode &status) { init(status); if (U_FAILURE(status)) { return; } UParseError pe; fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); if (U_FAILURE(status)) { return; } fPattern = fPatternOwned; init2(RegexStaticSets::gStaticSets->fEmptyText, status); } RegexMatcher::RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status) { init(status); if (U_FAILURE(status)) { return; } UParseError pe; fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); if (U_FAILURE(status)) { return; } fPattern = fPatternOwned; init2(RegexStaticSets::gStaticSets->fEmptyText, status); } RegexMatcher::~RegexMatcher() { delete fStack; if (fData != fSmallData) { uprv_free(fData); fData = nullptr; } if (fPatternOwned) { delete fPatternOwned; fPatternOwned = nullptr; fPattern = nullptr; } if (fInput) { delete fInput; } if (fInputText) { utext_close(fInputText); } if (fAltInputText) { utext_close(fAltInputText); } #if UCONFIG_NO_BREAK_ITERATION==0 delete fWordBreakItr; delete fGCBreakItr; #endif } // // init() common initialization for use by all constructors. // Initialize all fields, get the object into a consistent state. // This must be done even when the initial status shows an error, // so that the object is initialized sufficiently well for the destructor // to run safely. // void RegexMatcher::init(UErrorCode &status) { fPattern = nullptr; fPatternOwned = nullptr; fFrameSize = 0; fRegionStart = 0; fRegionLimit = 0; fAnchorStart = 0; fAnchorLimit = 0; fLookStart = 0; fLookLimit = 0; fActiveStart = 0; fActiveLimit = 0; fTransparentBounds = false; fAnchoringBounds = true; fMatch = false; fMatchStart = 0; fMatchEnd = 0; fLastMatchEnd = -1; fAppendPosition = 0; fHitEnd = false; fRequireEnd = false; fStack = nullptr; fFrame = nullptr; fTimeLimit = 0; fTime = 0; fTickCounter = 0; fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY; fCallbackFn = nullptr; fCallbackContext = nullptr; fFindProgressCallbackFn = nullptr; fFindProgressCallbackContext = nullptr; fTraceDebug = false; fDeferredStatus = status; fData = fSmallData; fWordBreakItr = nullptr; fGCBreakItr = nullptr; fStack = nullptr; fInputText = nullptr; fAltInputText = nullptr; fInput = nullptr; fInputLength = 0; fInputUniStrMaybeMutable = false; } // // init2() Common initialization for use by RegexMatcher constructors, part 2. // This handles the common setup to be done after the Pattern is available. // void RegexMatcher::init2(UText *input, UErrorCode &status) { if (U_FAILURE(status)) { fDeferredStatus = status; return; } if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) { fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t)); if (fData == nullptr) { status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; return; } } fStack = new UVector64(status); if (fStack == nullptr) { status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; return; } reset(input); setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status); if (U_FAILURE(status)) { fDeferredStatus = status; return; } } static const char16_t BACKSLASH = 0x5c; static const char16_t DOLLARSIGN = 0x24; static const char16_t LEFTBRACKET = 0x7b; static const char16_t RIGHTBRACKET = 0x7d; //-------------------------------------------------------------------------------- // // appendReplacement // //-------------------------------------------------------------------------------- RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, const UnicodeString &replacement, UErrorCode &status) { UText replacementText = UTEXT_INITIALIZER; utext_openConstUnicodeString(&replacementText, &replacement, &status); if (U_SUCCESS(status)) { UText resultText = UTEXT_INITIALIZER; utext_openUnicodeString(&resultText, &dest, &status); if (U_SUCCESS(status)) { appendReplacement(&resultText, &replacementText, status); utext_close(&resultText); } utext_close(&replacementText); } return *this; } // // appendReplacement, UText mode // RegexMatcher &RegexMatcher::appendReplacement(UText *dest, UText *replacement, UErrorCode &status) { if (U_FAILURE(status)) { return *this; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; return *this; } if (fMatch == false) { status = U_REGEX_INVALID_STATE; return *this; } // Copy input string from the end of previous match to start of current match int64_t destLen = utext_nativeLength(dest); if (fMatchStart > fAppendPosition) { if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, (int32_t)(fMatchStart-fAppendPosition), &status); } else { int32_t len16; if (UTEXT_USES_U16(fInputText)) { len16 = (int32_t)(fMatchStart-fAppendPosition); } else { UErrorCode lengthStatus = U_ZERO_ERROR; len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, nullptr, 0, &lengthStatus); } char16_t *inputChars = (char16_t *)uprv_malloc(sizeof(char16_t)*(len16+1)); if (inputChars == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return *this; } utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status); destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status); uprv_free(inputChars); } } fAppendPosition = fMatchEnd; // scan the replacement text, looking for substitutions ($n) and \escapes. // TODO: optimize this loop by efficiently scanning for '$' or '\', // move entire ranges not containing substitutions. UTEXT_SETNATIVEINDEX(replacement, 0); for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL; c = UTEXT_NEXT32(replacement)) { if (c == BACKSLASH) { // Backslash Escape. Copy the following char out without further checks. // Note: Surrogate pairs don't need any special handling // The second half wont be a '$' or a '\', and // will move to the dest normally on the next // loop iteration. c = UTEXT_CURRENT32(replacement); if (c == U_SENTINEL) { break; } if (c==0x55/*U*/ || c==0x75/*u*/) { // We have a \udddd or \Udddddddd escape sequence. int32_t offset = 0; struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement); UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); if (escapedChar != (UChar32)0xFFFFFFFF) { if (U_IS_BMP(escapedChar)) { char16_t c16 = (char16_t)escapedChar; destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); } else { char16_t surrogate[2]; surrogate[0] = U16_LEAD(escapedChar); surrogate[1] = U16_TRAIL(escapedChar); if (U_SUCCESS(status)) { destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); } } // TODO: Report errors for mal-formed \u escapes? // As this is, the original sequence is output, which may be OK. if (context.lastOffset == offset) { (void)UTEXT_PREVIOUS32(replacement); } else if (context.lastOffset != offset-1) { utext_moveIndex32(replacement, offset - context.lastOffset - 1); } } } else { (void)UTEXT_NEXT32(replacement); // Plain backslash escape. Just put out the escaped character. if (U_IS_BMP(c)) { char16_t c16 = (char16_t)c; destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); } else { char16_t surrogate[2]; surrogate[0] = U16_LEAD(c); surrogate[1] = U16_TRAIL(c); if (U_SUCCESS(status)) { destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); } } } } else if (c != DOLLARSIGN) { // Normal char, not a $. Copy it out without further checks. if (U_IS_BMP(c)) { char16_t c16 = (char16_t)c; destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); } else { char16_t surrogate[2]; surrogate[0] = U16_LEAD(c); surrogate[1] = U16_TRAIL(c); if (U_SUCCESS(status)) { destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); } } } else { // We've got a $. Pick up a capture group name or number if one follows. // Consume digits so long as the resulting group number <= the number of // number of capture groups in the pattern. int32_t groupNum = 0; int32_t numDigits = 0; UChar32 nextChar = utext_current32(replacement); if (nextChar == LEFTBRACKET) { // Scan for a Named Capture Group, ${name}. UnicodeString groupName; utext_next32(replacement); while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) { nextChar = utext_next32(replacement); if (nextChar == U_SENTINEL) { status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; } else if ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z (nextChar >= 0x61 && nextChar <= 0x7a) || // a..z (nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9 groupName.append(nextChar); } else if (nextChar == RIGHTBRACKET) { groupNum = fPattern->fNamedCaptureMap ? uhash_geti(fPattern->fNamedCaptureMap, &groupName) : 0; if (groupNum == 0) { status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; } } else { // Character was something other than a name char or a closing '}' status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; } } } else if (u_isdigit(nextChar)) { // $n Scan for a capture group number int32_t numCaptureGroups = fPattern->fGroupMap->size(); for (;;) { nextChar = UTEXT_CURRENT32(replacement); if (nextChar == U_SENTINEL) { break; } if (u_isdigit(nextChar) == false) { break; } int32_t nextDigitVal = u_charDigitValue(nextChar); if (groupNum*10 + nextDigitVal > numCaptureGroups) { // Don't consume the next digit if it makes the capture group number too big. if (numDigits == 0) { status = U_INDEX_OUTOFBOUNDS_ERROR; } break; } (void)UTEXT_NEXT32(replacement); groupNum=groupNum*10 + nextDigitVal; ++numDigits; } } else { // $ not followed by capture group name or number. status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; } if (U_SUCCESS(status)) { destLen += appendGroup(groupNum, dest, status); } } // End of $ capture group handling } // End of per-character loop through the replacement string. return *this; } //-------------------------------------------------------------------------------- // // appendTail Intended to be used in conjunction with appendReplacement() // To the destination string, append everything following // the last match position from the input string. // // Note: Match ranges do not affect appendTail or appendReplacement // //-------------------------------------------------------------------------------- UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) { UErrorCode status = U_ZERO_ERROR; UText resultText = UTEXT_INITIALIZER; utext_openUnicodeString(&resultText, &dest, &status); if (U_SUCCESS(status)) { appendTail(&resultText, status); utext_close(&resultText); } return dest; } // // appendTail, UText mode // UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) { if (U_FAILURE(status)) { return dest; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; return dest; } if (fInputLength > fAppendPosition) { if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { int64_t destLen = utext_nativeLength(dest); utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, (int32_t)(fInputLength-fAppendPosition), &status); } else { int32_t len16; if (UTEXT_USES_U16(fInputText)) { len16 = (int32_t)(fInputLength-fAppendPosition); } else { len16 = utext_extract(fInputText, fAppendPosition, fInputLength, nullptr, 0, &status); status = U_ZERO_ERROR; // buffer overflow } char16_t *inputChars = (char16_t *)uprv_malloc(sizeof(char16_t)*(len16)); if (inputChars == nullptr) { fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; } else { utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated int64_t destLen = utext_nativeLength(dest); utext_replace(dest, destLen, destLen, inputChars, len16, &status); uprv_free(inputChars); } } } return dest; } //-------------------------------------------------------------------------------- // // end // //-------------------------------------------------------------------------------- int32_t RegexMatcher::end(UErrorCode &err) const { return end(0, err); } int64_t RegexMatcher::end64(UErrorCode &err) const { return end64(0, err); } int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const { if (U_FAILURE(err)) { return -1; } if (fMatch == false) { err = U_REGEX_INVALID_STATE; return -1; } if (group < 0 || group > fPattern->fGroupMap->size()) { err = U_INDEX_OUTOFBOUNDS_ERROR; return -1; } int64_t e = -1; if (group == 0) { e = fMatchEnd; } else { // Get the position within the stack frame of the variables for // this capture group. int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); U_ASSERT(groupOffset < fPattern->fFrameSize); U_ASSERT(groupOffset >= 0); e = fFrame->fExtra[groupOffset + 1]; } return e; } int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { return (int32_t)end64(group, err); } //-------------------------------------------------------------------------------- // // findProgressInterrupt This function is called once for each advance in the target // string from the find() function, and calls the user progress callback // function if there is one installed. // // Return: true if the find operation is to be terminated. // false if the find operation is to continue running. // //-------------------------------------------------------------------------------- UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) { if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) { status = U_REGEX_STOPPED_BY_CALLER; return true; } return false; } //-------------------------------------------------------------------------------- // // find() // //-------------------------------------------------------------------------------- UBool RegexMatcher::find() { if (U_FAILURE(fDeferredStatus)) { return false; } UErrorCode status = U_ZERO_ERROR; UBool result = find(status); return result; } //-------------------------------------------------------------------------------- // // find() // //-------------------------------------------------------------------------------- UBool RegexMatcher::find(UErrorCode &status) { // Start at the position of the last match end. (Will be zero if the // matcher has been reset.) // if (U_FAILURE(status)) { return false; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; return false; } if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { return findUsingChunk(status); } int64_t startPos = fMatchEnd; if (startPos==0) { startPos = fActiveStart; } if (fMatch) { // Save the position of any previous successful match. fLastMatchEnd = fMatchEnd; if (fMatchStart == fMatchEnd) { // Previous match had zero length. Move start position up one position // to avoid sending find() into a loop on zero-length matches. if (startPos >= fActiveLimit) { fMatch = false; fHitEnd = true; return false; } UTEXT_SETNATIVEINDEX(fInputText, startPos); (void)UTEXT_NEXT32(fInputText); startPos = UTEXT_GETNATIVEINDEX(fInputText); } } else { if (fLastMatchEnd >= 0) { // A previous find() failed to match. Don't try again. // (without this test, a pattern with a zero-length match // could match again at the end of an input string.) fHitEnd = true; return false; } } // Compute the position in the input string beyond which a match can not begin, because // the minimum length match would extend past the end of the input. // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. // Be aware of possible overflows if making changes here. int64_t testStartLimit; if (UTEXT_USES_U16(fInputText)) { testStartLimit = fActiveLimit - fPattern->fMinMatchLen; if (startPos > testStartLimit) { fMatch = false; fHitEnd = true; return false; } } else { // We don't know exactly how long the minimum match length is in native characters. // Treat anything > 0 as 1. testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0); } UChar32 c; U_ASSERT(startPos >= 0); switch (fPattern->fStartType) { case START_NO_INFO: // No optimization was found. // Try a match at each input position. for (;;) { MatchAt(startPos, false, status); if (U_FAILURE(status)) { return false; } if (fMatch) { return true; } if (startPos >= testStartLimit) { fHitEnd = true; return false; } UTEXT_SETNATIVEINDEX(fInputText, startPos); (void)UTEXT_NEXT32(fInputText); startPos = UTEXT_GETNATIVEINDEX(fInputText); // Note that it's perfectly OK for a pattern to have a zero-length // match at the end of a string, so we must make sure that the loop // runs with startPos == testStartLimit the last time through. if (findProgressInterrupt(startPos, status)) return false; } UPRV_UNREACHABLE_EXIT; case START_START: // Matches are only possible at the start of the input string // (pattern begins with ^ or \A) if (startPos > fActiveStart) { fMatch = false; return false; } MatchAt(startPos, false, status); if (U_FAILURE(status)) { return false; } return fMatch; case START_SET: { // Match may start on any char from a pre-computed set. U_ASSERT(fPattern->fMinMatchLen > 0); UTEXT_SETNATIVEINDEX(fInputText, startPos); for (;;) { int64_t pos = startPos; c = UTEXT_NEXT32(fInputText); startPos = UTEXT_GETNATIVEINDEX(fInputText); // c will be -1 (U_SENTINEL) at end of text, in which case we // skip this next block (so we don't have a negative array index) // and handle end of text in the following block. if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) || (c>=256 && fPattern->fInitialChars->contains(c)))) { MatchAt(pos, false, status); if (U_FAILURE(status)) { return false; } if (fMatch) { return true; } UTEXT_SETNATIVEINDEX(fInputText, pos); } if (startPos > testStartLimit) { fMatch = false; fHitEnd = true; return false; } if (findProgressInterrupt(startPos, status)) return false; } } UPRV_UNREACHABLE_EXIT; case START_STRING: case START_CHAR: { // Match starts on exactly one char. U_ASSERT(fPattern->fMinMatchLen > 0); UChar32 theChar = fPattern->fInitialChar; UTEXT_SETNATIVEINDEX(fInputText, startPos); for (;;) { int64_t pos = startPos; c = UTEXT_NEXT32(fInputText); startPos = UTEXT_GETNATIVEINDEX(fInputText); if (c == theChar) { MatchAt(pos, false, status); if (U_FAILURE(status)) { return false; } if (fMatch) { return true; } UTEXT_SETNATIVEINDEX(fInputText, startPos); } if (startPos > testStartLimit) { fMatch = false; fHitEnd = true; return false; } if (findProgressInterrupt(startPos, status)) return false; } } UPRV_UNREACHABLE_EXIT; case START_LINE: { UChar32 ch; if (startPos == fAnchorStart) { MatchAt(startPos, false, status); if (U_FAILURE(status)) { return false; } if (fMatch) { return true; } UTEXT_SETNATIVEINDEX(fInputText, startPos); ch = UTEXT_NEXT32(fInputText); startPos = UTEXT_GETNATIVEINDEX(fInputText); } else { UTEXT_SETNATIVEINDEX(fInputText, startPos); ch = UTEXT_PREVIOUS32(fInputText); UTEXT_SETNATIVEINDEX(fInputText, startPos); } if (fPattern->fFlags & UREGEX_UNIX_LINES) { for (;;) { if (ch == 0x0a) { MatchAt(startPos, false, status); if (U_FAILURE(status)) { return false; } if (fMatch) { return true; } UTEXT_SETNATIVEINDEX(fInputText, startPos); } if (startPos >= testStartLimit) { fMatch = false; fHitEnd = true; return false; } ch = UTEXT_NEXT32(fInputText); startPos = UTEXT_GETNATIVEINDEX(fInputText); // Note that it's perfectly OK for a pattern to have a zero-length // match at the end of a string, so we must make sure that the loop // runs with startPos == testStartLimit the last time through. if (findProgressInterrupt(startPos, status)) return false; } } else { for (;;) { if (isLineTerminator(ch)) { if (ch == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { (void)UTEXT_NEXT32(fInputText); startPos = UTEXT_GETNATIVEINDEX(fInputText); } MatchAt(startPos, false, status); if (U_FAILURE(status)) { return false; } if (fMatch) { return true; } UTEXT_SETNATIVEINDEX(fInputText, startPos); } if (startPos >= testStartLimit) { fMatch = false; fHitEnd = true; return false; } ch = UTEXT_NEXT32(fInputText); startPos = UTEXT_GETNATIVEINDEX(fInputText); // Note that it's perfectly OK for a pattern to have a zero-length // match at the end of a string, so we must make sure that the loop // runs with startPos == testStartLimit the last time through. if (findProgressInterrupt(startPos, status)) return false; } } } default: UPRV_UNREACHABLE_ASSERT; // Unknown value in fPattern->fStartType, should be from StartOfMatch enum. But // we have reports of this in production code, don't use UPRV_UNREACHABLE_EXIT. // See ICU-21669. status = U_INTERNAL_PROGRAM_ERROR; return false; } UPRV_UNREACHABLE_EXIT; } UBool RegexMatcher::find(int64_t start, UErrorCode &status) { if (U_FAILURE(status)) { return false; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; return false; } this->reset(); // Note: Reset() is specified by Java Matcher documentation. // This will reset the region to be the full input length. if (start < 0) { status = U_INDEX_OUTOFBOUNDS_ERROR; return false; } int64_t nativeStart = start; if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { status = U_INDEX_OUTOFBOUNDS_ERROR; return false; } fMatchEnd = nativeStart; return find(status); } //-------------------------------------------------------------------------------- // // findUsingChunk() -- like find(), but with the advance knowledge that the // entire string is available in the UText's chunk buffer. // //-------------------------------------------------------------------------------- UBool RegexMatcher::findUsingChunk(UErrorCode &status) { // Start at the position of the last match end. (Will be zero if the // matcher has been reset. // int32_t startPos = (int32_t)fMatchEnd; if (startPos==0) { startPos = (int32_t)fActiveStart; } const char16_t *inputBuf = fInputText->chunkContents; if (fMatch) { // Save the position of any previous successful match. fLastMatchEnd = fMatchEnd; if (fMatchStart == fMatchEnd) { // Previous match had zero length. Move start position up one position // to avoid sending find() into a loop on zero-length matches. if (startPos >= fActiveLimit) { fMatch = false; fHitEnd = true; return false; } U16_FWD_1(inputBuf, startPos, fInputLength); } } else { if (fLastMatchEnd >= 0) { // A previous find() failed to match. Don't try again. // (without this test, a pattern with a zero-length match // could match again at the end of an input string.) fHitEnd = true; return false; } } // Compute the position in the input string beyond which a match can not begin, because // the minimum length match would extend past the end of the input. // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. // Be aware of possible overflows if making changes here. // Note: a match can begin at inputBuf + testLen; it is an inclusive limit. int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen); if (startPos > testLen) { fMatch = false; fHitEnd = true; return false; } UChar32 c; U_ASSERT(startPos >= 0); switch (fPattern->fStartType) { case START_NO_INFO: // No optimization was found. // Try a match at each input position. for (;;) { MatchChunkAt(startPos, false, status); if (U_FAILURE(status)) { return false; } if (fMatch) { return true; } if (startPos >= testLen) { fHitEnd = true; return false; } U16_FWD_1(inputBuf, startPos, fActiveLimit); // Note that it's perfectly OK for a pattern to have a zero-length // match at the end of a string, so we must make sure that the loop // runs with startPos == testLen the last time through. if (findProgressInterrupt(startPos, status)) return false; } UPRV_UNREACHABLE_EXIT; case START_START: // Matches are only possible at the start of the input string // (pattern begins with ^ or \A) if (startPos > fActiveStart) { fMatch = false; return false; } MatchChunkAt(startPos, false, status); if (U_FAILURE(status)) { return false; } return fMatch; case START_SET: { // Match may start on any char from a pre-computed set. U_ASSERT(fPattern->fMinMatchLen > 0); for (;;) { int32_t pos = startPos; U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; if ((c<256 && fPattern->fInitialChars8->contains(c)) || (c>=256 && fPattern->fInitialChars->contains(c))) { MatchChunkAt(pos, false, status); if (U_FAILURE(status)) { return false; } if (fMatch) { return true; } } if (startPos > testLen) { fMatch = false; fHitEnd = true; return false; } if (findProgressInterrupt(startPos, status)) return false; } } UPRV_UNREACHABLE_EXIT; case START_STRING: case START_CHAR: { // Match starts on exactly one char. U_ASSERT(fPattern->fMinMatchLen > 0); UChar32 theChar = fPattern->fInitialChar; for (;;) { int32_t pos = startPos; U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; if (c == theChar) { MatchChunkAt(pos, false, status); if (U_FAILURE(status)) { return false; } if (fMatch) { return true; } } if (startPos > testLen) { fMatch = false; fHitEnd = true; return false; } if (findProgressInterrupt(startPos, status)) return false; } } UPRV_UNREACHABLE_EXIT; case START_LINE: { UChar32 ch; if (startPos == fAnchorStart) { MatchChunkAt(startPos, false, status); if (U_FAILURE(status)) { return false; } if (fMatch) { return true; } U16_FWD_1(inputBuf, startPos, fActiveLimit); } if (fPattern->fFlags & UREGEX_UNIX_LINES) { for (;;) { ch = inputBuf[startPos-1]; if (ch == 0x0a) { MatchChunkAt(startPos, false, status); if (U_FAILURE(status)) { return false; } if (fMatch) { return true; } } if (startPos >= testLen) { fMatch = false; fHitEnd = true; return false; } U16_FWD_1(inputBuf, startPos, fActiveLimit); // Note that it's perfectly OK for a pattern to have a zero-length // match at the end of a string, so we must make sure that the loop // runs with startPos == testLen the last time through. if (findProgressInterrupt(startPos, status)) return false; } } else { for (;;) { ch = inputBuf[startPos-1]; if (isLineTerminator(ch)) { if (ch == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) { startPos++; } MatchChunkAt(startPos, false, status); if (U_FAILURE(status)) { return false; } if (fMatch) { return true; } } if (startPos >= testLen) { fMatch = false; fHitEnd = true; return false; } U16_FWD_1(inputBuf, startPos, fActiveLimit); // Note that it's perfectly OK for a pattern to have a zero-length // match at the end of a string, so we must make sure that the loop // runs with startPos == testLen the last time through. if (findProgressInterrupt(startPos, status)) return false; } } } default: UPRV_UNREACHABLE_ASSERT; // Unknown value in fPattern->fStartType, should be from StartOfMatch enum. But // we have reports of this in production code, don't use UPRV_UNREACHABLE_EXIT. // See ICU-21669. status = U_INTERNAL_PROGRAM_ERROR; return false; } UPRV_UNREACHABLE_EXIT; } //-------------------------------------------------------------------------------- // // group() // //-------------------------------------------------------------------------------- UnicodeString RegexMatcher::group(UErrorCode &status) const { return group(0, status); } // Return immutable shallow clone UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const { return group(0, dest, group_len, status); } // Return immutable shallow clone UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const { group_len = 0; if (U_FAILURE(status)) { return dest; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; } else if (fMatch == false) { status = U_REGEX_INVALID_STATE; } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { status = U_INDEX_OUTOFBOUNDS_ERROR; } if (U_FAILURE(status)) { return dest; } int64_t s, e; if (groupNum == 0) { s = fMatchStart; e = fMatchEnd; } else { int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); U_ASSERT(groupOffset < fPattern->fFrameSize); U_ASSERT(groupOffset >= 0); s = fFrame->fExtra[groupOffset]; e = fFrame->fExtra[groupOffset+1]; } if (s < 0) { // A capture group wasn't part of the match return utext_clone(dest, fInputText, false, true, &status); } U_ASSERT(s <= e); group_len = e - s; dest = utext_clone(dest, fInputText, false, true, &status); if (dest) UTEXT_SETNATIVEINDEX(dest, s); return dest; } UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { UnicodeString result; int64_t groupStart = start64(groupNum, status); int64_t groupEnd = end64(groupNum, status); if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) { return result; } // Get the group length using a utext_extract preflight. // UText is actually pretty efficient at this when underlying encoding is UTF-16. int32_t length = utext_extract(fInputText, groupStart, groupEnd, nullptr, 0, &status); if (status != U_BUFFER_OVERFLOW_ERROR) { return result; } status = U_ZERO_ERROR; char16_t *buf = result.getBuffer(length); if (buf == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; } else { int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status); result.releaseBuffer(extractLength); U_ASSERT(length == extractLength); } return result; } //-------------------------------------------------------------------------------- // // appendGroup() -- currently internal only, appends a group to a UText rather // than replacing its contents // //-------------------------------------------------------------------------------- int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const { if (U_FAILURE(status)) { return 0; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; return 0; } int64_t destLen = utext_nativeLength(dest); if (fMatch == false) { status = U_REGEX_INVALID_STATE; return utext_replace(dest, destLen, destLen, nullptr, 0, &status); } if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { status = U_INDEX_OUTOFBOUNDS_ERROR; return utext_replace(dest, destLen, destLen, nullptr, 0, &status); } int64_t s, e; if (groupNum == 0) { s = fMatchStart; e = fMatchEnd; } else { int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); U_ASSERT(groupOffset < fPattern->fFrameSize); U_ASSERT(groupOffset >= 0); s = fFrame->fExtra[groupOffset]; e = fFrame->fExtra[groupOffset+1]; } if (s < 0) { // A capture group wasn't part of the match return utext_replace(dest, destLen, destLen, nullptr, 0, &status); } U_ASSERT(s <= e); int64_t deltaLen; if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { U_ASSERT(e <= fInputLength); deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status); } else { int32_t len16; if (UTEXT_USES_U16(fInputText)) { len16 = (int32_t)(e-s); } else { UErrorCode lengthStatus = U_ZERO_ERROR; len16 = utext_extract(fInputText, s, e, nullptr, 0, &lengthStatus); } char16_t *groupChars = (char16_t *)uprv_malloc(sizeof(char16_t)*(len16+1)); if (groupChars == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return 0; } utext_extract(fInputText, s, e, groupChars, len16+1, &status); deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status); uprv_free(groupChars); } return deltaLen; } //-------------------------------------------------------------------------------- // // groupCount() // //-------------------------------------------------------------------------------- int32_t RegexMatcher::groupCount() const { return fPattern->fGroupMap->size(); } //-------------------------------------------------------------------------------- // // hasAnchoringBounds() // //-------------------------------------------------------------------------------- UBool RegexMatcher::hasAnchoringBounds() const { return fAnchoringBounds; } //-------------------------------------------------------------------------------- // // hasTransparentBounds() // //-------------------------------------------------------------------------------- UBool RegexMatcher::hasTransparentBounds() const { return fTransparentBounds; } //-------------------------------------------------------------------------------- // // hitEnd() // //-------------------------------------------------------------------------------- UBool RegexMatcher::hitEnd() const { return fHitEnd; } //-------------------------------------------------------------------------------- // // input() // //-------------------------------------------------------------------------------- const UnicodeString &RegexMatcher::input() const { if (!fInput) { UErrorCode status = U_ZERO_ERROR; int32_t len16; if (UTEXT_USES_U16(fInputText)) { len16 = (int32_t)fInputLength; } else { len16 = utext_extract(fInputText, 0, fInputLength, nullptr, 0, &status); status = U_ZERO_ERROR; // overflow, length status } UnicodeString *result = new UnicodeString(len16, 0, 0); char16_t *inputChars = result->getBuffer(len16); utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning result->releaseBuffer(len16); (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator= } return *fInput; } //-------------------------------------------------------------------------------- // // inputText() // //-------------------------------------------------------------------------------- UText *RegexMatcher::inputText() const { return fInputText; } //-------------------------------------------------------------------------------- // // getInput() -- like inputText(), but makes a clone or copies into another UText // //-------------------------------------------------------------------------------- UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const { if (U_FAILURE(status)) { return dest; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; return dest; } if (dest) { if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status); } else { int32_t input16Len; if (UTEXT_USES_U16(fInputText)) { input16Len = (int32_t)fInputLength; } else { UErrorCode lengthStatus = U_ZERO_ERROR; input16Len = utext_extract(fInputText, 0, fInputLength, nullptr, 0, &lengthStatus); // buffer overflow error } char16_t *inputChars = (char16_t *)uprv_malloc(sizeof(char16_t)*(input16Len)); if (inputChars == nullptr) { return dest; } status = U_ZERO_ERROR; utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning status = U_ZERO_ERROR; utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status); uprv_free(inputChars); } return dest; } else { return utext_clone(nullptr, fInputText, false, true, &status); } } static UBool compat_SyncMutableUTextContents(UText *ut); static UBool compat_SyncMutableUTextContents(UText *ut) { UBool retVal = false; // In the following test, we're really only interested in whether the UText should switch // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents // will still point to the correct data. if (utext_nativeLength(ut) != ut->nativeIndexingLimit) { UnicodeString *us=(UnicodeString *)ut->context; // Update to the latest length. // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit). int32_t newLength = us->length(); // Update the chunk description. // The buffer may have switched between stack- and heap-based. ut->chunkContents = us->getBuffer(); ut->chunkLength = newLength; ut->chunkNativeLimit = newLength; ut->nativeIndexingLimit = newLength; retVal = true; } return retVal; } //-------------------------------------------------------------------------------- // // lookingAt() // //-------------------------------------------------------------------------------- UBool RegexMatcher::lookingAt(UErrorCode &status) { if (U_FAILURE(status)) { return false; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; return false; } if (fInputUniStrMaybeMutable) { if (compat_SyncMutableUTextContents(fInputText)) { fInputLength = utext_nativeLength(fInputText); reset(); } } else { resetPreserveRegion(); } if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { MatchChunkAt((int32_t)fActiveStart, false, status); } else { MatchAt(fActiveStart, false, status); } return fMatch; } UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) { if (U_FAILURE(status)) { return false; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; return false; } reset(); if (start < 0) { status = U_INDEX_OUTOFBOUNDS_ERROR; return false; } if (fInputUniStrMaybeMutable) { if (compat_SyncMutableUTextContents(fInputText)) { fInputLength = utext_nativeLength(fInputText); reset(); } } int64_t nativeStart; nativeStart = start; if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { status = U_INDEX_OUTOFBOUNDS_ERROR; return false; } if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { MatchChunkAt((int32_t)nativeStart, false, status); } else { MatchAt(nativeStart, false, status); } return fMatch; } //-------------------------------------------------------------------------------- // // matches() // //-------------------------------------------------------------------------------- UBool RegexMatcher::matches(UErrorCode &status) { if (U_FAILURE(status)) { return false; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; return false; } if (fInputUniStrMaybeMutable) { if (compat_SyncMutableUTextContents(fInputText)) { fInputLength = utext_nativeLength(fInputText); reset(); } } else { resetPreserveRegion(); } if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { MatchChunkAt((int32_t)fActiveStart, true, status); } else { MatchAt(fActiveStart, true, status); } return fMatch; } UBool RegexMatcher::matches(int64_t start, UErrorCode &status) { if (U_FAILURE(status)) { return false; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; return false; } reset(); if (start < 0) { status = U_INDEX_OUTOFBOUNDS_ERROR; return false; } if (fInputUniStrMaybeMutable) { if (compat_SyncMutableUTextContents(fInputText)) { fInputLength = utext_nativeLength(fInputText); reset(); } } int64_t nativeStart; nativeStart = start; if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { status = U_INDEX_OUTOFBOUNDS_ERROR; return false; } if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { MatchChunkAt((int32_t)nativeStart, true, status); } else { MatchAt(nativeStart, true, status); } return fMatch; } //-------------------------------------------------------------------------------- // // pattern // //-------------------------------------------------------------------------------- const RegexPattern &RegexMatcher::pattern() const { return *fPattern; } //-------------------------------------------------------------------------------- // // region // //-------------------------------------------------------------------------------- RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) { if (U_FAILURE(status)) { return *this; } if (regionStart>regionLimit || regionStart<0 || regionLimit<0) { status = U_ILLEGAL_ARGUMENT_ERROR; } int64_t nativeStart = regionStart; int64_t nativeLimit = regionLimit; if (nativeStart > fInputLength || nativeLimit > fInputLength) { status = U_ILLEGAL_ARGUMENT_ERROR; } if (startIndex == -1) this->reset(); else resetPreserveRegion(); fRegionStart = nativeStart; fRegionLimit = nativeLimit; fActiveStart = nativeStart; fActiveLimit = nativeLimit; if (startIndex != -1) { if (startIndex < fActiveStart || startIndex > fActiveLimit) { status = U_INDEX_OUTOFBOUNDS_ERROR; } fMatchEnd = startIndex; } if (!fTransparentBounds) { fLookStart = nativeStart; fLookLimit = nativeLimit; } if (fAnchoringBounds) { fAnchorStart = nativeStart; fAnchorLimit = nativeLimit; } return *this; } RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) { return region(start, limit, -1, status); } //-------------------------------------------------------------------------------- // // regionEnd // //-------------------------------------------------------------------------------- int32_t RegexMatcher::regionEnd() const { return (int32_t)fRegionLimit; } int64_t RegexMatcher::regionEnd64() const { return fRegionLimit; } //-------------------------------------------------------------------------------- // // regionStart // //-------------------------------------------------------------------------------- int32_t RegexMatcher::regionStart() const { return (int32_t)fRegionStart; } int64_t RegexMatcher::regionStart64() const { return fRegionStart; } //-------------------------------------------------------------------------------- // // replaceAll // //-------------------------------------------------------------------------------- UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) { UText replacementText = UTEXT_INITIALIZER; UText resultText = UTEXT_INITIALIZER; UnicodeString resultString; if (U_FAILURE(status)) { return resultString; } utext_openConstUnicodeString(&replacementText, &replacement, &status); utext_openUnicodeString(&resultText, &resultString, &status); replaceAll(&replacementText, &resultText, status); utext_close(&resultText); utext_close(&replacementText); return resultString; } // // replaceAll, UText mode // UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) { if (U_FAILURE(status)) { return dest; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; return dest; } if (dest == nullptr) { UnicodeString emptyString; UText empty = UTEXT_INITIALIZER; utext_openUnicodeString(&empty, &emptyString, &status); dest = utext_clone(nullptr, &empty, true, false, &status); utext_close(&empty); } if (U_SUCCESS(status)) { reset(); while (find()) { appendReplacement(dest, replacement, status); if (U_FAILURE(status)) { break; } } appendTail(dest, status); } return dest; } //-------------------------------------------------------------------------------- // // replaceFirst // //-------------------------------------------------------------------------------- UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) { UText replacementText = UTEXT_INITIALIZER; UText resultText = UTEXT_INITIALIZER; UnicodeString resultString; utext_openConstUnicodeString(&replacementText, &replacement, &status); utext_openUnicodeString(&resultText, &resultString, &status); replaceFirst(&replacementText, &resultText, status); utext_close(&resultText); utext_close(&replacementText); return resultString; } // // replaceFirst, UText mode // UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) { if (U_FAILURE(status)) { return dest; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; return dest; } reset(); if (!find()) { return getInput(dest, status); } if (dest == nullptr) { UnicodeString emptyString; UText empty = UTEXT_INITIALIZER; utext_openUnicodeString(&empty, &emptyString, &status); dest = utext_clone(nullptr, &empty, true, false, &status); utext_close(&empty); } appendReplacement(dest, replacement, status); appendTail(dest, status); return dest; } //-------------------------------------------------------------------------------- // // requireEnd // //-------------------------------------------------------------------------------- UBool RegexMatcher::requireEnd() const { return fRequireEnd; } //-------------------------------------------------------------------------------- // // reset // //-------------------------------------------------------------------------------- RegexMatcher &RegexMatcher::reset() { fRegionStart = 0; fRegionLimit = fInputLength; fActiveStart = 0; fActiveLimit = fInputLength; fAnchorStart = 0; fAnchorLimit = fInputLength; fLookStart = 0; fLookLimit = fInputLength; resetPreserveRegion(); return *this; } void RegexMatcher::resetPreserveRegion() { fMatchStart = 0; fMatchEnd = 0; fLastMatchEnd = -1; fAppendPosition = 0; fMatch = false; fHitEnd = false; fRequireEnd = false; fTime = 0; fTickCounter = TIMER_INITIAL_VALUE; //resetStack(); // more expensive than it looks... } RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus); if (fPattern->fNeedsAltInput) { fAltInputText = utext_clone(fAltInputText, fInputText, false, true, &fDeferredStatus); } if (U_FAILURE(fDeferredStatus)) { return *this; } fInputLength = utext_nativeLength(fInputText); reset(); delete fInput; fInput = nullptr; // Do the following for any UnicodeString. // This is for compatibility for those clients who modify the input string "live" during regex operations. fInputUniStrMaybeMutable = true; #if UCONFIG_NO_BREAK_ITERATION==0 if (fWordBreakItr) { fWordBreakItr->setText(fInputText, fDeferredStatus); } if (fGCBreakItr) { fGCBreakItr->setText(fInputText, fDeferredStatus); } #endif return *this; } RegexMatcher &RegexMatcher::reset(UText *input) { if (fInputText != input) { fInputText = utext_clone(fInputText, input, false, true, &fDeferredStatus); if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, false, true, &fDeferredStatus); if (U_FAILURE(fDeferredStatus)) { return *this; } fInputLength = utext_nativeLength(fInputText); delete fInput; fInput = nullptr; #if UCONFIG_NO_BREAK_ITERATION==0 if (fWordBreakItr) { fWordBreakItr->setText(input, fDeferredStatus); } if (fGCBreakItr) { fGCBreakItr->setText(fInputText, fDeferredStatus); } #endif } reset(); fInputUniStrMaybeMutable = false; return *this; } /*RegexMatcher &RegexMatcher::reset(const char16_t *) { fDeferredStatus = U_INTERNAL_PROGRAM_ERROR; return *this; }*/ RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) { if (U_FAILURE(status)) { return *this; } reset(); // Reset also resets the region to be the entire string. if (position < 0 || position > fActiveLimit) { status = U_INDEX_OUTOFBOUNDS_ERROR; return *this; } fMatchEnd = position; return *this; } //-------------------------------------------------------------------------------- // // refresh // //-------------------------------------------------------------------------------- RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) { if (U_FAILURE(status)) { return *this; } if (input == nullptr) { status = U_ILLEGAL_ARGUMENT_ERROR; return *this; } if (utext_nativeLength(fInputText) != utext_nativeLength(input)) { status = U_ILLEGAL_ARGUMENT_ERROR; return *this; } int64_t pos = utext_getNativeIndex(fInputText); // Shallow read-only clone of the new UText into the existing input UText fInputText = utext_clone(fInputText, input, false, true, &status); if (U_FAILURE(status)) { return *this; } utext_setNativeIndex(fInputText, pos); if (fAltInputText != nullptr) { pos = utext_getNativeIndex(fAltInputText); fAltInputText = utext_clone(fAltInputText, input, false, true, &status); if (U_FAILURE(status)) { return *this; } utext_setNativeIndex(fAltInputText, pos); } return *this; } //-------------------------------------------------------------------------------- // // setTrace // //-------------------------------------------------------------------------------- void RegexMatcher::setTrace(UBool state) { fTraceDebug = state; } /** * UText, replace entire contents of the destination UText with a substring of the source UText. * * @param src The source UText * @param dest The destination UText. Must be writable. * May be nullptr, in which case a new UText will be allocated. * @param start Start index of source substring. * @param limit Limit index of source substring. * @param status An error code. */ static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) { if (U_FAILURE(*status)) { return dest; } if (start == limit) { if (dest) { utext_replace(dest, 0, utext_nativeLength(dest), nullptr, 0, status); return dest; } else { return utext_openUChars(nullptr, nullptr, 0, status); } } int32_t length = utext_extract(src, start, limit, nullptr, 0, status); if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) { return dest; } *status = U_ZERO_ERROR; MaybeStackArray buffer; if (length >= buffer.getCapacity()) { char16_t *newBuf = buffer.resize(length+1); // Leave space for terminating Nul. if (newBuf == nullptr) { *status = U_MEMORY_ALLOCATION_ERROR; } } utext_extract(src, start, limit, buffer.getAlias(), length+1, status); if (dest) { utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status); return dest; } // Caller did not provide a preexisting UText. // Open a new one, and have it adopt the text buffer storage. if (U_FAILURE(*status)) { return nullptr; } int32_t ownedLength = 0; char16_t *ownedBuf = buffer.orphanOrClone(length+1, ownedLength); if (ownedBuf == nullptr) { *status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } UText *result = utext_openUChars(nullptr, ownedBuf, length, status); if (U_FAILURE(*status)) { uprv_free(ownedBuf); return nullptr; } result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT); return result; } //--------------------------------------------------------------------- // // split // //--------------------------------------------------------------------- int32_t RegexMatcher::split(const UnicodeString &input, UnicodeString dest[], int32_t destCapacity, UErrorCode &status) { UText inputText = UTEXT_INITIALIZER; utext_openConstUnicodeString(&inputText, &input, &status); if (U_FAILURE(status)) { return 0; } UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity); if (destText == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return 0; } int32_t i; for (i = 0; i < destCapacity; i++) { destText[i] = utext_openUnicodeString(nullptr, &dest[i], &status); } int32_t fieldCount = split(&inputText, destText, destCapacity, status); for (i = 0; i < destCapacity; i++) { utext_close(destText[i]); } uprv_free(destText); utext_close(&inputText); return fieldCount; } // // split, UText mode // int32_t RegexMatcher::split(UText *input, UText *dest[], int32_t destCapacity, UErrorCode &status) { // // Check arguments for validity // if (U_FAILURE(status)) { return 0; } if (destCapacity < 1) { status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } // // Reset for the input text // reset(input); int64_t nextOutputStringStart = 0; if (fActiveLimit == 0) { return 0; } // // Loop through the input text, searching for the delimiter pattern // int32_t i; int32_t numCaptureGroups = fPattern->fGroupMap->size(); for (i=0; ; i++) { if (i>=destCapacity-1) { // There is one or zero output string left. // Fill the last output string with whatever is left from the input, then exit the loop. // ( i will be == destCapacity if we filled the output array while processing // capture groups of the delimiter expression, in which case we will discard the // last capture group saved in favor of the unprocessed remainder of the // input string.) i = destCapacity-1; if (fActiveLimit > nextOutputStringStart) { if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { if (dest[i]) { utext_replace(dest[i], 0, utext_nativeLength(dest[i]), input->chunkContents+nextOutputStringStart, (int32_t)(fActiveLimit-nextOutputStringStart), &status); } else { UText remainingText = UTEXT_INITIALIZER; utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, fActiveLimit-nextOutputStringStart, &status); dest[i] = utext_clone(nullptr, &remainingText, true, false, &status); utext_close(&remainingText); } } else { UErrorCode lengthStatus = U_ZERO_ERROR; int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, nullptr, 0, &lengthStatus); char16_t *remainingChars = (char16_t *)uprv_malloc(sizeof(char16_t)*(remaining16Length+1)); if (remainingChars == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; break; } utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); if (dest[i]) { utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); } else { UText remainingText = UTEXT_INITIALIZER; utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); dest[i] = utext_clone(nullptr, &remainingText, true, false, &status); utext_close(&remainingText); } uprv_free(remainingChars); } } break; } if (find()) { // We found another delimiter. Move everything from where we started looking // up until the start of the delimiter into the next output string. if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { if (dest[i]) { utext_replace(dest[i], 0, utext_nativeLength(dest[i]), input->chunkContents+nextOutputStringStart, (int32_t)(fMatchStart-nextOutputStringStart), &status); } else { UText remainingText = UTEXT_INITIALIZER; utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, fMatchStart-nextOutputStringStart, &status); dest[i] = utext_clone(nullptr, &remainingText, true, false, &status); utext_close(&remainingText); } } else { UErrorCode lengthStatus = U_ZERO_ERROR; int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, nullptr, 0, &lengthStatus); char16_t *remainingChars = (char16_t *)uprv_malloc(sizeof(char16_t)*(remaining16Length+1)); if (remainingChars == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; break; } utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status); if (dest[i]) { utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); } else { UText remainingText = UTEXT_INITIALIZER; utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); dest[i] = utext_clone(nullptr, &remainingText, true, false, &status); utext_close(&remainingText); } uprv_free(remainingChars); } nextOutputStringStart = fMatchEnd; // If the delimiter pattern has capturing parentheses, the captured // text goes out into the next n destination strings. int32_t groupNum; for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { if (i >= destCapacity-2) { // Never fill the last available output string with capture group text. // It will filled with the last field, the remainder of the // unsplit input text. break; } i++; dest[i] = utext_extract_replace(fInputText, dest[i], start64(groupNum, status), end64(groupNum, status), &status); } if (nextOutputStringStart == fActiveLimit) { // The delimiter was at the end of the string. We're done, but first // we output one last empty string, for the empty field following // the delimiter at the end of input. if (i+1 < destCapacity) { ++i; if (dest[i] == nullptr) { dest[i] = utext_openUChars(nullptr, nullptr, 0, &status); } else { static const char16_t emptyString[] = {(char16_t)0}; utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status); } } break; } } else { // We ran off the end of the input while looking for the next delimiter. // All the remaining text goes into the current output string. if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { if (dest[i]) { utext_replace(dest[i], 0, utext_nativeLength(dest[i]), input->chunkContents+nextOutputStringStart, (int32_t)(fActiveLimit-nextOutputStringStart), &status); } else { UText remainingText = UTEXT_INITIALIZER; utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, fActiveLimit-nextOutputStringStart, &status); dest[i] = utext_clone(nullptr, &remainingText, true, false, &status); utext_close(&remainingText); } } else { UErrorCode lengthStatus = U_ZERO_ERROR; int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, nullptr, 0, &lengthStatus); char16_t *remainingChars = (char16_t *)uprv_malloc(sizeof(char16_t)*(remaining16Length+1)); if (remainingChars == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; break; } utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); if (dest[i]) { utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); } else { UText remainingText = UTEXT_INITIALIZER; utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); dest[i] = utext_clone(nullptr, &remainingText, true, false, &status); utext_close(&remainingText); } uprv_free(remainingChars); } break; } if (U_FAILURE(status)) { break; } } // end of for loop return i+1; } //-------------------------------------------------------------------------------- // // start // //-------------------------------------------------------------------------------- int32_t RegexMatcher::start(UErrorCode &status) const { return start(0, status); } int64_t RegexMatcher::start64(UErrorCode &status) const { return start64(0, status); } //-------------------------------------------------------------------------------- // // start(int32_t group, UErrorCode &status) // //-------------------------------------------------------------------------------- int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const { if (U_FAILURE(status)) { return -1; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; return -1; } if (fMatch == false) { status = U_REGEX_INVALID_STATE; return -1; } if (group < 0 || group > fPattern->fGroupMap->size()) { status = U_INDEX_OUTOFBOUNDS_ERROR; return -1; } int64_t s; if (group == 0) { s = fMatchStart; } else { int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); U_ASSERT(groupOffset < fPattern->fFrameSize); U_ASSERT(groupOffset >= 0); s = fFrame->fExtra[groupOffset]; } return s; } int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { return (int32_t)start64(group, status); } //-------------------------------------------------------------------------------- // // useAnchoringBounds // //-------------------------------------------------------------------------------- RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) { fAnchoringBounds = b; fAnchorStart = (fAnchoringBounds ? fRegionStart : 0); fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength); return *this; } //-------------------------------------------------------------------------------- // // useTransparentBounds // //-------------------------------------------------------------------------------- RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) { fTransparentBounds = b; fLookStart = (fTransparentBounds ? 0 : fRegionStart); fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit); return *this; } //-------------------------------------------------------------------------------- // // setTimeLimit // //-------------------------------------------------------------------------------- void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) { if (U_FAILURE(status)) { return; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; return; } if (limit < 0) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } fTimeLimit = limit; } //-------------------------------------------------------------------------------- // // getTimeLimit // //-------------------------------------------------------------------------------- int32_t RegexMatcher::getTimeLimit() const { return fTimeLimit; } //-------------------------------------------------------------------------------- // // setStackLimit // //-------------------------------------------------------------------------------- void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) { if (U_FAILURE(status)) { return; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; return; } if (limit < 0) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } // Reset the matcher. This is needed here in case there is a current match // whose final stack frame (containing the match results, pointed to by fFrame) // would be lost by resizing to a smaller stack size. reset(); if (limit == 0) { // Unlimited stack expansion fStack->setMaxCapacity(0); } else { // Change the units of the limit from bytes to ints, and bump the size up // to be big enough to hold at least one stack frame for the pattern, // if it isn't there already. int32_t adjustedLimit = limit / sizeof(int32_t); if (adjustedLimit < fPattern->fFrameSize) { adjustedLimit = fPattern->fFrameSize; } fStack->setMaxCapacity(adjustedLimit); } fStackLimit = limit; } //-------------------------------------------------------------------------------- // // getStackLimit // //-------------------------------------------------------------------------------- int32_t RegexMatcher::getStackLimit() const { return fStackLimit; } //-------------------------------------------------------------------------------- // // setMatchCallback // //-------------------------------------------------------------------------------- void RegexMatcher::setMatchCallback(URegexMatchCallback *callback, const void *context, UErrorCode &status) { if (U_FAILURE(status)) { return; } fCallbackFn = callback; fCallbackContext = context; } //-------------------------------------------------------------------------------- // // getMatchCallback // //-------------------------------------------------------------------------------- void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback, const void *&context, UErrorCode &status) { if (U_FAILURE(status)) { return; } callback = fCallbackFn; context = fCallbackContext; } //-------------------------------------------------------------------------------- // // setMatchCallback // //-------------------------------------------------------------------------------- void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback, const void *context, UErrorCode &status) { if (U_FAILURE(status)) { return; } fFindProgressCallbackFn = callback; fFindProgressCallbackContext = context; } //-------------------------------------------------------------------------------- // // getMatchCallback // //-------------------------------------------------------------------------------- void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback, const void *&context, UErrorCode &status) { if (U_FAILURE(status)) { return; } callback = fFindProgressCallbackFn; context = fFindProgressCallbackContext; } //================================================================================ // // Code following this point in this file is the internal // Match Engine Implementation. // //================================================================================ //-------------------------------------------------------------------------------- // // resetStack // Discard any previous contents of the state save stack, and initialize a // new stack frame to all -1. The -1s are needed for capture group limits, // where they indicate that a group has not yet matched anything. //-------------------------------------------------------------------------------- REStackFrame *RegexMatcher::resetStack() { // Discard any previous contents of the state save stack, and initialize a // new stack frame with all -1 data. The -1s are needed for capture group limits, // where they indicate that a group has not yet matched anything. fStack->removeAllElements(); REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus); if(U_FAILURE(fDeferredStatus)) { return nullptr; } int32_t i; for (i=0; ifFrameSize-RESTACKFRAME_HDRCOUNT; i++) { iFrame->fExtra[i] = -1; } return iFrame; } //-------------------------------------------------------------------------------- // // isWordBoundary // in perl, "xab..cd..", \b is true at positions 0,3,5,7 // For us, // If the current char is a combining mark, // \b is false. // Else Scan backwards to the first non-combining char. // We are at a boundary if the this char and the original chars are // opposite in membership in \w set // // parameters: pos - the current position in the input buffer // // TODO: double-check edge cases at region boundaries. // //-------------------------------------------------------------------------------- UBool RegexMatcher::isWordBoundary(int64_t pos) { UBool isBoundary = false; UBool cIsWord = false; if (pos >= fLookLimit) { fHitEnd = true; } else { // Determine whether char c at current position is a member of the word set of chars. // If we're off the end of the string, behave as though we're not at a word char. UTEXT_SETNATIVEINDEX(fInputText, pos); UChar32 c = UTEXT_CURRENT32(fInputText); if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { // Current char is a combining one. Not a boundary. return false; } cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c); } // Back up until we come to a non-combining char, determine whether // that char is a word char. UBool prevCIsWord = false; for (;;) { if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) { break; } UChar32 prevChar = UTEXT_PREVIOUS32(fInputText); if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) || u_charType(prevChar) == U_FORMAT_CHAR)) { prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar); break; } } isBoundary = cIsWord ^ prevCIsWord; return isBoundary; } UBool RegexMatcher::isChunkWordBoundary(int32_t pos) { UBool isBoundary = false; UBool cIsWord = false; const char16_t *inputBuf = fInputText->chunkContents; if (pos >= fLookLimit) { fHitEnd = true; } else { // Determine whether char c at current position is a member of the word set of chars. // If we're off the end of the string, behave as though we're not at a word char. UChar32 c; U16_GET(inputBuf, fLookStart, pos, fLookLimit, c); if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { // Current char is a combining one. Not a boundary. return false; } cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c); } // Back up until we come to a non-combining char, determine whether // that char is a word char. UBool prevCIsWord = false; for (;;) { if (pos <= fLookStart) { break; } UChar32 prevChar; U16_PREV(inputBuf, fLookStart, pos, prevChar); if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) || u_charType(prevChar) == U_FORMAT_CHAR)) { prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar); break; } } isBoundary = cIsWord ^ prevCIsWord; return isBoundary; } //-------------------------------------------------------------------------------- // // isUWordBoundary // // Test for a word boundary using RBBI word break. // // parameters: pos - the current position in the input buffer // //-------------------------------------------------------------------------------- UBool RegexMatcher::isUWordBoundary(int64_t pos, UErrorCode &status) { UBool returnVal = false; #if UCONFIG_NO_BREAK_ITERATION==0 // Note: this point will never be reached if break iteration is configured out. // Regex patterns that would require this function will fail to compile. // If we haven't yet created a break iterator for this matcher, do it now. if (fWordBreakItr == nullptr) { fWordBreakItr = BreakIterator::createWordInstance(Locale::getEnglish(), status); if (U_FAILURE(status)) { return false; } fWordBreakItr->setText(fInputText, status); } // Note: zero width boundary tests like \b see through transparent region bounds, // which is why fLookLimit is used here, rather than fActiveLimit. if (pos >= fLookLimit) { fHitEnd = true; returnVal = true; // With Unicode word rules, only positions within the interior of "real" // words are not boundaries. All non-word chars stand by themselves, // with word boundaries on both sides. } else { returnVal = fWordBreakItr->isBoundary((int32_t)pos); } #endif return returnVal; } int64_t RegexMatcher::followingGCBoundary(int64_t pos, UErrorCode &status) { int64_t result = pos; #if UCONFIG_NO_BREAK_ITERATION==0 // Note: this point will never be reached if break iteration is configured out. // Regex patterns that would require this function will fail to compile. // If we haven't yet created a break iterator for this matcher, do it now. if (fGCBreakItr == nullptr) { fGCBreakItr = BreakIterator::createCharacterInstance(Locale::getEnglish(), status); if (U_FAILURE(status)) { return pos; } fGCBreakItr->setText(fInputText, status); } result = fGCBreakItr->following(pos); if (result == BreakIterator::DONE) { result = pos; } #endif return result; } //-------------------------------------------------------------------------------- // // IncrementTime This function is called once each TIMER_INITIAL_VALUE state // saves. Increment the "time" counter, and call the // user callback function if there is one installed. // // If the match operation needs to be aborted, either for a time-out // or because the user callback asked for it, just set an error status. // The engine will pick that up and stop in its outer loop. // //-------------------------------------------------------------------------------- void RegexMatcher::IncrementTime(UErrorCode &status) { fTickCounter = TIMER_INITIAL_VALUE; fTime++; if (fCallbackFn != nullptr) { if ((*fCallbackFn)(fCallbackContext, fTime) == false) { status = U_REGEX_STOPPED_BY_CALLER; return; } } if (fTimeLimit > 0 && fTime >= fTimeLimit) { status = U_REGEX_TIME_OUT; } } //-------------------------------------------------------------------------------- // // StateSave // Make a new stack frame, initialized as a copy of the current stack frame. // Set the pattern index in the original stack frame from the operand value // in the opcode. Execution of the engine continues with the state in // the newly created stack frame // // Note that reserveBlock() may grow the stack, resulting in the // whole thing being relocated in memory. // // Parameters: // fp The top frame pointer when called. At return, a new // fame will be present // savePatIdx An index into the compiled pattern. Goes into the original // (not new) frame. If execution ever back-tracks out of the // new frame, this will be where we continue from in the pattern. // Return // The new frame pointer. // //-------------------------------------------------------------------------------- inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) { if (U_FAILURE(status)) { return fp; } // push storage for a new frame. int64_t *newFP = fStack->reserveBlock(fFrameSize, status); if (U_FAILURE(status)) { // Failure on attempted stack expansion. // Stack function set some other error code, change it to a more // specific one for regular expressions. status = U_REGEX_STACK_OVERFLOW; // We need to return a writable stack frame, so just return the // previous frame. The match operation will stop quickly // because of the error status, after which the frame will never // be looked at again. return fp; } fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack. // New stack frame = copy of old top frame. int64_t *source = (int64_t *)fp; int64_t *dest = newFP; for (;;) { *dest++ = *source++; if (source == newFP) { break; } } fTickCounter--; if (fTickCounter <= 0) { IncrementTime(status); // Re-initializes fTickCounter } fp->fPatIdx = savePatIdx; return (REStackFrame *)newFP; } #if defined(REGEX_DEBUG) namespace { UnicodeString StringFromUText(UText *ut) { UnicodeString result; for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) { result.append(c); } return result; } } #endif // REGEX_DEBUG //-------------------------------------------------------------------------------- // // MatchAt This is the actual matching engine. // // startIdx: begin matching a this index. // toEnd: if true, match must extend to end of the input region // //-------------------------------------------------------------------------------- void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { UBool isMatch = false; // True if the we have a match. int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards int32_t op; // Operation from the compiled pattern, split into int32_t opType; // the opcode int32_t opValue; // and the operand value. #ifdef REGEX_RUN_DEBUG if (fTraceDebug) { printf("MatchAt(startIdx=%ld)\n", startIdx); printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))()); printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))()); } #endif if (U_FAILURE(status)) { return; } // Cache frequently referenced items from the compiled pattern // int64_t *pat = fPattern->fCompiledPat->getBuffer(); const char16_t *litText = fPattern->fLiteralText.getBuffer(); UVector *fSets = fPattern->fSets; fFrameSize = fPattern->fFrameSize; REStackFrame *fp = resetStack(); if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; return; } fp->fPatIdx = 0; fp->fInputIdx = startIdx; // Zero out the pattern's static data int32_t i; for (i = 0; ifDataSize; i++) { fData[i] = 0; } // // Main loop for interpreting the compiled pattern. // One iteration of the loop per pattern operation performed. // for (;;) { op = (int32_t)pat[fp->fPatIdx]; opType = URX_TYPE(op); opValue = URX_VAL(op); #ifdef REGEX_RUN_DEBUG if (fTraceDebug) { UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx, UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); fPattern->dumpOp(fp->fPatIdx); } #endif fp->fPatIdx++; switch (opType) { case URX_NOP: break; case URX_BACKTRACK: // Force a backtrack. In some circumstances, the pattern compiler // will notice that the pattern can't possibly match anything, and will // emit one of these at that point. fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; case URX_ONECHAR: if (fp->fInputIdx < fActiveLimit) { UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); UChar32 c = UTEXT_NEXT32(fInputText); if (c == opValue) { fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); break; } } else { fHitEnd = true; } fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; case URX_STRING: { // Test input against a literal string. // Strings require two slots in the compiled pattern, one for the // offset to the string text, and one for the length. int32_t stringStartIdx = opValue; op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand fp->fPatIdx++; opType = URX_TYPE(op); int32_t stringLen = URX_VAL(op); U_ASSERT(opType == URX_STRING_LEN); U_ASSERT(stringLen >= 2); const char16_t *patternString = litText+stringStartIdx; int32_t patternStringIndex = 0; UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); UChar32 inputChar; UChar32 patternChar; UBool success = true; while (patternStringIndex < stringLen) { if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) { success = false; fHitEnd = true; break; } inputChar = UTEXT_NEXT32(fInputText); U16_NEXT(patternString, patternStringIndex, stringLen, patternChar); if (patternChar != inputChar) { success = false; break; } } if (success) { fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); } else { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_STATE_SAVE: fp = StateSave(fp, opValue, status); break; case URX_END: // The match loop will exit via this path on a successful match, // when we reach the end of the pattern. if (toEnd && fp->fInputIdx != fActiveLimit) { // The pattern matched, but not to the end of input. Try some more. fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } isMatch = true; goto breakFromLoop; // Start and End Capture stack frame variables are laid out out like this: // fp->fExtra[opValue] - The start of a completed capture group // opValue+1 - The end of a completed capture group // opValue+2 - the start of a capture group whose end // has not yet been reached (and might not ever be). case URX_START_CAPTURE: U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); fp->fExtra[opValue+2] = fp->fInputIdx; break; case URX_END_CAPTURE: U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. fp->fExtra[opValue+1] = fp->fInputIdx; // End position U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); break; case URX_DOLLAR: // $, test for End of line // or for position before new line at end of input { if (fp->fInputIdx >= fAnchorLimit) { // We really are at the end of input. Success. fHitEnd = true; fRequireEnd = true; break; } UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); // If we are positioned just before a new-line that is located at the // end of input, succeed. UChar32 c = UTEXT_NEXT32(fInputText); if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { if (isLineTerminator(c)) { // If not in the middle of a CR/LF sequence if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { // At new-line at end of input. Success fHitEnd = true; fRequireEnd = true; break; } } } else { UChar32 nextC = UTEXT_NEXT32(fInputText); if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { fHitEnd = true; fRequireEnd = true; break; // At CR/LF at end of input. Success } } fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. if (fp->fInputIdx >= fAnchorLimit) { // Off the end of input. Success. fHitEnd = true; fRequireEnd = true; break; } else { UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); UChar32 c = UTEXT_NEXT32(fInputText); // Either at the last character of input, or off the end. if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) { fHitEnd = true; fRequireEnd = true; break; } } // Not at end of input. Back-track out. fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; case URX_DOLLAR_M: // $, test for End of line in multi-line mode { if (fp->fInputIdx >= fAnchorLimit) { // We really are at the end of input. Success. fHitEnd = true; fRequireEnd = true; break; } // If we are positioned just before a new-line, succeed. // It makes no difference where the new-line is within the input. UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); UChar32 c = UTEXT_CURRENT32(fInputText); if (isLineTerminator(c)) { // At a line end, except for the odd chance of being in the middle of a CR/LF sequence // In multi-line mode, hitting a new-line just before the end of input does not // set the hitEnd or requireEnd flags if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) { break; } } // not at a new line. Fail. fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode { if (fp->fInputIdx >= fAnchorLimit) { // We really are at the end of input. Success. fHitEnd = true; fRequireEnd = true; // Java set requireEnd in this case, even though break; // adding a new-line would not lose the match. } // If we are not positioned just before a new-line, the test fails; backtrack out. // It makes no difference where the new-line is within the input. UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); if (UTEXT_CURRENT32(fInputText) != 0x0a) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_CARET: // ^, test for start of line if (fp->fInputIdx != fAnchorStart) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; case URX_CARET_M: // ^, test for start of line in mulit-line mode { if (fp->fInputIdx == fAnchorStart) { // We are at the start input. Success. break; } // Check whether character just before the current pos is a new-line // unless we are at the end of input UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); UChar32 c = UTEXT_PREVIOUS32(fInputText); if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) { // It's a new-line. ^ is true. Success. // TODO: what should be done with positions between a CR and LF? break; } // Not at the start of a line. Fail. fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode { U_ASSERT(fp->fInputIdx >= fAnchorStart); if (fp->fInputIdx <= fAnchorStart) { // We are at the start input. Success. break; } // Check whether character just before the current pos is a new-line U_ASSERT(fp->fInputIdx <= fAnchorLimit); UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); UChar32 c = UTEXT_PREVIOUS32(fInputText); if (c != 0x0a) { // Not at the start of a line. Back-track out. fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_BACKSLASH_B: // Test for word boundaries { UBool success = isWordBoundary(fp->fInputIdx); success ^= (UBool)(opValue != 0); // flip sense for \B if (!success) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style { UBool success = isUWordBoundary(fp->fInputIdx, status); success ^= (UBool)(opValue != 0); // flip sense for \B if (!success) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_BACKSLASH_D: // Test for decimal digit { if (fp->fInputIdx >= fActiveLimit) { fHitEnd = true; fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); UChar32 c = UTEXT_NEXT32(fInputText); int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); success ^= (UBool)(opValue != 0); // flip sense for \D if (success) { fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); } else { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_BACKSLASH_G: // Test for position at end of previous match if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==false && fp->fInputIdx==fActiveStart))) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; case URX_BACKSLASH_H: // Test for \h, horizontal white space. { if (fp->fInputIdx >= fActiveLimit) { fHitEnd = true; fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); UChar32 c = UTEXT_NEXT32(fInputText); int8_t ctype = u_charType(c); UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB success ^= (UBool)(opValue != 0); // flip sense for \H if (success) { fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); } else { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_BACKSLASH_R: // Test for \R, any line break sequence. { if (fp->fInputIdx >= fActiveLimit) { fHitEnd = true; fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); UChar32 c = UTEXT_NEXT32(fInputText); if (isLineTerminator(c)) { if (c == 0x0d && utext_current32(fInputText) == 0x0a) { utext_next32(fInputText); } fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); } else { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_BACKSLASH_V: // \v, any single line ending character. { if (fp->fInputIdx >= fActiveLimit) { fHitEnd = true; fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); UChar32 c = UTEXT_NEXT32(fInputText); UBool success = isLineTerminator(c); success ^= (UBool)(opValue != 0); // flip sense for \V if (success) { fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); } else { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_BACKSLASH_X: // Match a Grapheme, as defined by Unicode UAX 29. // Fail if at end of input if (fp->fInputIdx >= fActiveLimit) { fHitEnd = true; fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status); if (fp->fInputIdx >= fActiveLimit) { fHitEnd = true; fp->fInputIdx = fActiveLimit; } break; case URX_BACKSLASH_Z: // Test for end of Input if (fp->fInputIdx < fAnchorLimit) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } else { fHitEnd = true; fRequireEnd = true; } break; case URX_STATIC_SETREF: { // Test input character against one of the predefined sets // (Word Characters, for example) // The high bit of the op value is a flag for the match polarity. // 0: success if input char is in set. // 1: success if input char is not in set. if (fp->fInputIdx >= fActiveLimit) { fHitEnd = true; fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); opValue &= ~URX_NEG_SET; U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); UChar32 c = UTEXT_NEXT32(fInputText); if (c < 256) { Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue]; if (s8.contains(c)) { success = !success; } } else { const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue]; if (s.contains(c)) { success = !success; } } if (success) { fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); } else { // the character wasn't in the set. fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_STAT_SETREF_N: { // Test input character for NOT being a member of one of // the predefined sets (Word Characters, for example) if (fp->fInputIdx >= fActiveLimit) { fHitEnd = true; fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); UChar32 c = UTEXT_NEXT32(fInputText); if (c < 256) { Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue]; if (s8.contains(c) == false) { fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); break; } } else { const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue]; if (s.contains(c) == false) { fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); break; } } // the character wasn't in the set. fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; case URX_SETREF: if (fp->fInputIdx >= fActiveLimit) { fHitEnd = true; fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } else { UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); // There is input left. Pick up one char and test it for set membership. UChar32 c = UTEXT_NEXT32(fInputText); U_ASSERT(opValue > 0 && opValue < fSets->size()); if (c<256) { Regex8BitSet *s8 = &fPattern->fSets8[opValue]; if (s8->contains(c)) { fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); break; } } else { UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue); if (s->contains(c)) { // The character is in the set. A Match. fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); break; } } // the character wasn't in the set. fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; case URX_DOTANY: { // . matches anything, but stops at end-of-line. if (fp->fInputIdx >= fActiveLimit) { // At end of input. Match failed. Backtrack out. fHitEnd = true; fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); // There is input left. Advance over one char, unless we've hit end-of-line UChar32 c = UTEXT_NEXT32(fInputText); if (isLineTerminator(c)) { // End of line in normal mode. . does not match. fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); } break; case URX_DOTANY_ALL: { // ., in dot-matches-all (including new lines) mode if (fp->fInputIdx >= fActiveLimit) { // At end of input. Match failed. Backtrack out. fHitEnd = true; fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); // There is input left. Advance over one char, except if we are // at a cr/lf, advance over both of them. UChar32 c; c = UTEXT_NEXT32(fInputText); fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); if (c==0x0d && fp->fInputIdx < fActiveLimit) { // In the case of a CR/LF, we need to advance over both. UChar32 nextc = UTEXT_CURRENT32(fInputText); if (nextc == 0x0a) { (void)UTEXT_NEXT32(fInputText); fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); } } } break; case URX_DOTANY_UNIX: { // '.' operator, matches all, but stops at end-of-line. // UNIX_LINES mode, so 0x0a is the only recognized line ending. if (fp->fInputIdx >= fActiveLimit) { // At end of input. Match failed. Backtrack out. fHitEnd = true; fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); // There is input left. Advance over one char, unless we've hit end-of-line UChar32 c = UTEXT_NEXT32(fInputText); if (c == 0x0a) { // End of line in normal mode. '.' does not match the \n fp = (REStackFrame *)fStack->popFrame(fFrameSize); } else { fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); } } break; case URX_JMP: fp->fPatIdx = opValue; break; case URX_FAIL: isMatch = false; goto breakFromLoop; case URX_JMP_SAV: U_ASSERT(opValue < fPattern->fCompiledPat->size()); fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current fp->fPatIdx = opValue; // Then JMP. break; case URX_JMP_SAV_X: // This opcode is used with (x)+, when x can match a zero length string. // Same as JMP_SAV, except conditional on the match having made forward progress. // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the // data address of the input position at the start of the loop. { U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); int32_t stoOp = (int32_t)pat[opValue-1]; U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); int32_t frameLoc = URX_VAL(stoOp); U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); int64_t prevInputIdx = fp->fExtra[frameLoc]; U_ASSERT(prevInputIdx <= fp->fInputIdx); if (prevInputIdx < fp->fInputIdx) { // The match did make progress. Repeat the loop. fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current fp->fPatIdx = opValue; fp->fExtra[frameLoc] = fp->fInputIdx; } // If the input position did not advance, we do nothing here, // execution will fall out of the loop. } break; case URX_CTR_INIT: { U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); fp->fExtra[opValue] = 0; // Set the loop counter variable to zero // Pick up the three extra operands that CTR_INIT has, and // skip the pattern location counter past int32_t instrOperandLoc = (int32_t)fp->fPatIdx; fp->fPatIdx += 3; int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); int32_t minCount = (int32_t)pat[instrOperandLoc+1]; int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; U_ASSERT(minCount>=0); U_ASSERT(maxCount>=minCount || maxCount==-1); U_ASSERT(loopLoc>=fp->fPatIdx); if (minCount == 0) { fp = StateSave(fp, loopLoc+1, status); } if (maxCount == -1) { fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking. } else if (maxCount == 0) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_CTR_LOOP: { U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); int32_t initOp = (int32_t)pat[opValue]; U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; int32_t minCount = (int32_t)pat[opValue+2]; int32_t maxCount = (int32_t)pat[opValue+3]; (*pCounter)++; if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { U_ASSERT(*pCounter == maxCount); break; } if (*pCounter >= minCount) { if (maxCount == -1) { // Loop has no hard upper bound. // Check that it is progressing through the input, break if it is not. int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; if (fp->fInputIdx == *pLastInputIdx) { break; } else { *pLastInputIdx = fp->fInputIdx; } } fp = StateSave(fp, fp->fPatIdx, status); } else { // Increment time-out counter. (StateSave() does it if count >= minCount) fTickCounter--; if (fTickCounter <= 0) { IncrementTime(status); // Re-initializes fTickCounter } } fp->fPatIdx = opValue + 4; // Loop back. } break; case URX_CTR_INIT_NG: { // Initialize a non-greedy loop U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); fp->fExtra[opValue] = 0; // Set the loop counter variable to zero // Pick up the three extra operands that CTR_INIT_NG has, and // skip the pattern location counter past int32_t instrOperandLoc = (int32_t)fp->fPatIdx; fp->fPatIdx += 3; int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); int32_t minCount = (int32_t)pat[instrOperandLoc+1]; int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; U_ASSERT(minCount>=0); U_ASSERT(maxCount>=minCount || maxCount==-1); U_ASSERT(loopLoc>fp->fPatIdx); if (maxCount == -1) { fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking. } if (minCount == 0) { if (maxCount != 0) { fp = StateSave(fp, fp->fPatIdx, status); } fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block } } break; case URX_CTR_LOOP_NG: { // Non-greedy {min, max} loops U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); int32_t initOp = (int32_t)pat[opValue]; U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; int32_t minCount = (int32_t)pat[opValue+2]; int32_t maxCount = (int32_t)pat[opValue+3]; (*pCounter)++; if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { // The loop has matched the maximum permitted number of times. // Break out of here with no action. Matching will // continue with the following pattern. U_ASSERT(*pCounter == maxCount); break; } if (*pCounter < minCount) { // We haven't met the minimum number of matches yet. // Loop back for another one. fp->fPatIdx = opValue + 4; // Loop back. // Increment time-out counter. (StateSave() does it if count >= minCount) fTickCounter--; if (fTickCounter <= 0) { IncrementTime(status); // Re-initializes fTickCounter } } else { // We do have the minimum number of matches. // If there is no upper bound on the loop iterations, check that the input index // is progressing, and stop the loop if it is not. if (maxCount == -1) { int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; if (fp->fInputIdx == *pLastInputIdx) { break; } *pLastInputIdx = fp->fInputIdx; } // Loop Continuation: we will fall into the pattern following the loop // (non-greedy, don't execute loop body first), but first do // a state save to the top of the loop, so that a match failure // in the following pattern will try another iteration of the loop. fp = StateSave(fp, opValue + 4, status); } } break; case URX_STO_SP: U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); fData[opValue] = fStack->size(); break; case URX_LD_SP: { U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); int32_t newStackSize = (int32_t)fData[opValue]; U_ASSERT(newStackSize <= fStack->size()); int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; if (newFP == (int64_t *)fp) { break; } int32_t j; for (j=0; jsetSize(newStackSize); } break; case URX_BACKREF: { U_ASSERT(opValue < fFrameSize); int64_t groupStartIdx = fp->fExtra[opValue]; int64_t groupEndIdx = fp->fExtra[opValue+1]; U_ASSERT(groupStartIdx <= groupEndIdx); if (groupStartIdx < 0) { // This capture group has not participated in the match thus far, fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. break; } UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx); UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); // Note: if the capture group match was of an empty string the backref // match succeeds. Verified by testing: Perl matches succeed // in this case, so we do too. UBool success = true; for (;;) { if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) { success = true; break; } if (utext_getNativeIndex(fInputText) >= fActiveLimit) { success = false; fHitEnd = true; break; } UChar32 captureGroupChar = utext_next32(fAltInputText); UChar32 inputChar = utext_next32(fInputText); if (inputChar != captureGroupChar) { success = false; break; } } if (success) { fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); } else { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_BACKREF_I: { U_ASSERT(opValue < fFrameSize); int64_t groupStartIdx = fp->fExtra[opValue]; int64_t groupEndIdx = fp->fExtra[opValue+1]; U_ASSERT(groupStartIdx <= groupEndIdx); if (groupStartIdx < 0) { // This capture group has not participated in the match thus far, fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. break; } utext_setNativeIndex(fAltInputText, groupStartIdx); utext_setNativeIndex(fInputText, fp->fInputIdx); CaseFoldingUTextIterator captureGroupItr(*fAltInputText); CaseFoldingUTextIterator inputItr(*fInputText); // Note: if the capture group match was of an empty string the backref // match succeeds. Verified by testing: Perl matches succeed // in this case, so we do too. UBool success = true; for (;;) { if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) { success = true; break; } if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) { success = false; fHitEnd = true; break; } UChar32 captureGroupChar = captureGroupItr.next(); UChar32 inputChar = inputItr.next(); if (inputChar != captureGroupChar) { success = false; break; } } if (success && inputItr.inExpansion()) { // We obtained a match by consuming part of a string obtained from // case-folding a single code point of the input text. // This does not count as an overall match. success = false; } if (success) { fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); } else { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_STO_INP_LOC: { U_ASSERT(opValue >= 0 && opValue < fFrameSize); fp->fExtra[opValue] = fp->fInputIdx; } break; case URX_JMPX: { int32_t instrOperandLoc = (int32_t)fp->fPatIdx; fp->fPatIdx += 1; int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); int64_t savedInputIdx = fp->fExtra[dataLoc]; U_ASSERT(savedInputIdx <= fp->fInputIdx); if (savedInputIdx < fp->fInputIdx) { fp->fPatIdx = opValue; // JMP } else { fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. } } break; case URX_LA_START: { // Entering a look around block. // Save Stack Ptr, Input Pos. U_ASSERT(opValue>=0 && opValue+3fDataSize); fData[opValue] = fStack->size(); fData[opValue+1] = fp->fInputIdx; fData[opValue+2] = fActiveStart; fData[opValue+3] = fActiveLimit; fActiveStart = fLookStart; // Set the match region change for fActiveLimit = fLookLimit; // transparent bounds. } break; case URX_LA_END: { // Leaving a look-ahead block. // restore Stack Ptr, Input Pos to positions they had on entry to block. U_ASSERT(opValue>=0 && opValue+3fDataSize); int32_t stackSize = fStack->size(); int32_t newStackSize =(int32_t)fData[opValue]; U_ASSERT(stackSize >= newStackSize); if (stackSize > newStackSize) { // Copy the current top frame back to the new (cut back) top frame. // This makes the capture groups from within the look-ahead // expression available. int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; int32_t j; for (j=0; jsetSize(newStackSize); } fp->fInputIdx = fData[opValue+1]; // Restore the active region bounds in the input string; they may have // been changed because of transparent bounds on a Region. fActiveStart = fData[opValue+2]; fActiveLimit = fData[opValue+3]; U_ASSERT(fActiveStart >= 0); U_ASSERT(fActiveLimit <= fInputLength); } break; case URX_ONECHAR_I: // Case insensitive one char. The char from the pattern is already case folded. // Input text is not, but case folding the input can not reduce two or more code // points to one. if (fp->fInputIdx < fActiveLimit) { UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); UChar32 c = UTEXT_NEXT32(fInputText); if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); break; } } else { fHitEnd = true; } fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; case URX_STRING_I: { // Case-insensitive test input against a literal string. // Strings require two slots in the compiled pattern, one for the // offset to the string text, and one for the length. // The compiled string has already been case folded. { const char16_t *patternString = litText + opValue; int32_t patternStringIdx = 0; op = (int32_t)pat[fp->fPatIdx]; fp->fPatIdx++; opType = URX_TYPE(op); opValue = URX_VAL(op); U_ASSERT(opType == URX_STRING_LEN); int32_t patternStringLen = opValue; // Length of the string from the pattern. UChar32 cPattern; UChar32 cText; UBool success = true; UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); CaseFoldingUTextIterator inputIterator(*fInputText); while (patternStringIdx < patternStringLen) { if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) { success = false; fHitEnd = true; break; } U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern); cText = inputIterator.next(); if (cText != cPattern) { success = false; break; } } if (inputIterator.inExpansion()) { success = false; } if (success) { fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); } else { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } } break; case URX_LB_START: { // Entering a look-behind block. // Save Stack Ptr, Input Pos and active input region. // TODO: implement transparent bounds. Ticket #6067 U_ASSERT(opValue>=0 && opValue+4fDataSize); fData[opValue] = fStack->size(); fData[opValue+1] = fp->fInputIdx; // Save input string length, then reset to pin any matches to end at // the current position. fData[opValue+2] = fActiveStart; fData[opValue+3] = fActiveLimit; fActiveStart = fRegionStart; fActiveLimit = fp->fInputIdx; // Init the variable containing the start index for attempted matches. fData[opValue+4] = -1; } break; case URX_LB_CONT: { // Positive Look-Behind, at top of loop checking for matches of LB expression // at all possible input starting positions. // Fetch the min and max possible match lengths. They are the operands // of this op in the pattern. int32_t minML = (int32_t)pat[fp->fPatIdx++]; int32_t maxML = (int32_t)pat[fp->fPatIdx++]; if (!UTEXT_USES_U16(fInputText)) { // utf-8 fix to maximum match length. The pattern compiler assumes utf-16. // The max length need not be exact; it just needs to be >= actual maximum. maxML *= 3; } U_ASSERT(minML <= maxML); U_ASSERT(minML >= 0); // Fetch (from data) the last input index where a match was attempted. U_ASSERT(opValue>=0 && opValue+4fDataSize); int64_t &lbStartIdx = fData[opValue+4]; if (lbStartIdx < 0) { // First time through loop. lbStartIdx = fp->fInputIdx - minML; if (lbStartIdx > 0) { // move index to a code point boundary, if it's not on one already. UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx); lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); } } else { // 2nd through nth time through the loop. // Back up start position for match by one. if (lbStartIdx == 0) { (lbStartIdx)--; } else { UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx); (void)UTEXT_PREVIOUS32(fInputText); lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); } } if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) { // We have tried all potential match starting points without // getting a match. Backtrack out, and out of the // Look Behind altogether. fp = (REStackFrame *)fStack->popFrame(fFrameSize); fActiveStart = fData[opValue+2]; fActiveLimit = fData[opValue+3]; U_ASSERT(fActiveStart >= 0); U_ASSERT(fActiveLimit <= fInputLength); break; } // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. // (successful match will fall off the end of the loop.) fp = StateSave(fp, fp->fPatIdx-3, status); fp->fInputIdx = lbStartIdx; } break; case URX_LB_END: // End of a look-behind block, after a successful match. { U_ASSERT(opValue>=0 && opValue+4fDataSize); if (fp->fInputIdx != fActiveLimit) { // The look-behind expression matched, but the match did not // extend all the way to the point that we are looking behind from. // FAIL out of here, which will take us back to the LB_CONT, which // will retry the match starting at another position or fail // the look-behind altogether, whichever is appropriate. fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } // Look-behind match is good. Restore the original input string region, // which had been truncated to pin the end of the lookbehind match to the // position being looked-behind. fActiveStart = fData[opValue+2]; fActiveLimit = fData[opValue+3]; U_ASSERT(fActiveStart >= 0); U_ASSERT(fActiveLimit <= fInputLength); } break; case URX_LBN_CONT: { // Negative Look-Behind, at top of loop checking for matches of LB expression // at all possible input starting positions. // Fetch the extra parameters of this op. int32_t minML = (int32_t)pat[fp->fPatIdx++]; int32_t maxML = (int32_t)pat[fp->fPatIdx++]; if (!UTEXT_USES_U16(fInputText)) { // utf-8 fix to maximum match length. The pattern compiler assumes utf-16. // The max length need not be exact; it just needs to be >= actual maximum. maxML *= 3; } int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; continueLoc = URX_VAL(continueLoc); U_ASSERT(minML <= maxML); U_ASSERT(minML >= 0); U_ASSERT(continueLoc > fp->fPatIdx); // Fetch (from data) the last input index where a match was attempted. U_ASSERT(opValue>=0 && opValue+4fDataSize); int64_t &lbStartIdx = fData[opValue+4]; if (lbStartIdx < 0) { // First time through loop. lbStartIdx = fp->fInputIdx - minML; if (lbStartIdx > 0) { // move index to a code point boundary, if it's not on one already. UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx); lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); } } else { // 2nd through nth time through the loop. // Back up start position for match by one. if (lbStartIdx == 0) { (lbStartIdx)--; } else { UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx); (void)UTEXT_PREVIOUS32(fInputText); lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); } } if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) { // We have tried all potential match starting points without // getting a match, which means that the negative lookbehind as // a whole has succeeded. Jump forward to the continue location fActiveStart = fData[opValue+2]; fActiveLimit = fData[opValue+3]; U_ASSERT(fActiveStart >= 0); U_ASSERT(fActiveLimit <= fInputLength); fp->fPatIdx = continueLoc; break; } // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. // (successful match will cause a FAIL out of the loop altogether.) fp = StateSave(fp, fp->fPatIdx-4, status); fp->fInputIdx = lbStartIdx; } break; case URX_LBN_END: // End of a negative look-behind block, after a successful match. { U_ASSERT(opValue>=0 && opValue+4fDataSize); if (fp->fInputIdx != fActiveLimit) { // The look-behind expression matched, but the match did not // extend all the way to the point that we are looking behind from. // FAIL out of here, which will take us back to the LB_CONT, which // will retry the match starting at another position or succeed // the look-behind altogether, whichever is appropriate. fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } // Look-behind expression matched, which means look-behind test as // a whole Fails // Restore the original input string length, which had been truncated // inorder to pin the end of the lookbehind match // to the position being looked-behind. fActiveStart = fData[opValue+2]; fActiveLimit = fData[opValue+3]; U_ASSERT(fActiveStart >= 0); U_ASSERT(fActiveLimit <= fInputLength); // Restore original stack position, discarding any state saved // by the successful pattern match. U_ASSERT(opValue>=0 && opValue+1fDataSize); int32_t newStackSize = (int32_t)fData[opValue]; U_ASSERT(fStack->size() > newStackSize); fStack->setSize(newStackSize); // FAIL, which will take control back to someplace // prior to entering the look-behind test. fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; case URX_LOOP_SR_I: // Loop Initialization for the optimized implementation of // [some character set]* // This op scans through all matching input. // The following LOOP_C op emulates stack unwinding if the following pattern fails. { U_ASSERT(opValue > 0 && opValue < fSets->size()); Regex8BitSet *s8 = &fPattern->fSets8[opValue]; UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue); // Loop through input, until either the input is exhausted or // we reach a character that is not a member of the set. int64_t ix = fp->fInputIdx; UTEXT_SETNATIVEINDEX(fInputText, ix); for (;;) { if (ix >= fActiveLimit) { fHitEnd = true; break; } UChar32 c = UTEXT_NEXT32(fInputText); if (c<256) { if (s8->contains(c) == false) { break; } } else { if (s->contains(c) == false) { break; } } ix = UTEXT_GETNATIVEINDEX(fInputText); } // If there were no matching characters, skip over the loop altogether. // The loop doesn't run at all, a * op always succeeds. if (ix == fp->fInputIdx) { fp->fPatIdx++; // skip the URX_LOOP_C op. break; } // Peek ahead in the compiled pattern, to the URX_LOOP_C that // must follow. It's operand is the stack location // that holds the starting input index for the match of this [set]* int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); int32_t stackLoc = URX_VAL(loopcOp); U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); fp->fExtra[stackLoc] = fp->fInputIdx; fp->fInputIdx = ix; // Save State to the URX_LOOP_C op that follows this one, // so that match failures in the following code will return to there. // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. fp = StateSave(fp, fp->fPatIdx, status); fp->fPatIdx++; } break; case URX_LOOP_DOT_I: // Loop Initialization for the optimized implementation of .* // This op scans through all remaining input. // The following LOOP_C op emulates stack unwinding if the following pattern fails. { // Loop through input until the input is exhausted (we reach an end-of-line) // In DOTALL mode, we can just go straight to the end of the input. int64_t ix; if ((opValue & 1) == 1) { // Dot-matches-All mode. Jump straight to the end of the string. ix = fActiveLimit; fHitEnd = true; } else { // NOT DOT ALL mode. Line endings do not match '.' // Scan forward until a line ending or end of input. ix = fp->fInputIdx; UTEXT_SETNATIVEINDEX(fInputText, ix); for (;;) { if (ix >= fActiveLimit) { fHitEnd = true; break; } UChar32 c = UTEXT_NEXT32(fInputText); if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s if ((c == 0x0a) || // 0x0a is newline in both modes. (((opValue & 2) == 0) && // IF not UNIX_LINES mode isLineTerminator(c))) { // char is a line ending. Exit the scanning loop. break; } } ix = UTEXT_GETNATIVEINDEX(fInputText); } } // If there were no matching characters, skip over the loop altogether. // The loop doesn't run at all, a * op always succeeds. if (ix == fp->fInputIdx) { fp->fPatIdx++; // skip the URX_LOOP_C op. break; } // Peek ahead in the compiled pattern, to the URX_LOOP_C that // must follow. It's operand is the stack location // that holds the starting input index for the match of this .* int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); int32_t stackLoc = URX_VAL(loopcOp); U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); fp->fExtra[stackLoc] = fp->fInputIdx; fp->fInputIdx = ix; // Save State to the URX_LOOP_C op that follows this one, // so that match failures in the following code will return to there. // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. fp = StateSave(fp, fp->fPatIdx, status); fp->fPatIdx++; } break; case URX_LOOP_C: { U_ASSERT(opValue>=0 && opValuefExtra[opValue]; U_ASSERT(backSearchIndex <= fp->fInputIdx); if (backSearchIndex == fp->fInputIdx) { // We've backed up the input idx to the point that the loop started. // The loop is done. Leave here without saving state. // Subsequent failures won't come back here. break; } // Set up for the next iteration of the loop, with input index // backed up by one from the last time through, // and a state save to this instruction in case the following code fails again. // (We're going backwards because this loop emulates stack unwinding, not // the initial scan forward.) U_ASSERT(fp->fInputIdx > 0); UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); UChar32 prevC = UTEXT_PREVIOUS32(fInputText); fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText); if (prevC == 0x0a && fp->fInputIdx > backSearchIndex && twoPrevC == 0x0d) { int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { // .*, stepping back over CRLF pair. fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); } } fp = StateSave(fp, fp->fPatIdx-1, status); } break; default: // Trouble. The compiled pattern contains an entry with an // unrecognized type tag. UPRV_UNREACHABLE_ASSERT; // Unknown opcode type in opType = URX_TYPE(pat[fp->fPatIdx]). But we have // reports of this in production code, don't use UPRV_UNREACHABLE_EXIT. // See ICU-21669. status = U_INTERNAL_PROGRAM_ERROR; } if (U_FAILURE(status)) { isMatch = false; break; } } breakFromLoop: fMatch = isMatch; if (isMatch) { fLastMatchEnd = fMatchEnd; fMatchStart = startIdx; fMatchEnd = fp->fInputIdx; } #ifdef REGEX_RUN_DEBUG if (fTraceDebug) { if (isMatch) { printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd); } else { printf("No match\n\n"); } } #endif fFrame = fp; // The active stack frame when the engine stopped. // Contains the capture group results that we need to // access later. return; } //-------------------------------------------------------------------------------- // // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the // assumption that the entire string is available in the UText's // chunk buffer. For now, that means we can use int32_t indexes, // except for anything that needs to be saved (like group starts // and ends). // // startIdx: begin matching a this index. // toEnd: if true, match must extend to end of the input region // //-------------------------------------------------------------------------------- void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { UBool isMatch = false; // True if the we have a match. int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards int32_t op; // Operation from the compiled pattern, split into int32_t opType; // the opcode int32_t opValue; // and the operand value. #ifdef REGEX_RUN_DEBUG if (fTraceDebug) { printf("MatchAt(startIdx=%d)\n", startIdx); printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))()); printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))()); } #endif if (U_FAILURE(status)) { return; } // Cache frequently referenced items from the compiled pattern // int64_t *pat = fPattern->fCompiledPat->getBuffer(); const char16_t *litText = fPattern->fLiteralText.getBuffer(); UVector *fSets = fPattern->fSets; const char16_t *inputBuf = fInputText->chunkContents; fFrameSize = fPattern->fFrameSize; REStackFrame *fp = resetStack(); if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; return; } fp->fPatIdx = 0; fp->fInputIdx = startIdx; // Zero out the pattern's static data int32_t i; for (i = 0; ifDataSize; i++) { fData[i] = 0; } // // Main loop for interpreting the compiled pattern. // One iteration of the loop per pattern operation performed. // for (;;) { op = (int32_t)pat[fp->fPatIdx]; opType = URX_TYPE(op); opValue = URX_VAL(op); #ifdef REGEX_RUN_DEBUG if (fTraceDebug) { UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx, UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); fPattern->dumpOp(fp->fPatIdx); } #endif fp->fPatIdx++; switch (opType) { case URX_NOP: break; case URX_BACKTRACK: // Force a backtrack. In some circumstances, the pattern compiler // will notice that the pattern can't possibly match anything, and will // emit one of these at that point. fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; case URX_ONECHAR: if (fp->fInputIdx < fActiveLimit) { UChar32 c; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); if (c == opValue) { break; } } else { fHitEnd = true; } fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; case URX_STRING: { // Test input against a literal string. // Strings require two slots in the compiled pattern, one for the // offset to the string text, and one for the length. int32_t stringStartIdx = opValue; int32_t stringLen; op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand fp->fPatIdx++; opType = URX_TYPE(op); stringLen = URX_VAL(op); U_ASSERT(opType == URX_STRING_LEN); U_ASSERT(stringLen >= 2); const char16_t * pInp = inputBuf + fp->fInputIdx; const char16_t * pInpLimit = inputBuf + fActiveLimit; const char16_t * pPat = litText+stringStartIdx; const char16_t * pEnd = pInp + stringLen; UBool success = true; while (pInp < pEnd) { if (pInp >= pInpLimit) { fHitEnd = true; success = false; break; } if (*pInp++ != *pPat++) { success = false; break; } } if (success) { fp->fInputIdx += stringLen; } else { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_STATE_SAVE: fp = StateSave(fp, opValue, status); break; case URX_END: // The match loop will exit via this path on a successful match, // when we reach the end of the pattern. if (toEnd && fp->fInputIdx != fActiveLimit) { // The pattern matched, but not to the end of input. Try some more. fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } isMatch = true; goto breakFromLoop; // Start and End Capture stack frame variables are laid out out like this: // fp->fExtra[opValue] - The start of a completed capture group // opValue+1 - The end of a completed capture group // opValue+2 - the start of a capture group whose end // has not yet been reached (and might not ever be). case URX_START_CAPTURE: U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); fp->fExtra[opValue+2] = fp->fInputIdx; break; case URX_END_CAPTURE: U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. fp->fExtra[opValue+1] = fp->fInputIdx; // End position U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); break; case URX_DOLLAR: // $, test for End of line // or for position before new line at end of input if (fp->fInputIdx < fAnchorLimit-2) { // We are no where near the end of input. Fail. // This is the common case. Keep it first. fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } if (fp->fInputIdx >= fAnchorLimit) { // We really are at the end of input. Success. fHitEnd = true; fRequireEnd = true; break; } // If we are positioned just before a new-line that is located at the // end of input, succeed. if (fp->fInputIdx == fAnchorLimit-1) { UChar32 c; U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c); if (isLineTerminator(c)) { if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { // At new-line at end of input. Success fHitEnd = true; fRequireEnd = true; break; } } } else if (fp->fInputIdx == fAnchorLimit-2 && inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) { fHitEnd = true; fRequireEnd = true; break; // At CR/LF at end of input. Success } fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. if (fp->fInputIdx >= fAnchorLimit-1) { // Either at the last character of input, or off the end. if (fp->fInputIdx == fAnchorLimit-1) { // At last char of input. Success if it's a new line. if (inputBuf[fp->fInputIdx] == 0x0a) { fHitEnd = true; fRequireEnd = true; break; } } else { // Off the end of input. Success. fHitEnd = true; fRequireEnd = true; break; } } // Not at end of input. Back-track out. fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; case URX_DOLLAR_M: // $, test for End of line in multi-line mode { if (fp->fInputIdx >= fAnchorLimit) { // We really are at the end of input. Success. fHitEnd = true; fRequireEnd = true; break; } // If we are positioned just before a new-line, succeed. // It makes no difference where the new-line is within the input. UChar32 c = inputBuf[fp->fInputIdx]; if (isLineTerminator(c)) { // At a line end, except for the odd chance of being in the middle of a CR/LF sequence // In multi-line mode, hitting a new-line just before the end of input does not // set the hitEnd or requireEnd flags if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { break; } } // not at a new line. Fail. fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode { if (fp->fInputIdx >= fAnchorLimit) { // We really are at the end of input. Success. fHitEnd = true; fRequireEnd = true; // Java set requireEnd in this case, even though break; // adding a new-line would not lose the match. } // If we are not positioned just before a new-line, the test fails; backtrack out. // It makes no difference where the new-line is within the input. if (inputBuf[fp->fInputIdx] != 0x0a) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_CARET: // ^, test for start of line if (fp->fInputIdx != fAnchorStart) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; case URX_CARET_M: // ^, test for start of line in mulit-line mode { if (fp->fInputIdx == fAnchorStart) { // We are at the start input. Success. break; } // Check whether character just before the current pos is a new-line // unless we are at the end of input char16_t c = inputBuf[fp->fInputIdx - 1]; if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) { // It's a new-line. ^ is true. Success. // TODO: what should be done with positions between a CR and LF? break; } // Not at the start of a line. Fail. fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode { U_ASSERT(fp->fInputIdx >= fAnchorStart); if (fp->fInputIdx <= fAnchorStart) { // We are at the start input. Success. break; } // Check whether character just before the current pos is a new-line U_ASSERT(fp->fInputIdx <= fAnchorLimit); char16_t c = inputBuf[fp->fInputIdx - 1]; if (c != 0x0a) { // Not at the start of a line. Back-track out. fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_BACKSLASH_B: // Test for word boundaries { UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx); success ^= (UBool)(opValue != 0); // flip sense for \B if (!success) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style { UBool success = isUWordBoundary(fp->fInputIdx, status); success ^= (UBool)(opValue != 0); // flip sense for \B if (!success) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_BACKSLASH_D: // Test for decimal digit { if (fp->fInputIdx >= fActiveLimit) { fHitEnd = true; fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } UChar32 c; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); success ^= (UBool)(opValue != 0); // flip sense for \D if (!success) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_BACKSLASH_G: // Test for position at end of previous match if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==false && fp->fInputIdx==fActiveStart))) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; case URX_BACKSLASH_H: // Test for \h, horizontal white space. { if (fp->fInputIdx >= fActiveLimit) { fHitEnd = true; fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } UChar32 c; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); int8_t ctype = u_charType(c); UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB success ^= (UBool)(opValue != 0); // flip sense for \H if (!success) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_BACKSLASH_R: // Test for \R, any line break sequence. { if (fp->fInputIdx >= fActiveLimit) { fHitEnd = true; fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } UChar32 c; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); if (isLineTerminator(c)) { if (c == 0x0d && fp->fInputIdx < fActiveLimit) { // Check for CR/LF sequence. Consume both together when found. char16_t c2; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2); if (c2 != 0x0a) { U16_PREV(inputBuf, 0, fp->fInputIdx, c2); } } } else { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_BACKSLASH_V: // Any single code point line ending. { if (fp->fInputIdx >= fActiveLimit) { fHitEnd = true; fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } UChar32 c; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); UBool success = isLineTerminator(c); success ^= (UBool)(opValue != 0); // flip sense for \V if (!success) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_BACKSLASH_X: // Match a Grapheme, as defined by Unicode UAX 29. // Fail if at end of input if (fp->fInputIdx >= fActiveLimit) { fHitEnd = true; fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status); if (fp->fInputIdx >= fActiveLimit) { fHitEnd = true; fp->fInputIdx = fActiveLimit; } break; case URX_BACKSLASH_Z: // Test for end of Input if (fp->fInputIdx < fAnchorLimit) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } else { fHitEnd = true; fRequireEnd = true; } break; case URX_STATIC_SETREF: { // Test input character against one of the predefined sets // (Word Characters, for example) // The high bit of the op value is a flag for the match polarity. // 0: success if input char is in set. // 1: success if input char is not in set. if (fp->fInputIdx >= fActiveLimit) { fHitEnd = true; fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); opValue &= ~URX_NEG_SET; U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); UChar32 c; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); if (c < 256) { Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue]; if (s8.contains(c)) { success = !success; } } else { const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue]; if (s.contains(c)) { success = !success; } } if (!success) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_STAT_SETREF_N: { // Test input character for NOT being a member of one of // the predefined sets (Word Characters, for example) if (fp->fInputIdx >= fActiveLimit) { fHitEnd = true; fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); UChar32 c; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); if (c < 256) { Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue]; if (s8.contains(c) == false) { break; } } else { const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue]; if (s.contains(c) == false) { break; } } fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; case URX_SETREF: { if (fp->fInputIdx >= fActiveLimit) { fHitEnd = true; fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } U_ASSERT(opValue > 0 && opValue < fSets->size()); // There is input left. Pick up one char and test it for set membership. UChar32 c; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); if (c<256) { Regex8BitSet *s8 = &fPattern->fSets8[opValue]; if (s8->contains(c)) { // The character is in the set. A Match. break; } } else { UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue); if (s->contains(c)) { // The character is in the set. A Match. break; } } // the character wasn't in the set. fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; case URX_DOTANY: { // . matches anything, but stops at end-of-line. if (fp->fInputIdx >= fActiveLimit) { // At end of input. Match failed. Backtrack out. fHitEnd = true; fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } // There is input left. Advance over one char, unless we've hit end-of-line UChar32 c; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); if (isLineTerminator(c)) { // End of line in normal mode. . does not match. fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } } break; case URX_DOTANY_ALL: { // . in dot-matches-all (including new lines) mode if (fp->fInputIdx >= fActiveLimit) { // At end of input. Match failed. Backtrack out. fHitEnd = true; fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } // There is input left. Advance over one char, except if we are // at a cr/lf, advance over both of them. UChar32 c; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); if (c==0x0d && fp->fInputIdx < fActiveLimit) { // In the case of a CR/LF, we need to advance over both. if (inputBuf[fp->fInputIdx] == 0x0a) { U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit); } } } break; case URX_DOTANY_UNIX: { // '.' operator, matches all, but stops at end-of-line. // UNIX_LINES mode, so 0x0a is the only recognized line ending. if (fp->fInputIdx >= fActiveLimit) { // At end of input. Match failed. Backtrack out. fHitEnd = true; fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } // There is input left. Advance over one char, unless we've hit end-of-line UChar32 c; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); if (c == 0x0a) { // End of line in normal mode. '.' does not match the \n fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_JMP: fp->fPatIdx = opValue; break; case URX_FAIL: isMatch = false; goto breakFromLoop; case URX_JMP_SAV: U_ASSERT(opValue < fPattern->fCompiledPat->size()); fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current fp->fPatIdx = opValue; // Then JMP. break; case URX_JMP_SAV_X: // This opcode is used with (x)+, when x can match a zero length string. // Same as JMP_SAV, except conditional on the match having made forward progress. // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the // data address of the input position at the start of the loop. { U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); int32_t stoOp = (int32_t)pat[opValue-1]; U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); int32_t frameLoc = URX_VAL(stoOp); U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc]; U_ASSERT(prevInputIdx <= fp->fInputIdx); if (prevInputIdx < fp->fInputIdx) { // The match did make progress. Repeat the loop. fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current fp->fPatIdx = opValue; fp->fExtra[frameLoc] = fp->fInputIdx; } // If the input position did not advance, we do nothing here, // execution will fall out of the loop. } break; case URX_CTR_INIT: { U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); fp->fExtra[opValue] = 0; // Set the loop counter variable to zero // Pick up the three extra operands that CTR_INIT has, and // skip the pattern location counter past int32_t instrOperandLoc = (int32_t)fp->fPatIdx; fp->fPatIdx += 3; int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); int32_t minCount = (int32_t)pat[instrOperandLoc+1]; int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; U_ASSERT(minCount>=0); U_ASSERT(maxCount>=minCount || maxCount==-1); U_ASSERT(loopLoc>=fp->fPatIdx); if (minCount == 0) { fp = StateSave(fp, loopLoc+1, status); } if (maxCount == -1) { fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking. } else if (maxCount == 0) { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_CTR_LOOP: { U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); int32_t initOp = (int32_t)pat[opValue]; U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; int32_t minCount = (int32_t)pat[opValue+2]; int32_t maxCount = (int32_t)pat[opValue+3]; (*pCounter)++; if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { U_ASSERT(*pCounter == maxCount); break; } if (*pCounter >= minCount) { if (maxCount == -1) { // Loop has no hard upper bound. // Check that it is progressing through the input, break if it is not. int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; if (fp->fInputIdx == *pLastInputIdx) { break; } else { *pLastInputIdx = fp->fInputIdx; } } fp = StateSave(fp, fp->fPatIdx, status); } else { // Increment time-out counter. (StateSave() does it if count >= minCount) fTickCounter--; if (fTickCounter <= 0) { IncrementTime(status); // Re-initializes fTickCounter } } fp->fPatIdx = opValue + 4; // Loop back. } break; case URX_CTR_INIT_NG: { // Initialize a non-greedy loop U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); fp->fExtra[opValue] = 0; // Set the loop counter variable to zero // Pick up the three extra operands that CTR_INIT_NG has, and // skip the pattern location counter past int32_t instrOperandLoc = (int32_t)fp->fPatIdx; fp->fPatIdx += 3; int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); int32_t minCount = (int32_t)pat[instrOperandLoc+1]; int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; U_ASSERT(minCount>=0); U_ASSERT(maxCount>=minCount || maxCount==-1); U_ASSERT(loopLoc>fp->fPatIdx); if (maxCount == -1) { fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking. } if (minCount == 0) { if (maxCount != 0) { fp = StateSave(fp, fp->fPatIdx, status); } fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block } } break; case URX_CTR_LOOP_NG: { // Non-greedy {min, max} loops U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); int32_t initOp = (int32_t)pat[opValue]; U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; int32_t minCount = (int32_t)pat[opValue+2]; int32_t maxCount = (int32_t)pat[opValue+3]; (*pCounter)++; if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) { // The loop has matched the maximum permitted number of times. // Break out of here with no action. Matching will // continue with the following pattern. U_ASSERT(*pCounter == maxCount); break; } if (*pCounter < minCount) { // We haven't met the minimum number of matches yet. // Loop back for another one. fp->fPatIdx = opValue + 4; // Loop back. fTickCounter--; if (fTickCounter <= 0) { IncrementTime(status); // Re-initializes fTickCounter } } else { // We do have the minimum number of matches. // If there is no upper bound on the loop iterations, check that the input index // is progressing, and stop the loop if it is not. if (maxCount == -1) { int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1]; if (fp->fInputIdx == *pLastInputIdx) { break; } *pLastInputIdx = fp->fInputIdx; } // Loop Continuation: we will fall into the pattern following the loop // (non-greedy, don't execute loop body first), but first do // a state save to the top of the loop, so that a match failure // in the following pattern will try another iteration of the loop. fp = StateSave(fp, opValue + 4, status); } } break; case URX_STO_SP: U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); fData[opValue] = fStack->size(); break; case URX_LD_SP: { U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); int32_t newStackSize = (int32_t)fData[opValue]; U_ASSERT(newStackSize <= fStack->size()); int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; if (newFP == (int64_t *)fp) { break; } int32_t j; for (j=0; jsetSize(newStackSize); } break; case URX_BACKREF: { U_ASSERT(opValue < fFrameSize); int64_t groupStartIdx = fp->fExtra[opValue]; int64_t groupEndIdx = fp->fExtra[opValue+1]; U_ASSERT(groupStartIdx <= groupEndIdx); int64_t inputIndex = fp->fInputIdx; if (groupStartIdx < 0) { // This capture group has not participated in the match thus far, fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. break; } UBool success = true; for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) { if (inputIndex >= fActiveLimit) { success = false; fHitEnd = true; break; } if (inputBuf[groupIndex] != inputBuf[inputIndex]) { success = false; break; } } if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) && inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) { // Capture group ended with an unpaired lead surrogate. // Back reference is not permitted to match lead only of a surrogatge pair. success = false; } if (success) { fp->fInputIdx = inputIndex; } else { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_BACKREF_I: { U_ASSERT(opValue < fFrameSize); int64_t groupStartIdx = fp->fExtra[opValue]; int64_t groupEndIdx = fp->fExtra[opValue+1]; U_ASSERT(groupStartIdx <= groupEndIdx); if (groupStartIdx < 0) { // This capture group has not participated in the match thus far, fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. break; } CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx); CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit); // Note: if the capture group match was of an empty string the backref // match succeeds. Verified by testing: Perl matches succeed // in this case, so we do too. UBool success = true; for (;;) { UChar32 captureGroupChar = captureGroupItr.next(); if (captureGroupChar == U_SENTINEL) { success = true; break; } UChar32 inputChar = inputItr.next(); if (inputChar == U_SENTINEL) { success = false; fHitEnd = true; break; } if (inputChar != captureGroupChar) { success = false; break; } } if (success && inputItr.inExpansion()) { // We obtained a match by consuming part of a string obtained from // case-folding a single code point of the input text. // This does not count as an overall match. success = false; } if (success) { fp->fInputIdx = inputItr.getIndex(); } else { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_STO_INP_LOC: { U_ASSERT(opValue >= 0 && opValue < fFrameSize); fp->fExtra[opValue] = fp->fInputIdx; } break; case URX_JMPX: { int32_t instrOperandLoc = (int32_t)fp->fPatIdx; fp->fPatIdx += 1; int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc]; U_ASSERT(savedInputIdx <= fp->fInputIdx); if (savedInputIdx < fp->fInputIdx) { fp->fPatIdx = opValue; // JMP } else { fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. } } break; case URX_LA_START: { // Entering a look around block. // Save Stack Ptr, Input Pos. U_ASSERT(opValue>=0 && opValue+3fDataSize); fData[opValue] = fStack->size(); fData[opValue+1] = fp->fInputIdx; fData[opValue+2] = fActiveStart; fData[opValue+3] = fActiveLimit; fActiveStart = fLookStart; // Set the match region change for fActiveLimit = fLookLimit; // transparent bounds. } break; case URX_LA_END: { // Leaving a look around block. // restore Stack Ptr, Input Pos to positions they had on entry to block. U_ASSERT(opValue>=0 && opValue+3fDataSize); int32_t stackSize = fStack->size(); int32_t newStackSize = (int32_t)fData[opValue]; U_ASSERT(stackSize >= newStackSize); if (stackSize > newStackSize) { // Copy the current top frame back to the new (cut back) top frame. // This makes the capture groups from within the look-ahead // expression available. int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; int32_t j; for (j=0; jsetSize(newStackSize); } fp->fInputIdx = fData[opValue+1]; // Restore the active region bounds in the input string; they may have // been changed because of transparent bounds on a Region. fActiveStart = fData[opValue+2]; fActiveLimit = fData[opValue+3]; U_ASSERT(fActiveStart >= 0); U_ASSERT(fActiveLimit <= fInputLength); } break; case URX_ONECHAR_I: if (fp->fInputIdx < fActiveLimit) { UChar32 c; U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { break; } } else { fHitEnd = true; } fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; case URX_STRING_I: // Case-insensitive test input against a literal string. // Strings require two slots in the compiled pattern, one for the // offset to the string text, and one for the length. // The compiled string has already been case folded. { const char16_t *patternString = litText + opValue; op = (int32_t)pat[fp->fPatIdx]; fp->fPatIdx++; opType = URX_TYPE(op); opValue = URX_VAL(op); U_ASSERT(opType == URX_STRING_LEN); int32_t patternStringLen = opValue; // Length of the string from the pattern. UChar32 cText; UChar32 cPattern; UBool success = true; int32_t patternStringIdx = 0; CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit); while (patternStringIdx < patternStringLen) { U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern); cText = inputIterator.next(); if (cText != cPattern) { success = false; if (cText == U_SENTINEL) { fHitEnd = true; } break; } } if (inputIterator.inExpansion()) { success = false; } if (success) { fp->fInputIdx = inputIterator.getIndex(); } else { fp = (REStackFrame *)fStack->popFrame(fFrameSize); } } break; case URX_LB_START: { // Entering a look-behind block. // Save Stack Ptr, Input Pos and active input region. // TODO: implement transparent bounds. Ticket #6067 U_ASSERT(opValue>=0 && opValue+4fDataSize); fData[opValue] = fStack->size(); fData[opValue+1] = fp->fInputIdx; // Save input string length, then reset to pin any matches to end at // the current position. fData[opValue+2] = fActiveStart; fData[opValue+3] = fActiveLimit; fActiveStart = fRegionStart; fActiveLimit = fp->fInputIdx; // Init the variable containing the start index for attempted matches. fData[opValue+4] = -1; } break; case URX_LB_CONT: { // Positive Look-Behind, at top of loop checking for matches of LB expression // at all possible input starting positions. // Fetch the min and max possible match lengths. They are the operands // of this op in the pattern. int32_t minML = (int32_t)pat[fp->fPatIdx++]; int32_t maxML = (int32_t)pat[fp->fPatIdx++]; U_ASSERT(minML <= maxML); U_ASSERT(minML >= 0); // Fetch (from data) the last input index where a match was attempted. U_ASSERT(opValue>=0 && opValue+4fDataSize); int64_t &lbStartIdx = fData[opValue+4]; if (lbStartIdx < 0) { // First time through loop. lbStartIdx = fp->fInputIdx - minML; if (lbStartIdx > 0 && lbStartIdx < fInputLength) { U16_SET_CP_START(inputBuf, 0, lbStartIdx); } } else { // 2nd through nth time through the loop. // Back up start position for match by one. if (lbStartIdx == 0) { lbStartIdx--; } else { U16_BACK_1(inputBuf, 0, lbStartIdx); } } if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) { // We have tried all potential match starting points without // getting a match. Backtrack out, and out of the // Look Behind altogether. fp = (REStackFrame *)fStack->popFrame(fFrameSize); fActiveStart = fData[opValue+2]; fActiveLimit = fData[opValue+3]; U_ASSERT(fActiveStart >= 0); U_ASSERT(fActiveLimit <= fInputLength); break; } // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. // (successful match will fall off the end of the loop.) fp = StateSave(fp, fp->fPatIdx-3, status); fp->fInputIdx = lbStartIdx; } break; case URX_LB_END: // End of a look-behind block, after a successful match. { U_ASSERT(opValue>=0 && opValue+4fDataSize); if (fp->fInputIdx != fActiveLimit) { // The look-behind expression matched, but the match did not // extend all the way to the point that we are looking behind from. // FAIL out of here, which will take us back to the LB_CONT, which // will retry the match starting at another position or fail // the look-behind altogether, whichever is appropriate. fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } // Look-behind match is good. Restore the original input string region, // which had been truncated to pin the end of the lookbehind match to the // position being looked-behind. fActiveStart = fData[opValue+2]; fActiveLimit = fData[opValue+3]; U_ASSERT(fActiveStart >= 0); U_ASSERT(fActiveLimit <= fInputLength); } break; case URX_LBN_CONT: { // Negative Look-Behind, at top of loop checking for matches of LB expression // at all possible input starting positions. // Fetch the extra parameters of this op. int32_t minML = (int32_t)pat[fp->fPatIdx++]; int32_t maxML = (int32_t)pat[fp->fPatIdx++]; int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; continueLoc = URX_VAL(continueLoc); U_ASSERT(minML <= maxML); U_ASSERT(minML >= 0); U_ASSERT(continueLoc > fp->fPatIdx); // Fetch (from data) the last input index where a match was attempted. U_ASSERT(opValue>=0 && opValue+4fDataSize); int64_t &lbStartIdx = fData[opValue+4]; if (lbStartIdx < 0) { // First time through loop. lbStartIdx = fp->fInputIdx - minML; if (lbStartIdx > 0 && lbStartIdx < fInputLength) { U16_SET_CP_START(inputBuf, 0, lbStartIdx); } } else { // 2nd through nth time through the loop. // Back up start position for match by one. if (lbStartIdx == 0) { lbStartIdx--; // Because U16_BACK is unsafe starting at 0. } else { U16_BACK_1(inputBuf, 0, lbStartIdx); } } if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) { // We have tried all potential match starting points without // getting a match, which means that the negative lookbehind as // a whole has succeeded. Jump forward to the continue location fActiveStart = fData[opValue+2]; fActiveLimit = fData[opValue+3]; U_ASSERT(fActiveStart >= 0); U_ASSERT(fActiveLimit <= fInputLength); fp->fPatIdx = continueLoc; break; } // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. // (successful match will cause a FAIL out of the loop altogether.) fp = StateSave(fp, fp->fPatIdx-4, status); fp->fInputIdx = lbStartIdx; } break; case URX_LBN_END: // End of a negative look-behind block, after a successful match. { U_ASSERT(opValue>=0 && opValue+4fDataSize); if (fp->fInputIdx != fActiveLimit) { // The look-behind expression matched, but the match did not // extend all the way to the point that we are looking behind from. // FAIL out of here, which will take us back to the LB_CONT, which // will retry the match starting at another position or succeed // the look-behind altogether, whichever is appropriate. fp = (REStackFrame *)fStack->popFrame(fFrameSize); break; } // Look-behind expression matched, which means look-behind test as // a whole Fails // Restore the original input string length, which had been truncated // inorder to pin the end of the lookbehind match // to the position being looked-behind. fActiveStart = fData[opValue+2]; fActiveLimit = fData[opValue+3]; U_ASSERT(fActiveStart >= 0); U_ASSERT(fActiveLimit <= fInputLength); // Restore original stack position, discarding any state saved // by the successful pattern match. U_ASSERT(opValue>=0 && opValue+1fDataSize); int32_t newStackSize = (int32_t)fData[opValue]; U_ASSERT(fStack->size() > newStackSize); fStack->setSize(newStackSize); // FAIL, which will take control back to someplace // prior to entering the look-behind test. fp = (REStackFrame *)fStack->popFrame(fFrameSize); } break; case URX_LOOP_SR_I: // Loop Initialization for the optimized implementation of // [some character set]* // This op scans through all matching input. // The following LOOP_C op emulates stack unwinding if the following pattern fails. { U_ASSERT(opValue > 0 && opValue < fSets->size()); Regex8BitSet *s8 = &fPattern->fSets8[opValue]; UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue); // Loop through input, until either the input is exhausted or // we reach a character that is not a member of the set. int32_t ix = (int32_t)fp->fInputIdx; for (;;) { if (ix >= fActiveLimit) { fHitEnd = true; break; } UChar32 c; U16_NEXT(inputBuf, ix, fActiveLimit, c); if (c<256) { if (s8->contains(c) == false) { U16_BACK_1(inputBuf, 0, ix); break; } } else { if (s->contains(c) == false) { U16_BACK_1(inputBuf, 0, ix); break; } } } // If there were no matching characters, skip over the loop altogether. // The loop doesn't run at all, a * op always succeeds. if (ix == fp->fInputIdx) { fp->fPatIdx++; // skip the URX_LOOP_C op. break; } // Peek ahead in the compiled pattern, to the URX_LOOP_C that // must follow. It's operand is the stack location // that holds the starting input index for the match of this [set]* int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); int32_t stackLoc = URX_VAL(loopcOp); U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); fp->fExtra[stackLoc] = fp->fInputIdx; fp->fInputIdx = ix; // Save State to the URX_LOOP_C op that follows this one, // so that match failures in the following code will return to there. // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. fp = StateSave(fp, fp->fPatIdx, status); fp->fPatIdx++; } break; case URX_LOOP_DOT_I: // Loop Initialization for the optimized implementation of .* // This op scans through all remaining input. // The following LOOP_C op emulates stack unwinding if the following pattern fails. { // Loop through input until the input is exhausted (we reach an end-of-line) // In DOTALL mode, we can just go straight to the end of the input. int32_t ix; if ((opValue & 1) == 1) { // Dot-matches-All mode. Jump straight to the end of the string. ix = (int32_t)fActiveLimit; fHitEnd = true; } else { // NOT DOT ALL mode. Line endings do not match '.' // Scan forward until a line ending or end of input. ix = (int32_t)fp->fInputIdx; for (;;) { if (ix >= fActiveLimit) { fHitEnd = true; break; } UChar32 c; U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++] if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s if ((c == 0x0a) || // 0x0a is newline in both modes. (((opValue & 2) == 0) && // IF not UNIX_LINES mode isLineTerminator(c))) { // char is a line ending. Put the input pos back to the // line ending char, and exit the scanning loop. U16_BACK_1(inputBuf, 0, ix); break; } } } } // If there were no matching characters, skip over the loop altogether. // The loop doesn't run at all, a * op always succeeds. if (ix == fp->fInputIdx) { fp->fPatIdx++; // skip the URX_LOOP_C op. break; } // Peek ahead in the compiled pattern, to the URX_LOOP_C that // must follow. It's operand is the stack location // that holds the starting input index for the match of this .* int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); int32_t stackLoc = URX_VAL(loopcOp); U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); fp->fExtra[stackLoc] = fp->fInputIdx; fp->fInputIdx = ix; // Save State to the URX_LOOP_C op that follows this one, // so that match failures in the following code will return to there. // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. fp = StateSave(fp, fp->fPatIdx, status); fp->fPatIdx++; } break; case URX_LOOP_C: { U_ASSERT(opValue>=0 && opValuefExtra[opValue]; U_ASSERT(backSearchIndex <= fp->fInputIdx); if (backSearchIndex == fp->fInputIdx) { // We've backed up the input idx to the point that the loop started. // The loop is done. Leave here without saving state. // Subsequent failures won't come back here. break; } // Set up for the next iteration of the loop, with input index // backed up by one from the last time through, // and a state save to this instruction in case the following code fails again. // (We're going backwards because this loop emulates stack unwinding, not // the initial scan forward.) U_ASSERT(fp->fInputIdx > 0); UChar32 prevC; U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit? if (prevC == 0x0a && fp->fInputIdx > backSearchIndex && inputBuf[fp->fInputIdx-1] == 0x0d) { int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { // .*, stepping back over CRLF pair. U16_BACK_1(inputBuf, 0, fp->fInputIdx); } } fp = StateSave(fp, fp->fPatIdx-1, status); } break; default: // Trouble. The compiled pattern contains an entry with an // unrecognized type tag. UPRV_UNREACHABLE_ASSERT; // Unknown opcode type in opType = URX_TYPE(pat[fp->fPatIdx]). But we have // reports of this in production code, don't use UPRV_UNREACHABLE_EXIT. // See ICU-21669. status = U_INTERNAL_PROGRAM_ERROR; } if (U_FAILURE(status)) { isMatch = false; break; } } breakFromLoop: fMatch = isMatch; if (isMatch) { fLastMatchEnd = fMatchEnd; fMatchStart = startIdx; fMatchEnd = fp->fInputIdx; } #ifdef REGEX_RUN_DEBUG if (fTraceDebug) { if (isMatch) { printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd); } else { printf("No match\n\n"); } } #endif fFrame = fp; // The active stack frame when the engine stopped. // Contains the capture group results that we need to // access later. return; } UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher) U_NAMESPACE_END #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS stringi/src/icu74/i18n/formattedval_sbimpl.cpp0000644000176200001440000003251214700200761020774 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING // This file contains one implementation of FormattedValue. // Other independent implementations should go into their own cpp file for // better dependency modularization. #include "unicode/ustring.h" #include "formattedval_impl.h" #include "number_types.h" #include "formatted_string_builder.h" #include "number_utils.h" #include "static_unicode_sets.h" #include "unicode/listformatter.h" U_NAMESPACE_BEGIN typedef FormattedStringBuilder::Field Field; FormattedValueStringBuilderImpl::FormattedValueStringBuilderImpl(Field numericField) : fNumericField(numericField) { } FormattedValueStringBuilderImpl::~FormattedValueStringBuilderImpl() { } UnicodeString FormattedValueStringBuilderImpl::toString(UErrorCode&) const { return fString.toUnicodeString(); } UnicodeString FormattedValueStringBuilderImpl::toTempString(UErrorCode&) const { return fString.toTempUnicodeString(); } Appendable& FormattedValueStringBuilderImpl::appendTo(Appendable& appendable, UErrorCode&) const { appendable.appendString(fString.chars(), fString.length()); return appendable; } UBool FormattedValueStringBuilderImpl::nextPosition(ConstrainedFieldPosition& cfpos, UErrorCode& status) const { // NOTE: MSVC sometimes complains when implicitly converting between bool and UBool return nextPositionImpl(cfpos, fNumericField, status) ? true : false; } UBool FormattedValueStringBuilderImpl::nextFieldPosition(FieldPosition& fp, UErrorCode& status) const { int32_t rawField = fp.getField(); if (rawField == FieldPosition::DONT_CARE) { return false; } if (rawField < 0 || rawField >= UNUM_FIELD_COUNT) { status = U_ILLEGAL_ARGUMENT_ERROR; return false; } ConstrainedFieldPosition cfpos; cfpos.constrainField(UFIELD_CATEGORY_NUMBER, rawField); cfpos.setState(UFIELD_CATEGORY_NUMBER, rawField, fp.getBeginIndex(), fp.getEndIndex()); if (nextPositionImpl(cfpos, kUndefinedField, status)) { fp.setBeginIndex(cfpos.getStart()); fp.setEndIndex(cfpos.getLimit()); return true; } // Special case: fraction should start after integer if fraction is not present if (rawField == UNUM_FRACTION_FIELD && fp.getEndIndex() == 0) { bool inside = false; int32_t i = fString.fZero; for (; i < fString.fZero + fString.fLength; i++) { if (isIntOrGroup(fString.getFieldPtr()[i]) || fString.getFieldPtr()[i] == Field(UFIELD_CATEGORY_NUMBER, UNUM_DECIMAL_SEPARATOR_FIELD)) { inside = true; } else if (inside) { break; } } fp.setBeginIndex(i - fString.fZero); fp.setEndIndex(i - fString.fZero); } return false; } void FormattedValueStringBuilderImpl::getAllFieldPositions(FieldPositionIteratorHandler& fpih, UErrorCode& status) const { ConstrainedFieldPosition cfpos; while (nextPositionImpl(cfpos, kUndefinedField, status)) { fpih.addAttribute(cfpos.getField(), cfpos.getStart(), cfpos.getLimit()); } } void FormattedValueStringBuilderImpl::resetString() { fString.clear(); spanIndicesCount = 0; } // Signal the end of the string using a field that doesn't exist and that is // different from kUndefinedField, which is used for "null field". static constexpr Field kEndField = Field(0xf, 0xf); bool FormattedValueStringBuilderImpl::nextPositionImpl(ConstrainedFieldPosition& cfpos, Field numericField, UErrorCode& /*status*/) const { int32_t fieldStart = -1; Field currField = kUndefinedField; bool prevIsSpan = false; int32_t nextSpanStart = -1; if (spanIndicesCount > 0) { int64_t si = cfpos.getInt64IterationContext(); U_ASSERT(si <= spanIndicesCount); if (si < spanIndicesCount) { nextSpanStart = spanIndices[si].start; } if (si > 0) { prevIsSpan = cfpos.getCategory() == spanIndices[si-1].category && cfpos.getField() == spanIndices[si-1].spanValue; } } bool prevIsNumeric = false; if (numericField != kUndefinedField) { prevIsNumeric = cfpos.getCategory() == numericField.getCategory() && cfpos.getField() == numericField.getField(); } bool prevIsInteger = cfpos.getCategory() == UFIELD_CATEGORY_NUMBER && cfpos.getField() == UNUM_INTEGER_FIELD; for (int32_t i = fString.fZero + cfpos.getLimit(); i <= fString.fZero + fString.fLength; i++) { Field _field = (i < fString.fZero + fString.fLength) ? fString.getFieldPtr()[i] : kEndField; // Case 1: currently scanning a field. if (currField != kUndefinedField) { if (currField != _field) { int32_t end = i - fString.fZero; // Grouping separators can be whitespace; don't throw them out! if (isTrimmable(currField)) { end = trimBack(i - fString.fZero); } if (end <= fieldStart) { // Entire field position is ignorable; skip. fieldStart = -1; currField = kUndefinedField; i--; // look at this index again continue; } int32_t start = fieldStart; if (isTrimmable(currField)) { start = trimFront(start); } cfpos.setState(currField.getCategory(), currField.getField(), start, end); return true; } continue; } // Special case: emit normalField if we are pointing at the end of spanField. if (i > fString.fZero && prevIsSpan) { int64_t si = cfpos.getInt64IterationContext() - 1; U_ASSERT(si >= 0); int32_t previ = i - spanIndices[si].length; U_ASSERT(previ >= fString.fZero); Field prevField = fString.getFieldPtr()[previ]; if (prevField == Field(UFIELD_CATEGORY_LIST, ULISTFMT_ELEMENT_FIELD)) { // Special handling for ULISTFMT_ELEMENT_FIELD if (cfpos.matchesField(UFIELD_CATEGORY_LIST, ULISTFMT_ELEMENT_FIELD)) { fieldStart = i - fString.fZero - spanIndices[si].length; int32_t end = fieldStart + spanIndices[si].length; cfpos.setState( UFIELD_CATEGORY_LIST, ULISTFMT_ELEMENT_FIELD, fieldStart, end); return true; } else { prevIsSpan = false; } } else { // Re-wind, since there may be multiple fields in the span. i = previ; _field = prevField; } } // Special case: coalesce the INTEGER if we are pointing at the end of the INTEGER. if (cfpos.matchesField(UFIELD_CATEGORY_NUMBER, UNUM_INTEGER_FIELD) && i > fString.fZero && !prevIsInteger && !prevIsNumeric && isIntOrGroup(fString.getFieldPtr()[i - 1]) && !isIntOrGroup(_field)) { int j = i - 1; for (; j >= fString.fZero && isIntOrGroup(fString.getFieldPtr()[j]); j--) {} cfpos.setState( UFIELD_CATEGORY_NUMBER, UNUM_INTEGER_FIELD, j - fString.fZero + 1, i - fString.fZero); return true; } // Special case: coalesce NUMERIC if we are pointing at the end of the NUMERIC. if (numericField != kUndefinedField && cfpos.matchesField(numericField.getCategory(), numericField.getField()) && i > fString.fZero && !prevIsNumeric && fString.getFieldPtr()[i - 1].isNumeric() && !_field.isNumeric()) { // Re-wind to the beginning of the field and then emit it int32_t j = i - 1; for (; j >= fString.fZero && fString.getFieldPtr()[j].isNumeric(); j--) {} cfpos.setState( numericField.getCategory(), numericField.getField(), j - fString.fZero + 1, i - fString.fZero); return true; } // Check for span field if (!prevIsSpan && ( _field == Field(UFIELD_CATEGORY_LIST, ULISTFMT_ELEMENT_FIELD) || i - fString.fZero == nextSpanStart)) { int64_t si = cfpos.getInt64IterationContext(); if (si >= spanIndicesCount) { break; } UFieldCategory spanCategory = spanIndices[si].category; int32_t spanValue = spanIndices[si].spanValue; int32_t length = spanIndices[si].length; cfpos.setInt64IterationContext(si + 1); if (si + 1 < spanIndicesCount) { nextSpanStart = spanIndices[si + 1].start; } if (length == 0) { // ICU-21871: Don't return fields on empty spans i--; continue; } if (cfpos.matchesField(spanCategory, spanValue)) { fieldStart = i - fString.fZero; int32_t end = fieldStart + length; cfpos.setState( spanCategory, spanValue, fieldStart, end); return true; } else if (_field == Field(UFIELD_CATEGORY_LIST, ULISTFMT_ELEMENT_FIELD)) { // Special handling for ULISTFMT_ELEMENT_FIELD if (cfpos.matchesField(UFIELD_CATEGORY_LIST, ULISTFMT_ELEMENT_FIELD)) { fieldStart = i - fString.fZero; int32_t end = fieldStart + length; cfpos.setState( UFIELD_CATEGORY_LIST, ULISTFMT_ELEMENT_FIELD, fieldStart, end); return true; } else { // Failed to match; jump ahead i += length - 1; // goto loopend } } } // Special case: skip over INTEGER; will be coalesced later. else if (_field == Field(UFIELD_CATEGORY_NUMBER, UNUM_INTEGER_FIELD)) { _field = kUndefinedField; } // No field starting at this position. else if (_field.isUndefined() || _field == kEndField) { // goto loopend } // No SpanField else if (cfpos.matchesField(_field.getCategory(), _field.getField())) { fieldStart = i - fString.fZero; currField = _field; } // loopend: prevIsSpan = false; prevIsNumeric = false; prevIsInteger = false; } U_ASSERT(currField == kUndefinedField); // Always set the position to the end so that we don't revisit previous sections cfpos.setState( cfpos.getCategory(), cfpos.getField(), fString.fLength, fString.fLength); return false; } void FormattedValueStringBuilderImpl::appendSpanInfo(UFieldCategory category, int32_t spanValue, int32_t start, int32_t length, UErrorCode& status) { if (U_FAILURE(status)) { return; } U_ASSERT(spanIndices.getCapacity() >= spanIndicesCount); if (spanIndices.getCapacity() == spanIndicesCount) { if (!spanIndices.resize(spanIndicesCount * 2, spanIndicesCount)) { status = U_MEMORY_ALLOCATION_ERROR; return; } } spanIndices[spanIndicesCount] = {category, spanValue, start, length}; spanIndicesCount++; } void FormattedValueStringBuilderImpl::prependSpanInfo(UFieldCategory category, int32_t spanValue, int32_t start, int32_t length, UErrorCode& status) { if (U_FAILURE(status)) { return; } U_ASSERT(spanIndices.getCapacity() >= spanIndicesCount); if (spanIndices.getCapacity() == spanIndicesCount) { if (!spanIndices.resize(spanIndicesCount * 2, spanIndicesCount)) { status = U_MEMORY_ALLOCATION_ERROR; return; } } for (int32_t i = spanIndicesCount - 1; i >= 0; i--) { spanIndices[i+1] = spanIndices[i]; } spanIndices[0] = {category, spanValue, start, length}; spanIndicesCount++; } bool FormattedValueStringBuilderImpl::isIntOrGroup(Field field) { return field == Field(UFIELD_CATEGORY_NUMBER, UNUM_INTEGER_FIELD) || field == Field(UFIELD_CATEGORY_NUMBER, UNUM_GROUPING_SEPARATOR_FIELD); } bool FormattedValueStringBuilderImpl::isTrimmable(Field field) { return field != Field(UFIELD_CATEGORY_NUMBER, UNUM_GROUPING_SEPARATOR_FIELD) && field.getCategory() != UFIELD_CATEGORY_LIST; } int32_t FormattedValueStringBuilderImpl::trimBack(int32_t limit) const { return unisets::get(unisets::DEFAULT_IGNORABLES)->spanBack( fString.getCharPtr() + fString.fZero, limit, USET_SPAN_CONTAINED); } int32_t FormattedValueStringBuilderImpl::trimFront(int32_t start) const { return start + unisets::get(unisets::DEFAULT_IGNORABLES)->span( fString.getCharPtr() + fString.fZero + start, fString.fLength - start, USET_SPAN_CONTAINED); } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/double-conversion-bignum.h0000644000176200001440000001440214700200761021315 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // // From the double-conversion library. Original license: // // Copyright 2010 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // ICU PATCH: ifdef around UCONFIG_NO_FORMATTING #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #ifndef DOUBLE_CONVERSION_BIGNUM_H_ #define DOUBLE_CONVERSION_BIGNUM_H_ // ICU PATCH: Customize header file paths for ICU. #include "double-conversion-utils.h" // ICU PATCH: Wrap in ICU namespace U_NAMESPACE_BEGIN namespace double_conversion { class Bignum { public: // 3584 = 128 * 28. We can represent 2^3584 > 10^1000 accurately. // This bignum can encode much bigger numbers, since it contains an // exponent. static const int kMaxSignificantBits = 3584; Bignum() : used_bigits_(0), exponent_(0) {} void AssignUInt16(const uint16_t value); void AssignUInt64(uint64_t value); void AssignBignum(const Bignum& other); void AssignDecimalString(const Vector value); void AssignHexString(const Vector value); void AssignPowerUInt16(uint16_t base, const int exponent); void AddUInt64(const uint64_t operand); void AddBignum(const Bignum& other); // Precondition: this >= other. void SubtractBignum(const Bignum& other); void Square(); void ShiftLeft(const int shift_amount); void MultiplyByUInt32(const uint32_t factor); void MultiplyByUInt64(const uint64_t factor); void MultiplyByPowerOfTen(const int exponent); void Times10() { return MultiplyByUInt32(10); } // Pseudocode: // int result = this / other; // this = this % other; // In the worst case this function is in O(this/other). uint16_t DivideModuloIntBignum(const Bignum& other); bool ToHexString(char* buffer, const int buffer_size) const; // Returns // -1 if a < b, // 0 if a == b, and // +1 if a > b. static int Compare(const Bignum& a, const Bignum& b); static bool Equal(const Bignum& a, const Bignum& b) { return Compare(a, b) == 0; } static bool LessEqual(const Bignum& a, const Bignum& b) { return Compare(a, b) <= 0; } static bool Less(const Bignum& a, const Bignum& b) { return Compare(a, b) < 0; } // Returns Compare(a + b, c); static int PlusCompare(const Bignum& a, const Bignum& b, const Bignum& c); // Returns a + b == c static bool PlusEqual(const Bignum& a, const Bignum& b, const Bignum& c) { return PlusCompare(a, b, c) == 0; } // Returns a + b <= c static bool PlusLessEqual(const Bignum& a, const Bignum& b, const Bignum& c) { return PlusCompare(a, b, c) <= 0; } // Returns a + b < c static bool PlusLess(const Bignum& a, const Bignum& b, const Bignum& c) { return PlusCompare(a, b, c) < 0; } private: typedef uint32_t Chunk; typedef uint64_t DoubleChunk; static const int kChunkSize = sizeof(Chunk) * 8; static const int kDoubleChunkSize = sizeof(DoubleChunk) * 8; // With bigit size of 28 we loose some bits, but a double still fits easily // into two chunks, and more importantly we can use the Comba multiplication. static const int kBigitSize = 28; static const Chunk kBigitMask = (1 << kBigitSize) - 1; // Every instance allocates kBigitLength chunks on the stack. Bignums cannot // grow. There are no checks if the stack-allocated space is sufficient. static const int kBigitCapacity = kMaxSignificantBits / kBigitSize; static void EnsureCapacity(const int size) { if (size > kBigitCapacity) { DOUBLE_CONVERSION_UNREACHABLE(); } } void Align(const Bignum& other); void Clamp(); bool IsClamped() const { return used_bigits_ == 0 || RawBigit(used_bigits_ - 1) != 0; } void Zero() { used_bigits_ = 0; exponent_ = 0; } // Requires this to have enough capacity (no tests done). // Updates used_bigits_ if necessary. // shift_amount must be < kBigitSize. void BigitsShiftLeft(const int shift_amount); // BigitLength includes the "hidden" bigits encoded in the exponent. int BigitLength() const { return used_bigits_ + exponent_; } Chunk& RawBigit(const int index); const Chunk& RawBigit(const int index) const; Chunk BigitOrZero(const int index) const; void SubtractTimes(const Bignum& other, const int factor); // The Bignum's value is value(bigits_buffer_) * 2^(exponent_ * kBigitSize), // where the value of the buffer consists of the lower kBigitSize bits of // the first used_bigits_ Chunks in bigits_buffer_, first chunk has lowest // significant bits. int16_t used_bigits_; int16_t exponent_; Chunk bigits_buffer_[kBigitCapacity]; DOUBLE_CONVERSION_DISALLOW_COPY_AND_ASSIGN(Bignum); }; } // namespace double_conversion // ICU PATCH: Close ICU namespace U_NAMESPACE_END #endif // DOUBLE_CONVERSION_BIGNUM_H_ #endif // ICU PATCH: close #if !UCONFIG_NO_FORMATTING stringi/src/icu74/i18n/measunit_impl.h0000644000176200001440000003326214700200761017254 0ustar liggesusers// © 2020 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #ifndef __MEASUNIT_IMPL_H__ #define __MEASUNIT_IMPL_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/measunit.h" #include "cmemory.h" #include "charstr.h" U_NAMESPACE_BEGIN namespace number { namespace impl { class LongNameHandler; } } // namespace number static const char16_t kDefaultCurrency[] = u"XXX"; static const char kDefaultCurrency8[] = "XXX"; /** * Looks up the "unitQuantity" (aka "type" or "category") of a base unit * identifier. The category is returned via `result`, which must initially be * empty. * * This only supports base units: other units must be resolved to base units * before passing to this function, otherwise U_UNSUPPORTED_ERROR status may be * returned. * * Categories are found in `unitQuantities` in the `units` resource (see * `units.txt`). */ // TODO: make this function accepts any `MeasureUnit` as Java and move it to the `UnitsData` class. CharString U_I18N_API getUnitQuantity(const MeasureUnitImpl &baseMeasureUnitImpl, UErrorCode &status); /** * A struct representing a single unit (optional SI or binary prefix, and dimensionality). */ struct U_I18N_API SingleUnitImpl : public UMemory { /** * Gets a single unit from the MeasureUnit. If there are multiple single units, sets an error * code and returns the base dimensionless unit. Parses if necessary. */ static SingleUnitImpl forMeasureUnit(const MeasureUnit& measureUnit, UErrorCode& status); /** Transform this SingleUnitImpl into a MeasureUnit, simplifying if possible. */ MeasureUnit build(UErrorCode& status) const; /** * Returns the "simple unit ID", without SI or dimensionality prefix: this * instance may represent a square-kilometer, but only "meter" will be * returned. * * The returned pointer points at memory that exists for the duration of the * program's running. */ const char *getSimpleUnitID() const; /** * Generates and append a neutral identifier string for a single unit which means we do not include * the dimension signal. */ void appendNeutralIdentifier(CharString &result, UErrorCode &status) const; /** * Returns the index of this unit's "quantity" in unitQuantities (in * measunit_extra.cpp). The value of this index determines sort order for * normalization of unit identifiers. */ int32_t getUnitCategoryIndex() const; /** * Compare this SingleUnitImpl to another SingleUnitImpl for the sake of * sorting and coalescing. * * Sort order of units is specified by UTS #35 * (https://unicode.org/reports/tr35/tr35-info.html#Unit_Identifier_Normalization). * * Takes the sign of dimensionality into account, but not the absolute * value: per-meter is not considered the same as meter, but meter is * considered the same as square-meter. * * The dimensionless unit generally does not get compared, but if it did, it * would sort before other units by virtue of index being < 0 and * dimensionality not being negative. */ int32_t compareTo(const SingleUnitImpl& other) const { if (dimensionality < 0 && other.dimensionality > 0) { // Positive dimensions first return 1; } if (dimensionality > 0 && other.dimensionality < 0) { return -1; } // Sort by official quantity order int32_t thisQuantity = this->getUnitCategoryIndex(); int32_t otherQuantity = other.getUnitCategoryIndex(); if (thisQuantity < otherQuantity) { return -1; } if (thisQuantity > otherQuantity) { return 1; } // If quantity order didn't help, then we go by index. if (index < other.index) { return -1; } if (index > other.index) { return 1; } // When comparing binary prefixes vs SI prefixes, instead of comparing the actual values, we can // multiply the binary prefix power by 3 and compare the powers. if they are equal, we can can // compare the bases. // NOTE: this methodology will fail if the binary prefix more than or equal 98. int32_t unitBase = umeas_getPrefixBase(unitPrefix); int32_t otherUnitBase = umeas_getPrefixBase(other.unitPrefix); // Values for comparison purposes only. int32_t unitPower = unitBase == 1024 /* Binary Prefix */ ? umeas_getPrefixPower(unitPrefix) * 3 : umeas_getPrefixPower(unitPrefix); int32_t otherUnitPower = otherUnitBase == 1024 /* Binary Prefix */ ? umeas_getPrefixPower(other.unitPrefix) * 3 : umeas_getPrefixPower(other.unitPrefix); // NOTE: if the unitPower is less than the other, // we return 1 not -1. Thus because we want th sorting order // for the bigger prefix to be before the smaller. // Example: megabyte should come before kilobyte. if (unitPower < otherUnitPower) { return 1; } if (unitPower > otherUnitPower) { return -1; } if (unitBase < otherUnitBase) { return 1; } if (unitBase > otherUnitBase) { return -1; } return 0; } /** * Return whether this SingleUnitImpl is compatible with another for the purpose of coalescing. * * Units with the same base unit and SI or binary prefix should match, except that they must also * have the same dimensionality sign, such that we don't merge numerator and denominator. */ bool isCompatibleWith(const SingleUnitImpl& other) const { return (compareTo(other) == 0); } /** * Returns true if this unit is the "dimensionless base unit", as produced * by the MeasureUnit() default constructor. (This does not include the * likes of concentrations or angles.) */ bool isDimensionless() const { return index == -1; } /** * Simple unit index, unique for every simple unit, -1 for the dimensionless * unit. This is an index into a string list in measunit_extra.cpp, as * loaded by SimpleUnitIdentifiersSink. * * The default value is -1, meaning the dimensionless unit: * isDimensionless() will return true, until index is changed. */ int32_t index = -1; /** * SI or binary prefix. * * This is ignored for the dimensionless unit. */ UMeasurePrefix unitPrefix = UMEASURE_PREFIX_ONE; /** * Dimensionality. * * This is meaningless for the dimensionless unit. */ int32_t dimensionality = 1; }; // Forward declaration struct MeasureUnitImplWithIndex; // Export explicit template instantiations of MaybeStackArray, MemoryPool and // MaybeStackVector. This is required when building DLLs for Windows. (See // datefmt.h, collationiterator.h, erarules.h and others for similar examples.) #if U_PF_WINDOWS <= U_PLATFORM && U_PLATFORM <= U_PF_CYGWIN template class U_I18N_API MaybeStackArray; template class U_I18N_API MemoryPool; template class U_I18N_API MaybeStackVector; #endif /** * Internal representation of measurement units. Capable of representing all complexities of units, * including mixed and compound units. */ class U_I18N_API MeasureUnitImpl : public UMemory { public: MeasureUnitImpl() = default; MeasureUnitImpl(MeasureUnitImpl &&other) = default; // No copy constructor, use MeasureUnitImpl::copy() to make it explicit. MeasureUnitImpl(const MeasureUnitImpl &other, UErrorCode &status) = delete; MeasureUnitImpl(const SingleUnitImpl &singleUnit, UErrorCode &status); MeasureUnitImpl &operator=(MeasureUnitImpl &&other) noexcept = default; /** Extract the MeasureUnitImpl from a MeasureUnit. */ static inline const MeasureUnitImpl *get(const MeasureUnit &measureUnit) { return measureUnit.fImpl; } /** * Parse a unit identifier into a MeasureUnitImpl. * * @param identifier The unit identifier string. * @param status Set if the identifier string is not valid. * @return A newly parsed value object. Behaviour of this unit is * unspecified if an error is returned via status. */ static MeasureUnitImpl forIdentifier(StringPiece identifier, UErrorCode& status); /** * Extract the MeasureUnitImpl from a MeasureUnit, or parse if it is not present. * * @param measureUnit The source MeasureUnit. * @param memory A place to write the new MeasureUnitImpl if parsing is required. * @param status Set if an error occurs. * @return A reference to either measureUnit.fImpl or memory. */ static const MeasureUnitImpl& forMeasureUnit( const MeasureUnit& measureUnit, MeasureUnitImpl& memory, UErrorCode& status); /** * Extract the MeasureUnitImpl from a MeasureUnit, or parse if it is not present. * * @param measureUnit The source MeasureUnit. * @param status Set if an error occurs. * @return A value object, either newly parsed or copied from measureUnit. */ static MeasureUnitImpl forMeasureUnitMaybeCopy( const MeasureUnit& measureUnit, UErrorCode& status); /** * Used for currency units. */ static inline MeasureUnitImpl forCurrencyCode(StringPiece currencyCode) { MeasureUnitImpl result; UErrorCode localStatus = U_ZERO_ERROR; result.identifier.append(currencyCode, localStatus); // localStatus is not expected to fail since currencyCode should be 3 chars long return result; } /** Transform this MeasureUnitImpl into a MeasureUnit, simplifying if possible. */ MeasureUnit build(UErrorCode& status) &&; /** * Create a copy of this MeasureUnitImpl. Don't use copy constructor to make this explicit. */ MeasureUnitImpl copy(UErrorCode& status) const; /** * Extracts the list of all the individual units inside the `MeasureUnitImpl` with their indices. * For example: * - if the `MeasureUnitImpl` is `foot-per-hour` * it will return a list of 1 {(0, `foot-per-hour`)} * - if the `MeasureUnitImpl` is `foot-and-inch` * it will return a list of 2 {(0, `foot`), (1, `inch`)} */ MaybeStackVector extractIndividualUnitsWithIndices(UErrorCode &status) const; /** Mutates this MeasureUnitImpl to take the reciprocal. */ void takeReciprocal(UErrorCode& status); /** * Returns a simplified version of the unit. * NOTE: the simplification happen when there are two units equals in their base unit and their * prefixes. * * Example 1: "square-meter-per-meter" --> "meter" * Example 2: "square-millimeter-per-meter" --> "square-millimeter-per-meter" */ MeasureUnitImpl copyAndSimplify(UErrorCode &status) const; /** * Mutates this MeasureUnitImpl to append a single unit. * * @return true if a new item was added. If unit is the dimensionless unit, * it is never added: the return value will always be false. */ bool appendSingleUnit(const SingleUnitImpl& singleUnit, UErrorCode& status); /** * Normalizes a MeasureUnitImpl and generate the identifier string in place. */ void serialize(UErrorCode &status); /** The complexity, either SINGLE, COMPOUND, or MIXED. */ UMeasureUnitComplexity complexity = UMEASURE_UNIT_SINGLE; /** * The list of single units. These may be summed or multiplied, based on the * value of the complexity field. * * The "dimensionless" unit (SingleUnitImpl default constructor) must not be * added to this list. */ MaybeStackVector singleUnits; /** * The full unit identifier. Owned by the MeasureUnitImpl. Empty if not computed. */ CharString identifier; // For calling serialize // TODO(icu-units#147): revisit serialization friend class number::impl::LongNameHandler; }; struct U_I18N_API MeasureUnitImplWithIndex : public UMemory { const int32_t index; MeasureUnitImpl unitImpl; // Makes a copy of unitImpl. MeasureUnitImplWithIndex(int32_t index, const MeasureUnitImpl &unitImpl, UErrorCode &status) : index(index), unitImpl(unitImpl.copy(status)) { } MeasureUnitImplWithIndex(int32_t index, const SingleUnitImpl &singleUnitImpl, UErrorCode &status) : index(index), unitImpl(MeasureUnitImpl(singleUnitImpl, status)) { } }; // Export explicit template instantiations of MaybeStackArray, MemoryPool and // MaybeStackVector. This is required when building DLLs for Windows. (See // datefmt.h, collationiterator.h, erarules.h and others for similar examples.) #if U_PF_WINDOWS <= U_PLATFORM && U_PLATFORM <= U_PF_CYGWIN template class U_I18N_API MaybeStackArray; template class U_I18N_API MemoryPool; template class U_I18N_API MaybeStackVector; // Export an explicit template instantiation of the LocalPointer that is used as a // data member of MeasureUnitImpl. // (When building DLLs for Windows this is required.) #if defined(_MSC_VER) // Ignore warning 4661 as LocalPointerBase does not use operator== or operator!= #pragma warning(push) #pragma warning(disable : 4661) #endif template class U_I18N_API LocalPointerBase; template class U_I18N_API LocalPointer; #if defined(_MSC_VER) #pragma warning(pop) #endif #endif U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ #endif //__MEASUNIT_IMPL_H__ stringi/src/icu74/i18n/uitercollationiterator.cpp0000644000176200001440000003410714700200761021547 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2012-2014, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * uitercollationiterator.cpp * * created on: 2012sep23 (from utf16collationiterator.cpp) * created by: Markus W. Scherer */ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION #include "unicode/uiter.h" #include "charstr.h" #include "cmemory.h" #include "collation.h" #include "collationdata.h" #include "collationfcd.h" #include "collationiterator.h" #include "normalizer2impl.h" #include "uassert.h" #include "uitercollationiterator.h" U_NAMESPACE_BEGIN UIterCollationIterator::~UIterCollationIterator() {} void UIterCollationIterator::resetToOffset(int32_t newOffset) { reset(); iter.move(&iter, newOffset, UITER_START); } int32_t UIterCollationIterator::getOffset() const { return iter.getIndex(&iter, UITER_CURRENT); } uint32_t UIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) { c = iter.next(&iter); if(c < 0) { return Collation::FALLBACK_CE32; } return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); } char16_t UIterCollationIterator::handleGetTrailSurrogate() { UChar32 trail = iter.next(&iter); if(!U16_IS_TRAIL(trail) && trail >= 0) { iter.previous(&iter); } return (char16_t)trail; } UChar32 UIterCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) { return uiter_next32(&iter); } UChar32 UIterCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) { return uiter_previous32(&iter); } void UIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { while(num > 0 && (uiter_next32(&iter)) >= 0) { --num; } } void UIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { while(num > 0 && (uiter_previous32(&iter)) >= 0) { --num; } } // FCDUIterCollationIterator ----------------------------------------------- *** FCDUIterCollationIterator::~FCDUIterCollationIterator() {} void FCDUIterCollationIterator::resetToOffset(int32_t newOffset) { UIterCollationIterator::resetToOffset(newOffset); start = newOffset; state = ITER_CHECK_FWD; } int32_t FCDUIterCollationIterator::getOffset() const { if(state <= ITER_CHECK_BWD) { return iter.getIndex(&iter, UITER_CURRENT); } else if(state == ITER_IN_FCD_SEGMENT) { return pos; } else if(pos == 0) { return start; } else { return limit; } } uint32_t FCDUIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) { for(;;) { if(state == ITER_CHECK_FWD) { c = iter.next(&iter); if(c < 0) { return Collation::FALLBACK_CE32; } if(CollationFCD::hasTccc(c)) { if(CollationFCD::maybeTibetanCompositeVowel(c) || CollationFCD::hasLccc(iter.current(&iter))) { iter.previous(&iter); if(!nextSegment(errorCode)) { c = U_SENTINEL; return Collation::FALLBACK_CE32; } continue; } } break; } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) { c = iter.next(&iter); ++pos; U_ASSERT(c >= 0); break; } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) { c = normalized[pos++]; break; } else { switchToForward(); } } return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); } char16_t FCDUIterCollationIterator::handleGetTrailSurrogate() { if(state <= ITER_IN_FCD_SEGMENT) { UChar32 trail = iter.next(&iter); if(U16_IS_TRAIL(trail)) { if(state == ITER_IN_FCD_SEGMENT) { ++pos; } } else if(trail >= 0) { iter.previous(&iter); } return (char16_t)trail; } else { U_ASSERT(pos < normalized.length()); char16_t trail; if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; } return trail; } } UChar32 FCDUIterCollationIterator::nextCodePoint(UErrorCode &errorCode) { UChar32 c; for(;;) { if(state == ITER_CHECK_FWD) { c = iter.next(&iter); if(c < 0) { return c; } if(CollationFCD::hasTccc(c)) { if(CollationFCD::maybeTibetanCompositeVowel(c) || CollationFCD::hasLccc(iter.current(&iter))) { iter.previous(&iter); if(!nextSegment(errorCode)) { return U_SENTINEL; } continue; } } if(U16_IS_LEAD(c)) { UChar32 trail = iter.next(&iter); if(U16_IS_TRAIL(trail)) { return U16_GET_SUPPLEMENTARY(c, trail); } else if(trail >= 0) { iter.previous(&iter); } } return c; } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) { c = uiter_next32(&iter); pos += U16_LENGTH(c); U_ASSERT(c >= 0); return c; } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) { c = normalized.char32At(pos); pos += U16_LENGTH(c); return c; } else { switchToForward(); } } } UChar32 FCDUIterCollationIterator::previousCodePoint(UErrorCode &errorCode) { UChar32 c; for(;;) { if(state == ITER_CHECK_BWD) { c = iter.previous(&iter); if(c < 0) { start = pos = 0; state = ITER_IN_FCD_SEGMENT; return U_SENTINEL; } if(CollationFCD::hasLccc(c)) { UChar32 prev = U_SENTINEL; if(CollationFCD::maybeTibetanCompositeVowel(c) || CollationFCD::hasTccc(prev = iter.previous(&iter))) { iter.next(&iter); if(prev >= 0) { iter.next(&iter); } if(!previousSegment(errorCode)) { return U_SENTINEL; } continue; } // hasLccc(trail)=true for all trail surrogates if(U16_IS_TRAIL(c)) { if(prev < 0) { prev = iter.previous(&iter); } if(U16_IS_LEAD(prev)) { return U16_GET_SUPPLEMENTARY(prev, c); } } if(prev >= 0) { iter.next(&iter); } } return c; } else if(state == ITER_IN_FCD_SEGMENT && pos != start) { c = uiter_previous32(&iter); pos -= U16_LENGTH(c); U_ASSERT(c >= 0); return c; } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != 0) { c = normalized.char32At(pos - 1); pos -= U16_LENGTH(c); return c; } else { switchToBackward(); } } } void FCDUIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) { // Specify the class to avoid a virtual-function indirection. // In Java, we would declare this class final. while(num > 0 && FCDUIterCollationIterator::nextCodePoint(errorCode) >= 0) { --num; } } void FCDUIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) { // Specify the class to avoid a virtual-function indirection. // In Java, we would declare this class final. while(num > 0 && FCDUIterCollationIterator::previousCodePoint(errorCode) >= 0) { --num; } } void FCDUIterCollationIterator::switchToForward() { U_ASSERT(state == ITER_CHECK_BWD || (state == ITER_IN_FCD_SEGMENT && pos == limit) || (state >= IN_NORM_ITER_AT_LIMIT && pos == normalized.length())); if(state == ITER_CHECK_BWD) { // Turn around from backward checking. start = pos = iter.getIndex(&iter, UITER_CURRENT); if(pos == limit) { state = ITER_CHECK_FWD; // Check forward. } else { // pos < limit state = ITER_IN_FCD_SEGMENT; // Stay in FCD segment. } } else { // Reached the end of the FCD segment. if(state == ITER_IN_FCD_SEGMENT) { // The input text segment is FCD, extend it forward. } else { // The input text segment needed to be normalized. // Switch to checking forward from it. if(state == IN_NORM_ITER_AT_START) { iter.move(&iter, limit - start, UITER_CURRENT); } start = limit; } state = ITER_CHECK_FWD; } } UBool FCDUIterCollationIterator::nextSegment(UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return false; } U_ASSERT(state == ITER_CHECK_FWD); // The input text [start..(iter index)[ passes the FCD check. pos = iter.getIndex(&iter, UITER_CURRENT); // Collect the characters being checked, in case they need to be normalized. UnicodeString s; uint8_t prevCC = 0; for(;;) { // Fetch the next character and its fcd16 value. UChar32 c = uiter_next32(&iter); if(c < 0) { break; } uint16_t fcd16 = nfcImpl.getFCD16(c); uint8_t leadCC = (uint8_t)(fcd16 >> 8); if(leadCC == 0 && !s.isEmpty()) { // FCD boundary before this character. uiter_previous32(&iter); break; } s.append(c); if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) { // Fails FCD check. Find the next FCD boundary and normalize. for(;;) { c = uiter_next32(&iter); if(c < 0) { break; } if(nfcImpl.getFCD16(c) <= 0xff) { uiter_previous32(&iter); break; } s.append(c); } if(!normalize(s, errorCode)) { return false; } start = pos; limit = pos + s.length(); state = IN_NORM_ITER_AT_LIMIT; pos = 0; return true; } prevCC = (uint8_t)fcd16; if(prevCC == 0) { // FCD boundary after the last character. break; } } limit = pos + s.length(); U_ASSERT(pos != limit); iter.move(&iter, -s.length(), UITER_CURRENT); state = ITER_IN_FCD_SEGMENT; return true; } void FCDUIterCollationIterator::switchToBackward() { U_ASSERT(state == ITER_CHECK_FWD || (state == ITER_IN_FCD_SEGMENT && pos == start) || (state >= IN_NORM_ITER_AT_LIMIT && pos == 0)); if(state == ITER_CHECK_FWD) { // Turn around from forward checking. limit = pos = iter.getIndex(&iter, UITER_CURRENT); if(pos == start) { state = ITER_CHECK_BWD; // Check backward. } else { // pos > start state = ITER_IN_FCD_SEGMENT; // Stay in FCD segment. } } else { // Reached the start of the FCD segment. if(state == ITER_IN_FCD_SEGMENT) { // The input text segment is FCD, extend it backward. } else { // The input text segment needed to be normalized. // Switch to checking backward from it. if(state == IN_NORM_ITER_AT_LIMIT) { iter.move(&iter, start - limit, UITER_CURRENT); } limit = start; } state = ITER_CHECK_BWD; } } UBool FCDUIterCollationIterator::previousSegment(UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return false; } U_ASSERT(state == ITER_CHECK_BWD); // The input text [(iter index)..limit[ passes the FCD check. pos = iter.getIndex(&iter, UITER_CURRENT); // Collect the characters being checked, in case they need to be normalized. UnicodeString s; uint8_t nextCC = 0; for(;;) { // Fetch the previous character and its fcd16 value. UChar32 c = uiter_previous32(&iter); if(c < 0) { break; } uint16_t fcd16 = nfcImpl.getFCD16(c); uint8_t trailCC = (uint8_t)fcd16; if(trailCC == 0 && !s.isEmpty()) { // FCD boundary after this character. uiter_next32(&iter); break; } s.append(c); if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) { // Fails FCD check. Find the previous FCD boundary and normalize. while(fcd16 > 0xff) { c = uiter_previous32(&iter); if(c < 0) { break; } fcd16 = nfcImpl.getFCD16(c); if(fcd16 == 0) { (void)uiter_next32(&iter); break; } s.append(c); } s.reverse(); if(!normalize(s, errorCode)) { return false; } limit = pos; start = pos - s.length(); state = IN_NORM_ITER_AT_START; pos = normalized.length(); return true; } nextCC = (uint8_t)(fcd16 >> 8); if(nextCC == 0) { // FCD boundary before the following character. break; } } start = pos - s.length(); U_ASSERT(pos != start); iter.move(&iter, s.length(), UITER_CURRENT); state = ITER_IN_FCD_SEGMENT; return true; } UBool FCDUIterCollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) { // NFD without argument checking. U_ASSERT(U_SUCCESS(errorCode)); nfcImpl.decompose(s, normalized, errorCode); return U_SUCCESS(errorCode); } U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION stringi/src/icu74/i18n/number_output.cpp0000644000176200001440000000506414700200761017650 0ustar liggesusers// © 2019 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/measunit.h" #include "unicode/numberformatter.h" #include "number_utypes.h" #include "util.h" #include "number_decimalquantity.h" #include "number_decnum.h" #include "numrange_impl.h" U_NAMESPACE_BEGIN namespace number { UPRV_FORMATTED_VALUE_SUBCLASS_AUTO_IMPL(FormattedNumber) #define UPRV_NOARG void FormattedNumber::toDecimalNumber(ByteSink& sink, UErrorCode& status) const { UPRV_FORMATTED_VALUE_METHOD_GUARD(UPRV_NOARG) impl::DecNum decnum; fData->quantity.toDecNum(decnum, status); decnum.toString(sink, status); } void FormattedNumber::getAllFieldPositionsImpl(FieldPositionIteratorHandler& fpih, UErrorCode& status) const { UPRV_FORMATTED_VALUE_METHOD_GUARD(UPRV_NOARG) fData->getAllFieldPositions(fpih, status); } MeasureUnit FormattedNumber::getOutputUnit(UErrorCode& status) const { UPRV_FORMATTED_VALUE_METHOD_GUARD(MeasureUnit()) return fData->outputUnit; } UDisplayOptionsNounClass FormattedNumber::getNounClass(UErrorCode &status) const { UPRV_FORMATTED_VALUE_METHOD_GUARD(UDISPOPT_NOUN_CLASS_UNDEFINED); const char *nounClass = fData->gender; return udispopt_fromNounClassIdentifier(nounClass); } void FormattedNumber::getDecimalQuantity(impl::DecimalQuantity& output, UErrorCode& status) const { UPRV_FORMATTED_VALUE_METHOD_GUARD(UPRV_NOARG) output = fData->quantity; } impl::UFormattedNumberData::~UFormattedNumberData() = default; UPRV_FORMATTED_VALUE_SUBCLASS_AUTO_IMPL(FormattedNumberRange) #define UPRV_NOARG void FormattedNumberRange::getDecimalNumbers(ByteSink& sink1, ByteSink& sink2, UErrorCode& status) const { UPRV_FORMATTED_VALUE_METHOD_GUARD(UPRV_NOARG) impl::DecNum decnum1; impl::DecNum decnum2; fData->quantity1.toDecNum(decnum1, status).toString(sink1, status); fData->quantity2.toDecNum(decnum2, status).toString(sink2, status); } UNumberRangeIdentityResult FormattedNumberRange::getIdentityResult(UErrorCode& status) const { UPRV_FORMATTED_VALUE_METHOD_GUARD(UNUM_IDENTITY_RESULT_NOT_EQUAL) return fData->identityResult; } const impl::UFormattedNumberRangeData* FormattedNumberRange::getData(UErrorCode& status) const { UPRV_FORMATTED_VALUE_METHOD_GUARD(nullptr) return fData; } impl::UFormattedNumberRangeData::~UFormattedNumberRangeData() = default; } // namespace number U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/remtrans.h0000644000176200001440000000424014700200761016233 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (c) 2001-2007, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 04/02/2001 aliu Creation. ********************************************************************** */ #ifndef REMTRANS_H #define REMTRANS_H #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/translit.h" U_NAMESPACE_BEGIN /** * A transliterator that removes text. * @author Alan Liu */ class RemoveTransliterator : public Transliterator { public: /** * Constructs a transliterator. */ RemoveTransliterator(); /** * Destructor. */ virtual ~RemoveTransliterator(); /** * System registration hook. */ static void registerIDs(); /** * Transliterator API. * @return A copy of the object. */ virtual RemoveTransliterator* clone() const override; /** * Implements {@link Transliterator#handleTransliterate}. * @param text the buffer holding transliterated and * untransliterated text * @param offset the start and limit of the text, the position * of the cursor, and the start and limit of transliteration. * @param incremental if true, assume more text may be coming after * pos.contextLimit. Otherwise, assume the text is complete. */ virtual void handleTransliterate(Replaceable& text, UTransPosition& offset, UBool isIncremental) const override; /** * ICU "poor man's RTTI", returns a UClassID for the actual class. */ virtual UClassID getDynamicClassID() const override; /** * ICU "poor man's RTTI", returns a UClassID for this class. */ U_I18N_API static UClassID U_EXPORT2 getStaticClassID(); }; U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ #endif stringi/src/icu74/i18n/scriptset.cpp0000644000176200001440000002057614700200761016765 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2014, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * * scriptset.cpp * * created on: 2013 Jan 7 * created by: Andy Heninger */ #include "unicode/utypes.h" #include "unicode/uchar.h" #include "unicode/unistr.h" #include "scriptset.h" #include "uassert.h" #include "cmemory.h" U_NAMESPACE_BEGIN //---------------------------------------------------------------------------- // // ScriptSet implementation // //---------------------------------------------------------------------------- ScriptSet::ScriptSet() { uprv_memset(bits, 0, sizeof(bits)); } ScriptSet::~ScriptSet() { } ScriptSet::ScriptSet(const ScriptSet &other) { *this = other; } ScriptSet & ScriptSet::operator =(const ScriptSet &other) { uprv_memcpy(bits, other.bits, sizeof(bits)); return *this; } bool ScriptSet::operator == (const ScriptSet &other) const { for (uint32_t i=0; i= SCRIPT_LIMIT) { status = U_ILLEGAL_ARGUMENT_ERROR; return false; } uint32_t index = script / 32; uint32_t bit = 1 << (script & 31); return ((bits[index] & bit) != 0); } ScriptSet &ScriptSet::set(UScriptCode script, UErrorCode &status) { if (U_FAILURE(status)) { return *this; } if (script < 0 || (int32_t)script >= SCRIPT_LIMIT) { status = U_ILLEGAL_ARGUMENT_ERROR; return *this; } uint32_t index = script / 32; uint32_t bit = 1 << (script & 31); bits[index] |= bit; return *this; } ScriptSet &ScriptSet::reset(UScriptCode script, UErrorCode &status) { if (U_FAILURE(status)) { return *this; } if (script < 0 || (int32_t)script >= SCRIPT_LIMIT) { status = U_ILLEGAL_ARGUMENT_ERROR; return *this; } uint32_t index = script / 32; uint32_t bit = 1 << (script & 31); bits[index] &= ~bit; return *this; } ScriptSet &ScriptSet::Union(const ScriptSet &other) { for (uint32_t i=0; iintersect(t); } return *this; } UBool ScriptSet::intersects(const ScriptSet &other) const { for (uint32_t i=0; i 0) { count++; x &= (x - 1); // and off the least significant one bit. } } return count; } int32_t ScriptSet::hashCode() const { int32_t hash = 0; for (int32_t i=0; i= 0; i = nextSetBit(i + 1)) { if (!firstTime) { dest.append((char16_t)0x20); } firstTime = false; const char *scriptName = uscript_getShortName((UScriptCode(i))); dest.append(UnicodeString(scriptName, -1, US_INV)); } return dest; } ScriptSet &ScriptSet::parseScripts(const UnicodeString &scriptString, UErrorCode &status) { resetAll(); if (U_FAILURE(status)) { return *this; } UnicodeString oneScriptName; for (int32_t i=0; i 0) { char buf[40]; oneScriptName.extract(0, oneScriptName.length(), buf, sizeof(buf)-1, US_INV); buf[sizeof(buf)-1] = 0; int32_t sc = u_getPropertyValueEnum(UCHAR_SCRIPT, buf); if (sc == UCHAR_INVALID_CODE) { status = U_ILLEGAL_ARGUMENT_ERROR; } else { this->set((UScriptCode)sc, status); } if (U_FAILURE(status)) { return *this; } oneScriptName.remove(); } } return *this; } void ScriptSet::setScriptExtensions(UChar32 codePoint, UErrorCode& status) { if (U_FAILURE(status)) { return; } static const int32_t FIRST_GUESS_SCRIPT_CAPACITY = 20; MaybeStackArray scripts; UErrorCode internalStatus = U_ZERO_ERROR; int32_t script_count = -1; while (true) { script_count = uscript_getScriptExtensions( codePoint, scripts.getAlias(), scripts.getCapacity(), &internalStatus); if (internalStatus == U_BUFFER_OVERFLOW_ERROR) { // Need to allocate more space if (scripts.resize(script_count) == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } internalStatus = U_ZERO_ERROR; } else { break; } } // Check if we failed for some reason other than buffer overflow if (U_FAILURE(internalStatus)) { status = internalStatus; return; } // Load the scripts into the ScriptSet and return for (int32_t i = 0; i < script_count; i++) { this->set(scripts[i], status); if (U_FAILURE(status)) { return; } } } U_NAMESPACE_END U_CAPI UBool U_EXPORT2 uhash_equalsScriptSet(const UElement key1, const UElement key2) { icu::ScriptSet *s1 = static_cast(key1.pointer); icu::ScriptSet *s2 = static_cast(key2.pointer); return (*s1 == *s2); } U_CAPI int8_t U_EXPORT2 uhash_compareScriptSet(UElement key0, UElement key1) { icu::ScriptSet *s0 = static_cast(key0.pointer); icu::ScriptSet *s1 = static_cast(key1.pointer); int32_t diff = s0->countMembers() - s1->countMembers(); if (diff != 0) return static_cast(diff); int32_t i0 = s0->nextSetBit(0); int32_t i1 = s1->nextSetBit(0); while ((diff = i0-i1) == 0 && i0 > 0) { i0 = s0->nextSetBit(i0+1); i1 = s1->nextSetBit(i1+1); } return (int8_t)diff; } U_CAPI int32_t U_EXPORT2 uhash_hashScriptSet(const UElement key) { icu::ScriptSet *s = static_cast(key.pointer); return s->hashCode(); } U_CAPI void U_EXPORT2 uhash_deleteScriptSet(void *obj) { icu::ScriptSet *s = static_cast(obj); delete s; } stringi/src/icu74/i18n/anytrans.h0000644000176200001440000000746614700200761016254 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* *********************************************************************** * Copyright (c) 2002-2007, International Business Machines Corporation * and others. All Rights Reserved. *********************************************************************** * Date Name Description * 06/06/2002 aliu Creation. *********************************************************************** */ #ifndef _ANYTRANS_H_ #define _ANYTRANS_H_ #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/translit.h" #include "unicode/uscript.h" #include "uhash.h" U_NAMESPACE_BEGIN /** * A transliterator named Any-T or Any-T/V, where T is the target * script and V is the optional variant, that uses multiple * transliterators, all going to T or T/V, all with script sources. * The target must be a script. It partitions text into runs of the * same script, and then based on the script of each run, * transliterates from that script to the given target or * target/variant. Adjacent COMMON or INHERITED script characters are * included in each run. * * @author Alan Liu */ class AnyTransliterator : public Transliterator { /** * Cache mapping UScriptCode values to Transliterator*. */ UHashtable* cache; /** * The target or target/variant string. */ UnicodeString target; /** * The target script code. Never USCRIPT_INVALID_CODE. */ UScriptCode targetScript; public: /** * Destructor. */ virtual ~AnyTransliterator(); /** * Copy constructor. */ AnyTransliterator(const AnyTransliterator&); /** * Transliterator API. */ virtual AnyTransliterator* clone() const override; /** * Implements {@link Transliterator#handleTransliterate}. */ virtual void handleTransliterate(Replaceable& text, UTransPosition& index, UBool incremental) const override; /** * ICU "poor man's RTTI", returns a UClassID for the actual class. */ virtual UClassID getDynamicClassID() const override; /** * ICU "poor man's RTTI", returns a UClassID for this class. */ U_I18N_API static UClassID U_EXPORT2 getStaticClassID(); private: /** * Private constructor * @param id the ID of the form S-T or S-T/V, where T is theTarget * and V is theVariant. Must not be empty. * @param theTarget the target name. Must not be empty, and must * name a script corresponding to theTargetScript. * @param theVariant the variant name, or the empty string if * there is no variant * @param theTargetScript the script code corresponding to * theTarget. * @param ec error code, fails if the internal hashtable cannot be * allocated */ AnyTransliterator(const UnicodeString& id, const UnicodeString& theTarget, const UnicodeString& theVariant, UScriptCode theTargetScript, UErrorCode& ec); /** * Returns a transliterator from the given source to our target or * target/variant. Returns nullptr if the source is the same as our * target script, or if the source is USCRIPT_INVALID_CODE. * Caches the result and returns the same transliterator the next * time. The caller does NOT own the result and must not delete * it. */ Transliterator* getTransliterator(UScriptCode source) const; /** * Registers standard transliterators with the system. Called by * Transliterator during initialization. */ static void registerIDs(); friend class Transliterator; // for registerIDs() }; U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ #endif stringi/src/icu74/i18n/sharedcalendar.h0000644000176200001440000000232114700200761017336 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * Copyright (C) 2014, International Business Machines * Corporation and others. All Rights Reserved. ****************************************************************************** * sharedcalendar.h */ #ifndef __SHARED_CALENDAR_H__ #define __SHARED_CALENDAR_H__ #include "unicode/utypes.h" #include "sharedobject.h" #include "unifiedcache.h" U_NAMESPACE_BEGIN class Calendar; class U_I18N_API SharedCalendar : public SharedObject { public: SharedCalendar(Calendar *calToAdopt) : ptr(calToAdopt) { } virtual ~SharedCalendar(); const Calendar *get() const { return ptr; } const Calendar *operator->() const { return ptr; } const Calendar &operator*() const { return *ptr; } private: Calendar *ptr; SharedCalendar(const SharedCalendar &) = delete; SharedCalendar &operator=(const SharedCalendar &) = delete; }; template<> U_I18N_API const SharedCalendar *LocaleCacheKey::createObject( const void * /*unusedCreationContext*/, UErrorCode &status) const; U_NAMESPACE_END #endif stringi/src/icu74/i18n/double-conversion-strtod.h0000644000176200001440000000673514700200761021365 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // // From the double-conversion library. Original license: // // Copyright 2010 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // ICU PATCH: ifdef around UCONFIG_NO_FORMATTING #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #ifndef DOUBLE_CONVERSION_STRTOD_H_ #define DOUBLE_CONVERSION_STRTOD_H_ // ICU PATCH: Customize header file paths for ICU. #include "double-conversion-utils.h" // ICU PATCH: Wrap in ICU namespace U_NAMESPACE_BEGIN namespace double_conversion { // The buffer must only contain digits in the range [0-9]. It must not // contain a dot or a sign. It must not start with '0', and must not be empty. double Strtod(Vector buffer, int exponent); // The buffer must only contain digits in the range [0-9]. It must not // contain a dot or a sign. It must not start with '0', and must not be empty. float Strtof(Vector buffer, int exponent); // Same as Strtod, but assumes that 'trimmed' is already trimmed, as if run // through TrimAndCut. That is, 'trimmed' must have no leading or trailing // zeros, must not be a lone zero, and must not have 'too many' digits. double StrtodTrimmed(Vector trimmed, int exponent); // Same as Strtof, but assumes that 'trimmed' is already trimmed, as if run // through TrimAndCut. That is, 'trimmed' must have no leading or trailing // zeros, must not be a lone zero, and must not have 'too many' digits. float StrtofTrimmed(Vector trimmed, int exponent); inline Vector TrimTrailingZeros(Vector buffer) { for (int i = buffer.length() - 1; i >= 0; --i) { if (buffer[i] != '0') { return buffer.SubVector(0, i + 1); } } return Vector(buffer.start(), 0); } } // namespace double_conversion // ICU PATCH: Close ICU namespace U_NAMESPACE_END #endif // DOUBLE_CONVERSION_STRTOD_H_ #endif // ICU PATCH: close #if !UCONFIG_NO_FORMATTING stringi/src/icu74/i18n/rbt_set.cpp0000644000176200001440000003675214700200761016412 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 1999-2011, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 11/17/99 aliu Creation. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/unistr.h" #include "unicode/uniset.h" #include "unicode/utf16.h" #include "rbt_set.h" #include "rbt_rule.h" #include "cmemory.h" #include "putilimp.h" U_CDECL_BEGIN static void U_CALLCONV _deleteRule(void *rule) { delete (icu::TransliterationRule *)rule; } U_CDECL_END //---------------------------------------------------------------------- // BEGIN Debugging support //---------------------------------------------------------------------- // #define DEBUG_RBT #ifdef DEBUG_RBT #include #include "charstr.h" /** * @param appendTo result is appended to this param. * @param input the string being transliterated * @param pos the index struct */ static UnicodeString& _formatInput(UnicodeString &appendTo, const UnicodeString& input, const UTransPosition& pos) { // Output a string of the form aaa{bbb|ccc|ddd}eee, where // the {} indicate the context start and limit, and the || // indicate the start and limit. if (0 <= pos.contextStart && pos.contextStart <= pos.start && pos.start <= pos.limit && pos.limit <= pos.contextLimit && pos.contextLimit <= input.length()) { UnicodeString a, b, c, d, e; input.extractBetween(0, pos.contextStart, a); input.extractBetween(pos.contextStart, pos.start, b); input.extractBetween(pos.start, pos.limit, c); input.extractBetween(pos.limit, pos.contextLimit, d); input.extractBetween(pos.contextLimit, input.length(), e); appendTo.append(a).append((char16_t)123/*{*/).append(b). append((char16_t)124/*|*/).append(c).append((char16_t)124/*|*/).append(d). append((char16_t)125/*}*/).append(e); } else { appendTo.append("INVALID UTransPosition"); //appendTo.append((UnicodeString)"INVALID UTransPosition {cs=" + // pos.contextStart + ", s=" + pos.start + ", l=" + // pos.limit + ", cl=" + pos.contextLimit + "} on " + // input); } return appendTo; } // Append a hex string to the target UnicodeString& _appendHex(uint32_t number, int32_t digits, UnicodeString& target) { static const char16_t digitString[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0 }; while (digits--) { target += digitString[(number >> (digits*4)) & 0xF]; } return target; } // Replace nonprintable characters with unicode escapes UnicodeString& _escape(const UnicodeString &source, UnicodeString &target) { for (int32_t i = 0; i < source.length(); ) { UChar32 ch = source.char32At(i); i += U16_LENGTH(ch); if (ch < 0x09 || (ch > 0x0A && ch < 0x20)|| ch > 0x7E) { if (ch <= 0xFFFF) { target += "\\u"; _appendHex(ch, 4, target); } else { target += "\\U"; _appendHex(ch, 8, target); } } else { target += ch; } } return target; } inline void _debugOut(const char* msg, TransliterationRule* rule, const Replaceable& theText, UTransPosition& pos) { UnicodeString buf(msg, ""); if (rule) { UnicodeString r; rule->toRule(r, true); buf.append((char16_t)32).append(r); } buf.append(UnicodeString(" => ", "")); UnicodeString* text = (UnicodeString*)&theText; _formatInput(buf, *text, pos); UnicodeString esc; _escape(buf, esc); CharString cbuf(esc); printf("%s\n", (const char*) cbuf); } #else #define _debugOut(msg, rule, theText, pos) #endif //---------------------------------------------------------------------- // END Debugging support //---------------------------------------------------------------------- // Fill the precontext and postcontext with the patterns of the rules // that are masking one another. static void maskingError(const icu::TransliterationRule& rule1, const icu::TransliterationRule& rule2, UParseError& parseError) { icu::UnicodeString r; int32_t len; parseError.line = parseError.offset = -1; // for pre-context rule1.toRule(r, false); len = uprv_min(r.length(), U_PARSE_CONTEXT_LEN-1); r.extract(0, len, parseError.preContext); parseError.preContext[len] = 0; //for post-context r.truncate(0); rule2.toRule(r, false); len = uprv_min(r.length(), U_PARSE_CONTEXT_LEN-1); r.extract(0, len, parseError.postContext); parseError.postContext[len] = 0; } U_NAMESPACE_BEGIN /** * Construct a new empty rule set. */ TransliterationRuleSet::TransliterationRuleSet(UErrorCode& status) : UMemory(), ruleVector(nullptr), rules(nullptr), index {}, maxContextLength(0) { LocalPointer lpRuleVector(new UVector(_deleteRule, nullptr, status), status); if (U_FAILURE(status)) { return; } ruleVector = lpRuleVector.orphan(); } /** * Copy constructor. */ TransliterationRuleSet::TransliterationRuleSet(const TransliterationRuleSet& other) : UMemory(other), ruleVector(nullptr), rules(nullptr), maxContextLength(other.maxContextLength) { int32_t i, len; uprv_memcpy(index, other.index, sizeof(index)); UErrorCode status = U_ZERO_ERROR; LocalPointer lpRuleVector(new UVector(_deleteRule, nullptr, status), status); if (U_FAILURE(status)) { return; } ruleVector = lpRuleVector.orphan(); if (other.ruleVector != nullptr && U_SUCCESS(status)) { len = other.ruleVector->size(); for (i=0; i tempTranslitRule( new TransliterationRule(*(TransliterationRule*)other.ruleVector->elementAt(i)), status); ruleVector->adoptElement(tempTranslitRule.orphan(), status); } } if (other.rules != 0 && U_SUCCESS(status)) { UParseError p; freeze(p, status); } } /** * Destructor. */ TransliterationRuleSet::~TransliterationRuleSet() { delete ruleVector; // This deletes the contained rules uprv_free(rules); } void TransliterationRuleSet::setData(const TransliterationRuleData* d) { /** * We assume that the ruleset has already been frozen. */ int32_t len = index[256]; // see freeze() for (int32_t i=0; isetData(d); } } /** * Return the maximum context length. * @return the length of the longest preceding context. */ int32_t TransliterationRuleSet::getMaximumContextLength() const { return maxContextLength; } /** * Add a rule to this set. Rules are added in order, and order is * significant. The last call to this method must be followed by * a call to freeze() before the rule set is used. * *

If freeze() has already been called, calling addRule() * unfreezes the rules, and freeze() must be called again. * * @param adoptedRule the rule to add */ void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule, UErrorCode& status) { LocalPointer lpAdoptedRule(adoptedRule); ruleVector->adoptElement(lpAdoptedRule.orphan(), status); if (U_FAILURE(status)) { return; } int32_t len; if ((len = adoptedRule->getContextLength()) > maxContextLength) { maxContextLength = len; } uprv_free(rules); rules = 0; } /** * Check this for masked rules and index it to optimize performance. * The sequence of operations is: (1) add rules to a set using * addRule(); (2) freeze the set using * freeze(); (3) use the rule set. If * addRule() is called after calling this method, it * invalidates this object, and this method must be called again. * That is, freeze() may be called multiple times, * although for optimal performance it shouldn't be. */ void TransliterationRuleSet::freeze(UParseError& parseError,UErrorCode& status) { /* Construct the rule array and index table. We reorder the * rules by sorting them into 256 bins. Each bin contains all * rules matching the index value for that bin. A rule * matches an index value if string whose first key character * has a low byte equal to the index value can match the rule. * * Each bin contains zero or more rules, in the same order * they were found originally. However, the total rules in * the bins may exceed the number in the original vector, * since rules that have a variable as their first key * character will generally fall into more than one bin. * * That is, each bin contains all rules that either have that * first index value as their first key character, or have * a set containing the index value as their first character. */ int32_t n = ruleVector->size(); int32_t j; int16_t x; UVector v(2*n, status); // heuristic; adjust as needed if (U_FAILURE(status)) { return; } /* Precompute the index values. This saves a LOT of time. * Be careful not to call malloc(0). */ int16_t* indexValue = (int16_t*) uprv_malloc( sizeof(int16_t) * (n > 0 ? n : 1) ); /* test for nullptr */ if (indexValue == 0) { status = U_MEMORY_ALLOCATION_ERROR; return; } for (j=0; jelementAt(j); indexValue[j] = r->getIndexValue(); } for (x=0; x<256; ++x) { index[x] = v.size(); for (j=0; j= 0) { if (indexValue[j] == x) { v.addElement(ruleVector->elementAt(j), status); } } else { // If the indexValue is < 0, then the first key character is // a set, and we must use the more time-consuming // matchesIndexValue check. In practice this happens // rarely, so we seldom treat this code path. TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j); if (r->matchesIndexValue((uint8_t)x)) { v.addElement(r, status); } } } } uprv_free(indexValue); index[256] = v.size(); if (U_FAILURE(status)) { return; } /* Freeze things into an array. */ uprv_free(rules); // Contains alias pointers /* You can't do malloc(0)! */ if (v.size() == 0) { rules = nullptr; return; } rules = (TransliterationRule **)uprv_malloc(v.size() * sizeof(TransliterationRule *)); /* test for nullptr */ if (rules == 0) { status = U_MEMORY_ALLOCATION_ERROR; return; } for (j=0; jmasks(*r2)) { //| if (errors == null) { //| errors = new StringBuffer(); //| } else { //| errors.append("\n"); //| } //| errors.append("Rule " + r1 + " masks " + r2); status = U_RULE_MASK_ERROR; maskingError(*r1, *r2, parseError); return; } } } } //if (errors != null) { // throw new IllegalArgumentException(errors.toString()); //} } /** * Transliterate the given text with the given UTransPosition * indices. Return true if the transliteration should continue * or false if it should halt (because of a U_PARTIAL_MATCH match). * Note that false is only ever returned if isIncremental is true. * @param text the text to be transliterated * @param pos the position indices, which will be updated * @param incremental if true, assume new text may be inserted * at index.limit, and return false if there is a partial match. * @return true unless a U_PARTIAL_MATCH has been obtained, * indicating that transliteration should stop until more text * arrives. */ UBool TransliterationRuleSet::transliterate(Replaceable& text, UTransPosition& pos, UBool incremental) { int16_t indexByte = (int16_t) (text.char32At(pos.start) & 0xFF); for (int32_t i=index[indexByte]; imatchAndReplace(text, pos, incremental); switch (m) { case U_MATCH: _debugOut("match", rules[i], text, pos); return true; case U_PARTIAL_MATCH: _debugOut("partial match", rules[i], text, pos); return false; default: /* Ram: added default to make GCC happy */ break; } } // No match or partial match from any rule pos.start += U16_LENGTH(text.char32At(pos.start)); _debugOut("no match", nullptr, text, pos); return true; } /** * Create rule strings that represents this rule set. */ UnicodeString& TransliterationRuleSet::toRules(UnicodeString& ruleSource, UBool escapeUnprintable) const { int32_t i; int32_t count = ruleVector->size(); ruleSource.truncate(0); for (i=0; ielementAt(i); r->toRule(ruleSource, escapeUnprintable); } return ruleSource; } /** * Return the set of all characters that may be modified * (getTarget=false) or emitted (getTarget=true) by this set. */ UnicodeSet& TransliterationRuleSet::getSourceTargetSet(UnicodeSet& result, UBool getTarget) const { result.clear(); int32_t count = ruleVector->size(); for (int32_t i=0; ielementAt(i); if (getTarget) { r->addTargetSetTo(result); } else { r->addSourceSetTo(result); } } return result; } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ stringi/src/icu74/i18n/number_integerwidth.cpp0000644000176200001440000000461214700200761021003 0ustar liggesusers// © 2017 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/numberformatter.h" #include "number_types.h" #include "number_decimalquantity.h" using namespace icu; using namespace icu::number; using namespace icu::number::impl; IntegerWidth::IntegerWidth(digits_t minInt, digits_t maxInt, bool formatFailIfMoreThanMaxDigits) { fUnion.minMaxInt.fMinInt = minInt; fUnion.minMaxInt.fMaxInt = maxInt; fUnion.minMaxInt.fFormatFailIfMoreThanMaxDigits = formatFailIfMoreThanMaxDigits; } IntegerWidth IntegerWidth::zeroFillTo(int32_t minInt) { if (minInt >= 0 && minInt <= kMaxIntFracSig) { return {static_cast(minInt), -1, false}; } else { return {U_NUMBER_ARG_OUTOFBOUNDS_ERROR}; } } IntegerWidth IntegerWidth::truncateAt(int32_t maxInt) { if (fHasError) { return *this; } // No-op on error digits_t minInt = fUnion.minMaxInt.fMinInt; if (maxInt >= 0 && maxInt <= kMaxIntFracSig && minInt <= maxInt) { return {minInt, static_cast(maxInt), false}; } else if (maxInt == -1) { return {minInt, -1, false}; } else { return {U_NUMBER_ARG_OUTOFBOUNDS_ERROR}; } } void IntegerWidth::apply(impl::DecimalQuantity& quantity, UErrorCode& status) const { if (U_FAILURE(status)) { return; } if (fHasError) { status = U_ILLEGAL_ARGUMENT_ERROR; } else if (fUnion.minMaxInt.fMaxInt == -1) { quantity.setMinInteger(fUnion.minMaxInt.fMinInt); } else { // Enforce the backwards-compatibility feature "FormatFailIfMoreThanMaxDigits" if (fUnion.minMaxInt.fFormatFailIfMoreThanMaxDigits && fUnion.minMaxInt.fMaxInt < quantity.getMagnitude()) { status = U_ILLEGAL_ARGUMENT_ERROR; } quantity.setMinInteger(fUnion.minMaxInt.fMinInt); quantity.applyMaxInteger(fUnion.minMaxInt.fMaxInt); } } bool IntegerWidth::operator==(const IntegerWidth& other) const { // Private operator==; do error and bogus checking first! U_ASSERT(!fHasError); U_ASSERT(!other.fHasError); U_ASSERT(!isBogus()); U_ASSERT(!other.isBogus()); return fUnion.minMaxInt.fMinInt == other.fUnion.minMaxInt.fMinInt && fUnion.minMaxInt.fMaxInt == other.fUnion.minMaxInt.fMaxInt; } #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/basictz.cpp0000644000176200001440000005140114700200761016373 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2007-2013, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/basictz.h" #include "gregoimp.h" #include "uvector.h" #include "cmemory.h" U_NAMESPACE_BEGIN #define MILLIS_PER_YEAR (365*24*60*60*1000.0) BasicTimeZone::BasicTimeZone() : TimeZone() { } BasicTimeZone::BasicTimeZone(const UnicodeString &id) : TimeZone(id) { } BasicTimeZone::BasicTimeZone(const BasicTimeZone& source) : TimeZone(source) { } BasicTimeZone::~BasicTimeZone() { } UBool BasicTimeZone::hasEquivalentTransitions(const BasicTimeZone& tz, UDate start, UDate end, UBool ignoreDstAmount, UErrorCode& status) const { if (U_FAILURE(status)) { return false; } if (hasSameRules(tz)) { return true; } // Check the offsets at the start time int32_t raw1, raw2, dst1, dst2; getOffset(start, false, raw1, dst1, status); if (U_FAILURE(status)) { return false; } tz.getOffset(start, false, raw2, dst2, status); if (U_FAILURE(status)) { return false; } if (ignoreDstAmount) { if ((raw1 + dst1 != raw2 + dst2) || (dst1 != 0 && dst2 == 0) || (dst1 == 0 && dst2 != 0)) { return false; } } else { if (raw1 != raw2 || dst1 != dst2) { return false; } } // Check transitions in the range UDate time = start; TimeZoneTransition tr1, tr2; while (true) { UBool avail1 = getNextTransition(time, false, tr1); UBool avail2 = tz.getNextTransition(time, false, tr2); if (ignoreDstAmount) { // Skip a transition which only differ the amount of DST savings while (true) { if (avail1 && tr1.getTime() <= end && (tr1.getFrom()->getRawOffset() + tr1.getFrom()->getDSTSavings() == tr1.getTo()->getRawOffset() + tr1.getTo()->getDSTSavings()) && (tr1.getFrom()->getDSTSavings() != 0 && tr1.getTo()->getDSTSavings() != 0)) { getNextTransition(tr1.getTime(), false, tr1); } else { break; } } while (true) { if (avail2 && tr2.getTime() <= end && (tr2.getFrom()->getRawOffset() + tr2.getFrom()->getDSTSavings() == tr2.getTo()->getRawOffset() + tr2.getTo()->getDSTSavings()) && (tr2.getFrom()->getDSTSavings() != 0 && tr2.getTo()->getDSTSavings() != 0)) { tz.getNextTransition(tr2.getTime(), false, tr2); } else { break; } } } UBool inRange1 = (avail1 && tr1.getTime() <= end); UBool inRange2 = (avail2 && tr2.getTime() <= end); if (!inRange1 && !inRange2) { // No more transition in the range break; } if (!inRange1 || !inRange2) { return false; } if (tr1.getTime() != tr2.getTime()) { return false; } if (ignoreDstAmount) { if (tr1.getTo()->getRawOffset() + tr1.getTo()->getDSTSavings() != tr2.getTo()->getRawOffset() + tr2.getTo()->getDSTSavings() || (tr1.getTo()->getDSTSavings() != 0 && tr2.getTo()->getDSTSavings() == 0) || (tr1.getTo()->getDSTSavings() == 0 && tr2.getTo()->getDSTSavings() != 0)) { return false; } } else { if (tr1.getTo()->getRawOffset() != tr2.getTo()->getRawOffset() || tr1.getTo()->getDSTSavings() != tr2.getTo()->getDSTSavings()) { return false; } } time = tr1.getTime(); } return true; } void BasicTimeZone::getSimpleRulesNear(UDate date, InitialTimeZoneRule*& initial, AnnualTimeZoneRule*& std, AnnualTimeZoneRule*& dst, UErrorCode& status) const { initial = nullptr; std = nullptr; dst = nullptr; if (U_FAILURE(status)) { return; } int32_t initialRaw, initialDst; UnicodeString initialName; AnnualTimeZoneRule *ar1 = nullptr; AnnualTimeZoneRule *ar2 = nullptr; UnicodeString name; UBool avail; TimeZoneTransition tr; // Get the next transition avail = getNextTransition(date, false, tr); if (avail) { tr.getFrom()->getName(initialName); initialRaw = tr.getFrom()->getRawOffset(); initialDst = tr.getFrom()->getDSTSavings(); // Check if the next transition is either DST->STD or STD->DST and // within roughly 1 year from the specified date UDate nextTransitionTime = tr.getTime(); if (((tr.getFrom()->getDSTSavings() == 0 && tr.getTo()->getDSTSavings() != 0) || (tr.getFrom()->getDSTSavings() != 0 && tr.getTo()->getDSTSavings() == 0)) && (date + MILLIS_PER_YEAR > nextTransitionTime)) { int32_t year, month, dom, dow, doy, mid; UDate d; // Get local wall time for the next transition time Grego::timeToFields(nextTransitionTime + initialRaw + initialDst, year, month, dom, dow, doy, mid); int32_t weekInMonth = Grego::dayOfWeekInMonth(year, month, dom); // Create DOW rule DateTimeRule *dtr = new DateTimeRule(month, weekInMonth, dow, mid, DateTimeRule::WALL_TIME); tr.getTo()->getName(name); // Note: SimpleTimeZone does not support raw offset change. // So we always use raw offset of the given time for the rule, // even raw offset is changed. This will result that the result // zone to return wrong offset after the transition. // When we encounter such case, we do not inspect next next // transition for another rule. ar1 = new AnnualTimeZoneRule(name, initialRaw, tr.getTo()->getDSTSavings(), dtr, year, AnnualTimeZoneRule::MAX_YEAR); if (tr.getTo()->getRawOffset() == initialRaw) { // Get the next next transition avail = getNextTransition(nextTransitionTime, false, tr); if (avail) { // Check if the next next transition is either DST->STD or STD->DST // and within roughly 1 year from the next transition if (((tr.getFrom()->getDSTSavings() == 0 && tr.getTo()->getDSTSavings() != 0) || (tr.getFrom()->getDSTSavings() != 0 && tr.getTo()->getDSTSavings() == 0)) && nextTransitionTime + MILLIS_PER_YEAR > tr.getTime()) { // Get local wall time for the next transition time Grego::timeToFields(tr.getTime() + tr.getFrom()->getRawOffset() + tr.getFrom()->getDSTSavings(), year, month, dom, dow, doy, mid); weekInMonth = Grego::dayOfWeekInMonth(year, month, dom); // Generate another DOW rule dtr = new DateTimeRule(month, weekInMonth, dow, mid, DateTimeRule::WALL_TIME); tr.getTo()->getName(name); ar2 = new AnnualTimeZoneRule(name, tr.getTo()->getRawOffset(), tr.getTo()->getDSTSavings(), dtr, year - 1, AnnualTimeZoneRule::MAX_YEAR); // Make sure this rule can be applied to the specified date avail = ar2->getPreviousStart(date, tr.getFrom()->getRawOffset(), tr.getFrom()->getDSTSavings(), true, d); if (!avail || d > date || initialRaw != tr.getTo()->getRawOffset() || initialDst != tr.getTo()->getDSTSavings()) { // We cannot use this rule as the second transition rule delete ar2; ar2 = nullptr; } } } } if (ar2 == nullptr) { // Try previous transition avail = getPreviousTransition(date, true, tr); if (avail) { // Check if the previous transition is either DST->STD or STD->DST. // The actual transition time does not matter here. if ((tr.getFrom()->getDSTSavings() == 0 && tr.getTo()->getDSTSavings() != 0) || (tr.getFrom()->getDSTSavings() != 0 && tr.getTo()->getDSTSavings() == 0)) { // Generate another DOW rule Grego::timeToFields(tr.getTime() + tr.getFrom()->getRawOffset() + tr.getFrom()->getDSTSavings(), year, month, dom, dow, doy, mid); weekInMonth = Grego::dayOfWeekInMonth(year, month, dom); dtr = new DateTimeRule(month, weekInMonth, dow, mid, DateTimeRule::WALL_TIME); tr.getTo()->getName(name); // second rule raw/dst offsets should match raw/dst offsets // at the given time ar2 = new AnnualTimeZoneRule(name, initialRaw, initialDst, dtr, ar1->getStartYear() - 1, AnnualTimeZoneRule::MAX_YEAR); // Check if this rule start after the first rule after the specified date avail = ar2->getNextStart(date, tr.getFrom()->getRawOffset(), tr.getFrom()->getDSTSavings(), false, d); if (!avail || d <= nextTransitionTime) { // We cannot use this rule as the second transition rule delete ar2; ar2 = nullptr; } } } } if (ar2 == nullptr) { // Cannot find a good pair of AnnualTimeZoneRule delete ar1; ar1 = nullptr; } else { // The initial rule should represent the rule before the previous transition ar1->getName(initialName); initialRaw = ar1->getRawOffset(); initialDst = ar1->getDSTSavings(); } } } else { // Try the previous one avail = getPreviousTransition(date, true, tr); if (avail) { tr.getTo()->getName(initialName); initialRaw = tr.getTo()->getRawOffset(); initialDst = tr.getTo()->getDSTSavings(); } else { // No transitions in the past. Just use the current offsets getOffset(date, false, initialRaw, initialDst, status); if (U_FAILURE(status)) { return; } } } // Set the initial rule initial = new InitialTimeZoneRule(initialName, initialRaw, initialDst); // Set the standard and daylight saving rules if (ar1 != nullptr && ar2 != nullptr) { if (ar1->getDSTSavings() != 0) { dst = ar1; std = ar2; } else { std = ar1; dst = ar2; } } } void BasicTimeZone::getTimeZoneRulesAfter(UDate start, InitialTimeZoneRule*& initial, UVector*& transitionRules, UErrorCode& status) const { if (U_FAILURE(status)) { return; } const InitialTimeZoneRule *orgini; TimeZoneTransition tzt; bool avail; int32_t ruleCount; TimeZoneRule *r = nullptr; UnicodeString name; int32_t i; UDate time, t; UDate firstStart; UBool bFinalStd = false, bFinalDst = false; initial = nullptr; transitionRules = nullptr; // Original transition rules ruleCount = countTransitionRules(status); if (U_FAILURE(status)) { return; } LocalPointer orgRules( new UVector(uprv_deleteUObject, nullptr, ruleCount, status), status); if (U_FAILURE(status)) { return; } LocalMemory orgtrs( static_cast(uprv_malloc(sizeof(TimeZoneRule*)*ruleCount))); if (orgtrs.isNull()) { status = U_MEMORY_ALLOCATION_ERROR; return; } getTimeZoneRules(orgini, &orgtrs[0], ruleCount, status); if (U_FAILURE(status)) { return; } for (i = 0; i < ruleCount; i++) { LocalPointer lpRule(orgtrs[i]->clone(), status); orgRules->adoptElement(lpRule.orphan(), status); if (U_FAILURE(status)) { return; } } avail = getPreviousTransition(start, true, tzt); if (!avail) { // No need to filter out rules only applicable to time before the start initial = orgini->clone(); if (initial == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } transitionRules = orgRules.orphan(); return; } LocalMemory done(static_cast(uprv_malloc(sizeof(bool)*ruleCount))); if (done.isNull()) { status = U_MEMORY_ALLOCATION_ERROR; return; } LocalPointer filteredRules( new UVector(uprv_deleteUObject, nullptr, status), status); if (U_FAILURE(status)) { return; } // Create initial rule tzt.getTo()->getName(name); LocalPointer res_initial( new InitialTimeZoneRule(name, tzt.getTo()->getRawOffset(), tzt.getTo()->getDSTSavings()), status); if (U_FAILURE(status)) { return; } // Mark rules which does not need to be processed for (i = 0; i < ruleCount; i++) { r = (TimeZoneRule*)orgRules->elementAt(i); avail = r->getNextStart(start, res_initial->getRawOffset(), res_initial->getDSTSavings(), false, time); done[i] = !avail; } time = start; while (!bFinalStd || !bFinalDst) { avail = getNextTransition(time, false, tzt); if (!avail) { break; } UDate updatedTime = tzt.getTime(); if (updatedTime == time) { // Can get here if rules for start & end of daylight time have exactly // the same time. // TODO: fix getNextTransition() to prevent it? status = U_INVALID_STATE_ERROR; return; } time = updatedTime; const TimeZoneRule *toRule = tzt.getTo(); for (i = 0; i < ruleCount; i++) { r = (TimeZoneRule*)orgRules->elementAt(i); if (*r == *toRule) { break; } } if (i >= ruleCount) { // This case should never happen status = U_INVALID_STATE_ERROR; return; } if (done[i]) { continue; } const TimeArrayTimeZoneRule *tar = dynamic_cast(toRule); const AnnualTimeZoneRule *ar; if (tar != nullptr) { // Get the previous raw offset and DST savings before the very first start time TimeZoneTransition tzt0; t = start; while (true) { avail = getNextTransition(t, false, tzt0); if (!avail) { break; } if (*(tzt0.getTo()) == *tar) { break; } t = tzt0.getTime(); } if (avail) { // Check if the entire start times to be added tar->getFirstStart(tzt.getFrom()->getRawOffset(), tzt.getFrom()->getDSTSavings(), firstStart); if (firstStart > start) { // Just add the rule as is LocalPointer lpTar(tar->clone(), status); filteredRules->adoptElement(lpTar.orphan(), status); if (U_FAILURE(status)) { return; } } else { // Collect transitions after the start time int32_t startTimes; DateTimeRule::TimeRuleType timeType; int32_t idx; startTimes = tar->countStartTimes(); timeType = tar->getTimeType(); for (idx = 0; idx < startTimes; idx++) { tar->getStartTimeAt(idx, t); if (timeType == DateTimeRule::STANDARD_TIME) { t -= tzt.getFrom()->getRawOffset(); } if (timeType == DateTimeRule::WALL_TIME) { t -= tzt.getFrom()->getDSTSavings(); } if (t > start) { break; } } if (U_FAILURE(status)) { return; } int32_t asize = startTimes - idx; if (asize > 0) { LocalMemory newTimes(static_cast(uprv_malloc(sizeof(UDate) * asize))); if (newTimes.isNull()) { status = U_MEMORY_ALLOCATION_ERROR; return; } for (int32_t newidx = 0; newidx < asize; newidx++) { tar->getStartTimeAt(idx + newidx, newTimes[newidx]); } tar->getName(name); LocalPointer newTar(new TimeArrayTimeZoneRule( name, tar->getRawOffset(), tar->getDSTSavings(), &newTimes[0], asize, timeType), status); filteredRules->adoptElement(newTar.orphan(), status); if (U_FAILURE(status)) { return; } } } } } else if ((ar = dynamic_cast(toRule)) != nullptr) { ar->getFirstStart(tzt.getFrom()->getRawOffset(), tzt.getFrom()->getDSTSavings(), firstStart); if (firstStart == tzt.getTime()) { // Just add the rule as is LocalPointer arClone(ar->clone(), status); filteredRules->adoptElement(arClone.orphan(), status); if (U_FAILURE(status)) { return; } } else { // Calculate the transition year int32_t year, month, dom, dow, doy, mid; Grego::timeToFields(tzt.getTime(), year, month, dom, dow, doy, mid); // Re-create the rule ar->getName(name); LocalPointer newAr(new AnnualTimeZoneRule(name, ar->getRawOffset(), ar->getDSTSavings(), *(ar->getRule()), year, ar->getEndYear()), status); filteredRules->adoptElement(newAr.orphan(), status); if (U_FAILURE(status)) { return; } } // check if this is a final rule if (ar->getEndYear() == AnnualTimeZoneRule::MAX_YEAR) { // After bot final standard and dst rules are processed, // exit this while loop. if (ar->getDSTSavings() == 0) { bFinalStd = true; } else { bFinalDst = true; } } } done[i] = true; } // Set the results initial = res_initial.orphan(); transitionRules = filteredRules.orphan(); return; } void BasicTimeZone::getOffsetFromLocal(UDate /*date*/, UTimeZoneLocalOption /*nonExistingTimeOpt*/, UTimeZoneLocalOption /*duplicatedTimeOpt*/, int32_t& /*rawOffset*/, int32_t& /*dstOffset*/, UErrorCode& status) const { if (U_FAILURE(status)) { return; } status = U_UNSUPPORTED_ERROR; } void BasicTimeZone::getOffsetFromLocal(UDate date, int32_t nonExistingTimeOpt, int32_t duplicatedTimeOpt, int32_t& rawOffset, int32_t& dstOffset, UErrorCode& status) const { getOffsetFromLocal(date, (UTimeZoneLocalOption)nonExistingTimeOpt, (UTimeZoneLocalOption)duplicatedTimeOpt, rawOffset, dstOffset, status); } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ //eof stringi/src/icu74/i18n/buddhcal.cpp0000644000176200001440000001167514700200761016513 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2003-2013, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* * * File BUDDHCAL.CPP * * Modification History: * 05/13/2003 srl copied from gregocal.cpp * */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "buddhcal.h" #include "unicode/gregocal.h" #include "umutex.h" #include U_NAMESPACE_BEGIN UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BuddhistCalendar) //static const int32_t kMaxEra = 0; // only 1 era static const int32_t kBuddhistEraStart = -543; // 544 BC (Gregorian) static const int32_t kGregorianEpoch = 1970; // used as the default value of EXTENDED_YEAR BuddhistCalendar::BuddhistCalendar(const Locale& aLocale, UErrorCode& success) : GregorianCalendar(aLocale, success) { setTimeInMillis(getNow(), success); // Call this again now that the vtable is set up properly. } BuddhistCalendar::~BuddhistCalendar() { } BuddhistCalendar::BuddhistCalendar(const BuddhistCalendar& source) : GregorianCalendar(source) { } BuddhistCalendar& BuddhistCalendar::operator= ( const BuddhistCalendar& right) { GregorianCalendar::operator=(right); return *this; } BuddhistCalendar* BuddhistCalendar::clone() const { return new BuddhistCalendar(*this); } const char *BuddhistCalendar::getType() const { return "buddhist"; } int32_t BuddhistCalendar::handleGetExtendedYear() { // EXTENDED_YEAR in BuddhistCalendar is a Gregorian year. // The default value of EXTENDED_YEAR is 1970 (Buddhist 2513) int32_t year; if (newerField(UCAL_EXTENDED_YEAR, UCAL_YEAR) == UCAL_EXTENDED_YEAR) { year = internalGet(UCAL_EXTENDED_YEAR, kGregorianEpoch); } else { // extended year is a gregorian year, where 1 = 1AD, 0 = 1BC, -1 = 2BC, etc year = internalGet(UCAL_YEAR, kGregorianEpoch - kBuddhistEraStart) + kBuddhistEraStart; } return year; } void BuddhistCalendar::handleComputeFields(int32_t julianDay, UErrorCode& status) { GregorianCalendar::handleComputeFields(julianDay, status); int32_t y = internalGet(UCAL_EXTENDED_YEAR) - kBuddhistEraStart; internalSet(UCAL_ERA, 0); internalSet(UCAL_YEAR, y); } int32_t BuddhistCalendar::handleGetLimit(UCalendarDateFields field, ELimitType limitType) const { if(field == UCAL_ERA) { return BE; } else { return GregorianCalendar::handleGetLimit(field,limitType); } } #if 0 void BuddhistCalendar::timeToFields(UDate theTime, UBool quick, UErrorCode& status) { //Calendar::timeToFields(theTime, quick, status); int32_t era = internalGet(UCAL_ERA); int32_t year = internalGet(UCAL_YEAR); if(era == GregorianCalendar::BC) { year = 1-year; era = BuddhistCalendar::BE; } else if(era == GregorianCalendar::AD) { era = BuddhistCalendar::BE; } else { status = U_INTERNAL_PROGRAM_ERROR; } year = year - kBuddhistEraStart; internalSet(UCAL_ERA, era); internalSet(UCAL_YEAR, year); } #endif /** * The system maintains a static default century start date. This is initialized * the first time it is used. Once the system default century date and year * are set, they do not change. */ static UDate gSystemDefaultCenturyStart = DBL_MIN; static int32_t gSystemDefaultCenturyStartYear = -1; static icu::UInitOnce gBCInitOnce {}; UBool BuddhistCalendar::haveDefaultCentury() const { return true; } static void U_CALLCONV initializeSystemDefaultCentury() { // initialize systemDefaultCentury and systemDefaultCenturyYear based // on the current time. They'll be set to 80 years before // the current time. UErrorCode status = U_ZERO_ERROR; BuddhistCalendar calendar(Locale("@calendar=buddhist"),status); if (U_SUCCESS(status)) { calendar.setTime(Calendar::getNow(), status); calendar.add(UCAL_YEAR, -80, status); UDate newStart = calendar.getTime(status); int32_t newYear = calendar.get(UCAL_YEAR, status); gSystemDefaultCenturyStartYear = newYear; gSystemDefaultCenturyStart = newStart; } // We have no recourse upon failure unless we want to propagate the failure // out. } UDate BuddhistCalendar::defaultCenturyStart() const { // lazy-evaluate systemDefaultCenturyStart and systemDefaultCenturyStartYear umtx_initOnce(gBCInitOnce, &initializeSystemDefaultCentury); return gSystemDefaultCenturyStart; } int32_t BuddhistCalendar::defaultCenturyStartYear() const { // lazy-evaluate systemDefaultCenturyStartYear and systemDefaultCenturyStart umtx_initOnce(gBCInitOnce, &initializeSystemDefaultCentury); return gSystemDefaultCenturyStartYear; } U_NAMESPACE_END #endif stringi/src/icu74/i18n/regeximp.h0000644000176200001440000004163414700200761016230 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // // Copyright (C) 2002-2015 International Business Machines Corporation // and others. All rights reserved. // // file: regeximp.h // // ICU Regular Expressions, // Definitions of constant values used in the compiled form of // a regular expression pattern. // #ifndef _REGEXIMP_H #define _REGEXIMP_H #include "unicode/utypes.h" #include "unicode/uobject.h" #include "unicode/uniset.h" #include "unicode/utext.h" #include "cmemory.h" #include "ucase.h" U_NAMESPACE_BEGIN // For debugging, define REGEX_DEBUG // To define with configure, // CPPFLAGS="-DREGEX_DEBUG" ./runConfigureICU --enable-debug --disable-release Linux #ifdef REGEX_DEBUG // // debugging options. Enable one or more of the three #defines immediately following // //#define REGEX_SCAN_DEBUG #define REGEX_DUMP_DEBUG #define REGEX_RUN_DEBUG // End of #defines intended to be directly set. #include #endif #ifdef REGEX_SCAN_DEBUG #define REGEX_SCAN_DEBUG_PRINTF(a) printf a #else #define REGEX_SCAN_DEBUG_PRINTF(a) #endif // // Opcode types In the compiled form of the regexp, these are the type, or opcodes, // of the entries. // enum { URX_RESERVED_OP = 0, // For multi-operand ops, most non-first words. URX_RESERVED_OP_N = 255, // For multi-operand ops, negative operand values. URX_BACKTRACK = 1, // Force a backtrack, as if a match test had failed. URX_END = 2, URX_ONECHAR = 3, // Value field is the 21 bit unicode char to match URX_STRING = 4, // Value field is index of string start URX_STRING_LEN = 5, // Value field is string length (code units) URX_STATE_SAVE = 6, // Value field is pattern position to push URX_NOP = 7, URX_START_CAPTURE = 8, // Value field is capture group number. URX_END_CAPTURE = 9, // Value field is capture group number URX_STATIC_SETREF = 10, // Value field is index of set in array of sets. URX_SETREF = 11, // Value field is index of set in array of sets. URX_DOTANY = 12, URX_JMP = 13, // Value field is destination position in // the pattern. URX_FAIL = 14, // Stop match operation, No match. URX_JMP_SAV = 15, // Operand: JMP destination location URX_BACKSLASH_B = 16, // Value field: 0: \b 1: \B URX_BACKSLASH_G = 17, URX_JMP_SAV_X = 18, // Conditional JMP_SAV, // Used in (x)+, breaks loop on zero length match. // Operand: Jmp destination. URX_BACKSLASH_X = 19, URX_BACKSLASH_Z = 20, // \z Unconditional end of line. URX_DOTANY_ALL = 21, // ., in the . matches any mode. URX_BACKSLASH_D = 22, // Value field: 0: \d 1: \D URX_CARET = 23, // Value field: 1: multi-line mode. URX_DOLLAR = 24, // Also for \Z URX_CTR_INIT = 25, // Counter Inits for {Interval} loops. URX_CTR_INIT_NG = 26, // 2 kinds, normal and non-greedy. // These are 4 word opcodes. See description. // First Operand: Data loc of counter variable // 2nd Operand: Pat loc of the URX_CTR_LOOPx // at the end of the loop. // 3rd Operand: Minimum count. // 4th Operand: Max count, -1 for unbounded. URX_DOTANY_UNIX = 27, // '.' operator in UNIX_LINES mode, only \n marks end of line. URX_CTR_LOOP = 28, // Loop Ops for {interval} loops. URX_CTR_LOOP_NG = 29, // Also in three flavors. // Operand is loc of corresponding CTR_INIT. URX_CARET_M_UNIX = 30, // '^' operator, test for start of line in multi-line // plus UNIX_LINES mode. URX_RELOC_OPRND = 31, // Operand value in multi-operand ops that refers // back into compiled pattern code, and thus must // be relocated when inserting/deleting ops in code. URX_STO_SP = 32, // Store the stack ptr. Operand is location within // matcher data (not stack data) to store it. URX_LD_SP = 33, // Load the stack pointer. Operand is location // to load from. URX_BACKREF = 34, // Back Reference. Parameter is the index of the // capture group variables in the state stack frame. URX_STO_INP_LOC = 35, // Store the input location. Operand is location // within the matcher stack frame. URX_JMPX = 36, // Conditional JMP. // First Operand: JMP target location. // Second Operand: Data location containing an // input position. If current input position == // saved input position, FAIL rather than taking // the JMP URX_LA_START = 37, // Starting a LookAround expression. // Save InputPos, SP and active region in static data. // Operand: Static data offset for the save URX_LA_END = 38, // Ending a Lookaround expression. // Restore InputPos and Stack to saved values. // Operand: Static data offset for saved data. URX_ONECHAR_I = 39, // Test for case-insensitive match of a literal character. // Operand: the literal char. URX_STRING_I = 40, // Case insensitive string compare. // First Operand: Index of start of string in string literals // Second Operand (next word in compiled code): // the length of the string. URX_BACKREF_I = 41, // Case insensitive back reference. // Parameter is the index of the // capture group variables in the state stack frame. URX_DOLLAR_M = 42, // $ in multi-line mode. URX_CARET_M = 43, // ^ in multi-line mode. URX_LB_START = 44, // LookBehind Start. // Parameter is data location URX_LB_CONT = 45, // LookBehind Continue. // Param 0: the data location // Param 1: The minimum length of the look-behind match // Param 2: The max length of the look-behind match URX_LB_END = 46, // LookBehind End. // Parameter is the data location. // Check that match ended at the right spot, // Restore original input string len. URX_LBN_CONT = 47, // Negative LookBehind Continue // Param 0: the data location // Param 1: The minimum length of the look-behind match // Param 2: The max length of the look-behind match // Param 3: The pattern loc following the look-behind block. URX_LBN_END = 48, // Negative LookBehind end // Parameter is the data location. // Check that the match ended at the right spot. URX_STAT_SETREF_N = 49, // Reference to a prebuilt set (e.g. \w), negated // Operand is index of set in array of sets. URX_LOOP_SR_I = 50, // Init a [set]* loop. // Operand is the sets index in array of user sets. URX_LOOP_C = 51, // Continue a [set]* or OneChar* loop. // Operand is a matcher static data location. // Must always immediately follow LOOP_x_I instruction. URX_LOOP_DOT_I = 52, // .*, initialization of the optimized loop. // Operand value: // bit 0: // 0: Normal (. doesn't match new-line) mode. // 1: . matches new-line mode. // bit 1: controls what new-lines are recognized by this operation. // 0: All Unicode New-lines // 1: UNIX_LINES, \u000a only. URX_BACKSLASH_BU = 53, // \b or \B in UREGEX_UWORD mode, using Unicode style // word boundaries. URX_DOLLAR_D = 54, // $ end of input test, in UNIX_LINES mode. URX_DOLLAR_MD = 55, // $ end of input test, in MULTI_LINE and UNIX_LINES mode. URX_BACKSLASH_H = 56, // Value field: 0: \h 1: \H URX_BACKSLASH_R = 57, // Any line break sequence. URX_BACKSLASH_V = 58 // Value field: 0: \v 1: \V }; // Keep this list of opcode names in sync with the above enum // Used for debug printing only. #define URX_OPCODE_NAMES \ " ", \ "BACKTRACK", \ "END", \ "ONECHAR", \ "STRING", \ "STRING_LEN", \ "STATE_SAVE", \ "NOP", \ "START_CAPTURE", \ "END_CAPTURE", \ "URX_STATIC_SETREF", \ "SETREF", \ "DOTANY", \ "JMP", \ "FAIL", \ "JMP_SAV", \ "BACKSLASH_B", \ "BACKSLASH_G", \ "JMP_SAV_X", \ "BACKSLASH_X", \ "BACKSLASH_Z", \ "DOTANY_ALL", \ "BACKSLASH_D", \ "CARET", \ "DOLLAR", \ "CTR_INIT", \ "CTR_INIT_NG", \ "DOTANY_UNIX", \ "CTR_LOOP", \ "CTR_LOOP_NG", \ "URX_CARET_M_UNIX", \ "RELOC_OPRND", \ "STO_SP", \ "LD_SP", \ "BACKREF", \ "STO_INP_LOC", \ "JMPX", \ "LA_START", \ "LA_END", \ "ONECHAR_I", \ "STRING_I", \ "BACKREF_I", \ "DOLLAR_M", \ "CARET_M", \ "LB_START", \ "LB_CONT", \ "LB_END", \ "LBN_CONT", \ "LBN_END", \ "STAT_SETREF_N", \ "LOOP_SR_I", \ "LOOP_C", \ "LOOP_DOT_I", \ "BACKSLASH_BU", \ "DOLLAR_D", \ "DOLLAR_MD", \ "URX_BACKSLASH_H", \ "URX_BACKSLASH_R", \ "URX_BACKSLASH_V" // // Convenience macros for assembling and disassembling a compiled operation. // #define URX_TYPE(x) ((uint32_t)(x) >> 24) #define URX_VAL(x) ((x) & 0xffffff) // // Access to Unicode Sets composite character properties // The sets are accessed by the match engine for things like \w (word boundary) // enum { URX_ISWORD_SET = 1, URX_ISALNUM_SET = 2, URX_ISALPHA_SET = 3, URX_ISSPACE_SET = 4, URX_GC_NORMAL, // Sets for finding grapheme cluster boundaries. URX_GC_EXTEND, URX_GC_CONTROL, URX_GC_L, URX_GC_LV, URX_GC_LVT, URX_GC_V, URX_GC_T, URX_LAST_SET, URX_NEG_SET = 0x800000 // Flag bit to reverse sense of set // membership test. }; // // Match Engine State Stack Frame Layout. // struct REStackFrame { // Header int64_t fInputIdx; // Position of next character in the input string int64_t fPatIdx; // Position of next Op in the compiled pattern // (int64_t for UVector64, values fit in an int32_t) // Remainder int64_t fExtra[1]; // Extra state, for capture group start/ends // atomic parentheses, repeat counts, etc. // Locations assigned at pattern compile time. // Variable-length array. }; // number of UVector elements in the header #define RESTACKFRAME_HDRCOUNT 2 // // Start-Of-Match type. Used by find() to quickly scan to positions where a // match might start before firing up the full match engine. // enum StartOfMatch { START_NO_INFO, // No hint available. START_CHAR, // Match starts with a literal code point. START_SET, // Match starts with something matching a set. START_START, // Match starts at start of buffer only (^ or \A) START_LINE, // Match starts with ^ in multi-line mode. START_STRING // Match starts with a literal string. }; #define START_OF_MATCH_STR(v) ((v)==START_NO_INFO? "START_NO_INFO" : \ (v)==START_CHAR? "START_CHAR" : \ (v)==START_SET? "START_SET" : \ (v)==START_START? "START_START" : \ (v)==START_LINE? "START_LINE" : \ (v)==START_STRING? "START_STRING" : \ "ILLEGAL") // // 8 bit set, to fast-path latin-1 set membership tests. // struct Regex8BitSet : public UMemory { inline Regex8BitSet(); inline void operator = (const Regex8BitSet &s); inline void init(const UnicodeSet *src); inline UBool contains(UChar32 c); inline void add(UChar32 c); int8_t d[32]; }; inline Regex8BitSet::Regex8BitSet() { uprv_memset(d, 0, sizeof(d)); } inline UBool Regex8BitSet::contains(UChar32 c) { // No bounds checking! This is deliberate. return ((d[c>>3] & 1 <<(c&7)) != 0); } inline void Regex8BitSet::add(UChar32 c) { d[c>>3] |= 1 << (c&7); } inline void Regex8BitSet::init(const UnicodeSet *s) { if (s != nullptr) { for (int32_t i=0; i<=255; i++) { if (s->contains(i)) { this->add(i); } } } } inline void Regex8BitSet::operator = (const Regex8BitSet &s) { uprv_memcpy(d, s.d, sizeof(d)); } // Case folded UText Iterator helper class. // Wraps a UText, provides a case-folded enumeration over its contents. // Used in implementing case insensitive matching constructs. // Implementation in rematch.cpp class CaseFoldingUTextIterator: public UMemory { public: CaseFoldingUTextIterator(UText &text); ~CaseFoldingUTextIterator(); UChar32 next(); // Next case folded character UBool inExpansion(); // True if last char returned from next() and the // next to be returned both originated from a string // folding of the same code point from the original UText. private: UText &fUText; const char16_t *fFoldChars; int32_t fFoldLength; int32_t fFoldIndex; }; // Case folded char16_t * string iterator. // Wraps a char16_t *, provides a case-folded enumeration over its contents. // Used in implementing case insensitive matching constructs. // Implementation in rematch.cpp class CaseFoldingUCharIterator: public UMemory { public: CaseFoldingUCharIterator(const char16_t *chars, int64_t start, int64_t limit); ~CaseFoldingUCharIterator(); UChar32 next(); // Next case folded character UBool inExpansion(); // True if last char returned from next() and the // next to be returned both originated from a string // folding of the same code point from the original UText. int64_t getIndex(); // Return the current input buffer index. private: const char16_t *fChars; int64_t fIndex; int64_t fLimit; const char16_t *fFoldChars; int32_t fFoldLength; int32_t fFoldIndex; }; U_NAMESPACE_END #endif stringi/src/icu74/i18n/tmutamt.cpp0000644000176200001440000000346114700200761016432 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2008, Google, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ #include "unicode/tmutamt.h" #if !UCONFIG_NO_FORMATTING U_NAMESPACE_BEGIN UOBJECT_DEFINE_RTTI_IMPLEMENTATION(TimeUnitAmount) TimeUnitAmount::TimeUnitAmount(const Formattable& number, TimeUnit::UTimeUnitFields timeUnitField, UErrorCode& status) : Measure(number, TimeUnit::createInstance(timeUnitField, status), status) { } TimeUnitAmount::TimeUnitAmount(double amount, TimeUnit::UTimeUnitFields timeUnitField, UErrorCode& status) : Measure(Formattable(amount), TimeUnit::createInstance(timeUnitField, status), status) { } TimeUnitAmount::TimeUnitAmount(const TimeUnitAmount& other) : Measure(other) { } TimeUnitAmount& TimeUnitAmount::operator=(const TimeUnitAmount& other) { Measure::operator=(other); return *this; } bool TimeUnitAmount::operator==(const UObject& other) const { return Measure::operator==(other); } TimeUnitAmount* TimeUnitAmount::clone() const { return new TimeUnitAmount(*this); } TimeUnitAmount::~TimeUnitAmount() { } const TimeUnit& TimeUnitAmount::getTimeUnit() const { return static_cast(getUnit()); } TimeUnit::UTimeUnitFields TimeUnitAmount::getTimeUnitField() const { return getTimeUnit().getTimeUnitField(); } U_NAMESPACE_END #endif stringi/src/icu74/i18n/nortrans.h0000644000176200001440000000553214700200761016253 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2001-2010, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 07/03/01 aliu Creation. ********************************************************************** */ #ifndef NORTRANS_H #define NORTRANS_H #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/translit.h" #include "unicode/normalizer2.h" U_NAMESPACE_BEGIN /** * A transliterator that performs normalization. * @author Alan Liu */ class NormalizationTransliterator : public Transliterator { const Normalizer2 &fNorm2; public: /** * Destructor. */ virtual ~NormalizationTransliterator(); /** * Copy constructor. */ NormalizationTransliterator(const NormalizationTransliterator&); /** * Transliterator API. * @return A copy of the object. */ virtual NormalizationTransliterator* clone() const override; /** * ICU "poor man's RTTI", returns a UClassID for the actual class. */ virtual UClassID getDynamicClassID() const override; /** * ICU "poor man's RTTI", returns a UClassID for this class. */ U_I18N_API static UClassID U_EXPORT2 getStaticClassID(); protected: /** * Implements {@link Transliterator#handleTransliterate}. * @param text the buffer holding transliterated and * untransliterated text * @param offset the start and limit of the text, the position * of the cursor, and the start and limit of transliteration. * @param incremental if true, assume more text may be coming after * pos.contextLimit. Otherwise, assume the text is complete. */ virtual void handleTransliterate(Replaceable& text, UTransPosition& offset, UBool isIncremental) const override; public: /** * System registration hook. Public to Transliterator only. */ static void registerIDs(); private: // Transliterator::Factory methods static Transliterator* _create(const UnicodeString& ID, Token context); /** * Constructs a transliterator. This method is private. * Public users must use the factory method createInstance(). */ NormalizationTransliterator(const UnicodeString& id, const Normalizer2 &norm2); private: /** * Assignment operator. */ NormalizationTransliterator& operator=(const NormalizationTransliterator&); }; U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ #endif stringi/src/icu74/i18n/collationfastlatin.h0000644000176200001440000003377414700200761020310 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2013-2015, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * collationfastlatin.h * * created on: 2013aug09 * created by: Markus W. Scherer */ #ifndef __COLLATIONFASTLATIN_H__ #define __COLLATIONFASTLATIN_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION U_NAMESPACE_BEGIN struct CollationData; struct CollationSettings; class U_I18N_API CollationFastLatin /* all static */ { public: /** * Fast Latin format version (one byte 1..FF). * Must be incremented for any runtime-incompatible changes, * in particular, for changes to any of the following constants. * * When the major version number of the main data format changes, * we can reset this fast Latin version to 1. */ static const uint16_t VERSION = 2; static const int32_t LATIN_MAX = 0x17f; static const int32_t LATIN_LIMIT = LATIN_MAX + 1; static const int32_t LATIN_MAX_UTF8_LEAD = 0xc5; // UTF-8 lead byte of LATIN_MAX static const int32_t PUNCT_START = 0x2000; static const int32_t PUNCT_LIMIT = 0x2040; // excludes U+FFFE & U+FFFF static const int32_t NUM_FAST_CHARS = LATIN_LIMIT + (PUNCT_LIMIT - PUNCT_START); // Note on the supported weight ranges: // Analysis of UCA 6.3 and CLDR 23 non-search tailorings shows that // the CEs for characters in the above ranges, excluding expansions with length >2, // excluding contractions of >2 characters, and other restrictions // (see the builder's getCEsFromCE32()), // use at most about 150 primary weights, // where about 94 primary weights are possibly-variable (space/punct/symbol/currency), // at most 4 secondary before-common weights, // at most 4 secondary after-common weights, // at most 16 secondary high weights (in secondary CEs), and // at most 4 tertiary after-common weights. // The following ranges are designed to support slightly more weights than that. // (en_US_POSIX is unusual: It creates about 64 variable + 116 Latin primaries.) // Digits may use long primaries (preserving more short ones) // or short primaries (faster) without changing this data structure. // (If we supported numeric collation, then digits would have to have long primaries // so that special handling does not affect the fast path.) static const uint32_t SHORT_PRIMARY_MASK = 0xfc00; // bits 15..10 static const uint32_t INDEX_MASK = 0x3ff; // bits 9..0 for expansions & contractions static const uint32_t SECONDARY_MASK = 0x3e0; // bits 9..5 static const uint32_t CASE_MASK = 0x18; // bits 4..3 static const uint32_t LONG_PRIMARY_MASK = 0xfff8; // bits 15..3 static const uint32_t TERTIARY_MASK = 7; // bits 2..0 static const uint32_t CASE_AND_TERTIARY_MASK = CASE_MASK | TERTIARY_MASK; static const uint32_t TWO_SHORT_PRIMARIES_MASK = (SHORT_PRIMARY_MASK << 16) | SHORT_PRIMARY_MASK; // 0xfc00fc00 static const uint32_t TWO_LONG_PRIMARIES_MASK = (LONG_PRIMARY_MASK << 16) | LONG_PRIMARY_MASK; // 0xfff8fff8 static const uint32_t TWO_SECONDARIES_MASK = (SECONDARY_MASK << 16) | SECONDARY_MASK; // 0x3e003e0 static const uint32_t TWO_CASES_MASK = (CASE_MASK << 16) | CASE_MASK; // 0x180018 static const uint32_t TWO_TERTIARIES_MASK = (TERTIARY_MASK << 16) | TERTIARY_MASK; // 0x70007 /** * Contraction with one fast Latin character. * Use INDEX_MASK to find the start of the contraction list after the fixed table. * The first entry contains the default mapping. * Otherwise use CONTR_CHAR_MASK for the contraction character index * (in ascending order). * Use CONTR_LENGTH_SHIFT for the length of the entry * (1=BAIL_OUT, 2=one CE, 3=two CEs). * * Also, U+0000 maps to a contraction entry, so that the fast path need not * check for NUL termination. * It usually maps to a contraction list with only the completely ignorable default value. */ static const uint32_t CONTRACTION = 0x400; /** * An expansion encodes two CEs. * Use INDEX_MASK to find the pair of CEs after the fixed table. * * The higher a mini CE value, the easier it is to process. * For expansions and higher, no context needs to be considered. */ static const uint32_t EXPANSION = 0x800; /** * Encodes one CE with a long/low mini primary (there are 128). * All potentially-variable primaries must be in this range, * to make the short-primary path as fast as possible. */ static const uint32_t MIN_LONG = 0xc00; static const uint32_t LONG_INC = 8; static const uint32_t MAX_LONG = 0xff8; /** * Encodes one CE with a short/high primary (there are 60), * plus a secondary CE if the secondary weight is high. * Fast handling: At least all letter primaries should be in this range. */ static const uint32_t MIN_SHORT = 0x1000; static const uint32_t SHORT_INC = 0x400; /** The highest primary weight is reserved for U+FFFF. */ static const uint32_t MAX_SHORT = SHORT_PRIMARY_MASK; static const uint32_t MIN_SEC_BEFORE = 0; // must add SEC_OFFSET static const uint32_t SEC_INC = 0x20; static const uint32_t MAX_SEC_BEFORE = MIN_SEC_BEFORE + 4 * SEC_INC; // 5 before common static const uint32_t COMMON_SEC = MAX_SEC_BEFORE + SEC_INC; static const uint32_t MIN_SEC_AFTER = COMMON_SEC + SEC_INC; static const uint32_t MAX_SEC_AFTER = MIN_SEC_AFTER + 5 * SEC_INC; // 6 after common static const uint32_t MIN_SEC_HIGH = MAX_SEC_AFTER + SEC_INC; // 20 high secondaries static const uint32_t MAX_SEC_HIGH = SECONDARY_MASK; /** * Lookup: Add this offset to secondary weights, except for completely ignorable CEs. * Must be greater than any special value, e.g., MERGE_WEIGHT. * The exact value is not relevant for the format version. */ static const uint32_t SEC_OFFSET = SEC_INC; static const uint32_t COMMON_SEC_PLUS_OFFSET = COMMON_SEC + SEC_OFFSET; static const uint32_t TWO_SEC_OFFSETS = (SEC_OFFSET << 16) | SEC_OFFSET; // 0x200020 static const uint32_t TWO_COMMON_SEC_PLUS_OFFSET = (COMMON_SEC_PLUS_OFFSET << 16) | COMMON_SEC_PLUS_OFFSET; static const uint32_t LOWER_CASE = 8; // case bits include this offset static const uint32_t TWO_LOWER_CASES = (LOWER_CASE << 16) | LOWER_CASE; // 0x80008 static const uint32_t COMMON_TER = 0; // must add TER_OFFSET static const uint32_t MAX_TER_AFTER = 7; // 7 after common /** * Lookup: Add this offset to tertiary weights, except for completely ignorable CEs. * Must be greater than any special value, e.g., MERGE_WEIGHT. * Must be greater than case bits as well, so that with combined case+tertiary weights * plus the offset the tertiary bits does not spill over into the case bits. * The exact value is not relevant for the format version. */ static const uint32_t TER_OFFSET = SEC_OFFSET; static const uint32_t COMMON_TER_PLUS_OFFSET = COMMON_TER + TER_OFFSET; static const uint32_t TWO_TER_OFFSETS = (TER_OFFSET << 16) | TER_OFFSET; static const uint32_t TWO_COMMON_TER_PLUS_OFFSET = (COMMON_TER_PLUS_OFFSET << 16) | COMMON_TER_PLUS_OFFSET; static const uint32_t MERGE_WEIGHT = 3; static const uint32_t EOS = 2; // end of string static const uint32_t BAIL_OUT = 1; /** * Contraction result first word bits 8..0 contain the * second contraction character, as a char index 0..NUM_FAST_CHARS-1. * Each contraction list is terminated with a word containing CONTR_CHAR_MASK. */ static const uint32_t CONTR_CHAR_MASK = 0x1ff; /** * Contraction result first word bits 10..9 contain the result length: * 1=bail out, 2=one mini CE, 3=two mini CEs */ static const uint32_t CONTR_LENGTH_SHIFT = 9; /** * Comparison return value when the regular comparison must be used. * The exact value is not relevant for the format version. */ static const int32_t BAIL_OUT_RESULT = -2; static inline int32_t getCharIndex(char16_t c) { if(c <= LATIN_MAX) { return c; } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { return c - (PUNCT_START - LATIN_LIMIT); } else { // Not a fast Latin character. // Note: U+FFFE & U+FFFF are forbidden in tailorings // and thus do not occur in any contractions. return -1; } } /** * Computes the options value for the compare functions * and writes the precomputed primary weights. * Returns -1 if the Latin fastpath is not supported for the data and settings. * The capacity must be LATIN_LIMIT. */ static int32_t getOptions(const CollationData *data, const CollationSettings &settings, uint16_t *primaries, int32_t capacity); static int32_t compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options, const char16_t *left, int32_t leftLength, const char16_t *right, int32_t rightLength); static int32_t compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options, const uint8_t *left, int32_t leftLength, const uint8_t *right, int32_t rightLength); private: static uint32_t lookup(const uint16_t *table, UChar32 c); static uint32_t lookupUTF8(const uint16_t *table, UChar32 c, const uint8_t *s8, int32_t &sIndex, int32_t sLength); static uint32_t lookupUTF8Unsafe(const uint16_t *table, UChar32 c, const uint8_t *s8, int32_t &sIndex); static uint32_t nextPair(const uint16_t *table, UChar32 c, uint32_t ce, const char16_t *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength); static inline uint32_t getPrimaries(uint32_t variableTop, uint32_t pair) { uint32_t ce = pair & 0xffff; if(ce >= MIN_SHORT) { return pair & TWO_SHORT_PRIMARIES_MASK; } if(ce > variableTop) { return pair & TWO_LONG_PRIMARIES_MASK; } if(ce >= MIN_LONG) { return 0; } // variable return pair; // special mini CE } static inline uint32_t getSecondariesFromOneShortCE(uint32_t ce) { ce &= SECONDARY_MASK; if(ce < MIN_SEC_HIGH) { return ce + SEC_OFFSET; } else { return ((ce + SEC_OFFSET) << 16) | COMMON_SEC_PLUS_OFFSET; } } static uint32_t getSecondaries(uint32_t variableTop, uint32_t pair); static uint32_t getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair); static uint32_t getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair); static uint32_t getQuaternaries(uint32_t variableTop, uint32_t pair); private: CollationFastLatin() = delete; // no constructor }; /* * Format of the CollationFastLatin data table. * CollationFastLatin::VERSION = 2. * * This table contains data for a Latin-text collation fastpath. * The data is stored as an array of uint16_t which contains the following parts. * * uint16_t -- version & header length * Bits 15..8: version, must match the VERSION * 7..0: length of the header * * uint16_t varTops[header length - 1] * Version 2: * varTops[m] is the highest CollationFastLatin long-primary weight * of supported maxVariable group m * (special reorder group space, punct, symbol, currency). * * Version 1: * Each of these values maps the variable top lead byte of a supported maxVariable group * to the highest CollationFastLatin long-primary weight. * The values are stored in ascending order. * Bits 15..7: max fast-Latin long-primary weight (bits 11..3 shifted left by 4 bits) * 6..0: regular primary lead byte * * uint16_t miniCEs[0x1c0] * A mini collation element for each character U+0000..U+017F and U+2000..U+203F. * Each value encodes one or two mini CEs (two are possible if the first one * has a short mini primary and the second one is a secondary CE, i.e., primary == 0), * or points to an expansion or to a contraction table. * U+0000 always has a contraction entry, * so that NUL-termination need not be tested in the fastpath. * If the collation elements for a character or contraction cannot be encoded in this format, * then the BAIL_OUT value is stored. * For details see the comments for the class constants. * * uint16_t expansions[variable length]; * Expansion mini CEs contain an offset relative to just after the miniCEs table. * An expansions contains exactly 2 mini CEs. * * uint16_t contractions[variable length]; * Contraction mini CEs contain an offset relative to just after the miniCEs table. * It points to a list of tuples which map from a contraction suffix character to a result. * First uint16_t of each tuple: * Bits 10..9: Length of the result (1..3), see comments on CONTR_LENGTH_SHIFT. * Bits 8..0: Contraction character, see comments on CONTR_CHAR_MASK. * This is followed by 0, 1, or 2 uint16_t according to the length. * Each list is terminated by an entry with CONTR_CHAR_MASK. * Each list starts with such an entry which also contains the default result * for when there is no contraction match. * * ----------------- * Changes for version 2 (ICU 55) * * Special reorder groups do not necessarily start on whole primary lead bytes any more. * Therefore, the varTops data has a new format: * Version 1 stored the lead bytes of the highest root primaries for * the maxVariable-supported special reorder groups. * Now the top 16 bits would need to be stored, * and it is simpler to store only the fast-Latin weights. */ U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION #endif // __COLLATIONFASTLATIN_H__ stringi/src/icu74/i18n/nultrans.cpp0000644000176200001440000000225314700200761016603 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (c) 2000-2005, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 01/11/2000 aliu Creation. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "nultrans.h" U_NAMESPACE_BEGIN UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NullTransliterator) NullTransliterator::NullTransliterator() : Transliterator(UNICODE_STRING_SIMPLE("Any-Null"), 0) {} NullTransliterator::~NullTransliterator() {} NullTransliterator* NullTransliterator::clone() const { return new NullTransliterator(); } void NullTransliterator::handleTransliterate(Replaceable& /*text*/, UTransPosition& offsets, UBool /*isIncremental*/) const { offsets.start = offsets.limit; } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ stringi/src/icu74/i18n/uni2name.h0000644000176200001440000000470414700200761016123 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2001-2007, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 06/06/01 aliu Creation. ********************************************************************** */ #ifndef UNI2NAME_H #define UNI2NAME_H #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/translit.h" U_NAMESPACE_BEGIN /** * A transliterator that performs character to name mapping. * It generates the Perl syntax \N{name}. * @author Alan Liu */ class UnicodeNameTransliterator : public Transliterator { public: /** * Constructs a transliterator. * @param adoptedFilter the filter to be adopted. */ UnicodeNameTransliterator(UnicodeFilter* adoptedFilter = 0); /** * Destructor. */ virtual ~UnicodeNameTransliterator(); /** * Copy constructor. */ UnicodeNameTransliterator(const UnicodeNameTransliterator&); /** * Transliterator API. */ virtual UnicodeNameTransliterator* clone() const override; /** * ICU "poor man's RTTI", returns a UClassID for the actual class. */ virtual UClassID getDynamicClassID() const override; /** * ICU "poor man's RTTI", returns a UClassID for this class. */ U_I18N_API static UClassID U_EXPORT2 getStaticClassID(); protected: /** * Implements {@link Transliterator#handleTransliterate}. * @param text the buffer holding transliterated and * untransliterated text * @param offset the start and limit of the text, the position * of the cursor, and the start and limit of transliteration. * @param incremental if true, assume more text may be coming after * pos.contextLimit. Otherwise, assume the text is complete. */ virtual void handleTransliterate(Replaceable& text, UTransPosition& offset, UBool isIncremental) const override; private: /** * Assignment operator. */ UnicodeNameTransliterator& operator=(const UnicodeNameTransliterator&); }; U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ #endif stringi/src/icu74/i18n/tzgnames.cpp0000644000176200001440000012717414700200761016577 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2011-2016, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "tzgnames.h" #include "unicode/basictz.h" #include "unicode/locdspnm.h" #include "unicode/rbtz.h" #include "unicode/simpleformatter.h" #include "unicode/simpletz.h" #include "unicode/strenum.h" #include "unicode/vtzone.h" #include "bytesinkutil.h" #include "charstr.h" #include "cmemory.h" #include "cstring.h" #include "mutex.h" #include "uhash.h" #include "uassert.h" #include "umutex.h" #include "ulocimp.h" #include "uresimp.h" #include "ureslocs.h" #include "zonemeta.h" #include "tznames_impl.h" #include "olsontz.h" #include "ucln_in.h" U_NAMESPACE_BEGIN #define ZID_KEY_MAX 128 static const char gZoneStrings[] = "zoneStrings"; static const char gRegionFormatTag[] = "regionFormat"; static const char gFallbackFormatTag[] = "fallbackFormat"; static const char16_t gEmpty[] = {0x00}; static const char16_t gDefRegionPattern[] = {0x7B, 0x30, 0x7D, 0x00}; // "{0}" static const char16_t gDefFallbackPattern[] = {0x7B, 0x31, 0x7D, 0x20, 0x28, 0x7B, 0x30, 0x7D, 0x29, 0x00}; // "{1} ({0})" static const double kDstCheckRange = (double)184*U_MILLIS_PER_DAY; U_CDECL_BEGIN typedef struct PartialLocationKey { const char16_t* tzID; const char16_t* mzID; UBool isLong; } PartialLocationKey; /** * Hash function for partial location name hash key */ static int32_t U_CALLCONV hashPartialLocationKey(const UHashTok key) { // &#[L|S] PartialLocationKey *p = (PartialLocationKey *)key.pointer; UnicodeString str(p->tzID); str.append((char16_t)0x26) .append(p->mzID, -1) .append((char16_t)0x23) .append((char16_t)(p->isLong ? 0x4C : 0x53)); return str.hashCode(); } /** * Comparer for partial location name hash key */ static UBool U_CALLCONV comparePartialLocationKey(const UHashTok key1, const UHashTok key2) { PartialLocationKey *p1 = (PartialLocationKey *)key1.pointer; PartialLocationKey *p2 = (PartialLocationKey *)key2.pointer; if (p1 == p2) { return true; } if (p1 == nullptr || p2 == nullptr) { return false; } // We just check identity of tzID/mzID return (p1->tzID == p2->tzID && p1->mzID == p2->mzID && p1->isLong == p2->isLong); } /** * Deleter for GNameInfo */ static void U_CALLCONV deleteGNameInfo(void *obj) { uprv_free(obj); } /** * GNameInfo stores zone name information in the local trie */ typedef struct GNameInfo { UTimeZoneGenericNameType type; const char16_t* tzID; } ZNameInfo; /** * GMatchInfo stores zone name match information used by find method */ typedef struct GMatchInfo { const GNameInfo* gnameInfo; int32_t matchLength; UTimeZoneFormatTimeType timeType; } ZMatchInfo; U_CDECL_END // --------------------------------------------------- // The class stores time zone generic name match information // --------------------------------------------------- class TimeZoneGenericNameMatchInfo : public UMemory { public: TimeZoneGenericNameMatchInfo(UVector* matches); ~TimeZoneGenericNameMatchInfo(); int32_t size() const; UTimeZoneGenericNameType getGenericNameType(int32_t index) const; int32_t getMatchLength(int32_t index) const; UnicodeString& getTimeZoneID(int32_t index, UnicodeString& tzID) const; private: UVector* fMatches; // vector of MatchEntry }; TimeZoneGenericNameMatchInfo::TimeZoneGenericNameMatchInfo(UVector* matches) : fMatches(matches) { } TimeZoneGenericNameMatchInfo::~TimeZoneGenericNameMatchInfo() { if (fMatches != nullptr) { delete fMatches; } } int32_t TimeZoneGenericNameMatchInfo::size() const { if (fMatches == nullptr) { return 0; } return fMatches->size(); } UTimeZoneGenericNameType TimeZoneGenericNameMatchInfo::getGenericNameType(int32_t index) const { GMatchInfo *minfo = (GMatchInfo *)fMatches->elementAt(index); if (minfo != nullptr) { return static_cast(minfo->gnameInfo->type); } return UTZGNM_UNKNOWN; } int32_t TimeZoneGenericNameMatchInfo::getMatchLength(int32_t index) const { ZMatchInfo *minfo = (ZMatchInfo *)fMatches->elementAt(index); if (minfo != nullptr) { return minfo->matchLength; } return -1; } UnicodeString& TimeZoneGenericNameMatchInfo::getTimeZoneID(int32_t index, UnicodeString& tzID) const { GMatchInfo *minfo = (GMatchInfo *)fMatches->elementAt(index); if (minfo != nullptr && minfo->gnameInfo->tzID != nullptr) { tzID.setTo(true, minfo->gnameInfo->tzID, -1); } else { tzID.setToBogus(); } return tzID; } // --------------------------------------------------- // GNameSearchHandler // --------------------------------------------------- class GNameSearchHandler : public TextTrieMapSearchResultHandler { public: GNameSearchHandler(uint32_t types); virtual ~GNameSearchHandler(); UBool handleMatch(int32_t matchLength, const CharacterNode *node, UErrorCode &status) override; UVector* getMatches(int32_t& maxMatchLen); private: uint32_t fTypes; UVector* fResults; int32_t fMaxMatchLen; }; GNameSearchHandler::GNameSearchHandler(uint32_t types) : fTypes(types), fResults(nullptr), fMaxMatchLen(0) { } GNameSearchHandler::~GNameSearchHandler() { if (fResults != nullptr) { delete fResults; } } UBool GNameSearchHandler::handleMatch(int32_t matchLength, const CharacterNode *node, UErrorCode &status) { if (U_FAILURE(status)) { return false; } if (node->hasValues()) { int32_t valuesCount = node->countValues(); for (int32_t i = 0; i < valuesCount; i++) { GNameInfo *nameinfo = (ZNameInfo *)node->getValue(i); if (nameinfo == nullptr) { break; } if ((nameinfo->type & fTypes) != 0) { // matches a requested type if (fResults == nullptr) { LocalPointer lpResults(new UVector(uprv_free, nullptr, status), status); if (U_FAILURE(status)) { return false; } fResults = lpResults.orphan(); } GMatchInfo *gmatch = (GMatchInfo *)uprv_malloc(sizeof(GMatchInfo)); if (gmatch == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return false; } // add the match to the vector gmatch->gnameInfo = nameinfo; gmatch->matchLength = matchLength; gmatch->timeType = UTZFMT_TIME_TYPE_UNKNOWN; fResults->adoptElement(gmatch, status); if (U_FAILURE(status)) { return false; } if (matchLength > fMaxMatchLen) { fMaxMatchLen = matchLength; } } } } return true; } UVector* GNameSearchHandler::getMatches(int32_t& maxMatchLen) { // give the ownership to the caller UVector *results = fResults; maxMatchLen = fMaxMatchLen; // reset fResults = nullptr; fMaxMatchLen = 0; return results; } static UMutex gLock; class TZGNCore : public UMemory { public: TZGNCore(const Locale& locale, UErrorCode& status); virtual ~TZGNCore(); UnicodeString& getDisplayName(const TimeZone& tz, UTimeZoneGenericNameType type, UDate date, UnicodeString& name) const; UnicodeString& getGenericLocationName(const UnicodeString& tzCanonicalID, UnicodeString& name) const; int32_t findBestMatch(const UnicodeString& text, int32_t start, uint32_t types, UnicodeString& tzID, UTimeZoneFormatTimeType& timeType, UErrorCode& status) const; private: Locale fLocale; const TimeZoneNames* fTimeZoneNames; UHashtable* fLocationNamesMap; UHashtable* fPartialLocationNamesMap; SimpleFormatter fRegionFormat; SimpleFormatter fFallbackFormat; LocaleDisplayNames* fLocaleDisplayNames; ZNStringPool fStringPool; TextTrieMap fGNamesTrie; UBool fGNamesTrieFullyLoaded; char fTargetRegion[ULOC_COUNTRY_CAPACITY]; void initialize(const Locale& locale, UErrorCode& status); void cleanup(); void loadStrings(const UnicodeString& tzCanonicalID); const char16_t* getGenericLocationName(const UnicodeString& tzCanonicalID); UnicodeString& formatGenericNonLocationName(const TimeZone& tz, UTimeZoneGenericNameType type, UDate date, UnicodeString& name) const; UnicodeString& getPartialLocationName(const UnicodeString& tzCanonicalID, const UnicodeString& mzID, UBool isLong, const UnicodeString& mzDisplayName, UnicodeString& name) const; const char16_t* getPartialLocationName(const UnicodeString& tzCanonicalID, const UnicodeString& mzID, UBool isLong, const UnicodeString& mzDisplayName); TimeZoneGenericNameMatchInfo* findLocal(const UnicodeString& text, int32_t start, uint32_t types, UErrorCode& status) const; TimeZoneNames::MatchInfoCollection* findTimeZoneNames(const UnicodeString& text, int32_t start, uint32_t types, UErrorCode& status) const; }; // --------------------------------------------------- // TZGNCore - core implementation of TimeZoneGenericNames // // TimeZoneGenericNames is parallel to TimeZoneNames, // but handles run-time generated time zone names. // This is the main part of this module. // --------------------------------------------------- TZGNCore::TZGNCore(const Locale& locale, UErrorCode& status) : fLocale(locale), fTimeZoneNames(nullptr), fLocationNamesMap(nullptr), fPartialLocationNamesMap(nullptr), fLocaleDisplayNames(nullptr), fStringPool(status), fGNamesTrie(true, deleteGNameInfo), fGNamesTrieFullyLoaded(false) { initialize(locale, status); } TZGNCore::~TZGNCore() { cleanup(); } void TZGNCore::initialize(const Locale& locale, UErrorCode& status) { if (U_FAILURE(status)) { return; } // TimeZoneNames fTimeZoneNames = TimeZoneNames::createInstance(locale, status); if (U_FAILURE(status)) { return; } // Initialize format patterns UnicodeString rpat(true, gDefRegionPattern, -1); UnicodeString fpat(true, gDefFallbackPattern, -1); UErrorCode tmpsts = U_ZERO_ERROR; // OK with fallback warning.. UResourceBundle *zoneStrings = ures_open(U_ICUDATA_ZONE, locale.getName(), &tmpsts); zoneStrings = ures_getByKeyWithFallback(zoneStrings, gZoneStrings, zoneStrings, &tmpsts); if (U_SUCCESS(tmpsts)) { const char16_t *regionPattern = ures_getStringByKeyWithFallback(zoneStrings, gRegionFormatTag, nullptr, &tmpsts); if (U_SUCCESS(tmpsts) && u_strlen(regionPattern) > 0) { rpat.setTo(regionPattern, -1); } tmpsts = U_ZERO_ERROR; const char16_t *fallbackPattern = ures_getStringByKeyWithFallback(zoneStrings, gFallbackFormatTag, nullptr, &tmpsts); if (U_SUCCESS(tmpsts) && u_strlen(fallbackPattern) > 0) { fpat.setTo(fallbackPattern, -1); } } ures_close(zoneStrings); fRegionFormat.applyPatternMinMaxArguments(rpat, 1, 1, status); fFallbackFormat.applyPatternMinMaxArguments(fpat, 2, 2, status); if (U_FAILURE(status)) { cleanup(); return; } // locale display names fLocaleDisplayNames = LocaleDisplayNames::createInstance(locale); // hash table for names - no key/value deleters fLocationNamesMap = uhash_open(uhash_hashUChars, uhash_compareUChars, nullptr, &status); if (U_FAILURE(status)) { cleanup(); return; } fPartialLocationNamesMap = uhash_open(hashPartialLocationKey, comparePartialLocationKey, nullptr, &status); if (U_FAILURE(status)) { cleanup(); return; } uhash_setKeyDeleter(fPartialLocationNamesMap, uprv_free); // no value deleter // target region const char* region = fLocale.getCountry(); int32_t regionLen = static_cast(uprv_strlen(region)); if (regionLen == 0) { CharString loc; { CharStringByteSink sink(&loc); ulocimp_addLikelySubtags(fLocale.getName(), sink, &status); } regionLen = uloc_getCountry(loc.data(), fTargetRegion, sizeof(fTargetRegion), &status); if (U_SUCCESS(status)) { fTargetRegion[regionLen] = 0; } else { cleanup(); return; } } else if (regionLen < (int32_t)sizeof(fTargetRegion)) { uprv_strcpy(fTargetRegion, region); } else { fTargetRegion[0] = 0; } // preload generic names for the default zone TimeZone *tz = TimeZone::createDefault(); const char16_t *tzID = ZoneMeta::getCanonicalCLDRID(*tz); if (tzID != nullptr) { loadStrings(UnicodeString(true, tzID, -1)); } delete tz; } void TZGNCore::cleanup() { if (fLocaleDisplayNames != nullptr) { delete fLocaleDisplayNames; } if (fTimeZoneNames != nullptr) { delete fTimeZoneNames; } uhash_close(fLocationNamesMap); uhash_close(fPartialLocationNamesMap); } UnicodeString& TZGNCore::getDisplayName(const TimeZone& tz, UTimeZoneGenericNameType type, UDate date, UnicodeString& name) const { name.setToBogus(); switch (type) { case UTZGNM_LOCATION: { const char16_t* tzCanonicalID = ZoneMeta::getCanonicalCLDRID(tz); if (tzCanonicalID != nullptr) { getGenericLocationName(UnicodeString(true, tzCanonicalID, -1), name); } } break; case UTZGNM_LONG: case UTZGNM_SHORT: formatGenericNonLocationName(tz, type, date, name); if (name.isEmpty()) { const char16_t* tzCanonicalID = ZoneMeta::getCanonicalCLDRID(tz); if (tzCanonicalID != nullptr) { getGenericLocationName(UnicodeString(true, tzCanonicalID, -1), name); } } break; default: break; } return name; } UnicodeString& TZGNCore::getGenericLocationName(const UnicodeString& tzCanonicalID, UnicodeString& name) const { if (tzCanonicalID.isEmpty()) { name.setToBogus(); return name; } const char16_t *locname = nullptr; TZGNCore *nonConstThis = const_cast(this); umtx_lock(&gLock); { locname = nonConstThis->getGenericLocationName(tzCanonicalID); } umtx_unlock(&gLock); if (locname == nullptr) { name.setToBogus(); } else { name.setTo(locname, u_strlen(locname)); } return name; } /* * This method updates the cache and must be called with a lock */ const char16_t* TZGNCore::getGenericLocationName(const UnicodeString& tzCanonicalID) { U_ASSERT(!tzCanonicalID.isEmpty()); if (tzCanonicalID.length() > ZID_KEY_MAX) { return nullptr; } UErrorCode status = U_ZERO_ERROR; char16_t tzIDKey[ZID_KEY_MAX + 1]; int32_t tzIDKeyLen = tzCanonicalID.extract(tzIDKey, ZID_KEY_MAX + 1, status); U_ASSERT(status == U_ZERO_ERROR); // already checked length above tzIDKey[tzIDKeyLen] = 0; const char16_t *locname = (const char16_t *)uhash_get(fLocationNamesMap, tzIDKey); if (locname != nullptr) { // gEmpty indicate the name is not available if (locname == gEmpty) { return nullptr; } return locname; } // Construct location name UnicodeString name; UnicodeString usCountryCode; UBool isPrimary = false; ZoneMeta::getCanonicalCountry(tzCanonicalID, usCountryCode, &isPrimary); if (!usCountryCode.isEmpty()) { if (isPrimary) { // If this is the primary zone in the country, use the country name. char countryCode[ULOC_COUNTRY_CAPACITY]; U_ASSERT(usCountryCode.length() < ULOC_COUNTRY_CAPACITY); int32_t ccLen = usCountryCode.extract(0, usCountryCode.length(), countryCode, sizeof(countryCode), US_INV); countryCode[ccLen] = 0; UnicodeString country; fLocaleDisplayNames->regionDisplayName(countryCode, country); fRegionFormat.format(country, name, status); } else { // If this is not the primary zone in the country, // use the exemplar city name. // getExemplarLocationName should return non-empty string // if the time zone is associated with a region UnicodeString city; fTimeZoneNames->getExemplarLocationName(tzCanonicalID, city); fRegionFormat.format(city, name, status); } if (U_FAILURE(status)) { return nullptr; } } locname = name.isEmpty() ? nullptr : fStringPool.get(name, status); if (U_SUCCESS(status)) { // Cache the result const char16_t* cacheID = ZoneMeta::findTimeZoneID(tzCanonicalID); U_ASSERT(cacheID != nullptr); if (locname == nullptr) { // gEmpty to indicate - no location name available uhash_put(fLocationNamesMap, (void *)cacheID, (void *)gEmpty, &status); } else { uhash_put(fLocationNamesMap, (void *)cacheID, (void *)locname, &status); if (U_FAILURE(status)) { locname = nullptr; } else { // put the name info into the trie GNameInfo *nameinfo = (ZNameInfo *)uprv_malloc(sizeof(GNameInfo)); if (nameinfo != nullptr) { nameinfo->type = UTZGNM_LOCATION; nameinfo->tzID = cacheID; fGNamesTrie.put(locname, nameinfo, status); } } } } return locname; } UnicodeString& TZGNCore::formatGenericNonLocationName(const TimeZone& tz, UTimeZoneGenericNameType type, UDate date, UnicodeString& name) const { U_ASSERT(type == UTZGNM_LONG || type == UTZGNM_SHORT); name.setToBogus(); const char16_t* uID = ZoneMeta::getCanonicalCLDRID(tz); if (uID == nullptr) { return name; } UnicodeString tzID(true, uID, -1); // Try to get a name from time zone first UTimeZoneNameType nameType = (type == UTZGNM_LONG) ? UTZNM_LONG_GENERIC : UTZNM_SHORT_GENERIC; fTimeZoneNames->getTimeZoneDisplayName(tzID, nameType, name); if (!name.isEmpty()) { return name; } // Try meta zone char16_t mzIDBuf[32]; UnicodeString mzID(mzIDBuf, 0, UPRV_LENGTHOF(mzIDBuf)); fTimeZoneNames->getMetaZoneID(tzID, date, mzID); if (!mzID.isEmpty()) { UErrorCode status = U_ZERO_ERROR; UBool useStandard = false; int32_t raw, sav; char16_t tmpNameBuf[ZONE_NAME_U16_MAX]; tz.getOffset(date, false, raw, sav, status); if (U_FAILURE(status)) { return name; } if (sav == 0) { useStandard = true; TimeZone *tmptz = tz.clone(); // Check if the zone actually uses daylight saving time around the time BasicTimeZone *btz = nullptr; if (dynamic_cast(tmptz) != nullptr || dynamic_cast(tmptz) != nullptr || dynamic_cast(tmptz) != nullptr || dynamic_cast(tmptz) != nullptr) { btz = (BasicTimeZone*)tmptz; } if (btz != nullptr) { TimeZoneTransition before; UBool beforTrs = btz->getPreviousTransition(date, true, before); if (beforTrs && (date - before.getTime() < kDstCheckRange) && before.getFrom()->getDSTSavings() != 0) { useStandard = false; } else { TimeZoneTransition after; UBool afterTrs = btz->getNextTransition(date, false, after); if (afterTrs && (after.getTime() - date < kDstCheckRange) && after.getTo()->getDSTSavings() != 0) { useStandard = false; } } } else { // If not BasicTimeZone... only if the instance is not an ICU's implementation. // We may get a wrong answer in edge case, but it should practically work OK. tmptz->getOffset(date - kDstCheckRange, false, raw, sav, status); if (sav != 0) { useStandard = false; } else { tmptz->getOffset(date + kDstCheckRange, false, raw, sav, status); if (sav != 0){ useStandard = false; } } if (U_FAILURE(status)) { delete tmptz; return name; } } delete tmptz; } if (useStandard) { UTimeZoneNameType stdNameType = (nameType == UTZNM_LONG_GENERIC) ? UTZNM_LONG_STANDARD : UTZNM_SHORT_STANDARD; UnicodeString stdName(tmpNameBuf, 0, UPRV_LENGTHOF(tmpNameBuf)); fTimeZoneNames->getDisplayName(tzID, stdNameType, date, stdName); if (!stdName.isEmpty()) { name.setTo(stdName); // TODO: revisit this issue later // In CLDR, a same display name is used for both generic and standard // for some meta zones in some locales. This looks like a data bugs. // For now, we check if the standard name is different from its generic // name below. char16_t genNameBuf[ZONE_NAME_U16_MAX]; UnicodeString mzGenericName(genNameBuf, 0, UPRV_LENGTHOF(genNameBuf)); fTimeZoneNames->getMetaZoneDisplayName(mzID, nameType, mzGenericName); if (stdName.caseCompare(mzGenericName, 0) == 0) { name.setToBogus(); } } } if (name.isEmpty()) { // Get a name from meta zone UnicodeString mzName(tmpNameBuf, 0, UPRV_LENGTHOF(tmpNameBuf)); fTimeZoneNames->getMetaZoneDisplayName(mzID, nameType, mzName); if (!mzName.isEmpty()) { // Check if we need to use a partial location format. // This check is done by comparing offset with the meta zone's // golden zone at the given date. char16_t idBuf[32]; UnicodeString goldenID(idBuf, 0, UPRV_LENGTHOF(idBuf)); fTimeZoneNames->getReferenceZoneID(mzID, fTargetRegion, goldenID); if (!goldenID.isEmpty() && goldenID != tzID) { TimeZone *goldenZone = TimeZone::createTimeZone(goldenID); int32_t raw1, sav1; // Check offset in the golden zone with wall time. // With getOffset(date, false, offsets1), // you may get incorrect results because of time overlap at DST->STD // transition. goldenZone->getOffset(date + raw + sav, true, raw1, sav1, status); delete goldenZone; if (U_SUCCESS(status)) { if (raw != raw1 || sav != sav1) { // Now we need to use a partial location format getPartialLocationName(tzID, mzID, (nameType == UTZNM_LONG_GENERIC), mzName, name); } else { name.setTo(mzName); } } } else { name.setTo(mzName); } } } } return name; } UnicodeString& TZGNCore::getPartialLocationName(const UnicodeString& tzCanonicalID, const UnicodeString& mzID, UBool isLong, const UnicodeString& mzDisplayName, UnicodeString& name) const { name.setToBogus(); if (tzCanonicalID.isEmpty() || mzID.isEmpty() || mzDisplayName.isEmpty()) { return name; } const char16_t *uplname = nullptr; TZGNCore *nonConstThis = const_cast(this); umtx_lock(&gLock); { uplname = nonConstThis->getPartialLocationName(tzCanonicalID, mzID, isLong, mzDisplayName); } umtx_unlock(&gLock); if (uplname == nullptr) { name.setToBogus(); } else { name.setTo(true, uplname, -1); } return name; } /* * This method updates the cache and must be called with a lock */ const char16_t* TZGNCore::getPartialLocationName(const UnicodeString& tzCanonicalID, const UnicodeString& mzID, UBool isLong, const UnicodeString& mzDisplayName) { U_ASSERT(!tzCanonicalID.isEmpty()); U_ASSERT(!mzID.isEmpty()); U_ASSERT(!mzDisplayName.isEmpty()); PartialLocationKey key; key.tzID = ZoneMeta::findTimeZoneID(tzCanonicalID); key.mzID = ZoneMeta::findMetaZoneID(mzID); key.isLong = isLong; U_ASSERT(key.tzID != nullptr && key.mzID != nullptr); const char16_t* uplname = (const char16_t*)uhash_get(fPartialLocationNamesMap, (void *)&key); if (uplname != nullptr) { return uplname; } UnicodeString location; UnicodeString usCountryCode; ZoneMeta::getCanonicalCountry(tzCanonicalID, usCountryCode); if (!usCountryCode.isEmpty()) { char countryCode[ULOC_COUNTRY_CAPACITY]; U_ASSERT(usCountryCode.length() < ULOC_COUNTRY_CAPACITY); int32_t ccLen = usCountryCode.extract(0, usCountryCode.length(), countryCode, sizeof(countryCode), US_INV); countryCode[ccLen] = 0; UnicodeString regionalGolden; fTimeZoneNames->getReferenceZoneID(mzID, countryCode, regionalGolden); if (tzCanonicalID == regionalGolden) { // Use country name fLocaleDisplayNames->regionDisplayName(countryCode, location); } else { // Otherwise, use exemplar city name fTimeZoneNames->getExemplarLocationName(tzCanonicalID, location); } } else { fTimeZoneNames->getExemplarLocationName(tzCanonicalID, location); if (location.isEmpty()) { // This could happen when the time zone is not associated with a country, // and its ID is not hierarchical, for example, CST6CDT. // We use the canonical ID itself as the location for this case. location.setTo(tzCanonicalID); } } UErrorCode status = U_ZERO_ERROR; UnicodeString name; fFallbackFormat.format(location, mzDisplayName, name, status); if (U_FAILURE(status)) { return nullptr; } uplname = fStringPool.get(name, status); if (U_SUCCESS(status)) { // Add the name to cache PartialLocationKey* cacheKey = (PartialLocationKey *)uprv_malloc(sizeof(PartialLocationKey)); if (cacheKey != nullptr) { cacheKey->tzID = key.tzID; cacheKey->mzID = key.mzID; cacheKey->isLong = key.isLong; uhash_put(fPartialLocationNamesMap, (void *)cacheKey, (void *)uplname, &status); if (U_FAILURE(status)) { uprv_free(cacheKey); } else { // put the name to the local trie as well GNameInfo *nameinfo = (ZNameInfo *)uprv_malloc(sizeof(GNameInfo)); if (nameinfo != nullptr) { nameinfo->type = isLong ? UTZGNM_LONG : UTZGNM_SHORT; nameinfo->tzID = key.tzID; fGNamesTrie.put(uplname, nameinfo, status); } } } } return uplname; } /* * This method updates the cache and must be called with a lock, * except initializer. */ void TZGNCore::loadStrings(const UnicodeString& tzCanonicalID) { // load the generic location name getGenericLocationName(tzCanonicalID); // partial location names UErrorCode status = U_ZERO_ERROR; const UnicodeString *mzID; UnicodeString goldenID; UnicodeString mzGenName; UTimeZoneNameType genNonLocTypes[] = { UTZNM_LONG_GENERIC, UTZNM_SHORT_GENERIC, UTZNM_UNKNOWN /*terminator*/ }; StringEnumeration *mzIDs = fTimeZoneNames->getAvailableMetaZoneIDs(tzCanonicalID, status); while ((mzID = mzIDs->snext(status)) != nullptr) { if (U_FAILURE(status)) { break; } // if this time zone is not the golden zone of the meta zone, // partial location name (such as "PT (Los Angeles)") might be // available. fTimeZoneNames->getReferenceZoneID(*mzID, fTargetRegion, goldenID); if (tzCanonicalID != goldenID) { for (int32_t i = 0; genNonLocTypes[i] != UTZNM_UNKNOWN; i++) { fTimeZoneNames->getMetaZoneDisplayName(*mzID, genNonLocTypes[i], mzGenName); if (!mzGenName.isEmpty()) { // getPartialLocationName formats a name and put it into the trie getPartialLocationName(tzCanonicalID, *mzID, (genNonLocTypes[i] == UTZNM_LONG_GENERIC), mzGenName); } } } } if (mzIDs != nullptr) { delete mzIDs; } } int32_t TZGNCore::findBestMatch(const UnicodeString& text, int32_t start, uint32_t types, UnicodeString& tzID, UTimeZoneFormatTimeType& timeType, UErrorCode& status) const { timeType = UTZFMT_TIME_TYPE_UNKNOWN; tzID.setToBogus(); if (U_FAILURE(status)) { return 0; } // Find matches in the TimeZoneNames first TimeZoneNames::MatchInfoCollection *tznamesMatches = findTimeZoneNames(text, start, types, status); if (U_FAILURE(status)) { return 0; } int32_t bestMatchLen = 0; UTimeZoneFormatTimeType bestMatchTimeType = UTZFMT_TIME_TYPE_UNKNOWN; UnicodeString bestMatchTzID; // UBool isLongStandard = false; // workaround - see the comments below UBool isStandard = false; // TODO: Temporary hack (on hack) for short standard name/location name conflict (found in zh_Hant), should be removed after CLDR 21m1 integration if (tznamesMatches != nullptr) { UnicodeString mzID; for (int32_t i = 0; i < tznamesMatches->size(); i++) { int32_t len = tznamesMatches->getMatchLengthAt(i); if (len > bestMatchLen) { bestMatchLen = len; if (!tznamesMatches->getTimeZoneIDAt(i, bestMatchTzID)) { // name for a meta zone if (tznamesMatches->getMetaZoneIDAt(i, mzID)) { fTimeZoneNames->getReferenceZoneID(mzID, fTargetRegion, bestMatchTzID); } } UTimeZoneNameType nameType = tznamesMatches->getNameTypeAt(i); if (U_FAILURE(status)) { break; } switch (nameType) { case UTZNM_LONG_STANDARD: // isLongStandard = true; case UTZNM_SHORT_STANDARD: // this one is never used for generic, but just in case isStandard = true; // TODO: Remove this later, see the comments above. bestMatchTimeType = UTZFMT_TIME_TYPE_STANDARD; break; case UTZNM_LONG_DAYLIGHT: case UTZNM_SHORT_DAYLIGHT: // this one is never used for generic, but just in case bestMatchTimeType = UTZFMT_TIME_TYPE_DAYLIGHT; break; default: bestMatchTimeType = UTZFMT_TIME_TYPE_UNKNOWN; } } } delete tznamesMatches; if (U_FAILURE(status)) { return 0; } if (bestMatchLen == (text.length() - start)) { // Full match //tzID.setTo(bestMatchTzID); //timeType = bestMatchTimeType; //return bestMatchLen; // TODO Some time zone uses a same name for the long standard name // and the location name. When the match is a long standard name, // then we need to check if the name is same with the location name. // This is probably a data error or a design bug. /* if (!isLongStandard) { tzID.setTo(bestMatchTzID); timeType = bestMatchTimeType; return bestMatchLen; } */ // TODO The deprecation of commonlyUsed flag introduced the name // conflict not only for long standard names, but short standard names too. // These short names (found in zh_Hant) should be gone once we clean // up CLDR time zone display name data. Once the short name conflict // problem (with location name) is resolved, we should change the condition // below back to the original one above. -Yoshito (2011-09-14) if (!isStandard) { tzID.setTo(bestMatchTzID); timeType = bestMatchTimeType; return bestMatchLen; } } } // Find matches in the local trie TimeZoneGenericNameMatchInfo *localMatches = findLocal(text, start, types, status); if (U_FAILURE(status)) { return 0; } if (localMatches != nullptr) { for (int32_t i = 0; i < localMatches->size(); i++) { int32_t len = localMatches->getMatchLength(i); // TODO See the above TODO. We use len >= bestMatchLen // because of the long standard/location name collision // problem. If it is also a location name, carrying // timeType = UTZFMT_TIME_TYPE_STANDARD will cause a // problem in SimpleDateFormat if (len >= bestMatchLen) { bestMatchLen = localMatches->getMatchLength(i); bestMatchTimeType = UTZFMT_TIME_TYPE_UNKNOWN; // because generic localMatches->getTimeZoneID(i, bestMatchTzID); } } delete localMatches; } if (bestMatchLen > 0) { timeType = bestMatchTimeType; tzID.setTo(bestMatchTzID); } return bestMatchLen; } TimeZoneGenericNameMatchInfo* TZGNCore::findLocal(const UnicodeString& text, int32_t start, uint32_t types, UErrorCode& status) const { GNameSearchHandler handler(types); TZGNCore *nonConstThis = const_cast(this); umtx_lock(&gLock); { fGNamesTrie.search(text, start, (TextTrieMapSearchResultHandler *)&handler, status); } umtx_unlock(&gLock); if (U_FAILURE(status)) { return nullptr; } TimeZoneGenericNameMatchInfo *gmatchInfo = nullptr; int32_t maxLen = 0; UVector *results = handler.getMatches(maxLen); if (results != nullptr && ((maxLen == (text.length() - start)) || fGNamesTrieFullyLoaded)) { // perfect match gmatchInfo = new TimeZoneGenericNameMatchInfo(results); if (gmatchInfo == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; delete results; return nullptr; } return gmatchInfo; } if (results != nullptr) { delete results; } // All names are not yet loaded into the local trie. // Load all available names into the trie. This could be very heavy. umtx_lock(&gLock); { if (!fGNamesTrieFullyLoaded) { StringEnumeration *tzIDs = TimeZone::createTimeZoneIDEnumeration(UCAL_ZONE_TYPE_CANONICAL, nullptr, nullptr, status); if (U_SUCCESS(status)) { const UnicodeString *tzID; while ((tzID = tzIDs->snext(status)) != nullptr) { if (U_FAILURE(status)) { break; } nonConstThis->loadStrings(*tzID); } } if (tzIDs != nullptr) { delete tzIDs; } if (U_SUCCESS(status)) { nonConstThis->fGNamesTrieFullyLoaded = true; } } } umtx_unlock(&gLock); if (U_FAILURE(status)) { return nullptr; } umtx_lock(&gLock); { // now try it again fGNamesTrie.search(text, start, (TextTrieMapSearchResultHandler *)&handler, status); } umtx_unlock(&gLock); results = handler.getMatches(maxLen); if (results != nullptr && maxLen > 0) { gmatchInfo = new TimeZoneGenericNameMatchInfo(results); if (gmatchInfo == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; delete results; return nullptr; } } return gmatchInfo; } TimeZoneNames::MatchInfoCollection* TZGNCore::findTimeZoneNames(const UnicodeString& text, int32_t start, uint32_t types, UErrorCode& status) const { // Check if the target name typs is really in the TimeZoneNames uint32_t nameTypes = 0; if (types & UTZGNM_LONG) { nameTypes |= (UTZNM_LONG_GENERIC | UTZNM_LONG_STANDARD); } if (types & UTZGNM_SHORT) { nameTypes |= (UTZNM_SHORT_GENERIC | UTZNM_SHORT_STANDARD); } if (types) { // Find matches in the TimeZoneNames return fTimeZoneNames->find(text, start, nameTypes, status); } return nullptr; } typedef struct TZGNCoreRef { TZGNCore* obj; int32_t refCount; double lastAccess; } TZGNCoreRef; // TZGNCore object cache handling static UMutex gTZGNLock; static UHashtable *gTZGNCoreCache = nullptr; static UBool gTZGNCoreCacheInitialized = false; // Access count - incremented every time up to SWEEP_INTERVAL, // then reset to 0 static int32_t gAccessCount = 0; // Interval for calling the cache sweep function - every 100 times #define SWEEP_INTERVAL 100 // Cache expiration in millisecond. When a cached entry is no // longer referenced and exceeding this threshold since last // access time, then the cache entry will be deleted by the sweep // function. For now, 3 minutes. #define CACHE_EXPIRATION 180000.0 U_CDECL_BEGIN /** * Cleanup callback func */ static UBool U_CALLCONV tzgnCore_cleanup() { if (gTZGNCoreCache != nullptr) { uhash_close(gTZGNCoreCache); gTZGNCoreCache = nullptr; } gTZGNCoreCacheInitialized = false; return true; } /** * Deleter for TZGNCoreRef */ static void U_CALLCONV deleteTZGNCoreRef(void *obj) { icu::TZGNCoreRef *entry = (icu::TZGNCoreRef*)obj; delete (icu::TZGNCore*) entry->obj; uprv_free(entry); } U_CDECL_END /** * Function used for removing unreferrenced cache entries exceeding * the expiration time. This function must be called with in the mutex * block. */ static void sweepCache() { int32_t pos = UHASH_FIRST; const UHashElement* elem; double now = (double)uprv_getUTCtime(); while ((elem = uhash_nextElement(gTZGNCoreCache, &pos)) != nullptr) { TZGNCoreRef *entry = (TZGNCoreRef *)elem->value.pointer; if (entry->refCount <= 0 && (now - entry->lastAccess) > CACHE_EXPIRATION) { // delete this entry uhash_removeElement(gTZGNCoreCache, elem); } } } TimeZoneGenericNames::TimeZoneGenericNames() : fRef(0) { } TimeZoneGenericNames::~TimeZoneGenericNames() { umtx_lock(&gTZGNLock); { U_ASSERT(fRef->refCount > 0); // Just decrement the reference count fRef->refCount--; } umtx_unlock(&gTZGNLock); } TimeZoneGenericNames* TimeZoneGenericNames::createInstance(const Locale& locale, UErrorCode& status) { if (U_FAILURE(status)) { return nullptr; } TimeZoneGenericNames* instance = new TimeZoneGenericNames(); if (instance == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } TZGNCoreRef *cacheEntry = nullptr; { Mutex lock(&gTZGNLock); if (!gTZGNCoreCacheInitialized) { // Create empty hashtable gTZGNCoreCache = uhash_open(uhash_hashChars, uhash_compareChars, nullptr, &status); if (U_SUCCESS(status)) { uhash_setKeyDeleter(gTZGNCoreCache, uprv_free); uhash_setValueDeleter(gTZGNCoreCache, deleteTZGNCoreRef); gTZGNCoreCacheInitialized = true; ucln_i18n_registerCleanup(UCLN_I18N_TIMEZONEGENERICNAMES, tzgnCore_cleanup); } } if (U_FAILURE(status)) { return nullptr; } // Check the cache, if not available, create new one and cache const char *key = locale.getName(); cacheEntry = (TZGNCoreRef *)uhash_get(gTZGNCoreCache, key); if (cacheEntry == nullptr) { TZGNCore *tzgnCore = nullptr; char *newKey = nullptr; tzgnCore = new TZGNCore(locale, status); if (tzgnCore == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; } if (U_SUCCESS(status)) { newKey = (char *)uprv_malloc(uprv_strlen(key) + 1); if (newKey == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; } else { uprv_strcpy(newKey, key); } } if (U_SUCCESS(status)) { cacheEntry = (TZGNCoreRef *)uprv_malloc(sizeof(TZGNCoreRef)); if (cacheEntry == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; } else { cacheEntry->obj = tzgnCore; cacheEntry->refCount = 1; cacheEntry->lastAccess = (double)uprv_getUTCtime(); uhash_put(gTZGNCoreCache, newKey, cacheEntry, &status); } } if (U_FAILURE(status)) { if (tzgnCore != nullptr) { delete tzgnCore; } if (newKey != nullptr) { uprv_free(newKey); } if (cacheEntry != nullptr) { uprv_free(cacheEntry); } cacheEntry = nullptr; } } else { // Update the reference count cacheEntry->refCount++; cacheEntry->lastAccess = (double)uprv_getUTCtime(); } gAccessCount++; if (gAccessCount >= SWEEP_INTERVAL) { // sweep sweepCache(); gAccessCount = 0; } } // End of mutex locked block if (cacheEntry == nullptr) { delete instance; return nullptr; } instance->fRef = cacheEntry; return instance; } bool TimeZoneGenericNames::operator==(const TimeZoneGenericNames& other) const { // Just compare if the other object also use the same // ref entry return fRef == other.fRef; } TimeZoneGenericNames* TimeZoneGenericNames::clone() const { TimeZoneGenericNames* other = new TimeZoneGenericNames(); if (other) { umtx_lock(&gTZGNLock); { // Just increments the reference count fRef->refCount++; other->fRef = fRef; } umtx_unlock(&gTZGNLock); } return other; } UnicodeString& TimeZoneGenericNames::getDisplayName(const TimeZone& tz, UTimeZoneGenericNameType type, UDate date, UnicodeString& name) const { return fRef->obj->getDisplayName(tz, type, date, name); } UnicodeString& TimeZoneGenericNames::getGenericLocationName(const UnicodeString& tzCanonicalID, UnicodeString& name) const { return fRef->obj->getGenericLocationName(tzCanonicalID, name); } int32_t TimeZoneGenericNames::findBestMatch(const UnicodeString& text, int32_t start, uint32_t types, UnicodeString& tzID, UTimeZoneFormatTimeType& timeType, UErrorCode& status) const { return fRef->obj->findBestMatch(text, start, types, tzID, timeType, status); } U_NAMESPACE_END #endif stringi/src/icu74/i18n/number_utypes.h0000644000176200001440000000325414700200761017305 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #ifndef __SOURCE_NUMBER_UTYPES_H__ #define __SOURCE_NUMBER_UTYPES_H__ #include "unicode/numberformatter.h" #include "number_types.h" #include "number_decimalquantity.h" #include "formatted_string_builder.h" #include "formattedval_impl.h" U_NAMESPACE_BEGIN namespace number { namespace impl { /** Helper function used in upluralrules.cpp */ const DecimalQuantity* validateUFormattedNumberToDecimalQuantity( const UFormattedNumber* uresult, UErrorCode& status); /** * Struct for data used by FormattedNumber. * * This struct is held internally by the C++ version FormattedNumber since the member types are not * declared in the public header file. * * Exported as U_I18N_API for tests */ class U_I18N_API UFormattedNumberData : public FormattedValueStringBuilderImpl { public: UFormattedNumberData() : FormattedValueStringBuilderImpl(kUndefinedField) {} virtual ~UFormattedNumberData(); UFormattedNumberData(UFormattedNumberData&&) = default; UFormattedNumberData& operator=(UFormattedNumberData&&) = default; // The formatted quantity. DecimalQuantity quantity; // The output unit for the formatted quantity. // TODO(units,hugovdm): populate this correctly for the general case - it's // currently only implemented for the .usage() use case. MeasureUnit outputUnit; // The gender of the formatted output. const char *gender = ""; }; } // namespace impl } // namespace number U_NAMESPACE_END #endif //__SOURCE_NUMBER_UTYPES_H__ #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/collationrootelements.h0000644000176200001440000002236514700200761021035 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2013-2014, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * collationrootelements.h * * created on: 2013mar01 * created by: Markus W. Scherer */ #ifndef __COLLATIONROOTELEMENTS_H__ #define __COLLATIONROOTELEMENTS_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION #include "unicode/uobject.h" #include "collation.h" U_NAMESPACE_BEGIN /** * Container and access methods for collation elements and weights * that occur in the root collator. * Needed for finding boundaries for building a tailoring. * * This class takes and returns 16-bit secondary and tertiary weights. */ class U_I18N_API CollationRootElements : public UMemory { public: CollationRootElements(const uint32_t *rootElements, int32_t rootElementsLength) : elements(rootElements), length(rootElementsLength) {} /** * Higher than any root primary. */ static const uint32_t PRIMARY_SENTINEL = 0xffffff00; /** * Flag in a root element, set if the element contains secondary & tertiary weights, * rather than a primary. */ static const uint32_t SEC_TER_DELTA_FLAG = 0x80; /** * Mask for getting the primary range step value from a primary-range-end element. */ static const uint8_t PRIMARY_STEP_MASK = 0x7f; enum { /** * Index of the first CE with a non-zero tertiary weight. * Same as the start of the compact root elements table. */ IX_FIRST_TERTIARY_INDEX, /** * Index of the first CE with a non-zero secondary weight. */ IX_FIRST_SECONDARY_INDEX, /** * Index of the first CE with a non-zero primary weight. */ IX_FIRST_PRIMARY_INDEX, /** * Must match Collation::COMMON_SEC_AND_TER_CE. */ IX_COMMON_SEC_AND_TER_CE, /** * Secondary & tertiary boundaries. * Bits 31..24: [fixed last secondary common byte 45] * Bits 23..16: [fixed first ignorable secondary byte 80] * Bits 15.. 8: reserved, 0 * Bits 7.. 0: [fixed first ignorable tertiary byte 3C] */ IX_SEC_TER_BOUNDARIES, /** * The current number of indexes. * Currently the same as elements[IX_FIRST_TERTIARY_INDEX]. */ IX_COUNT }; /** * Returns the boundary between tertiary weights of primary/secondary CEs * and those of tertiary CEs. * This is the upper limit for tertiaries of primary/secondary CEs. * This minus one is the lower limit for tertiaries of tertiary CEs. */ uint32_t getTertiaryBoundary() const { return (elements[IX_SEC_TER_BOUNDARIES] << 8) & 0xff00; } /** * Returns the first assigned tertiary CE. */ uint32_t getFirstTertiaryCE() const { return elements[elements[IX_FIRST_TERTIARY_INDEX]] & ~SEC_TER_DELTA_FLAG; } /** * Returns the last assigned tertiary CE. */ uint32_t getLastTertiaryCE() const { return elements[elements[IX_FIRST_SECONDARY_INDEX] - 1] & ~SEC_TER_DELTA_FLAG; } /** * Returns the last common secondary weight. * This is the lower limit for secondaries of primary CEs. */ uint32_t getLastCommonSecondary() const { return (elements[IX_SEC_TER_BOUNDARIES] >> 16) & 0xff00; } /** * Returns the boundary between secondary weights of primary CEs * and those of secondary CEs. * This is the upper limit for secondaries of primary CEs. * This minus one is the lower limit for secondaries of secondary CEs. */ uint32_t getSecondaryBoundary() const { return (elements[IX_SEC_TER_BOUNDARIES] >> 8) & 0xff00; } /** * Returns the first assigned secondary CE. */ uint32_t getFirstSecondaryCE() const { return elements[elements[IX_FIRST_SECONDARY_INDEX]] & ~SEC_TER_DELTA_FLAG; } /** * Returns the last assigned secondary CE. */ uint32_t getLastSecondaryCE() const { return elements[elements[IX_FIRST_PRIMARY_INDEX] - 1] & ~SEC_TER_DELTA_FLAG; } /** * Returns the first assigned primary weight. */ uint32_t getFirstPrimary() const { return elements[elements[IX_FIRST_PRIMARY_INDEX]]; // step=0: cannot be a range end } /** * Returns the first assigned primary CE. */ int64_t getFirstPrimaryCE() const { return Collation::makeCE(getFirstPrimary()); } /** * Returns the last root CE with a primary weight before p. * Intended only for reordering group boundaries. */ int64_t lastCEWithPrimaryBefore(uint32_t p) const; /** * Returns the first root CE with a primary weight of at least p. * Intended only for reordering group boundaries. */ int64_t firstCEWithPrimaryAtLeast(uint32_t p) const; /** * Returns the primary weight before p. * p must be greater than the first root primary. */ uint32_t getPrimaryBefore(uint32_t p, UBool isCompressible) const; /** Returns the secondary weight before [p, s]. */ uint32_t getSecondaryBefore(uint32_t p, uint32_t s) const; /** Returns the tertiary weight before [p, s, t]. */ uint32_t getTertiaryBefore(uint32_t p, uint32_t s, uint32_t t) const; /** * Finds the index of the input primary. * p must occur as a root primary, and must not be 0. */ int32_t findPrimary(uint32_t p) const; /** * Returns the primary weight after p where index=findPrimary(p). * p must be at least the first root primary. */ uint32_t getPrimaryAfter(uint32_t p, int32_t index, UBool isCompressible) const; /** * Returns the secondary weight after [p, s] where index=findPrimary(p) * except use index=0 for p=0. * * Must return a weight for every root [p, s] as well as for every weight * returned by getSecondaryBefore(). If p!=0 then s can be BEFORE_WEIGHT16. * * Exception: [0, 0] is handled by the CollationBuilder: * Both its lower and upper boundaries are special. */ uint32_t getSecondaryAfter(int32_t index, uint32_t s) const; /** * Returns the tertiary weight after [p, s, t] where index=findPrimary(p) * except use index=0 for p=0. * * Must return a weight for every root [p, s, t] as well as for every weight * returned by getTertiaryBefore(). If s!=0 then t can be BEFORE_WEIGHT16. * * Exception: [0, 0, 0] is handled by the CollationBuilder: * Both its lower and upper boundaries are special. */ uint32_t getTertiaryAfter(int32_t index, uint32_t s, uint32_t t) const; private: /** * Returns the first secondary & tertiary weights for p where index=findPrimary(p)+1. */ uint32_t getFirstSecTerForPrimary(int32_t index) const; /** * Finds the largest index i where elements[i]<=p. * Requires first primary<=p<0xffffff00 (PRIMARY_SENTINEL). * Does not require that p is a root collator primary. */ int32_t findP(uint32_t p) const; static inline UBool isEndOfPrimaryRange(uint32_t q) { return (q & SEC_TER_DELTA_FLAG) == 0 && (q & PRIMARY_STEP_MASK) != 0; } /** * Data structure: * * The first few entries are indexes, up to elements[IX_FIRST_TERTIARY_INDEX]. * See the comments on the IX_ constants. * * All other elements are a compact form of the root collator CEs * in mostly collation order. * * A sequence of one or more root CEs with the same primary weight is stored as * one element with the primary weight, with the SEC_TER_DELTA_FLAG flag not set, * followed by elements with only the secondary/tertiary weights, * each with that flag set. * If the lowest secondary/tertiary combination is Collation::COMMON_SEC_AND_TER_CE, * then the element for that combination is omitted. * * Note: If the first actual secondary/tertiary combination is higher than * Collation::COMMON_SEC_AND_TER_CE (which is unusual), * the runtime code will assume anyway that Collation::COMMON_SEC_AND_TER_CE is present. * * A range of only-primary CEs with a consistent "step" increment * from each primary to the next may be stored as a range. * Only the first and last primary are stored, and the last has the step * value in the low bits (PRIMARY_STEP_MASK). * * An range-end element may also either start a new range or be followed by * elements with secondary/tertiary deltas. * * A primary element that is not a range end has zero step bits. * * There is no element for the completely ignorable CE (all weights 0). * * Before elements[IX_FIRST_PRIMARY_INDEX], all elements are secondary/tertiary deltas, * for all of the ignorable root CEs. * * There are no elements for unassigned-implicit primary CEs. * All primaries stored here are at most 3 bytes long. */ const uint32_t *elements; int32_t length; }; U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION #endif // __COLLATIONROOTELEMENTS_H__ stringi/src/icu74/i18n/numparse_symbols.h0000644000176200001440000001125214700200761020003 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #ifndef __NUMPARSE_SYMBOLS_H__ #define __NUMPARSE_SYMBOLS_H__ #include "numparse_types.h" #include "unicode/uniset.h" #include "static_unicode_sets.h" U_NAMESPACE_BEGIN namespace numparse { namespace impl { /** * A base class for many matchers that performs a simple match against a UnicodeString and/or UnicodeSet. * * @author sffc */ // Exported as U_I18N_API for tests class U_I18N_API SymbolMatcher : public NumberParseMatcher, public UMemory { public: SymbolMatcher() = default; // WARNING: Leaves the object in an unusable state const UnicodeSet* getSet() const; bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override; bool smokeTest(const StringSegment& segment) const override; UnicodeString toString() const override; virtual bool isDisabled(const ParsedNumber& result) const = 0; virtual void accept(StringSegment& segment, ParsedNumber& result) const = 0; protected: UnicodeString fString; const UnicodeSet* fUniSet; // a reference from numparse_unisets.h; never owned SymbolMatcher(const UnicodeString& symbolString, unisets::Key key); }; // Exported as U_I18N_API for tests class U_I18N_API IgnorablesMatcher : public SymbolMatcher { public: IgnorablesMatcher() = default; // WARNING: Leaves the object in an unusable state IgnorablesMatcher(parse_flags_t parseFlags); bool isFlexible() const override; UnicodeString toString() const override; protected: bool isDisabled(const ParsedNumber& result) const override; void accept(StringSegment& segment, ParsedNumber& result) const override; }; class InfinityMatcher : public SymbolMatcher { public: InfinityMatcher() = default; // WARNING: Leaves the object in an unusable state InfinityMatcher(const DecimalFormatSymbols& dfs); protected: bool isDisabled(const ParsedNumber& result) const override; void accept(StringSegment& segment, ParsedNumber& result) const override; }; // Exported as U_I18N_API for tests class U_I18N_API MinusSignMatcher : public SymbolMatcher { public: MinusSignMatcher() = default; // WARNING: Leaves the object in an unusable state MinusSignMatcher(const DecimalFormatSymbols& dfs, bool allowTrailing); protected: bool isDisabled(const ParsedNumber& result) const override; void accept(StringSegment& segment, ParsedNumber& result) const override; private: bool fAllowTrailing; }; class NanMatcher : public SymbolMatcher { public: NanMatcher() = default; // WARNING: Leaves the object in an unusable state NanMatcher(const DecimalFormatSymbols& dfs); protected: bool isDisabled(const ParsedNumber& result) const override; void accept(StringSegment& segment, ParsedNumber& result) const override; }; class PaddingMatcher : public SymbolMatcher { public: PaddingMatcher() = default; // WARNING: Leaves the object in an unusable state PaddingMatcher(const UnicodeString& padString); bool isFlexible() const override; protected: bool isDisabled(const ParsedNumber& result) const override; void accept(StringSegment& segment, ParsedNumber& result) const override; }; // Exported as U_I18N_API for tests class U_I18N_API PercentMatcher : public SymbolMatcher { public: PercentMatcher() = default; // WARNING: Leaves the object in an unusable state PercentMatcher(const DecimalFormatSymbols& dfs); protected: bool isDisabled(const ParsedNumber& result) const override; void accept(StringSegment& segment, ParsedNumber& result) const override; }; // Exported as U_I18N_API for tests class U_I18N_API PermilleMatcher : public SymbolMatcher { public: PermilleMatcher() = default; // WARNING: Leaves the object in an unusable state PermilleMatcher(const DecimalFormatSymbols& dfs); protected: bool isDisabled(const ParsedNumber& result) const override; void accept(StringSegment& segment, ParsedNumber& result) const override; }; // Exported as U_I18N_API for tests class U_I18N_API PlusSignMatcher : public SymbolMatcher { public: PlusSignMatcher() = default; // WARNING: Leaves the object in an unusable state PlusSignMatcher(const DecimalFormatSymbols& dfs, bool allowTrailing); protected: bool isDisabled(const ParsedNumber& result) const override; void accept(StringSegment& segment, ParsedNumber& result) const override; private: bool fAllowTrailing; }; } // namespace impl } // namespace numparse U_NAMESPACE_END #endif //__NUMPARSE_SYMBOLS_H__ #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/uni2name.cpp0000644000176200001440000000710214700200761016451 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2001-2011, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 06/06/01 aliu Creation. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/unifilt.h" #include "unicode/uchar.h" #include "unicode/utf16.h" #include "uni2name.h" #include "cstring.h" #include "cmemory.h" #include "uprops.h" U_NAMESPACE_BEGIN UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeNameTransliterator) static const char16_t OPEN_DELIM[] = {92,78,123,0}; // "\N{" static const char16_t CLOSE_DELIM = 125; // "}" #define OPEN_DELIM_LEN 3 /** * Constructs a transliterator. */ UnicodeNameTransliterator::UnicodeNameTransliterator(UnicodeFilter* adoptedFilter) : Transliterator(UNICODE_STRING("Any-Name", 8), adoptedFilter) { } /** * Destructor. */ UnicodeNameTransliterator::~UnicodeNameTransliterator() {} /** * Copy constructor. */ UnicodeNameTransliterator::UnicodeNameTransliterator(const UnicodeNameTransliterator& o) : Transliterator(o) {} /** * Assignment operator. */ /*UnicodeNameTransliterator& UnicodeNameTransliterator::operator=( const UnicodeNameTransliterator& o) { Transliterator::operator=(o); return *this; }*/ /** * Transliterator API. */ UnicodeNameTransliterator* UnicodeNameTransliterator::clone() const { return new UnicodeNameTransliterator(*this); } /** * Implements {@link Transliterator#handleTransliterate}. * Ignore isIncremental since we don't need the context, and * we work on codepoints. */ void UnicodeNameTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, UBool /*isIncremental*/) const { // The failure mode, here and below, is to behave like Any-Null, // if either there is no name data (max len == 0) or there is no // memory (malloc() => nullptr). int32_t maxLen = uprv_getMaxCharNameLength(); if (maxLen == 0) { offsets.start = offsets.limit; return; } // Accommodate the longest possible name plus padding char* buf = (char*) uprv_malloc(maxLen); if (buf == nullptr) { offsets.start = offsets.limit; return; } int32_t cursor = offsets.start; int32_t limit = offsets.limit; UnicodeString str(false, OPEN_DELIM, OPEN_DELIM_LEN); UErrorCode status; int32_t len; while (cursor < limit) { UChar32 c = text.char32At(cursor); int32_t clen = U16_LENGTH(c); status = U_ZERO_ERROR; if ((len = u_charName(c, U_EXTENDED_CHAR_NAME, buf, maxLen, &status)) >0 && !U_FAILURE(status)) { str.truncate(OPEN_DELIM_LEN); str.append(UnicodeString(buf, len, US_INV)).append(CLOSE_DELIM); text.handleReplaceBetween(cursor, cursor+clen, str); len += OPEN_DELIM_LEN + 1; // adjust for delimiters cursor += len; // advance cursor and adjust for new text limit += len-clen; // change in length } else { cursor += clen; } } offsets.contextLimit += limit - offsets.limit; offsets.limit = limit; offsets.start = cursor; uprv_free(buf); } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ stringi/src/icu74/i18n/standardplural.cpp0000644000176200001440000001007014700200761017751 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2015, International Business Machines Corporation * and others. All Rights Reserved. ******************************************************************************* * standardplural.cpp * * created on: 2015dec14 * created by: Markus W. Scherer */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/unistr.h" #include "cstring.h" #include "standardplural.h" #include "uassert.h" U_NAMESPACE_BEGIN static const char *gKeywords[StandardPlural::COUNT] = { "zero", "one", "two", "few", "many", "other", "=0", "=1" }; const char *StandardPlural::getKeyword(Form p) { U_ASSERT(ZERO <= p && p < COUNT); return gKeywords[p]; } int32_t StandardPlural::indexOrNegativeFromString(const char *keyword) { switch (*keyword++) { case 'f': if (uprv_strcmp(keyword, "ew") == 0) { return FEW; } break; case 'm': if (uprv_strcmp(keyword, "any") == 0) { return MANY; } break; case 'o': if (uprv_strcmp(keyword, "ther") == 0) { return OTHER; } else if (uprv_strcmp(keyword, "ne") == 0) { return ONE; } break; case 't': if (uprv_strcmp(keyword, "wo") == 0) { return TWO; } break; case 'z': if (uprv_strcmp(keyword, "ero") == 0) { return ZERO; } break; case '=': if (uprv_strcmp(keyword, "0") == 0) { return EQ_0; } else if (uprv_strcmp(keyword, "1") == 0) { return EQ_1; } break; // Also allow "0" and "1" case '0': if (*keyword == 0) { return EQ_0; } break; case '1': if (*keyword == 0) { return EQ_1; } break; default: break; } return -1; } static const char16_t gZero[] = u"zero"; static const char16_t gOne[] = u"one"; static const char16_t gTwo[] = u"two"; static const char16_t gFew[] = u"few"; static const char16_t gMany[] = u"many"; static const char16_t gOther[] = u"other"; static const char16_t gEq0[] = u"=0"; static const char16_t gEq1[] = u"=1"; int32_t StandardPlural::indexOrNegativeFromString(const UnicodeString &keyword) { switch (keyword.length()) { case 1: if (keyword.charAt(0) == '0') { return EQ_0; } else if (keyword.charAt(0) == '1') { return EQ_1; } break; case 2: if (keyword.compare(gEq0, 2) == 0) { return EQ_0; } else if (keyword.compare(gEq1, 2) == 0) { return EQ_1; } break; case 3: if (keyword.compare(gOne, 3) == 0) { return ONE; } else if (keyword.compare(gTwo, 3) == 0) { return TWO; } else if (keyword.compare(gFew, 3) == 0) { return FEW; } break; case 4: if (keyword.compare(gMany, 4) == 0) { return MANY; } else if (keyword.compare(gZero, 4) == 0) { return ZERO; } break; case 5: if (keyword.compare(gOther, 5) == 0) { return OTHER; } break; default: break; } return -1; } int32_t StandardPlural::indexFromString(const char *keyword, UErrorCode &errorCode) { if (U_FAILURE(errorCode)) { return OTHER; } int32_t i = indexOrNegativeFromString(keyword); if (i >= 0) { return i; } else { errorCode = U_ILLEGAL_ARGUMENT_ERROR; return OTHER; } } int32_t StandardPlural::indexFromString(const UnicodeString &keyword, UErrorCode &errorCode) { if (U_FAILURE(errorCode)) { return OTHER; } int32_t i = indexOrNegativeFromString(keyword); if (i >= 0) { return i; } else { errorCode = U_ILLEGAL_ARGUMENT_ERROR; return OTHER; } } U_NAMESPACE_END #endif // !UCONFIG_NO_FORMATTING stringi/src/icu74/i18n/collationrootelements.cpp0000644000176200001440000002661414700200761021371 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2013-2014, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * collationrootelements.cpp * * created on: 2013mar05 * created by: Markus W. Scherer */ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION #include "collation.h" #include "collationrootelements.h" #include "uassert.h" U_NAMESPACE_BEGIN int64_t CollationRootElements::lastCEWithPrimaryBefore(uint32_t p) const { if(p == 0) { return 0; } U_ASSERT(p > elements[elements[IX_FIRST_PRIMARY_INDEX]]); int32_t index = findP(p); uint32_t q = elements[index]; uint32_t secTer; if(p == (q & 0xffffff00)) { // p == elements[index] is a root primary. Find the CE before it. // We must not be in a primary range. U_ASSERT((q & PRIMARY_STEP_MASK) == 0); secTer = elements[index - 1]; if((secTer & SEC_TER_DELTA_FLAG) == 0) { // Primary CE just before p. p = secTer & 0xffffff00; secTer = Collation::COMMON_SEC_AND_TER_CE; } else { // secTer = last secondary & tertiary for the previous primary index -= 2; for(;;) { p = elements[index]; if((p & SEC_TER_DELTA_FLAG) == 0) { p &= 0xffffff00; break; } --index; } } } else { // p > elements[index] which is the previous primary. // Find the last secondary & tertiary weights for it. p = q & 0xffffff00; secTer = Collation::COMMON_SEC_AND_TER_CE; for(;;) { q = elements[++index]; if((q & SEC_TER_DELTA_FLAG) == 0) { // We must not be in a primary range. U_ASSERT((q & PRIMARY_STEP_MASK) == 0); break; } secTer = q; } } return ((int64_t)p << 32) | (secTer & ~SEC_TER_DELTA_FLAG); } int64_t CollationRootElements::firstCEWithPrimaryAtLeast(uint32_t p) const { if(p == 0) { return 0; } int32_t index = findP(p); if(p != (elements[index] & 0xffffff00)) { for(;;) { p = elements[++index]; if((p & SEC_TER_DELTA_FLAG) == 0) { // First primary after p. We must not be in a primary range. U_ASSERT((p & PRIMARY_STEP_MASK) == 0); break; } } } // The code above guarantees that p has at most 3 bytes: (p & 0xff) == 0. return ((int64_t)p << 32) | Collation::COMMON_SEC_AND_TER_CE; } uint32_t CollationRootElements::getPrimaryBefore(uint32_t p, UBool isCompressible) const { int32_t index = findPrimary(p); int32_t step; uint32_t q = elements[index]; if(p == (q & 0xffffff00)) { // Found p itself. Return the previous primary. // See if p is at the end of a previous range. step = (int32_t)q & PRIMARY_STEP_MASK; if(step == 0) { // p is not at the end of a range. Look for the previous primary. do { p = elements[--index]; } while((p & SEC_TER_DELTA_FLAG) != 0); return p & 0xffffff00; } } else { // p is in a range, and not at the start. uint32_t nextElement = elements[index + 1]; U_ASSERT(isEndOfPrimaryRange(nextElement)); step = (int32_t)nextElement & PRIMARY_STEP_MASK; } // Return the previous range primary. if((p & 0xffff) == 0) { return Collation::decTwoBytePrimaryByOneStep(p, isCompressible, step); } else { return Collation::decThreeBytePrimaryByOneStep(p, isCompressible, step); } } uint32_t CollationRootElements::getSecondaryBefore(uint32_t p, uint32_t s) const { int32_t index; uint32_t previousSec, sec; if(p == 0) { index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX]; // Gap at the beginning of the secondary CE range. previousSec = 0; sec = elements[index] >> 16; } else { index = findPrimary(p) + 1; previousSec = Collation::BEFORE_WEIGHT16; sec = getFirstSecTerForPrimary(index) >> 16; } U_ASSERT(s >= sec); while(s > sec) { previousSec = sec; U_ASSERT((elements[index] & SEC_TER_DELTA_FLAG) != 0); sec = elements[index++] >> 16; } U_ASSERT(sec == s); return previousSec; } uint32_t CollationRootElements::getTertiaryBefore(uint32_t p, uint32_t s, uint32_t t) const { U_ASSERT((t & ~Collation::ONLY_TERTIARY_MASK) == 0); int32_t index; uint32_t previousTer, secTer; if(p == 0) { if(s == 0) { index = (int32_t)elements[IX_FIRST_TERTIARY_INDEX]; // Gap at the beginning of the tertiary CE range. previousTer = 0; } else { index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX]; previousTer = Collation::BEFORE_WEIGHT16; } secTer = elements[index] & ~SEC_TER_DELTA_FLAG; } else { index = findPrimary(p) + 1; previousTer = Collation::BEFORE_WEIGHT16; secTer = getFirstSecTerForPrimary(index); } uint32_t st = (s << 16) | t; while(st > secTer) { if((secTer >> 16) == s) { previousTer = secTer; } U_ASSERT((elements[index] & SEC_TER_DELTA_FLAG) != 0); secTer = elements[index++] & ~SEC_TER_DELTA_FLAG; } U_ASSERT(secTer == st); return previousTer & 0xffff; } uint32_t CollationRootElements::getPrimaryAfter(uint32_t p, int32_t index, UBool isCompressible) const { U_ASSERT(p == (elements[index] & 0xffffff00) || isEndOfPrimaryRange(elements[index + 1])); uint32_t q = elements[++index]; int32_t step; if((q & SEC_TER_DELTA_FLAG) == 0 && (step = (int32_t)q & PRIMARY_STEP_MASK) != 0) { // Return the next primary in this range. if((p & 0xffff) == 0) { return Collation::incTwoBytePrimaryByOffset(p, isCompressible, step); } else { return Collation::incThreeBytePrimaryByOffset(p, isCompressible, step); } } else { // Return the next primary in the list. while((q & SEC_TER_DELTA_FLAG) != 0) { q = elements[++index]; } U_ASSERT((q & PRIMARY_STEP_MASK) == 0); return q; } } uint32_t CollationRootElements::getSecondaryAfter(int32_t index, uint32_t s) const { uint32_t secTer; uint32_t secLimit; if(index == 0) { // primary = 0 U_ASSERT(s != 0); index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX]; secTer = elements[index]; // Gap at the end of the secondary CE range. secLimit = 0x10000; } else { U_ASSERT(index >= (int32_t)elements[IX_FIRST_PRIMARY_INDEX]); secTer = getFirstSecTerForPrimary(index + 1); // If this is an explicit sec/ter unit, then it will be read once more. // Gap for secondaries of primary CEs. secLimit = getSecondaryBoundary(); } for(;;) { uint32_t sec = secTer >> 16; if(sec > s) { return sec; } secTer = elements[++index]; if((secTer & SEC_TER_DELTA_FLAG) == 0) { return secLimit; } } } uint32_t CollationRootElements::getTertiaryAfter(int32_t index, uint32_t s, uint32_t t) const { uint32_t secTer; uint32_t terLimit; if(index == 0) { // primary = 0 if(s == 0) { U_ASSERT(t != 0); index = (int32_t)elements[IX_FIRST_TERTIARY_INDEX]; // Gap at the end of the tertiary CE range. terLimit = 0x4000; } else { index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX]; // Gap for tertiaries of primary/secondary CEs. terLimit = getTertiaryBoundary(); } secTer = elements[index] & ~SEC_TER_DELTA_FLAG; } else { U_ASSERT(index >= (int32_t)elements[IX_FIRST_PRIMARY_INDEX]); secTer = getFirstSecTerForPrimary(index + 1); // If this is an explicit sec/ter unit, then it will be read once more. terLimit = getTertiaryBoundary(); } uint32_t st = (s << 16) | t; for(;;) { if(secTer > st) { U_ASSERT((secTer >> 16) == s); return secTer & 0xffff; } secTer = elements[++index]; // No tertiary greater than t for this primary+secondary. if((secTer & SEC_TER_DELTA_FLAG) == 0 || (secTer >> 16) > s) { return terLimit; } secTer &= ~SEC_TER_DELTA_FLAG; } } uint32_t CollationRootElements::getFirstSecTerForPrimary(int32_t index) const { uint32_t secTer = elements[index]; if((secTer & SEC_TER_DELTA_FLAG) == 0) { // No sec/ter delta. return Collation::COMMON_SEC_AND_TER_CE; } secTer &= ~SEC_TER_DELTA_FLAG; if(secTer > Collation::COMMON_SEC_AND_TER_CE) { // Implied sec/ter. return Collation::COMMON_SEC_AND_TER_CE; } // Explicit sec/ter below common/common. return secTer; } int32_t CollationRootElements::findPrimary(uint32_t p) const { // Requirement: p must occur as a root primary. U_ASSERT((p & 0xff) == 0); // at most a 3-byte primary int32_t index = findP(p); // If p is in a range, then we just assume that p is an actual primary in this range. // (Too cumbersome/expensive to check.) // Otherwise, it must be an exact match. U_ASSERT(isEndOfPrimaryRange(elements[index + 1]) || p == (elements[index] & 0xffffff00)); return index; } int32_t CollationRootElements::findP(uint32_t p) const { // p need not occur as a root primary. // For example, it might be a reordering group boundary. U_ASSERT((p >> 24) != Collation::UNASSIGNED_IMPLICIT_BYTE); // modified binary search int32_t start = (int32_t)elements[IX_FIRST_PRIMARY_INDEX]; U_ASSERT(p >= elements[start]); int32_t limit = length - 1; U_ASSERT(elements[limit] >= PRIMARY_SENTINEL); U_ASSERT(p < elements[limit]); while((start + 1) < limit) { // Invariant: elements[start] and elements[limit] are primaries, // and elements[start]<=p<=elements[limit]. int32_t i = (start + limit) / 2; uint32_t q = elements[i]; if((q & SEC_TER_DELTA_FLAG) != 0) { // Find the next primary. int32_t j = i + 1; for(;;) { if(j == limit) { break; } q = elements[j]; if((q & SEC_TER_DELTA_FLAG) == 0) { i = j; break; } ++j; } if((q & SEC_TER_DELTA_FLAG) != 0) { // Find the preceding primary. j = i - 1; for(;;) { if(j == start) { break; } q = elements[j]; if((q & SEC_TER_DELTA_FLAG) == 0) { i = j; break; } --j; } if((q & SEC_TER_DELTA_FLAG) != 0) { // No primary between start and limit. break; } } } if(p < (q & 0xffffff00)) { // Reset the "step" bits of a range end primary. limit = i; } else { start = i; } } return start; } U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION stringi/src/icu74/i18n/rbtz.cpp0000644000176200001440000007732214700200761015727 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2007-2013, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ #include "utypeinfo.h" // for 'typeid' to work #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/rbtz.h" #include "unicode/gregocal.h" #include "uvector.h" #include "gregoimp.h" #include "cmemory.h" #include "umutex.h" U_NAMESPACE_BEGIN /** * A struct representing a time zone transition */ struct Transition : public UMemory { UDate time; TimeZoneRule* from; TimeZoneRule* to; }; U_CDECL_BEGIN static void U_CALLCONV deleteTransition(void* obj) { delete static_cast(obj); } U_CDECL_END static UBool compareRules(UVector* rules1, UVector* rules2) { if (rules1 == nullptr && rules2 == nullptr) { return true; } else if (rules1 == nullptr || rules2 == nullptr) { return false; } int32_t size = rules1->size(); if (size != rules2->size()) { return false; } for (int32_t i = 0; i < size; i++) { TimeZoneRule *r1 = (TimeZoneRule*)rules1->elementAt(i); TimeZoneRule *r2 = (TimeZoneRule*)rules2->elementAt(i); if (*r1 != *r2) { return false; } } return true; } UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTimeZone) RuleBasedTimeZone::RuleBasedTimeZone(const UnicodeString& id, InitialTimeZoneRule* initialRule) : BasicTimeZone(id), fInitialRule(initialRule), fHistoricRules(nullptr), fFinalRules(nullptr), fHistoricTransitions(nullptr), fUpToDate(false) { } RuleBasedTimeZone::RuleBasedTimeZone(const RuleBasedTimeZone& source) : BasicTimeZone(source), fInitialRule(source.fInitialRule->clone()), fHistoricTransitions(nullptr), fUpToDate(false) { fHistoricRules = copyRules(source.fHistoricRules); fFinalRules = copyRules(source.fFinalRules); if (source.fUpToDate) { UErrorCode status = U_ZERO_ERROR; complete(status); } } RuleBasedTimeZone::~RuleBasedTimeZone() { deleteTransitions(); deleteRules(); } RuleBasedTimeZone& RuleBasedTimeZone::operator=(const RuleBasedTimeZone& right) { if (*this != right) { BasicTimeZone::operator=(right); deleteRules(); fInitialRule = right.fInitialRule->clone(); fHistoricRules = copyRules(right.fHistoricRules); fFinalRules = copyRules(right.fFinalRules); deleteTransitions(); fUpToDate = false; } return *this; } bool RuleBasedTimeZone::operator==(const TimeZone& that) const { if (this == &that) { return true; } if (typeid(*this) != typeid(that) || !BasicTimeZone::operator==(that)) { return false; } RuleBasedTimeZone *rbtz = (RuleBasedTimeZone*)&that; if (*fInitialRule != *(rbtz->fInitialRule)) { return false; } if (compareRules(fHistoricRules, rbtz->fHistoricRules) && compareRules(fFinalRules, rbtz->fFinalRules)) { return true; } return false; } bool RuleBasedTimeZone::operator!=(const TimeZone& that) const { return !operator==(that); } void RuleBasedTimeZone::addTransitionRule(TimeZoneRule* rule, UErrorCode& status) { LocalPointerlpRule(rule); if (U_FAILURE(status)) { return; } AnnualTimeZoneRule* atzrule = dynamic_cast(rule); if (atzrule != nullptr && atzrule->getEndYear() == AnnualTimeZoneRule::MAX_YEAR) { // A final rule if (fFinalRules == nullptr) { LocalPointer lpFinalRules(new UVector(uprv_deleteUObject, nullptr, status), status); if (U_FAILURE(status)) { return; } fFinalRules = lpFinalRules.orphan(); } else if (fFinalRules->size() >= 2) { // Cannot handle more than two final rules status = U_INVALID_STATE_ERROR; return; } fFinalRules->adoptElement(lpRule.orphan(), status); } else { // Non-final rule if (fHistoricRules == nullptr) { LocalPointer lpHistoricRules(new UVector(uprv_deleteUObject, nullptr, status), status); if (U_FAILURE(status)) { return; } fHistoricRules = lpHistoricRules.orphan(); } fHistoricRules->adoptElement(lpRule.orphan(), status); } // Mark dirty, so transitions are recalculated at next complete() call fUpToDate = false; } void RuleBasedTimeZone::completeConst(UErrorCode& status) const { static UMutex gLock; if (U_FAILURE(status)) { return; } umtx_lock(&gLock); if (!fUpToDate) { RuleBasedTimeZone *ncThis = const_cast(this); ncThis->complete(status); } umtx_unlock(&gLock); } void RuleBasedTimeZone::complete(UErrorCode& status) { if (U_FAILURE(status)) { return; } if (fUpToDate) { return; } // Make sure either no final rules or a pair of AnnualTimeZoneRules // are available. if (fFinalRules != nullptr && fFinalRules->size() != 2) { status = U_INVALID_STATE_ERROR; return; } // Create a TimezoneTransition and add to the list if (fHistoricRules != nullptr || fFinalRules != nullptr) { TimeZoneRule *curRule = fInitialRule; UDate lastTransitionTime = MIN_MILLIS; // Build the transition array which represents historical time zone // transitions. if (fHistoricRules != nullptr && fHistoricRules->size() > 0) { int32_t i; int32_t historicCount = fHistoricRules->size(); LocalMemory done((bool *)uprv_malloc(sizeof(bool) * historicCount)); if (done == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; goto cleanup; } for (i = 0; i < historicCount; i++) { done[i] = false; } while (true) { int32_t curStdOffset = curRule->getRawOffset(); int32_t curDstSavings = curRule->getDSTSavings(); UDate nextTransitionTime = MAX_MILLIS; TimeZoneRule *nextRule = nullptr; TimeZoneRule *r = nullptr; UBool avail; UDate tt; UnicodeString curName, name; curRule->getName(curName); for (i = 0; i < historicCount; i++) { if (done[i]) { continue; } r = (TimeZoneRule*)fHistoricRules->elementAt(i); avail = r->getNextStart(lastTransitionTime, curStdOffset, curDstSavings, false, tt); if (!avail) { // No more transitions from this rule - skip this rule next time done[i] = true; } else { r->getName(name); if (*r == *curRule || (name == curName && r->getRawOffset() == curRule->getRawOffset() && r->getDSTSavings() == curRule->getDSTSavings())) { continue; } if (tt < nextTransitionTime) { nextTransitionTime = tt; nextRule = r; } } } if (nextRule == nullptr) { // Check if all historic rules are done UBool bDoneAll = true; for (int32_t j = 0; j < historicCount; j++) { if (!done[j]) { bDoneAll = false; break; } } if (bDoneAll) { break; } } if (fFinalRules != nullptr) { // Check if one of final rules has earlier transition date for (i = 0; i < 2 /* fFinalRules->size() */; i++) { TimeZoneRule *fr = (TimeZoneRule*)fFinalRules->elementAt(i); if (*fr == *curRule) { continue; } r = (TimeZoneRule*)fFinalRules->elementAt(i); avail = r->getNextStart(lastTransitionTime, curStdOffset, curDstSavings, false, tt); if (avail) { if (tt < nextTransitionTime) { nextTransitionTime = tt; nextRule = r; } } } } if (nextRule == nullptr) { // Nothing more break; } if (fHistoricTransitions == nullptr) { LocalPointer lpHistoricTransitions( new UVector(deleteTransition, nullptr, status), status); if (U_FAILURE(status)) { goto cleanup; } fHistoricTransitions = lpHistoricTransitions.orphan(); } LocalPointer trst(new Transition, status); if (U_FAILURE(status)) { goto cleanup; } trst->time = nextTransitionTime; trst->from = curRule; trst->to = nextRule; fHistoricTransitions->adoptElement(trst.orphan(), status); if (U_FAILURE(status)) { goto cleanup; } lastTransitionTime = nextTransitionTime; curRule = nextRule; } } if (fFinalRules != nullptr) { if (fHistoricTransitions == nullptr) { LocalPointer lpHistoricTransitions( new UVector(deleteTransition, nullptr, status), status); if (U_FAILURE(status)) { goto cleanup; } fHistoricTransitions = lpHistoricTransitions.orphan(); } // Append the first transition for each TimeZoneRule *rule0 = (TimeZoneRule*)fFinalRules->elementAt(0); TimeZoneRule *rule1 = (TimeZoneRule*)fFinalRules->elementAt(1); UDate tt0, tt1; UBool avail0 = rule0->getNextStart(lastTransitionTime, curRule->getRawOffset(), curRule->getDSTSavings(), false, tt0); UBool avail1 = rule1->getNextStart(lastTransitionTime, curRule->getRawOffset(), curRule->getDSTSavings(), false, tt1); if (!avail0 || !avail1) { // Should not happen, because both rules are permanent status = U_INVALID_STATE_ERROR; goto cleanup; } LocalPointer final0(new Transition, status); LocalPointer final1(new Transition, status); if (U_FAILURE(status)) { goto cleanup; } if (tt0 < tt1) { final0->time = tt0; final0->from = curRule; final0->to = rule0; rule1->getNextStart(tt0, rule0->getRawOffset(), rule0->getDSTSavings(), false, final1->time); final1->from = rule0; final1->to = rule1; } else { final0->time = tt1; final0->from = curRule; final0->to = rule1; rule0->getNextStart(tt1, rule1->getRawOffset(), rule1->getDSTSavings(), false, final1->time); final1->from = rule1; final1->to = rule0; } fHistoricTransitions->adoptElement(final0.orphan(), status); fHistoricTransitions->adoptElement(final1.orphan(), status); if (U_FAILURE(status)) { goto cleanup; } } } fUpToDate = true; return; cleanup: deleteTransitions(); fUpToDate = false; } RuleBasedTimeZone* RuleBasedTimeZone::clone() const { return new RuleBasedTimeZone(*this); } int32_t RuleBasedTimeZone::getOffset(uint8_t era, int32_t year, int32_t month, int32_t day, uint8_t dayOfWeek, int32_t millis, UErrorCode& status) const { if (U_FAILURE(status)) { return 0; } if (month < UCAL_JANUARY || month > UCAL_DECEMBER) { status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } else { return getOffset(era, year, month, day, dayOfWeek, millis, Grego::monthLength(year, month), status); } } int32_t RuleBasedTimeZone::getOffset(uint8_t era, int32_t year, int32_t month, int32_t day, uint8_t /*dayOfWeek*/, int32_t millis, int32_t /*monthLength*/, UErrorCode& status) const { // dayOfWeek and monthLength are unused if (U_FAILURE(status)) { return 0; } if (era == GregorianCalendar::BC) { // Convert to extended year year = 1 - year; } int32_t rawOffset, dstOffset; UDate time = (UDate)Grego::fieldsToDay(year, month, day) * U_MILLIS_PER_DAY + millis; getOffsetInternal(time, true, kDaylight, kStandard, rawOffset, dstOffset, status); if (U_FAILURE(status)) { return 0; } return (rawOffset + dstOffset); } void RuleBasedTimeZone::getOffset(UDate date, UBool local, int32_t& rawOffset, int32_t& dstOffset, UErrorCode& status) const { getOffsetInternal(date, local, kFormer, kLatter, rawOffset, dstOffset, status); } void RuleBasedTimeZone::getOffsetFromLocal(UDate date, UTimeZoneLocalOption nonExistingTimeOpt, UTimeZoneLocalOption duplicatedTimeOpt, int32_t& rawOffset, int32_t& dstOffset, UErrorCode& status) const { getOffsetInternal(date, true, nonExistingTimeOpt, duplicatedTimeOpt, rawOffset, dstOffset, status); } /* * The internal getOffset implementation */ void RuleBasedTimeZone::getOffsetInternal(UDate date, UBool local, int32_t NonExistingTimeOpt, int32_t DuplicatedTimeOpt, int32_t& rawOffset, int32_t& dstOffset, UErrorCode& status) const { rawOffset = 0; dstOffset = 0; if (U_FAILURE(status)) { return; } if (!fUpToDate) { // Transitions are not yet resolved. We cannot do it here // because this method is const. Thus, do nothing and return // error status. status = U_INVALID_STATE_ERROR; return; } const TimeZoneRule *rule = nullptr; if (fHistoricTransitions == nullptr) { rule = fInitialRule; } else { UDate tstart = getTransitionTime((Transition*)fHistoricTransitions->elementAt(0), local, NonExistingTimeOpt, DuplicatedTimeOpt); if (date < tstart) { rule = fInitialRule; } else { int32_t idx = fHistoricTransitions->size() - 1; UDate tend = getTransitionTime((Transition*)fHistoricTransitions->elementAt(idx), local, NonExistingTimeOpt, DuplicatedTimeOpt); if (date > tend) { if (fFinalRules != nullptr) { rule = findRuleInFinal(date, local, NonExistingTimeOpt, DuplicatedTimeOpt); } if (rule == nullptr) { // no final rules or the given time is before the first transition // specified by the final rules -> use the last rule rule = ((Transition*)fHistoricTransitions->elementAt(idx))->to; } } else { // Find a historical transition while (idx >= 0) { if (date >= getTransitionTime((Transition*)fHistoricTransitions->elementAt(idx), local, NonExistingTimeOpt, DuplicatedTimeOpt)) { break; } idx--; } rule = ((Transition*)fHistoricTransitions->elementAt(idx))->to; } } } if (rule != nullptr) { rawOffset = rule->getRawOffset(); dstOffset = rule->getDSTSavings(); } } void RuleBasedTimeZone::setRawOffset(int32_t /*offsetMillis*/) { // We don't support this operation at this moment. // Nothing to do! } int32_t RuleBasedTimeZone::getRawOffset() const { // Note: This implementation returns standard GMT offset // as of current time. UErrorCode status = U_ZERO_ERROR; int32_t raw, dst; getOffset(uprv_getUTCtime(), false, raw, dst, status); return raw; } UBool RuleBasedTimeZone::useDaylightTime() const { // Note: This implementation returns true when // daylight saving time is used as of now or // after the next transition. UErrorCode status = U_ZERO_ERROR; UDate now = uprv_getUTCtime(); int32_t raw, dst; getOffset(now, false, raw, dst, status); if (dst != 0) { return true; } // If DST is not used now, check if DST is used after the next transition UDate time; TimeZoneRule *from, *to; UBool avail = findNext(now, false, time, from, to); if (avail && to->getDSTSavings() != 0) { return true; } return false; } UBool RuleBasedTimeZone::inDaylightTime(UDate date, UErrorCode& status) const { if (U_FAILURE(status)) { return false; } int32_t raw, dst; getOffset(date, false, raw, dst, status); if (dst != 0) { return true; } return false; } UBool RuleBasedTimeZone::hasSameRules(const TimeZone& other) const { if (this == &other) { return true; } if (typeid(*this) != typeid(other)) { return false; } const RuleBasedTimeZone& that = static_cast(other); if (*fInitialRule != *(that.fInitialRule)) { return false; } if (compareRules(fHistoricRules, that.fHistoricRules) && compareRules(fFinalRules, that.fFinalRules)) { return true; } return false; } UBool RuleBasedTimeZone::getNextTransition(UDate base, UBool inclusive, TimeZoneTransition& result) const { UErrorCode status = U_ZERO_ERROR; completeConst(status); if (U_FAILURE(status)) { return false; } UDate transitionTime; TimeZoneRule *fromRule, *toRule; UBool found = findNext(base, inclusive, transitionTime, fromRule, toRule); if (found) { result.setTime(transitionTime); result.setFrom(*fromRule); result.setTo(*toRule); return true; } return false; } UBool RuleBasedTimeZone::getPreviousTransition(UDate base, UBool inclusive, TimeZoneTransition& result) const { UErrorCode status = U_ZERO_ERROR; completeConst(status); if (U_FAILURE(status)) { return false; } UDate transitionTime; TimeZoneRule *fromRule, *toRule; UBool found = findPrev(base, inclusive, transitionTime, fromRule, toRule); if (found) { result.setTime(transitionTime); result.setFrom(*fromRule); result.setTo(*toRule); return true; } return false; } int32_t RuleBasedTimeZone::countTransitionRules(UErrorCode& /*status*/) const { int32_t count = 0; if (fHistoricRules != nullptr) { count += fHistoricRules->size(); } if (fFinalRules != nullptr) { count += fFinalRules->size(); } return count; } void RuleBasedTimeZone::getTimeZoneRules(const InitialTimeZoneRule*& initial, const TimeZoneRule* trsrules[], int32_t& trscount, UErrorCode& status) const { if (U_FAILURE(status)) { return; } // Initial rule initial = fInitialRule; // Transition rules int32_t cnt = 0; int32_t idx; if (fHistoricRules != nullptr && cnt < trscount) { int32_t historicCount = fHistoricRules->size(); idx = 0; while (cnt < trscount && idx < historicCount) { trsrules[cnt++] = (const TimeZoneRule*)fHistoricRules->elementAt(idx++); } } if (fFinalRules != nullptr && cnt < trscount) { int32_t finalCount = fFinalRules->size(); idx = 0; while (cnt < trscount && idx < finalCount) { trsrules[cnt++] = (const TimeZoneRule*)fFinalRules->elementAt(idx++); } } // Set the result length trscount = cnt; } void RuleBasedTimeZone::deleteRules() { delete fInitialRule; fInitialRule = nullptr; if (fHistoricRules != nullptr) { delete fHistoricRules; fHistoricRules = nullptr; } if (fFinalRules != nullptr) { delete fFinalRules; fFinalRules = nullptr; } } void RuleBasedTimeZone::deleteTransitions() { if (fHistoricTransitions != nullptr) { delete fHistoricTransitions; } fHistoricTransitions = nullptr; } UVector* RuleBasedTimeZone::copyRules(UVector* source) { if (source == nullptr) { return nullptr; } UErrorCode ec = U_ZERO_ERROR; int32_t size = source->size(); LocalPointer rules(new UVector(uprv_deleteUObject, nullptr, size, ec), ec); if (U_FAILURE(ec)) { return nullptr; } int32_t i; for (i = 0; i < size; i++) { LocalPointer rule(((TimeZoneRule*)source->elementAt(i))->clone(), ec); rules->adoptElement(rule.orphan(), ec); if (U_FAILURE(ec)) { return nullptr; } } return rules.orphan(); } TimeZoneRule* RuleBasedTimeZone::findRuleInFinal(UDate date, UBool local, int32_t NonExistingTimeOpt, int32_t DuplicatedTimeOpt) const { if (fFinalRules == nullptr) { return nullptr; } AnnualTimeZoneRule* fr0 = (AnnualTimeZoneRule*)fFinalRules->elementAt(0); AnnualTimeZoneRule* fr1 = (AnnualTimeZoneRule*)fFinalRules->elementAt(1); if (fr0 == nullptr || fr1 == nullptr) { return nullptr; } UDate start0, start1; UDate base; int32_t localDelta; base = date; if (local) { localDelta = getLocalDelta(fr1->getRawOffset(), fr1->getDSTSavings(), fr0->getRawOffset(), fr0->getDSTSavings(), NonExistingTimeOpt, DuplicatedTimeOpt); base -= localDelta; } UBool avail0 = fr0->getPreviousStart(base, fr1->getRawOffset(), fr1->getDSTSavings(), true, start0); base = date; if (local) { localDelta = getLocalDelta(fr0->getRawOffset(), fr0->getDSTSavings(), fr1->getRawOffset(), fr1->getDSTSavings(), NonExistingTimeOpt, DuplicatedTimeOpt); base -= localDelta; } UBool avail1 = fr1->getPreviousStart(base, fr0->getRawOffset(), fr0->getDSTSavings(), true, start1); if (!avail0 || !avail1) { if (avail0) { return fr0; } else if (avail1) { return fr1; } // Both rules take effect after the given time return nullptr; } return (start0 > start1) ? fr0 : fr1; } UBool RuleBasedTimeZone::findNext(UDate base, UBool inclusive, UDate& transitionTime, TimeZoneRule*& fromRule, TimeZoneRule*& toRule) const { if (fHistoricTransitions == nullptr) { return false; } UBool isFinal = false; UBool found = false; Transition result; Transition *tzt = (Transition*)fHistoricTransitions->elementAt(0); UDate tt = tzt->time; if (tt > base || (inclusive && tt == base)) { result = *tzt; found = true; } else { int32_t idx = fHistoricTransitions->size() - 1; tzt = (Transition*)fHistoricTransitions->elementAt(idx); tt = tzt->time; if (inclusive && tt == base) { result = *tzt; found = true; } else if (tt <= base) { if (fFinalRules != nullptr) { // Find a transion time with finalRules TimeZoneRule *r0 = (TimeZoneRule*)fFinalRules->elementAt(0); TimeZoneRule *r1 = (TimeZoneRule*)fFinalRules->elementAt(1); UDate start0, start1; UBool avail0 = r0->getNextStart(base, r1->getRawOffset(), r1->getDSTSavings(), inclusive, start0); UBool avail1 = r1->getNextStart(base, r0->getRawOffset(), r0->getDSTSavings(), inclusive, start1); // avail0/avail1 should be always true if (!avail0 && !avail1) { return false; } if (!avail1 || start0 < start1) { result.time = start0; result.from = r1; result.to = r0; } else { result.time = start1; result.from = r0; result.to = r1; } isFinal = true; found = true; } } else { // Find a transition within the historic transitions idx--; Transition *prev = tzt; while (idx > 0) { tzt = (Transition*)fHistoricTransitions->elementAt(idx); tt = tzt->time; if (tt < base || (!inclusive && tt == base)) { break; } idx--; prev = tzt; } result.time = prev->time; result.from = prev->from; result.to = prev->to; found = true; } } if (found) { // For now, this implementation ignore transitions with only zone name changes. if (result.from->getRawOffset() == result.to->getRawOffset() && result.from->getDSTSavings() == result.to->getDSTSavings()) { if (isFinal) { return false; } else { // No offset changes. Try next one if not final return findNext(result.time, false /* always exclusive */, transitionTime, fromRule, toRule); } } transitionTime = result.time; fromRule = result.from; toRule = result.to; return true; } return false; } UBool RuleBasedTimeZone::findPrev(UDate base, UBool inclusive, UDate& transitionTime, TimeZoneRule*& fromRule, TimeZoneRule*& toRule) const { if (fHistoricTransitions == nullptr) { return false; } UBool found = false; Transition result; Transition *tzt = (Transition*)fHistoricTransitions->elementAt(0); UDate tt = tzt->time; if (inclusive && tt == base) { result = *tzt; found = true; } else if (tt < base) { int32_t idx = fHistoricTransitions->size() - 1; tzt = (Transition*)fHistoricTransitions->elementAt(idx); tt = tzt->time; if (inclusive && tt == base) { result = *tzt; found = true; } else if (tt < base) { if (fFinalRules != nullptr) { // Find a transion time with finalRules TimeZoneRule *r0 = (TimeZoneRule*)fFinalRules->elementAt(0); TimeZoneRule *r1 = (TimeZoneRule*)fFinalRules->elementAt(1); UDate start0, start1; UBool avail0 = r0->getPreviousStart(base, r1->getRawOffset(), r1->getDSTSavings(), inclusive, start0); UBool avail1 = r1->getPreviousStart(base, r0->getRawOffset(), r0->getDSTSavings(), inclusive, start1); // avail0/avail1 should be always true if (!avail0 && !avail1) { return false; } if (!avail1 || start0 > start1) { result.time = start0; result.from = r1; result.to = r0; } else { result.time = start1; result.from = r0; result.to = r1; } } else { result = *tzt; } found = true; } else { // Find a transition within the historic transitions idx--; while (idx >= 0) { tzt = (Transition*)fHistoricTransitions->elementAt(idx); tt = tzt->time; if (tt < base || (inclusive && tt == base)) { break; } idx--; } result = *tzt; found = true; } } if (found) { // For now, this implementation ignore transitions with only zone name changes. if (result.from->getRawOffset() == result.to->getRawOffset() && result.from->getDSTSavings() == result.to->getDSTSavings()) { // No offset changes. Try next one if not final return findPrev(result.time, false /* always exclusive */, transitionTime, fromRule, toRule); } transitionTime = result.time; fromRule = result.from; toRule = result.to; return true; } return false; } UDate RuleBasedTimeZone::getTransitionTime(Transition* transition, UBool local, int32_t NonExistingTimeOpt, int32_t DuplicatedTimeOpt) const { UDate time = transition->time; if (local) { time += getLocalDelta(transition->from->getRawOffset(), transition->from->getDSTSavings(), transition->to->getRawOffset(), transition->to->getDSTSavings(), NonExistingTimeOpt, DuplicatedTimeOpt); } return time; } int32_t RuleBasedTimeZone::getLocalDelta(int32_t rawBefore, int32_t dstBefore, int32_t rawAfter, int32_t dstAfter, int32_t NonExistingTimeOpt, int32_t DuplicatedTimeOpt) const { int32_t delta = 0; int32_t offsetBefore = rawBefore + dstBefore; int32_t offsetAfter = rawAfter + dstAfter; UBool dstToStd = (dstBefore != 0) && (dstAfter == 0); UBool stdToDst = (dstBefore == 0) && (dstAfter != 0); if (offsetAfter - offsetBefore >= 0) { // Positive transition, which makes a non-existing local time range if (((NonExistingTimeOpt & kStdDstMask) == kStandard && dstToStd) || ((NonExistingTimeOpt & kStdDstMask) == kDaylight && stdToDst)) { delta = offsetBefore; } else if (((NonExistingTimeOpt & kStdDstMask) == kStandard && stdToDst) || ((NonExistingTimeOpt & kStdDstMask) == kDaylight && dstToStd)) { delta = offsetAfter; } else if ((NonExistingTimeOpt & kFormerLatterMask) == kLatter) { delta = offsetBefore; } else { // Interprets the time with rule before the transition, // default for non-existing time range delta = offsetAfter; } } else { // Negative transition, which makes a duplicated local time range if (((DuplicatedTimeOpt & kStdDstMask) == kStandard && dstToStd) || ((DuplicatedTimeOpt & kStdDstMask) == kDaylight && stdToDst)) { delta = offsetAfter; } else if (((DuplicatedTimeOpt & kStdDstMask) == kStandard && stdToDst) || ((DuplicatedTimeOpt & kStdDstMask) == kDaylight && dstToStd)) { delta = offsetBefore; } else if ((DuplicatedTimeOpt & kFormerLatterMask) == kFormer) { delta = offsetBefore; } else { // Interprets the time with rule after the transition, // default for duplicated local time range delta = offsetAfter; } } return delta; } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ //eof stringi/src/icu74/i18n/number_asformat.h0000644000176200001440000000636314700200761017574 0ustar liggesusers// © 2017 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #ifndef __NUMBER_ASFORMAT_H__ #define __NUMBER_ASFORMAT_H__ #include "unicode/numberformatter.h" #include "number_types.h" #include "number_decimalquantity.h" #include "number_scientific.h" #include "number_patternstring.h" #include "number_modifiers.h" #include "number_multiplier.h" #include "number_roundingutils.h" #include "decNumber.h" #include "charstr.h" U_NAMESPACE_BEGIN namespace number { namespace impl { /** * A wrapper around LocalizedNumberFormatter implementing the Format interface, enabling improved * compatibility with other APIs. * * @see NumberFormatter */ class U_I18N_API LocalizedNumberFormatterAsFormat : public Format { public: LocalizedNumberFormatterAsFormat(const LocalizedNumberFormatter& formatter, const Locale& locale); /** * Destructor. */ ~LocalizedNumberFormatterAsFormat() override; /** * Equals operator. */ bool operator==(const Format& other) const override; /** * Creates a copy of this object. */ LocalizedNumberFormatterAsFormat* clone() const override; /** * Formats a Number using the wrapped LocalizedNumberFormatter. The provided formattable must be a * number type. */ UnicodeString& format(const Formattable& obj, UnicodeString& appendTo, FieldPosition& pos, UErrorCode& status) const override; /** * Formats a Number using the wrapped LocalizedNumberFormatter. The provided formattable must be a * number type. */ UnicodeString& format(const Formattable& obj, UnicodeString& appendTo, FieldPositionIterator* posIter, UErrorCode& status) const override; /** * Not supported: sets an error index and returns. */ void parseObject(const UnicodeString& source, Formattable& result, ParsePosition& parse_pos) const override; /** * Gets the LocalizedNumberFormatter that this wrapper class uses to format numbers. * * For maximum efficiency, this function returns by const reference. You must copy the return value * into a local variable if you want to use it beyond the lifetime of the current object: * *

     * LocalizedNumberFormatter localFormatter = fmt->getNumberFormatter();
     * 
* * You can however use the return value directly when chaining: * *
     * FormattedNumber result = fmt->getNumberFormatter().formatDouble(514.23, status);
     * 
* * @return The unwrapped LocalizedNumberFormatter. */ const LocalizedNumberFormatter& getNumberFormatter() const; UClassID getDynamicClassID() const override; static UClassID U_EXPORT2 getStaticClassID(); private: LocalizedNumberFormatter fFormatter; // Even though the locale is inside the LocalizedNumberFormatter, we have to keep it here, too, because // LocalizedNumberFormatter doesn't have a getLocale() method, and ICU-TC didn't want to add one. Locale fLocale; }; } // namespace impl } // namespace number U_NAMESPACE_END #endif // __NUMBER_ASFORMAT_H__ #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/rbnf.cpp0000644000176200001440000017141014700200761015666 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 1997-2015, International Business Machines Corporation * and others. All Rights Reserved. ******************************************************************************* */ #include "unicode/utypes.h" #include "utypeinfo.h" // for 'typeid' to work #include "unicode/rbnf.h" #if U_HAVE_RBNF #include "unicode/normlzr.h" #include "unicode/plurfmt.h" #include "unicode/tblcoll.h" #include "unicode/uchar.h" #include "unicode/ucol.h" #include "unicode/uloc.h" #include "unicode/unum.h" #include "unicode/ures.h" #include "unicode/ustring.h" #include "unicode/utf16.h" #include "unicode/udata.h" #include "unicode/udisplaycontext.h" #include "unicode/brkiter.h" #include "unicode/ucasemap.h" #include "cmemory.h" #include "cstring.h" #include "patternprops.h" #include "uresimp.h" #include "nfrs.h" #include "number_decimalquantity.h" // debugging // #define RBNF_DEBUG #ifdef RBNF_DEBUG #include #endif #define U_ICUDATA_RBNF U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "rbnf" static const char16_t gPercentPercent[] = { 0x25, 0x25, 0 }; /* "%%" */ // All urbnf objects are created through openRules, so we init all of the // Unicode string constants required by rbnf, nfrs, or nfr here. static const char16_t gLenientParse[] = { 0x25, 0x25, 0x6C, 0x65, 0x6E, 0x69, 0x65, 0x6E, 0x74, 0x2D, 0x70, 0x61, 0x72, 0x73, 0x65, 0x3A, 0 }; /* "%%lenient-parse:" */ static const char16_t gSemiColon = 0x003B; static const char16_t gSemiPercent[] = { 0x3B, 0x25, 0 }; /* ";%" */ #define kSomeNumberOfBitsDiv2 22 #define kHalfMaxDouble (double)(1 << kSomeNumberOfBitsDiv2) #define kMaxDouble (kHalfMaxDouble * kHalfMaxDouble) U_NAMESPACE_BEGIN using number::impl::DecimalQuantity; UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedNumberFormat) /* This is a utility class. It does not use ICU's RTTI. If ICU's RTTI is needed again, you can uncomment the RTTI code and derive from UObject. Please make sure that intltest passes on Windows in Release mode, since the string pooling per compilation unit will mess up how RTTI works. The RTTI code was also removed due to lack of code coverage. */ class LocalizationInfo : public UMemory { protected: virtual ~LocalizationInfo(); uint32_t refcount; public: LocalizationInfo() : refcount(0) {} LocalizationInfo* ref() { ++refcount; return this; } LocalizationInfo* unref() { if (refcount && --refcount == 0) { delete this; } return nullptr; } virtual bool operator==(const LocalizationInfo* rhs) const; inline bool operator!=(const LocalizationInfo* rhs) const { return !operator==(rhs); } virtual int32_t getNumberOfRuleSets() const = 0; virtual const char16_t* getRuleSetName(int32_t index) const = 0; virtual int32_t getNumberOfDisplayLocales() const = 0; virtual const char16_t* getLocaleName(int32_t index) const = 0; virtual const char16_t* getDisplayName(int32_t localeIndex, int32_t ruleIndex) const = 0; virtual int32_t indexForLocale(const char16_t* locale) const; virtual int32_t indexForRuleSet(const char16_t* ruleset) const; // virtual UClassID getDynamicClassID() const = 0; // static UClassID getStaticClassID(); }; LocalizationInfo::~LocalizationInfo() {} //UOBJECT_DEFINE_ABSTRACT_RTTI_IMPLEMENTATION(LocalizationInfo) // if both strings are nullptr, this returns true static UBool streq(const char16_t* lhs, const char16_t* rhs) { if (rhs == lhs) { return true; } if (lhs && rhs) { return u_strcmp(lhs, rhs) == 0; } return false; } bool LocalizationInfo::operator==(const LocalizationInfo* rhs) const { if (rhs) { if (this == rhs) { return true; } int32_t rsc = getNumberOfRuleSets(); if (rsc == rhs->getNumberOfRuleSets()) { for (int i = 0; i < rsc; ++i) { if (!streq(getRuleSetName(i), rhs->getRuleSetName(i))) { return false; } } int32_t dlc = getNumberOfDisplayLocales(); if (dlc == rhs->getNumberOfDisplayLocales()) { for (int i = 0; i < dlc; ++i) { const char16_t* locale = getLocaleName(i); int32_t ix = rhs->indexForLocale(locale); // if no locale, ix is -1, getLocaleName returns null, so streq returns false if (!streq(locale, rhs->getLocaleName(ix))) { return false; } for (int j = 0; j < rsc; ++j) { if (!streq(getDisplayName(i, j), rhs->getDisplayName(ix, j))) { return false; } } } return true; } } } return false; } int32_t LocalizationInfo::indexForLocale(const char16_t* locale) const { for (int i = 0; i < getNumberOfDisplayLocales(); ++i) { if (streq(locale, getLocaleName(i))) { return i; } } return -1; } int32_t LocalizationInfo::indexForRuleSet(const char16_t* ruleset) const { if (ruleset) { for (int i = 0; i < getNumberOfRuleSets(); ++i) { if (streq(ruleset, getRuleSetName(i))) { return i; } } } return -1; } typedef void (*Fn_Deleter)(void*); class VArray { void** buf; int32_t cap; int32_t size; Fn_Deleter deleter; public: VArray() : buf(nullptr), cap(0), size(0), deleter(nullptr) {} VArray(Fn_Deleter del) : buf(nullptr), cap(0), size(0), deleter(del) {} ~VArray() { if (deleter) { for (int i = 0; i < size; ++i) { (*deleter)(buf[i]); } } uprv_free(buf); } int32_t length() { return size; } void add(void* elem, UErrorCode& status) { if (U_SUCCESS(status)) { if (size == cap) { if (cap == 0) { cap = 1; } else if (cap < 256) { cap *= 2; } else { cap += 256; } if (buf == nullptr) { buf = (void**)uprv_malloc(cap * sizeof(void*)); } else { buf = (void**)uprv_realloc(buf, cap * sizeof(void*)); } if (buf == nullptr) { // if we couldn't realloc, we leak the memory we've already allocated, but we're in deep trouble anyway status = U_MEMORY_ALLOCATION_ERROR; return; } void* start = &buf[size]; size_t count = (cap - size) * sizeof(void*); uprv_memset(start, 0, count); // fill with nulls, just because } buf[size++] = elem; } } void** release() { void** result = buf; buf = nullptr; cap = 0; size = 0; return result; } }; class LocDataParser; class StringLocalizationInfo : public LocalizationInfo { char16_t* info; char16_t*** data; int32_t numRuleSets; int32_t numLocales; friend class LocDataParser; StringLocalizationInfo(char16_t* i, char16_t*** d, int32_t numRS, int32_t numLocs) : info(i), data(d), numRuleSets(numRS), numLocales(numLocs) { } public: static StringLocalizationInfo* create(const UnicodeString& info, UParseError& perror, UErrorCode& status); virtual ~StringLocalizationInfo(); virtual int32_t getNumberOfRuleSets() const override { return numRuleSets; } virtual const char16_t* getRuleSetName(int32_t index) const override; virtual int32_t getNumberOfDisplayLocales() const override { return numLocales; } virtual const char16_t* getLocaleName(int32_t index) const override; virtual const char16_t* getDisplayName(int32_t localeIndex, int32_t ruleIndex) const override; // virtual UClassID getDynamicClassID() const; // static UClassID getStaticClassID(); private: void init(UErrorCode& status) const; }; enum { OPEN_ANGLE = 0x003c, /* '<' */ CLOSE_ANGLE = 0x003e, /* '>' */ COMMA = 0x002c, TICK = 0x0027, QUOTE = 0x0022, SPACE = 0x0020 }; /** * Utility for parsing a localization string and returning a StringLocalizationInfo*. */ class LocDataParser { char16_t* data; const char16_t* e; char16_t* p; char16_t ch; UParseError& pe; UErrorCode& ec; public: LocDataParser(UParseError& parseError, UErrorCode& status) : data(nullptr), e(nullptr), p(nullptr), ch(0xffff), pe(parseError), ec(status) {} ~LocDataParser() {} /* * On a successful parse, return a StringLocalizationInfo*, otherwise delete locData, set perror and status, * and return nullptr. The StringLocalizationInfo will adopt locData if it is created. */ StringLocalizationInfo* parse(char16_t* data, int32_t len); private: inline void inc() { ++p; ch = 0xffff; } inline UBool checkInc(char16_t c) { if (p < e && (ch == c || *p == c)) { inc(); return true; } return false; } inline UBool check(char16_t c) { return p < e && (ch == c || *p == c); } inline void skipWhitespace() { while (p < e && PatternProps::isWhiteSpace(ch != 0xffff ? ch : *p)) { inc(); } } inline UBool inList(char16_t c, const char16_t* list) const { if (*list == SPACE && PatternProps::isWhiteSpace(c)) { return true; } while (*list && *list != c) { ++list; } return *list == c; } void parseError(const char* msg); StringLocalizationInfo* doParse(); char16_t** nextArray(int32_t& requiredLength); char16_t* nextString(); }; #ifdef RBNF_DEBUG #define ERROR(msg) UPRV_BLOCK_MACRO_BEGIN { \ parseError(msg); \ return nullptr; \ } UPRV_BLOCK_MACRO_END #define EXPLANATION_ARG explanationArg #else #define ERROR(msg) UPRV_BLOCK_MACRO_BEGIN { \ parseError(nullptr); \ return nullptr; \ } UPRV_BLOCK_MACRO_END #define EXPLANATION_ARG #endif static const char16_t DQUOTE_STOPLIST[] = { QUOTE, 0 }; static const char16_t SQUOTE_STOPLIST[] = { TICK, 0 }; static const char16_t NOQUOTE_STOPLIST[] = { SPACE, COMMA, CLOSE_ANGLE, OPEN_ANGLE, TICK, QUOTE, 0 }; static void DeleteFn(void* p) { uprv_free(p); } StringLocalizationInfo* LocDataParser::parse(char16_t* _data, int32_t len) { if (U_FAILURE(ec)) { if (_data) uprv_free(_data); return nullptr; } pe.line = 0; pe.offset = -1; pe.postContext[0] = 0; pe.preContext[0] = 0; if (_data == nullptr) { ec = U_ILLEGAL_ARGUMENT_ERROR; return nullptr; } if (len <= 0) { ec = U_ILLEGAL_ARGUMENT_ERROR; uprv_free(_data); return nullptr; } data = _data; e = data + len; p = _data; ch = 0xffff; return doParse(); } StringLocalizationInfo* LocDataParser::doParse() { skipWhitespace(); if (!checkInc(OPEN_ANGLE)) { ERROR("Missing open angle"); } else { VArray array(DeleteFn); UBool mightHaveNext = true; int32_t requiredLength = -1; while (mightHaveNext) { mightHaveNext = false; char16_t** elem = nextArray(requiredLength); skipWhitespace(); UBool haveComma = check(COMMA); if (elem) { array.add(elem, ec); if (haveComma) { inc(); mightHaveNext = true; } } else if (haveComma) { ERROR("Unexpected character"); } } skipWhitespace(); if (!checkInc(CLOSE_ANGLE)) { if (check(OPEN_ANGLE)) { ERROR("Missing comma in outer array"); } else { ERROR("Missing close angle bracket in outer array"); } } skipWhitespace(); if (p != e) { ERROR("Extra text after close of localization data"); } array.add(nullptr, ec); if (U_SUCCESS(ec)) { int32_t numLocs = array.length() - 2; // subtract first, nullptr char16_t*** result = (char16_t***)array.release(); return new StringLocalizationInfo(data, result, requiredLength-2, numLocs); // subtract first, nullptr } } ERROR("Unknown error"); } char16_t** LocDataParser::nextArray(int32_t& requiredLength) { if (U_FAILURE(ec)) { return nullptr; } skipWhitespace(); if (!checkInc(OPEN_ANGLE)) { ERROR("Missing open angle"); } VArray array; UBool mightHaveNext = true; while (mightHaveNext) { mightHaveNext = false; char16_t* elem = nextString(); skipWhitespace(); UBool haveComma = check(COMMA); if (elem) { array.add(elem, ec); if (haveComma) { inc(); mightHaveNext = true; } } else if (haveComma) { ERROR("Unexpected comma"); } } skipWhitespace(); if (!checkInc(CLOSE_ANGLE)) { if (check(OPEN_ANGLE)) { ERROR("Missing close angle bracket in inner array"); } else { ERROR("Missing comma in inner array"); } } array.add(nullptr, ec); if (U_SUCCESS(ec)) { if (requiredLength == -1) { requiredLength = array.length() + 1; } else if (array.length() != requiredLength) { ec = U_ILLEGAL_ARGUMENT_ERROR; ERROR("Array not of required length"); } return (char16_t**)array.release(); } ERROR("Unknown Error"); } char16_t* LocDataParser::nextString() { char16_t* result = nullptr; skipWhitespace(); if (p < e) { const char16_t* terminators; char16_t c = *p; UBool haveQuote = c == QUOTE || c == TICK; if (haveQuote) { inc(); terminators = c == QUOTE ? DQUOTE_STOPLIST : SQUOTE_STOPLIST; } else { terminators = NOQUOTE_STOPLIST; } char16_t* start = p; while (p < e && !inList(*p, terminators)) ++p; if (p == e) { ERROR("Unexpected end of data"); } char16_t x = *p; if (p > start) { ch = x; *p = 0x0; // terminate by writing to data result = start; // just point into data } if (haveQuote) { if (x != c) { ERROR("Missing matching quote"); } else if (p == start) { ERROR("Empty string"); } inc(); } else if (x == OPEN_ANGLE || x == TICK || x == QUOTE) { ERROR("Unexpected character in string"); } } // ok for there to be no next string return result; } void LocDataParser::parseError(const char* EXPLANATION_ARG) { if (!data) { return; } const char16_t* start = p - U_PARSE_CONTEXT_LEN - 1; if (start < data) { start = data; } for (char16_t* x = p; --x >= start;) { if (!*x) { start = x+1; break; } } const char16_t* limit = p + U_PARSE_CONTEXT_LEN - 1; if (limit > e) { limit = e; } u_strncpy(pe.preContext, start, (int32_t)(p-start)); pe.preContext[p-start] = 0; u_strncpy(pe.postContext, p, (int32_t)(limit-p)); pe.postContext[limit-p] = 0; pe.offset = (int32_t)(p - data); #ifdef RBNF_DEBUG fprintf(stderr, "%s at or near character %ld: ", EXPLANATION_ARG, p-data); UnicodeString msg; msg.append(start, p - start); msg.append((char16_t)0x002f); /* SOLIDUS/SLASH */ msg.append(p, limit-p); msg.append(UNICODE_STRING_SIMPLE("'")); char buf[128]; int32_t len = msg.extract(0, msg.length(), buf, 128); if (len >= 128) { buf[127] = 0; } else { buf[len] = 0; } fprintf(stderr, "%s\n", buf); fflush(stderr); #endif uprv_free(data); data = nullptr; p = nullptr; e = nullptr; if (U_SUCCESS(ec)) { ec = U_PARSE_ERROR; } } //UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringLocalizationInfo) StringLocalizationInfo* StringLocalizationInfo::create(const UnicodeString& info, UParseError& perror, UErrorCode& status) { if (U_FAILURE(status)) { return nullptr; } int32_t len = info.length(); if (len == 0) { return nullptr; // no error; } char16_t* p = (char16_t*)uprv_malloc(len * sizeof(char16_t)); if (!p) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } info.extract(p, len, status); if (!U_FAILURE(status)) { status = U_ZERO_ERROR; // clear warning about non-termination } LocDataParser parser(perror, status); return parser.parse(p, len); } StringLocalizationInfo::~StringLocalizationInfo() { for (char16_t*** p = (char16_t***)data; *p; ++p) { // remaining data is simply pointer into our unicode string data. if (*p) uprv_free(*p); } if (data) uprv_free(data); if (info) uprv_free(info); } const char16_t* StringLocalizationInfo::getRuleSetName(int32_t index) const { if (index >= 0 && index < getNumberOfRuleSets()) { return data[0][index]; } return nullptr; } const char16_t* StringLocalizationInfo::getLocaleName(int32_t index) const { if (index >= 0 && index < getNumberOfDisplayLocales()) { return data[index+1][0]; } return nullptr; } const char16_t* StringLocalizationInfo::getDisplayName(int32_t localeIndex, int32_t ruleIndex) const { if (localeIndex >= 0 && localeIndex < getNumberOfDisplayLocales() && ruleIndex >= 0 && ruleIndex < getNumberOfRuleSets()) { return data[localeIndex+1][ruleIndex+1]; } return nullptr; } // ---------- RuleBasedNumberFormat::RuleBasedNumberFormat(const UnicodeString& description, const UnicodeString& locs, const Locale& alocale, UParseError& perror, UErrorCode& status) : fRuleSets(nullptr) , ruleSetDescriptions(nullptr) , numRuleSets(0) , defaultRuleSet(nullptr) , locale(alocale) , collator(nullptr) , decimalFormatSymbols(nullptr) , defaultInfinityRule(nullptr) , defaultNaNRule(nullptr) , fRoundingMode(DecimalFormat::ERoundingMode::kRoundUnnecessary) , lenient(false) , lenientParseRules(nullptr) , localizations(nullptr) , capitalizationInfoSet(false) , capitalizationForUIListMenu(false) , capitalizationForStandAlone(false) , capitalizationBrkIter(nullptr) { LocalizationInfo* locinfo = StringLocalizationInfo::create(locs, perror, status); init(description, locinfo, perror, status); } RuleBasedNumberFormat::RuleBasedNumberFormat(const UnicodeString& description, const UnicodeString& locs, UParseError& perror, UErrorCode& status) : fRuleSets(nullptr) , ruleSetDescriptions(nullptr) , numRuleSets(0) , defaultRuleSet(nullptr) , locale(Locale::getDefault()) , collator(nullptr) , decimalFormatSymbols(nullptr) , defaultInfinityRule(nullptr) , defaultNaNRule(nullptr) , fRoundingMode(DecimalFormat::ERoundingMode::kRoundUnnecessary) , lenient(false) , lenientParseRules(nullptr) , localizations(nullptr) , capitalizationInfoSet(false) , capitalizationForUIListMenu(false) , capitalizationForStandAlone(false) , capitalizationBrkIter(nullptr) { LocalizationInfo* locinfo = StringLocalizationInfo::create(locs, perror, status); init(description, locinfo, perror, status); } RuleBasedNumberFormat::RuleBasedNumberFormat(const UnicodeString& description, LocalizationInfo* info, const Locale& alocale, UParseError& perror, UErrorCode& status) : fRuleSets(nullptr) , ruleSetDescriptions(nullptr) , numRuleSets(0) , defaultRuleSet(nullptr) , locale(alocale) , collator(nullptr) , decimalFormatSymbols(nullptr) , defaultInfinityRule(nullptr) , defaultNaNRule(nullptr) , fRoundingMode(DecimalFormat::ERoundingMode::kRoundUnnecessary) , lenient(false) , lenientParseRules(nullptr) , localizations(nullptr) , capitalizationInfoSet(false) , capitalizationForUIListMenu(false) , capitalizationForStandAlone(false) , capitalizationBrkIter(nullptr) { init(description, info, perror, status); } RuleBasedNumberFormat::RuleBasedNumberFormat(const UnicodeString& description, UParseError& perror, UErrorCode& status) : fRuleSets(nullptr) , ruleSetDescriptions(nullptr) , numRuleSets(0) , defaultRuleSet(nullptr) , locale(Locale::getDefault()) , collator(nullptr) , decimalFormatSymbols(nullptr) , defaultInfinityRule(nullptr) , defaultNaNRule(nullptr) , fRoundingMode(DecimalFormat::ERoundingMode::kRoundUnnecessary) , lenient(false) , lenientParseRules(nullptr) , localizations(nullptr) , capitalizationInfoSet(false) , capitalizationForUIListMenu(false) , capitalizationForStandAlone(false) , capitalizationBrkIter(nullptr) { init(description, nullptr, perror, status); } RuleBasedNumberFormat::RuleBasedNumberFormat(const UnicodeString& description, const Locale& aLocale, UParseError& perror, UErrorCode& status) : fRuleSets(nullptr) , ruleSetDescriptions(nullptr) , numRuleSets(0) , defaultRuleSet(nullptr) , locale(aLocale) , collator(nullptr) , decimalFormatSymbols(nullptr) , defaultInfinityRule(nullptr) , defaultNaNRule(nullptr) , fRoundingMode(DecimalFormat::ERoundingMode::kRoundUnnecessary) , lenient(false) , lenientParseRules(nullptr) , localizations(nullptr) , capitalizationInfoSet(false) , capitalizationForUIListMenu(false) , capitalizationForStandAlone(false) , capitalizationBrkIter(nullptr) { init(description, nullptr, perror, status); } RuleBasedNumberFormat::RuleBasedNumberFormat(URBNFRuleSetTag tag, const Locale& alocale, UErrorCode& status) : fRuleSets(nullptr) , ruleSetDescriptions(nullptr) , numRuleSets(0) , defaultRuleSet(nullptr) , locale(alocale) , collator(nullptr) , decimalFormatSymbols(nullptr) , defaultInfinityRule(nullptr) , defaultNaNRule(nullptr) , fRoundingMode(DecimalFormat::ERoundingMode::kRoundUnnecessary) , lenient(false) , lenientParseRules(nullptr) , localizations(nullptr) , capitalizationInfoSet(false) , capitalizationForUIListMenu(false) , capitalizationForStandAlone(false) , capitalizationBrkIter(nullptr) { if (U_FAILURE(status)) { return; } const char* rules_tag = "RBNFRules"; const char* fmt_tag = ""; switch (tag) { case URBNF_SPELLOUT: fmt_tag = "SpelloutRules"; break; case URBNF_ORDINAL: fmt_tag = "OrdinalRules"; break; case URBNF_DURATION: fmt_tag = "DurationRules"; break; case URBNF_NUMBERING_SYSTEM: fmt_tag = "NumberingSystemRules"; break; default: status = U_ILLEGAL_ARGUMENT_ERROR; return; } // TODO: read localization info from resource LocalizationInfo* locinfo = nullptr; UResourceBundle* nfrb = ures_open(U_ICUDATA_RBNF, locale.getName(), &status); if (U_SUCCESS(status)) { setLocaleIDs(ures_getLocaleByType(nfrb, ULOC_VALID_LOCALE, &status), ures_getLocaleByType(nfrb, ULOC_ACTUAL_LOCALE, &status)); UResourceBundle* rbnfRules = ures_getByKeyWithFallback(nfrb, rules_tag, nullptr, &status); if (U_FAILURE(status)) { ures_close(nfrb); } UResourceBundle* ruleSets = ures_getByKeyWithFallback(rbnfRules, fmt_tag, nullptr, &status); if (U_FAILURE(status)) { ures_close(rbnfRules); ures_close(nfrb); return; } UnicodeString desc; while (ures_hasNext(ruleSets)) { desc.append(ures_getNextUnicodeString(ruleSets,nullptr,&status)); } UParseError perror; init(desc, locinfo, perror, status); ures_close(ruleSets); ures_close(rbnfRules); } ures_close(nfrb); } RuleBasedNumberFormat::RuleBasedNumberFormat(const RuleBasedNumberFormat& rhs) : NumberFormat(rhs) , fRuleSets(nullptr) , ruleSetDescriptions(nullptr) , numRuleSets(0) , defaultRuleSet(nullptr) , locale(rhs.locale) , collator(nullptr) , decimalFormatSymbols(nullptr) , defaultInfinityRule(nullptr) , defaultNaNRule(nullptr) , fRoundingMode(DecimalFormat::ERoundingMode::kRoundUnnecessary) , lenient(false) , lenientParseRules(nullptr) , localizations(nullptr) , capitalizationInfoSet(false) , capitalizationForUIListMenu(false) , capitalizationForStandAlone(false) , capitalizationBrkIter(nullptr) { this->operator=(rhs); } // -------- RuleBasedNumberFormat& RuleBasedNumberFormat::operator=(const RuleBasedNumberFormat& rhs) { if (this == &rhs) { return *this; } NumberFormat::operator=(rhs); UErrorCode status = U_ZERO_ERROR; dispose(); locale = rhs.locale; lenient = rhs.lenient; UParseError perror; setDecimalFormatSymbols(*rhs.getDecimalFormatSymbols()); init(rhs.originalDescription, rhs.localizations ? rhs.localizations->ref() : nullptr, perror, status); setDefaultRuleSet(rhs.getDefaultRuleSetName(), status); setRoundingMode(rhs.getRoundingMode()); capitalizationInfoSet = rhs.capitalizationInfoSet; capitalizationForUIListMenu = rhs.capitalizationForUIListMenu; capitalizationForStandAlone = rhs.capitalizationForStandAlone; #if !UCONFIG_NO_BREAK_ITERATION capitalizationBrkIter = (rhs.capitalizationBrkIter!=nullptr)? rhs.capitalizationBrkIter->clone(): nullptr; #endif return *this; } RuleBasedNumberFormat::~RuleBasedNumberFormat() { dispose(); } RuleBasedNumberFormat* RuleBasedNumberFormat::clone() const { return new RuleBasedNumberFormat(*this); } bool RuleBasedNumberFormat::operator==(const Format& other) const { if (this == &other) { return true; } if (typeid(*this) == typeid(other)) { const RuleBasedNumberFormat& rhs = static_cast(other); // test for capitalization info equality is adequately handled // by the NumberFormat test for fCapitalizationContext equality; // the info here is just derived from that. if (locale == rhs.locale && lenient == rhs.lenient && (localizations == nullptr ? rhs.localizations == nullptr : (rhs.localizations == nullptr ? false : *localizations == rhs.localizations))) { NFRuleSet** p = fRuleSets; NFRuleSet** q = rhs.fRuleSets; if (p == nullptr) { return q == nullptr; } else if (q == nullptr) { return false; } while (*p && *q && (**p == **q)) { ++p; ++q; } return *q == nullptr && *p == nullptr; } } return false; } UnicodeString RuleBasedNumberFormat::getRules() const { UnicodeString result; if (fRuleSets != nullptr) { for (NFRuleSet** p = fRuleSets; *p; ++p) { (*p)->appendRules(result); } } return result; } UnicodeString RuleBasedNumberFormat::getRuleSetName(int32_t index) const { if (localizations) { UnicodeString string(true, localizations->getRuleSetName(index), (int32_t)-1); return string; } else if (fRuleSets) { UnicodeString result; for (NFRuleSet** p = fRuleSets; *p; ++p) { NFRuleSet* rs = *p; if (rs->isPublic()) { if (--index == -1) { rs->getName(result); return result; } } } } UnicodeString empty; return empty; } int32_t RuleBasedNumberFormat::getNumberOfRuleSetNames() const { int32_t result = 0; if (localizations) { result = localizations->getNumberOfRuleSets(); } else if (fRuleSets) { for (NFRuleSet** p = fRuleSets; *p; ++p) { if ((**p).isPublic()) { ++result; } } } return result; } int32_t RuleBasedNumberFormat::getNumberOfRuleSetDisplayNameLocales() const { if (localizations) { return localizations->getNumberOfDisplayLocales(); } return 0; } Locale RuleBasedNumberFormat::getRuleSetDisplayNameLocale(int32_t index, UErrorCode& status) const { if (U_FAILURE(status)) { return Locale(""); } if (localizations && index >= 0 && index < localizations->getNumberOfDisplayLocales()) { UnicodeString name(true, localizations->getLocaleName(index), -1); char buffer[64]; int32_t cap = name.length() + 1; char* bp = buffer; if (cap > 64) { bp = (char *)uprv_malloc(cap); if (bp == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return Locale(""); } } name.extract(0, name.length(), bp, cap, UnicodeString::kInvariant); Locale retLocale(bp); if (bp != buffer) { uprv_free(bp); } return retLocale; } status = U_ILLEGAL_ARGUMENT_ERROR; Locale retLocale; return retLocale; } UnicodeString RuleBasedNumberFormat::getRuleSetDisplayName(int32_t index, const Locale& localeParam) { if (localizations && index >= 0 && index < localizations->getNumberOfRuleSets()) { UnicodeString localeName(localeParam.getBaseName(), -1, UnicodeString::kInvariant); int32_t len = localeName.length(); char16_t* localeStr = localeName.getBuffer(len + 1); while (len >= 0) { localeStr[len] = 0; int32_t ix = localizations->indexForLocale(localeStr); if (ix >= 0) { UnicodeString name(true, localizations->getDisplayName(ix, index), -1); return name; } // trim trailing portion, skipping over omitted sections do { --len;} while (len > 0 && localeStr[len] != 0x005f); // underscore while (len > 0 && localeStr[len-1] == 0x005F) --len; } UnicodeString name(true, localizations->getRuleSetName(index), -1); return name; } UnicodeString bogus; bogus.setToBogus(); return bogus; } UnicodeString RuleBasedNumberFormat::getRuleSetDisplayName(const UnicodeString& ruleSetName, const Locale& localeParam) { if (localizations) { UnicodeString rsn(ruleSetName); int32_t ix = localizations->indexForRuleSet(rsn.getTerminatedBuffer()); return getRuleSetDisplayName(ix, localeParam); } UnicodeString bogus; bogus.setToBogus(); return bogus; } NFRuleSet* RuleBasedNumberFormat::findRuleSet(const UnicodeString& name, UErrorCode& status) const { if (U_SUCCESS(status) && fRuleSets) { for (NFRuleSet** p = fRuleSets; *p; ++p) { NFRuleSet* rs = *p; if (rs->isNamed(name)) { return rs; } } status = U_ILLEGAL_ARGUMENT_ERROR; } return nullptr; } UnicodeString& RuleBasedNumberFormat::format(const DecimalQuantity &number, UnicodeString& appendTo, FieldPosition& pos, UErrorCode &status) const { if (U_FAILURE(status)) { return appendTo; } DecimalQuantity copy(number); if (copy.fitsInLong()) { format(number.toLong(), appendTo, pos, status); } else { copy.roundToMagnitude(0, number::impl::RoundingMode::UNUM_ROUND_HALFEVEN, status); if (copy.fitsInLong()) { format(number.toDouble(), appendTo, pos, status); } else { // We're outside of our normal range that this framework can handle. // The DecimalFormat will provide more accurate results. // TODO this section should probably be optimized. The DecimalFormat is shared in ICU4J. LocalPointer decimalFormat(NumberFormat::createInstance(locale, UNUM_DECIMAL, status), status); if (decimalFormat.isNull()) { return appendTo; } Formattable f; LocalPointer decimalQuantity(new DecimalQuantity(number), status); if (decimalQuantity.isNull()) { return appendTo; } f.adoptDecimalQuantity(decimalQuantity.orphan()); // f now owns decimalQuantity. decimalFormat->format(f, appendTo, pos, status); } } return appendTo; } UnicodeString& RuleBasedNumberFormat::format(int32_t number, UnicodeString& toAppendTo, FieldPosition& pos) const { return format((int64_t)number, toAppendTo, pos); } UnicodeString& RuleBasedNumberFormat::format(int64_t number, UnicodeString& toAppendTo, FieldPosition& /* pos */) const { if (defaultRuleSet) { UErrorCode status = U_ZERO_ERROR; format(number, defaultRuleSet, toAppendTo, status); } return toAppendTo; } UnicodeString& RuleBasedNumberFormat::format(double number, UnicodeString& toAppendTo, FieldPosition& /* pos */) const { UErrorCode status = U_ZERO_ERROR; if (defaultRuleSet) { format(number, *defaultRuleSet, toAppendTo, status); } return toAppendTo; } UnicodeString& RuleBasedNumberFormat::format(int32_t number, const UnicodeString& ruleSetName, UnicodeString& toAppendTo, FieldPosition& pos, UErrorCode& status) const { return format((int64_t)number, ruleSetName, toAppendTo, pos, status); } UnicodeString& RuleBasedNumberFormat::format(int64_t number, const UnicodeString& ruleSetName, UnicodeString& toAppendTo, FieldPosition& /* pos */, UErrorCode& status) const { if (U_SUCCESS(status)) { if (ruleSetName.indexOf(gPercentPercent, 2, 0) == 0) { // throw new IllegalArgumentException("Can't use internal rule set"); status = U_ILLEGAL_ARGUMENT_ERROR; } else { NFRuleSet *rs = findRuleSet(ruleSetName, status); if (rs) { format(number, rs, toAppendTo, status); } } } return toAppendTo; } UnicodeString& RuleBasedNumberFormat::format(double number, const UnicodeString& ruleSetName, UnicodeString& toAppendTo, FieldPosition& /* pos */, UErrorCode& status) const { if (U_SUCCESS(status)) { if (ruleSetName.indexOf(gPercentPercent, 2, 0) == 0) { // throw new IllegalArgumentException("Can't use internal rule set"); status = U_ILLEGAL_ARGUMENT_ERROR; } else { NFRuleSet *rs = findRuleSet(ruleSetName, status); if (rs) { format(number, *rs, toAppendTo, status); } } } return toAppendTo; } void RuleBasedNumberFormat::format(double number, NFRuleSet& rs, UnicodeString& toAppendTo, UErrorCode& status) const { int32_t startPos = toAppendTo.length(); if (getRoundingMode() != DecimalFormat::ERoundingMode::kRoundUnnecessary && !uprv_isNaN(number) && !uprv_isInfinite(number)) { DecimalQuantity digitList; digitList.setToDouble(number); digitList.roundToMagnitude( -getMaximumFractionDigits(), static_cast(getRoundingMode()), status); number = digitList.toDouble(); } rs.format(number, toAppendTo, toAppendTo.length(), 0, status); adjustForCapitalizationContext(startPos, toAppendTo, status); } /** * Bottleneck through which all the public format() methods * that take a long pass. By the time we get here, we know * which rule set we're using to do the formatting. * @param number The number to format * @param ruleSet The rule set to use to format the number * @return The text that resulted from formatting the number */ UnicodeString& RuleBasedNumberFormat::format(int64_t number, NFRuleSet *ruleSet, UnicodeString& toAppendTo, UErrorCode& status) const { // all API format() routines that take a double vector through // here. We have these two identical functions-- one taking a // double and one taking a long-- the couple digits of precision // that long has but double doesn't (both types are 8 bytes long, // but double has to borrow some of the mantissa bits to hold // the exponent). // Create an empty string buffer where the result will // be built, and pass it to the rule set (along with an insertion // position of 0 and the number being formatted) to the rule set // for formatting if (U_SUCCESS(status)) { if (number == U_INT64_MIN) { // We can't handle this value right now. Provide an accurate default value. // TODO this section should probably be optimized. The DecimalFormat is shared in ICU4J. NumberFormat *decimalFormat = NumberFormat::createInstance(locale, UNUM_DECIMAL, status); if (decimalFormat == nullptr) { return toAppendTo; } Formattable f; FieldPosition pos(FieldPosition::DONT_CARE); DecimalQuantity *decimalQuantity = new DecimalQuantity(); if (decimalQuantity == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; delete decimalFormat; return toAppendTo; } decimalQuantity->setToLong(number); f.adoptDecimalQuantity(decimalQuantity); // f now owns decimalQuantity. decimalFormat->format(f, toAppendTo, pos, status); delete decimalFormat; } else { int32_t startPos = toAppendTo.length(); ruleSet->format(number, toAppendTo, toAppendTo.length(), 0, status); adjustForCapitalizationContext(startPos, toAppendTo, status); } } return toAppendTo; } UnicodeString& RuleBasedNumberFormat::adjustForCapitalizationContext(int32_t startPos, UnicodeString& currentResult, UErrorCode& status) const { #if !UCONFIG_NO_BREAK_ITERATION UDisplayContext capitalizationContext = getContext(UDISPCTX_TYPE_CAPITALIZATION, status); if (capitalizationContext != UDISPCTX_CAPITALIZATION_NONE && startPos == 0 && currentResult.length() > 0) { // capitalize currentResult according to context UChar32 ch = currentResult.char32At(0); if (u_islower(ch) && U_SUCCESS(status) && capitalizationBrkIter != nullptr && ( capitalizationContext == UDISPCTX_CAPITALIZATION_FOR_BEGINNING_OF_SENTENCE || (capitalizationContext == UDISPCTX_CAPITALIZATION_FOR_UI_LIST_OR_MENU && capitalizationForUIListMenu) || (capitalizationContext == UDISPCTX_CAPITALIZATION_FOR_STANDALONE && capitalizationForStandAlone)) ) { // titlecase first word of currentResult, here use sentence iterator unlike current implementations // in LocaleDisplayNamesImpl::adjustForUsageAndContext and RelativeDateFormat::format currentResult.toTitle(capitalizationBrkIter, locale, U_TITLECASE_NO_LOWERCASE | U_TITLECASE_NO_BREAK_ADJUSTMENT); } } #endif return currentResult; } void RuleBasedNumberFormat::parse(const UnicodeString& text, Formattable& result, ParsePosition& parsePosition) const { if (!fRuleSets) { parsePosition.setErrorIndex(0); return; } UnicodeString workingText(text, parsePosition.getIndex()); ParsePosition workingPos(0); ParsePosition high_pp(0); Formattable high_result; for (NFRuleSet** p = fRuleSets; *p; ++p) { NFRuleSet *rp = *p; if (rp->isPublic() && rp->isParseable()) { ParsePosition working_pp(0); Formattable working_result; rp->parse(workingText, working_pp, kMaxDouble, 0, working_result); if (working_pp.getIndex() > high_pp.getIndex()) { high_pp = working_pp; high_result = working_result; if (high_pp.getIndex() == workingText.length()) { break; } } } } int32_t startIndex = parsePosition.getIndex(); parsePosition.setIndex(startIndex + high_pp.getIndex()); if (high_pp.getIndex() > 0) { parsePosition.setErrorIndex(-1); } else { int32_t errorIndex = (high_pp.getErrorIndex()>0)? high_pp.getErrorIndex(): 0; parsePosition.setErrorIndex(startIndex + errorIndex); } result = high_result; if (result.getType() == Formattable::kDouble) { double d = result.getDouble(); if (!uprv_isNaN(d) && d == uprv_trunc(d) && INT32_MIN <= d && d <= INT32_MAX) { // Note: casting a double to an int when the double is too large or small // to fit the destination is undefined behavior. The explicit range checks, // above, are required. Just casting and checking the result value is undefined. result.setLong(static_cast(d)); } } } #if !UCONFIG_NO_COLLATION void RuleBasedNumberFormat::setLenient(UBool enabled) { lenient = enabled; if (!enabled && collator) { delete collator; collator = nullptr; } } #endif void RuleBasedNumberFormat::setDefaultRuleSet(const UnicodeString& ruleSetName, UErrorCode& status) { if (U_SUCCESS(status)) { if (ruleSetName.isEmpty()) { if (localizations) { UnicodeString name(true, localizations->getRuleSetName(0), -1); defaultRuleSet = findRuleSet(name, status); } else { initDefaultRuleSet(); } } else if (ruleSetName.startsWith(UNICODE_STRING_SIMPLE("%%"))) { status = U_ILLEGAL_ARGUMENT_ERROR; } else { NFRuleSet* result = findRuleSet(ruleSetName, status); if (result != nullptr) { defaultRuleSet = result; } } } } UnicodeString RuleBasedNumberFormat::getDefaultRuleSetName() const { UnicodeString result; if (defaultRuleSet && defaultRuleSet->isPublic()) { defaultRuleSet->getName(result); } else { result.setToBogus(); } return result; } void RuleBasedNumberFormat::initDefaultRuleSet() { defaultRuleSet = nullptr; if (!fRuleSets) { return; } const UnicodeString spellout(UNICODE_STRING_SIMPLE("%spellout-numbering")); const UnicodeString ordinal(UNICODE_STRING_SIMPLE("%digits-ordinal")); const UnicodeString duration(UNICODE_STRING_SIMPLE("%duration")); NFRuleSet**p = &fRuleSets[0]; while (*p) { if ((*p)->isNamed(spellout) || (*p)->isNamed(ordinal) || (*p)->isNamed(duration)) { defaultRuleSet = *p; return; } else { ++p; } } defaultRuleSet = *--p; if (!defaultRuleSet->isPublic()) { while (p != fRuleSets) { if ((*--p)->isPublic()) { defaultRuleSet = *p; break; } } } } void RuleBasedNumberFormat::init(const UnicodeString& rules, LocalizationInfo* localizationInfos, UParseError& pErr, UErrorCode& status) { // TODO: implement UParseError uprv_memset(&pErr, 0, sizeof(UParseError)); // Note: this can leave ruleSets == nullptr, so remaining code should check if (U_FAILURE(status)) { return; } initializeDecimalFormatSymbols(status); initializeDefaultInfinityRule(status); initializeDefaultNaNRule(status); if (U_FAILURE(status)) { return; } this->localizations = localizationInfos == nullptr ? nullptr : localizationInfos->ref(); UnicodeString description(rules); if (!description.length()) { status = U_MEMORY_ALLOCATION_ERROR; return; } // start by stripping the trailing whitespace from all the rules // (this is all the whitespace following each semicolon in the // description). This allows us to look for rule-set boundaries // by searching for ";%" without having to worry about whitespace // between the ; and the % stripWhitespace(description); // check to see if there's a set of lenient-parse rules. If there // is, pull them out into our temporary holding place for them, // and delete them from the description before the real desciption- // parsing code sees them int32_t lp = description.indexOf(gLenientParse, -1, 0); if (lp != -1) { // we've got to make sure we're not in the middle of a rule // (where "%%lenient-parse" would actually get treated as // rule text) if (lp == 0 || description.charAt(lp - 1) == gSemiColon) { // locate the beginning and end of the actual collation // rules (there may be whitespace between the name and // the first token in the description) int lpEnd = description.indexOf(gSemiPercent, 2, lp); if (lpEnd == -1) { lpEnd = description.length() - 1; } int lpStart = lp + u_strlen(gLenientParse); while (PatternProps::isWhiteSpace(description.charAt(lpStart))) { ++lpStart; } // copy out the lenient-parse rules and delete them // from the description lenientParseRules = new UnicodeString(); /* test for nullptr */ if (lenientParseRules == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } lenientParseRules->setTo(description, lpStart, lpEnd - lpStart); description.remove(lp, lpEnd + 1 - lp); } } // pre-flight parsing the description and count the number of // rule sets (";%" marks the end of one rule set and the beginning // of the next) numRuleSets = 0; for (int32_t p = description.indexOf(gSemiPercent, 2, 0); p != -1; p = description.indexOf(gSemiPercent, 2, p)) { ++numRuleSets; ++p; } ++numRuleSets; // our rule list is an array of the appropriate size fRuleSets = (NFRuleSet **)uprv_malloc((numRuleSets + 1) * sizeof(NFRuleSet *)); /* test for nullptr */ if (fRuleSets == 0) { status = U_MEMORY_ALLOCATION_ERROR; return; } for (int i = 0; i <= numRuleSets; ++i) { fRuleSets[i] = nullptr; } // divide up the descriptions into individual rule-set descriptions // and store them in a temporary array. At each step, we also // new up a rule set, but all this does is initialize its name // and remove it from its description. We can't actually parse // the rest of the descriptions and finish initializing everything // because we have to know the names and locations of all the rule // sets before we can actually set everything up if(!numRuleSets) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } ruleSetDescriptions = new UnicodeString[numRuleSets]; if (ruleSetDescriptions == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } { int curRuleSet = 0; int32_t start = 0; for (int32_t p = description.indexOf(gSemiPercent, 2, 0); p != -1; p = description.indexOf(gSemiPercent, 2, start)) { ruleSetDescriptions[curRuleSet].setTo(description, start, p + 1 - start); fRuleSets[curRuleSet] = new NFRuleSet(this, ruleSetDescriptions, curRuleSet, status); if (fRuleSets[curRuleSet] == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } ++curRuleSet; start = p + 1; } ruleSetDescriptions[curRuleSet].setTo(description, start, description.length() - start); fRuleSets[curRuleSet] = new NFRuleSet(this, ruleSetDescriptions, curRuleSet, status); if (fRuleSets[curRuleSet] == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } } // now we can take note of the formatter's default rule set, which // is the last public rule set in the description (it's the last // rather than the first so that a user can create a new formatter // from an existing formatter and change its default behavior just // by appending more rule sets to the end) // {dlf} Initialization of a fraction rule set requires the default rule // set to be known. For purposes of initialization, this is always the // last public rule set, no matter what the localization data says. initDefaultRuleSet(); // finally, we can go back through the temporary descriptions // list and finish setting up the substructure (and we throw // away the temporary descriptions as we go) { for (int i = 0; i < numRuleSets; i++) { fRuleSets[i]->parseRules(ruleSetDescriptions[i], status); } } // Now that the rules are initialized, the 'real' default rule // set can be adjusted by the localization data. // The C code keeps the localization array as is, rather than building // a separate array of the public rule set names, so we have less work // to do here-- but we still need to check the names. if (localizationInfos) { // confirm the names, if any aren't in the rules, that's an error // it is ok if the rules contain public rule sets that are not in this list for (int32_t i = 0; i < localizationInfos->getNumberOfRuleSets(); ++i) { UnicodeString name(true, localizationInfos->getRuleSetName(i), -1); NFRuleSet* rs = findRuleSet(name, status); if (rs == nullptr) { break; // error } if (i == 0) { defaultRuleSet = rs; } } } else { defaultRuleSet = getDefaultRuleSet(); } originalDescription = rules; } // override the NumberFormat implementation in order to // lazily initialize relevant items void RuleBasedNumberFormat::setContext(UDisplayContext value, UErrorCode& status) { NumberFormat::setContext(value, status); if (U_SUCCESS(status)) { if (!capitalizationInfoSet && (value==UDISPCTX_CAPITALIZATION_FOR_UI_LIST_OR_MENU || value==UDISPCTX_CAPITALIZATION_FOR_STANDALONE)) { initCapitalizationContextInfo(locale); capitalizationInfoSet = true; } #if !UCONFIG_NO_BREAK_ITERATION if ( capitalizationBrkIter == nullptr && (value==UDISPCTX_CAPITALIZATION_FOR_BEGINNING_OF_SENTENCE || (value==UDISPCTX_CAPITALIZATION_FOR_UI_LIST_OR_MENU && capitalizationForUIListMenu) || (value==UDISPCTX_CAPITALIZATION_FOR_STANDALONE && capitalizationForStandAlone)) ) { status = U_ZERO_ERROR; capitalizationBrkIter = BreakIterator::createSentenceInstance(locale, status); if (U_FAILURE(status)) { delete capitalizationBrkIter; capitalizationBrkIter = nullptr; } } #endif } } void RuleBasedNumberFormat::initCapitalizationContextInfo(const Locale& thelocale) { #if !UCONFIG_NO_BREAK_ITERATION const char * localeID = (thelocale != nullptr)? thelocale.getBaseName(): nullptr; UErrorCode status = U_ZERO_ERROR; UResourceBundle *rb = ures_open(nullptr, localeID, &status); rb = ures_getByKeyWithFallback(rb, "contextTransforms", rb, &status); rb = ures_getByKeyWithFallback(rb, "number-spellout", rb, &status); if (U_SUCCESS(status) && rb != nullptr) { int32_t len = 0; const int32_t * intVector = ures_getIntVector(rb, &len, &status); if (U_SUCCESS(status) && intVector != nullptr && len >= 2) { capitalizationForUIListMenu = static_cast(intVector[0]); capitalizationForStandAlone = static_cast(intVector[1]); } } ures_close(rb); #endif } void RuleBasedNumberFormat::stripWhitespace(UnicodeString& description) { // iterate through the characters... UnicodeString result; int start = 0; while (start != -1 && start < description.length()) { // seek to the first non-whitespace character... while (start < description.length() && PatternProps::isWhiteSpace(description.charAt(start))) { ++start; } // locate the next semicolon in the text and copy the text from // our current position up to that semicolon into the result int32_t p = description.indexOf(gSemiColon, start); if (p == -1) { // or if we don't find a semicolon, just copy the rest of // the string into the result result.append(description, start, description.length() - start); start = -1; } else if (p < description.length()) { result.append(description, start, p + 1 - start); start = p + 1; } // when we get here, we've seeked off the end of the string, and // we terminate the loop (we continue until *start* is -1 rather // than until *p* is -1, because otherwise we'd miss the last // rule in the description) else { start = -1; } } description.setTo(result); } void RuleBasedNumberFormat::dispose() { if (fRuleSets) { for (NFRuleSet** p = fRuleSets; *p; ++p) { delete *p; } uprv_free(fRuleSets); fRuleSets = nullptr; } if (ruleSetDescriptions) { delete [] ruleSetDescriptions; ruleSetDescriptions = nullptr; } #if !UCONFIG_NO_COLLATION delete collator; #endif collator = nullptr; delete decimalFormatSymbols; decimalFormatSymbols = nullptr; delete defaultInfinityRule; defaultInfinityRule = nullptr; delete defaultNaNRule; defaultNaNRule = nullptr; delete lenientParseRules; lenientParseRules = nullptr; #if !UCONFIG_NO_BREAK_ITERATION delete capitalizationBrkIter; capitalizationBrkIter = nullptr; #endif if (localizations) { localizations = localizations->unref(); } } //----------------------------------------------------------------------- // package-internal API //----------------------------------------------------------------------- /** * Returns the collator to use for lenient parsing. The collator is lazily created: * this function creates it the first time it's called. * @return The collator to use for lenient parsing, or null if lenient parsing * is turned off. */ const RuleBasedCollator* RuleBasedNumberFormat::getCollator() const { #if !UCONFIG_NO_COLLATION if (!fRuleSets) { return nullptr; } // lazy-evaluate the collator if (collator == nullptr && lenient) { // create a default collator based on the formatter's locale, // then pull out that collator's rules, append any additional // rules specified in the description, and create a _new_ // collator based on the combination of those rules UErrorCode status = U_ZERO_ERROR; Collator* temp = Collator::createInstance(locale, status); RuleBasedCollator* newCollator; if (U_SUCCESS(status) && (newCollator = dynamic_cast(temp)) != nullptr) { if (lenientParseRules) { UnicodeString rules(newCollator->getRules()); rules.append(*lenientParseRules); newCollator = new RuleBasedCollator(rules, status); // Exit if newCollator could not be created. if (newCollator == nullptr) { return nullptr; } } else { temp = nullptr; } if (U_SUCCESS(status)) { newCollator->setAttribute(UCOL_DECOMPOSITION_MODE, UCOL_ON, status); // cast away const ((RuleBasedNumberFormat*)this)->collator = newCollator; } else { delete newCollator; } } delete temp; } #endif // if lenient-parse mode is off, this will be null // (see setLenientParseMode()) return collator; } DecimalFormatSymbols* RuleBasedNumberFormat::initializeDecimalFormatSymbols(UErrorCode &status) { // lazy-evaluate the DecimalFormatSymbols object. This object // is shared by all DecimalFormat instances belonging to this // formatter if (decimalFormatSymbols == nullptr) { LocalPointer temp(new DecimalFormatSymbols(locale, status), status); if (U_SUCCESS(status)) { decimalFormatSymbols = temp.orphan(); } } return decimalFormatSymbols; } /** * Returns the DecimalFormatSymbols object that should be used by all DecimalFormat * instances owned by this formatter. */ const DecimalFormatSymbols* RuleBasedNumberFormat::getDecimalFormatSymbols() const { return decimalFormatSymbols; } NFRule* RuleBasedNumberFormat::initializeDefaultInfinityRule(UErrorCode &status) { if (U_FAILURE(status)) { return nullptr; } if (defaultInfinityRule == nullptr) { UnicodeString rule(UNICODE_STRING_SIMPLE("Inf: ")); rule.append(getDecimalFormatSymbols()->getSymbol(DecimalFormatSymbols::kInfinitySymbol)); LocalPointer temp(new NFRule(this, rule, status), status); if (U_SUCCESS(status)) { defaultInfinityRule = temp.orphan(); } } return defaultInfinityRule; } const NFRule* RuleBasedNumberFormat::getDefaultInfinityRule() const { return defaultInfinityRule; } NFRule* RuleBasedNumberFormat::initializeDefaultNaNRule(UErrorCode &status) { if (U_FAILURE(status)) { return nullptr; } if (defaultNaNRule == nullptr) { UnicodeString rule(UNICODE_STRING_SIMPLE("NaN: ")); rule.append(getDecimalFormatSymbols()->getSymbol(DecimalFormatSymbols::kNaNSymbol)); LocalPointer temp(new NFRule(this, rule, status), status); if (U_SUCCESS(status)) { defaultNaNRule = temp.orphan(); } } return defaultNaNRule; } const NFRule* RuleBasedNumberFormat::getDefaultNaNRule() const { return defaultNaNRule; } // De-owning the current localized symbols and adopt the new symbols. void RuleBasedNumberFormat::adoptDecimalFormatSymbols(DecimalFormatSymbols* symbolsToAdopt) { if (symbolsToAdopt == nullptr) { return; // do not allow caller to set decimalFormatSymbols to nullptr } if (decimalFormatSymbols != nullptr) { delete decimalFormatSymbols; } decimalFormatSymbols = symbolsToAdopt; { // Apply the new decimalFormatSymbols by reparsing the rulesets UErrorCode status = U_ZERO_ERROR; delete defaultInfinityRule; defaultInfinityRule = nullptr; initializeDefaultInfinityRule(status); // Reset with the new DecimalFormatSymbols delete defaultNaNRule; defaultNaNRule = nullptr; initializeDefaultNaNRule(status); // Reset with the new DecimalFormatSymbols if (fRuleSets) { for (int32_t i = 0; i < numRuleSets; i++) { fRuleSets[i]->setDecimalFormatSymbols(*symbolsToAdopt, status); } } } } // Setting the symbols is equivalent to adopting a newly created localized symbols. void RuleBasedNumberFormat::setDecimalFormatSymbols(const DecimalFormatSymbols& symbols) { adoptDecimalFormatSymbols(new DecimalFormatSymbols(symbols)); } PluralFormat * RuleBasedNumberFormat::createPluralFormat(UPluralType pluralType, const UnicodeString &pattern, UErrorCode& status) const { auto *pf = new PluralFormat(locale, pluralType, pattern, status); if (pf == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; } return pf; } /** * Get the rounding mode. * @return A rounding mode */ DecimalFormat::ERoundingMode RuleBasedNumberFormat::getRoundingMode() const { return fRoundingMode; } /** * Set the rounding mode. This has no effect unless the rounding * increment is greater than zero. * @param roundingMode A rounding mode */ void RuleBasedNumberFormat::setRoundingMode(DecimalFormat::ERoundingMode roundingMode) { fRoundingMode = roundingMode; } U_NAMESPACE_END /* U_HAVE_RBNF */ #endif stringi/src/icu74/i18n/collationcompare.h0000644000176200001440000000221514700200761017733 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 1996-2014, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * collationcompare.h * * created on: 2012feb14 with new and old collation code * created by: Markus W. Scherer */ #ifndef __COLLATIONCOMPARE_H__ #define __COLLATIONCOMPARE_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION #include "unicode/ucol.h" U_NAMESPACE_BEGIN class CollationIterator; struct CollationSettings; class U_I18N_API CollationCompare /* not : public UObject because all methods are static */ { public: static UCollationResult compareUpToQuaternary(CollationIterator &left, CollationIterator &right, const CollationSettings &settings, UErrorCode &errorCode); }; U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION #endif // __COLLATIONCOMPARE_H__ stringi/src/icu74/i18n/msgfmt_impl.h0000644000176200001440000000245614700200761016725 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2007-2008, International Business Machines Corporation and * others. All Rights Reserved. * ******************************************************************************* * * File MSGFMT.H * ******************************************************************************* */ #ifndef __MSGFMT_IMPL_H__ #define __MSGFMT_IMPL_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/msgfmt.h" #include "uvector.h" #include "unicode/strenum.h" U_NAMESPACE_BEGIN class FormatNameEnumeration : public StringEnumeration { public: FormatNameEnumeration(LocalPointer fFormatNames, UErrorCode& status); virtual ~FormatNameEnumeration(); static UClassID U_EXPORT2 getStaticClassID(); virtual UClassID getDynamicClassID() const override; virtual const UnicodeString* snext(UErrorCode& status) override; virtual void reset(UErrorCode& status) override; virtual int32_t count(UErrorCode& status) const override; private: int32_t pos; LocalPointer fFormatNames; }; U_NAMESPACE_END #endif #endif stringi/src/icu74/i18n/gregoimp.h0000644000176200001440000002565514700200761016226 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (c) 2003-2008, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Author: Alan Liu * Created: September 2 2003 * Since: ICU 2.8 ********************************************************************** */ #ifndef GREGOIMP_H #define GREGOIMP_H #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/ures.h" #include "unicode/locid.h" #include "putilimp.h" U_NAMESPACE_BEGIN /** * A utility class providing mathematical functions used by time zone * and calendar code. Do not instantiate. Formerly just named 'Math'. * @internal */ class ClockMath { public: /** * Divide two integers, returning the floor of the quotient. * Unlike the built-in division, this is mathematically * well-behaved. E.g., -1/4 => 0 but * floorDivide(-1,4) => -1. * @param numerator the numerator * @param denominator a divisor which must be != 0 * @return the floor of the quotient */ static int32_t floorDivide(int32_t numerator, int32_t denominator); /** * Divide two integers, returning the floor of the quotient. * Unlike the built-in division, this is mathematically * well-behaved. E.g., -1/4 => 0 but * floorDivide(-1,4) => -1. * @param numerator the numerator * @param denominator a divisor which must be != 0 * @return the floor of the quotient */ static int64_t floorDivide(int64_t numerator, int64_t denominator); /** * Divide two numbers, returning the floor of the quotient. * Unlike the built-in division, this is mathematically * well-behaved. E.g., -1/4 => 0 but * floorDivide(-1,4) => -1. * @param numerator the numerator * @param denominator a divisor which must be != 0 * @return the floor of the quotient */ static inline double floorDivide(double numerator, double denominator); /** * Divide two numbers, returning the floor of the quotient and * the modulus remainder. Unlike the built-in division, this is * mathematically well-behaved. E.g., -1/4 => 0 and * -1%4 => -1, but floorDivide(-1,4) => * -1 with remainder => 3. NOTE: If numerator is * too large, the returned quotient may overflow. * @param numerator the numerator * @param denominator a divisor which must be != 0 * @param remainder output parameter to receive the * remainder. Unlike numerator % denominator, this * will always be non-negative, in the half-open range [0, * |denominator|). * @return the floor of the quotient */ static int32_t floorDivide(double numerator, int32_t denominator, int32_t* remainder); /** * For a positive divisor, return the quotient and remainder * such that dividend = quotient*divisor + remainder and * 0 <= remainder < divisor. * * Works around edge-case bugs. Handles pathological input * (dividend >> divisor) reasonably. * * Calling with a divisor <= 0 is disallowed. */ static double floorDivide(double dividend, double divisor, double* remainder); }; // Useful millisecond constants #define kOneDay (1.0 * U_MILLIS_PER_DAY) // 86,400,000 #define kOneHour (60*60*1000) #define kOneMinute 60000 #define kOneSecond 1000 #define kOneMillisecond 1 #define kOneWeek (7.0 * kOneDay) // 604,800,000 // Epoch constants #define kJan1_1JulianDay 1721426 // January 1, year 1 (Gregorian) #define kEpochStartAsJulianDay 2440588 // January 1, 1970 (Gregorian) #define kEpochYear 1970 #define kEarliestViableMillis -185331720384000000.0 // minimum representable by julian day -1e17 #define kLatestViableMillis 185753453990400000.0 // max representable by julian day +1e17 /** * The minimum supported Julian day. This value is equivalent to * MIN_MILLIS. */ #define MIN_JULIAN (-0x7F000000) /** * The minimum supported epoch milliseconds. This value is equivalent * to MIN_JULIAN. */ #define MIN_MILLIS ((MIN_JULIAN - kEpochStartAsJulianDay) * kOneDay) /** * The maximum supported Julian day. This value is equivalent to * MAX_MILLIS. */ #define MAX_JULIAN (+0x7F000000) /** * The maximum supported epoch milliseconds. This value is equivalent * to MAX_JULIAN. */ #define MAX_MILLIS ((MAX_JULIAN - kEpochStartAsJulianDay) * kOneDay) /** * A utility class providing proleptic Gregorian calendar functions * used by time zone and calendar code. Do not instantiate. * * Note: Unlike GregorianCalendar, all computations performed by this * class occur in the pure proleptic GregorianCalendar. */ class Grego { public: /** * Return true if the given year is a leap year. * @param year Gregorian year, with 0 == 1 BCE, -1 == 2 BCE, etc. * @return true if the year is a leap year */ static inline UBool isLeapYear(int32_t year); /** * Return the number of days in the given month. * @param year Gregorian year, with 0 == 1 BCE, -1 == 2 BCE, etc. * @param month 0-based month, with 0==Jan * @return the number of days in the given month */ static inline int8_t monthLength(int32_t year, int32_t month); /** * Return the length of a previous month of the Gregorian calendar. * @param y the extended year * @param m the 0-based month number * @return the number of days in the month previous to the given month */ static inline int8_t previousMonthLength(int y, int m); /** * Convert a year, month, and day-of-month, given in the proleptic * Gregorian calendar, to 1970 epoch days. * @param year Gregorian year, with 0 == 1 BCE, -1 == 2 BCE, etc. * @param month 0-based month, with 0==Jan * @param dom 1-based day of month * @return the day number, with day 0 == Jan 1 1970 */ static double fieldsToDay(int32_t year, int32_t month, int32_t dom); /** * Convert a 1970-epoch day number to proleptic Gregorian year, * month, day-of-month, and day-of-week. * @param day 1970-epoch day (integral value) * @param year output parameter to receive year * @param month output parameter to receive month (0-based, 0==Jan) * @param dom output parameter to receive day-of-month (1-based) * @param dow output parameter to receive day-of-week (1-based, 1==Sun) * @param doy output parameter to receive day-of-year (1-based) */ static void dayToFields(double day, int32_t& year, int32_t& month, int32_t& dom, int32_t& dow, int32_t& doy); /** * Convert a 1970-epoch day number to proleptic Gregorian year, * month, day-of-month, and day-of-week. * @param day 1970-epoch day (integral value) * @param year output parameter to receive year * @param month output parameter to receive month (0-based, 0==Jan) * @param dom output parameter to receive day-of-month (1-based) * @param dow output parameter to receive day-of-week (1-based, 1==Sun) */ static inline void dayToFields(double day, int32_t& year, int32_t& month, int32_t& dom, int32_t& dow); /** * Convert a 1970-epoch milliseconds to proleptic Gregorian year, * month, day-of-month, and day-of-week, day of year and millis-in-day. * @param time 1970-epoch milliseconds * @param year output parameter to receive year * @param month output parameter to receive month (0-based, 0==Jan) * @param dom output parameter to receive day-of-month (1-based) * @param dow output parameter to receive day-of-week (1-based, 1==Sun) * @param doy output parameter to receive day-of-year (1-based) * @param mid output parameter to receive millis-in-day */ static void timeToFields(UDate time, int32_t& year, int32_t& month, int32_t& dom, int32_t& dow, int32_t& doy, int32_t& mid); /** * Return the day of week on the 1970-epoch day * @param day the 1970-epoch day (integral value) * @return the day of week */ static int32_t dayOfWeek(double day); /** * Returns the ordinal number for the specified day of week within the month. * The valid return value is 1, 2, 3, 4 or -1. * @param year Gregorian year, with 0 == 1 BCE, -1 == 2 BCE, etc. * @param month 0-based month, with 0==Jan * @param dom 1-based day of month * @return The ordinal number for the specified day of week within the month */ static int32_t dayOfWeekInMonth(int32_t year, int32_t month, int32_t dom); /** * Converts Julian day to time as milliseconds. * @param julian the given Julian day number. * @return time as milliseconds. * @internal */ static inline double julianDayToMillis(int32_t julian); /** * Converts time as milliseconds to Julian day. * @param millis the given milliseconds. * @return the Julian day number. * @internal */ static inline int32_t millisToJulianDay(double millis); /** * Calculates the Gregorian day shift value for an extended year. * @param eyear Extended year * @returns number of days to ADD to Julian in order to convert from J->G */ static inline int32_t gregorianShift(int32_t eyear); private: static const int16_t DAYS_BEFORE[24]; static const int8_t MONTH_LENGTH[24]; }; inline double ClockMath::floorDivide(double numerator, double denominator) { return uprv_floor(numerator / denominator); } inline UBool Grego::isLeapYear(int32_t year) { // year&0x3 == year%4 return ((year&0x3) == 0) && ((year%100 != 0) || (year%400 == 0)); } inline int8_t Grego::monthLength(int32_t year, int32_t month) { return MONTH_LENGTH[month + (isLeapYear(year) ? 12 : 0)]; } inline int8_t Grego::previousMonthLength(int y, int m) { return (m > 0) ? monthLength(y, m-1) : 31; } inline void Grego::dayToFields(double day, int32_t& year, int32_t& month, int32_t& dom, int32_t& dow) { int32_t doy_unused; dayToFields(day,year,month,dom,dow,doy_unused); } inline double Grego::julianDayToMillis(int32_t julian) { return (julian - kEpochStartAsJulianDay) * kOneDay; } inline int32_t Grego::millisToJulianDay(double millis) { return (int32_t) (kEpochStartAsJulianDay + ClockMath::floorDivide(millis, (double)kOneDay)); } inline int32_t Grego::gregorianShift(int32_t eyear) { int64_t y = (int64_t)eyear-1; int32_t gregShift = static_cast(ClockMath::floorDivide(y, (int64_t)400) - ClockMath::floorDivide(y, (int64_t)100) + 2); return gregShift; } U_NAMESPACE_END #endif // !UCONFIG_NO_FORMATTING #endif // GREGOIMP_H //eof stringi/src/icu74/i18n/collationdatabuilder.h0000644000176200001440000002360214770512021020572 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2012-2014, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * collationdatabuilder.h * * created on: 2012apr01 * created by: Markus W. Scherer */ #ifndef __COLLATIONDATABUILDER_H__ #define __COLLATIONDATABUILDER_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION #include "unicode/uniset.h" #include "unicode/unistr.h" #include "unicode/uversion.h" #include "collation.h" #include "collationdata.h" #include "collationsettings.h" #include "normalizer2impl.h" #include "utrie2.h" #include "uvectr32.h" #include "uvectr64.h" #include "uvector.h" U_NAMESPACE_BEGIN struct ConditionalCE32; class CollationFastLatinBuilder; class CopyHelper; class DataBuilderCollationIterator; class UCharsTrieBuilder; /** * Low-level CollationData builder. * Takes (character, CE) pairs and builds them into runtime data structures. * Supports characters with context prefixes and contraction suffixes. */ class U_I18N_API CollationDataBuilder : public UObject { public: /** * Collation element modifier. Interface class for a modifier * that changes a tailoring builder's temporary CEs to final CEs. * Called for every non-special CE32 and every expansion CE. */ class CEModifier : public UObject { public: virtual ~CEModifier(); /** Returns a new CE to replace the non-special input CE32, or else Collation::NO_CE. */ virtual int64_t modifyCE32(uint32_t ce32) const = 0; /** Returns a new CE to replace the input CE, or else Collation::NO_CE. */ virtual int64_t modifyCE(int64_t ce) const = 0; }; CollationDataBuilder(UBool icu4xMode, UErrorCode &errorCode); virtual ~CollationDataBuilder(); void initForTailoring(const CollationData *b, UErrorCode &errorCode); virtual UBool isCompressibleLeadByte(uint32_t b) const; inline UBool isCompressiblePrimary(uint32_t p) const { return isCompressibleLeadByte(p >> 24); } /** * @return true if this builder has mappings (e.g., add() has been called) */ UBool hasMappings() const { return modified; } /** * @return true if c has CEs in this builder */ UBool isAssigned(UChar32 c) const; /** * @return the three-byte primary if c maps to a single such CE and has no context data, * otherwise returns 0. */ uint32_t getLongPrimaryIfSingleCE(UChar32 c) const; /** * @return the single CE for c. * Sets an error code if c does not have a single CE. */ int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const; void add(const UnicodeString &prefix, const UnicodeString &s, const int64_t ces[], int32_t cesLength, UErrorCode &errorCode); /** * Encodes the ces as either the returned ce32 by itself, * or by storing an expansion, with the returned ce32 referring to that. * * add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength)) */ virtual uint32_t encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode); void addCE32(const UnicodeString &prefix, const UnicodeString &s, uint32_t ce32, UErrorCode &errorCode); /** * Sets three-byte-primary CEs for a range of code points in code point order, * if it is worth doing; otherwise no change is made. * None of the code points in the range should have complex mappings so far * (expansions/contractions/prefixes). * @param start first code point * @param end last code point (inclusive) * @param primary primary weight for 'start' * @param step per-code point primary-weight increment * @param errorCode ICU in/out error code * @return true if an OFFSET_TAG range was used for start..end */ UBool maybeSetPrimaryRange(UChar32 start, UChar32 end, uint32_t primary, int32_t step, UErrorCode &errorCode); /** * Sets three-byte-primary CEs for a range of code points in code point order. * Sets range values if that is worth doing, or else individual values. * None of the code points in the range should have complex mappings so far * (expansions/contractions/prefixes). * @param start first code point * @param end last code point (inclusive) * @param primary primary weight for 'start' * @param step per-code point primary-weight increment * @param errorCode ICU in/out error code * @return the next primary after 'end': start primary incremented by ((end-start)+1)*step */ uint32_t setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end, uint32_t primary, int32_t step, UErrorCode &errorCode); /** * Copies all mappings from the src builder, with modifications. * This builder here must not be built yet, and should be empty. */ void copyFrom(const CollationDataBuilder &src, const CEModifier &modifier, UErrorCode &errorCode); void optimize(const UnicodeSet &set, UErrorCode &errorCode); void suppressContractions(const UnicodeSet &set, UErrorCode &errorCode); void enableFastLatin() { fastLatinEnabled = true; } virtual void build(CollationData &data, UErrorCode &errorCode); /** * Looks up CEs for s and appends them to the ces array. * Does not handle normalization: s should be in FCD form. * * Does not write completely ignorable CEs. * Does not write beyond Collation::MAX_EXPANSION_LENGTH. * * @return incremented cesLength */ int32_t getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength); int32_t getCEs(const UnicodeString &prefix, const UnicodeString &s, int64_t ces[], int32_t cesLength); protected: friend class CopyHelper; friend class DataBuilderCollationIterator; uint32_t getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const; int32_t addCE(int64_t ce, UErrorCode &errorCode); int32_t addCE32(uint32_t ce32, UErrorCode &errorCode); int32_t addConditionalCE32(const UnicodeString &context, uint32_t ce32, UErrorCode &errorCode); inline ConditionalCE32 *getConditionalCE32(int32_t index) const { return static_cast(conditionalCE32s[index]); } inline ConditionalCE32 *getConditionalCE32ForCE32(uint32_t ce32) const { return getConditionalCE32(Collation::indexFromCE32(ce32)); } static uint32_t makeBuilderContextCE32(int32_t index) { return Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, index); } static inline UBool isBuilderContextCE32(uint32_t ce32) { return Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG); } static uint32_t encodeOneCEAsCE32(int64_t ce); uint32_t encodeOneCE(int64_t ce, UErrorCode &errorCode); uint32_t encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode); uint32_t encodeExpansion32(const int32_t newCE32s[], int32_t length, UErrorCode &errorCode); uint32_t copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, UErrorCode &errorCode); /** * Copies base contractions to a list of ConditionalCE32. * Sets cond->next to the index of the first new item * and returns the index of the last new item. */ int32_t copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32, ConditionalCE32 *cond, UErrorCode &errorCode); UBool getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode); void setDigitTags(UErrorCode &errorCode); void setLeadSurrogates(UErrorCode &errorCode); void buildMappings(CollationData &data, UErrorCode &errorCode); void clearContexts(); void buildContexts(UErrorCode &errorCode); uint32_t buildContext(ConditionalCE32 *head, UErrorCode &errorCode); int32_t addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder, UErrorCode &errorCode); void buildFastLatinTable(CollationData &data, UErrorCode &errorCode); int32_t getCEs(const UnicodeString &s, int32_t start, int64_t ces[], int32_t cesLength); static UChar32 jamoCpFromIndex(int32_t i) { // 0 <= i < CollationData::JAMO_CE32S_LENGTH = 19 + 21 + 27 if(i < Hangul::JAMO_L_COUNT) { return Hangul::JAMO_L_BASE + i; } i -= Hangul::JAMO_L_COUNT; if(i < Hangul::JAMO_V_COUNT) { return Hangul::JAMO_V_BASE + i; } i -= Hangul::JAMO_V_COUNT; // i < 27 return Hangul::JAMO_T_BASE + 1 + i; } /** @see Collation::BUILDER_DATA_TAG */ static const uint32_t IS_BUILDER_JAMO_CE32 = 0x100; const Normalizer2Impl &nfcImpl; const CollationData *base; const CollationSettings *baseSettings; UTrie2 *trie; UVector32 ce32s; UVector64 ce64s; UVector conditionalCE32s; // vector of ConditionalCE32 // Characters that have context (prefixes or contraction suffixes). UnicodeSet contextChars; // Serialized UCharsTrie structures for finalized contexts. UnicodeString contexts; private: /** * The "era" of building intermediate contexts. * When the array of cached, temporary contexts overflows, then clearContexts() * removes them all and invalidates the builtCE32 that used to point to built tries. * See ConditionalCE32::era. */ int32_t contextsEra = 0; protected: UnicodeSet unsafeBackwardSet; UBool modified; UBool icu4xMode; UBool fastLatinEnabled; CollationFastLatinBuilder *fastLatinBuilder; DataBuilderCollationIterator *collIter; }; U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION #endif // __COLLATIONDATABUILDER_H__ stringi/src/icu74/i18n/double-conversion-diy-fp.h0000644000176200001440000001264514700200761021233 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // // From the double-conversion library. Original license: // // Copyright 2010 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // ICU PATCH: ifdef around UCONFIG_NO_FORMATTING #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #ifndef DOUBLE_CONVERSION_DIY_FP_H_ #define DOUBLE_CONVERSION_DIY_FP_H_ // ICU PATCH: Customize header file paths for ICU. #include "double-conversion-utils.h" // ICU PATCH: Wrap in ICU namespace U_NAMESPACE_BEGIN namespace double_conversion { // This "Do It Yourself Floating Point" class implements a floating-point number // with a uint64 significand and an int exponent. Normalized DiyFp numbers will // have the most significant bit of the significand set. // Multiplication and Subtraction do not normalize their results. // DiyFp store only non-negative numbers and are not designed to contain special // doubles (NaN and Infinity). class DiyFp { public: static const int kSignificandSize = 64; DiyFp() : f_(0), e_(0) {} DiyFp(const uint64_t significand, const int32_t exponent) : f_(significand), e_(exponent) {} // this -= other. // The exponents of both numbers must be the same and the significand of this // must be greater or equal than the significand of other. // The result will not be normalized. void Subtract(const DiyFp& other) { DOUBLE_CONVERSION_ASSERT(e_ == other.e_); DOUBLE_CONVERSION_ASSERT(f_ >= other.f_); f_ -= other.f_; } // Returns a - b. // The exponents of both numbers must be the same and a must be greater // or equal than b. The result will not be normalized. static DiyFp Minus(const DiyFp& a, const DiyFp& b) { DiyFp result = a; result.Subtract(b); return result; } // this *= other. void Multiply(const DiyFp& other) { // Simply "emulates" a 128 bit multiplication. // However: the resulting number only contains 64 bits. The least // significant 64 bits are only used for rounding the most significant 64 // bits. const uint64_t kM32 = 0xFFFFFFFFU; const uint64_t a = f_ >> 32; const uint64_t b = f_ & kM32; const uint64_t c = other.f_ >> 32; const uint64_t d = other.f_ & kM32; const uint64_t ac = a * c; const uint64_t bc = b * c; const uint64_t ad = a * d; const uint64_t bd = b * d; // By adding 1U << 31 to tmp we round the final result. // Halfway cases will be rounded up. const uint64_t tmp = (bd >> 32) + (ad & kM32) + (bc & kM32) + (1U << 31); e_ += other.e_ + 64; f_ = ac + (ad >> 32) + (bc >> 32) + (tmp >> 32); } // returns a * b; static DiyFp Times(const DiyFp& a, const DiyFp& b) { DiyFp result = a; result.Multiply(b); return result; } void Normalize() { DOUBLE_CONVERSION_ASSERT(f_ != 0); uint64_t significand = f_; int32_t exponent = e_; // This method is mainly called for normalizing boundaries. In general, // boundaries need to be shifted by 10 bits, and we optimize for this case. const uint64_t k10MSBits = DOUBLE_CONVERSION_UINT64_2PART_C(0xFFC00000, 00000000); while ((significand & k10MSBits) == 0) { significand <<= 10; exponent -= 10; } while ((significand & kUint64MSB) == 0) { significand <<= 1; exponent--; } f_ = significand; e_ = exponent; } static DiyFp Normalize(const DiyFp& a) { DiyFp result = a; result.Normalize(); return result; } uint64_t f() const { return f_; } int32_t e() const { return e_; } void set_f(uint64_t new_value) { f_ = new_value; } void set_e(int32_t new_value) { e_ = new_value; } private: static const uint64_t kUint64MSB = DOUBLE_CONVERSION_UINT64_2PART_C(0x80000000, 00000000); uint64_t f_; int32_t e_; }; } // namespace double_conversion // ICU PATCH: Close ICU namespace U_NAMESPACE_END #endif // DOUBLE_CONVERSION_DIY_FP_H_ #endif // ICU PATCH: close #if !UCONFIG_NO_FORMATTING stringi/src/icu74/i18n/number_capi.cpp0000644000176200001440000003172614700200761017230 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING // Allow implicit conversion from char16_t* to UnicodeString for this file: // Helpful in toString methods and elsewhere. #define UNISTR_FROM_STRING_EXPLICIT #include "fphdlimp.h" #include "number_utypes.h" #include "numparse_types.h" #include "formattedval_impl.h" #include "number_decnum.h" #include "unicode/numberformatter.h" #include "unicode/unumberformatter.h" #include "unicode/simplenumberformatter.h" #include "unicode/usimplenumberformatter.h" using namespace icu; using namespace icu::number; using namespace icu::number::impl; U_NAMESPACE_BEGIN namespace number { namespace impl { /** * Implementation class for UNumberFormatter. Wraps a LocalizedNumberFormatter. */ struct UNumberFormatterData : public UMemory, // Magic number as ASCII == "NFR" (NumberFormatteR) public IcuCApiHelper { LocalizedNumberFormatter fFormatter; }; /** * Implementation class for USimpleNumber. Wraps a SimpleNumberFormatter. */ struct USimpleNumberData : public UMemory, // Magic number as ASCII == "SNM" (SimpleNuMber) public IcuCApiHelper { SimpleNumber fNumber; }; /** * Implementation class for USimpleNumberFormatter. Wraps a SimpleNumberFormatter. */ struct USimpleNumberFormatterData : public UMemory, // Magic number as ASCII == "SNF" (SimpleNumberFormatter) public IcuCApiHelper { SimpleNumberFormatter fFormatter; }; struct UFormattedNumberImpl; // Magic number as ASCII == "FDN" (FormatteDNumber) typedef IcuCApiHelper UFormattedNumberApiHelper; struct UFormattedNumberImpl : public UFormattedValueImpl, public UFormattedNumberApiHelper { UFormattedNumberImpl(); ~UFormattedNumberImpl(); FormattedNumber fImpl; UFormattedNumberData fData; void setTo(FormattedNumber value); }; UFormattedNumberImpl::UFormattedNumberImpl() : fImpl(&fData) { fFormattedValue = &fImpl; } UFormattedNumberImpl::~UFormattedNumberImpl() { // Disown the data from fImpl so it doesn't get deleted twice fImpl.fData = nullptr; } void UFormattedNumberImpl::setTo(FormattedNumber value) { fData = std::move(*value.fData); } } } U_NAMESPACE_END UPRV_FORMATTED_VALUE_CAPI_NO_IMPLTYPE_AUTO_IMPL( UFormattedNumber, UFormattedNumberImpl, UFormattedNumberApiHelper, unumf) const DecimalQuantity* icu::number::impl::validateUFormattedNumberToDecimalQuantity( const UFormattedNumber* uresult, UErrorCode& status) { auto* result = UFormattedNumberApiHelper::validate(uresult, status); if (U_FAILURE(status)) { return nullptr; } return &result->fData.quantity; } U_CAPI UNumberFormatter* U_EXPORT2 unumf_openForSkeletonAndLocale(const char16_t* skeleton, int32_t skeletonLen, const char* locale, UErrorCode* ec) { auto* impl = new UNumberFormatterData(); if (impl == nullptr) { *ec = U_MEMORY_ALLOCATION_ERROR; return nullptr; } // Readonly-alias constructor (first argument is whether we are NUL-terminated) UnicodeString skeletonString(skeletonLen == -1, skeleton, skeletonLen); impl->fFormatter = NumberFormatter::forSkeleton(skeletonString, *ec).locale(locale); return impl->exportForC(); } U_CAPI UNumberFormatter* U_EXPORT2 unumf_openForSkeletonAndLocaleWithError(const char16_t* skeleton, int32_t skeletonLen, const char* locale, UParseError* perror, UErrorCode* ec) { auto* impl = new UNumberFormatterData(); if (impl == nullptr) { *ec = U_MEMORY_ALLOCATION_ERROR; return nullptr; } // Readonly-alias constructor (first argument is whether we are NUL-terminated) UnicodeString skeletonString(skeletonLen == -1, skeleton, skeletonLen); UParseError tempParseError; impl->fFormatter = NumberFormatter::forSkeleton(skeletonString, (perror == nullptr) ? tempParseError : *perror, *ec).locale(locale); return impl->exportForC(); } U_CAPI void U_EXPORT2 unumf_formatInt(const UNumberFormatter* uformatter, int64_t value, UFormattedNumber* uresult, UErrorCode* ec) { const UNumberFormatterData* formatter = UNumberFormatterData::validate(uformatter, *ec); auto* result = UFormattedNumberApiHelper::validate(uresult, *ec); if (U_FAILURE(*ec)) { return; } result->fData.resetString(); result->fData.quantity.clear(); result->fData.quantity.setToLong(value); formatter->fFormatter.formatImpl(&result->fData, *ec); } U_CAPI void U_EXPORT2 unumf_formatDouble(const UNumberFormatter* uformatter, double value, UFormattedNumber* uresult, UErrorCode* ec) { const UNumberFormatterData* formatter = UNumberFormatterData::validate(uformatter, *ec); auto* result = UFormattedNumberApiHelper::validate(uresult, *ec); if (U_FAILURE(*ec)) { return; } result->fData.resetString(); result->fData.quantity.clear(); result->fData.quantity.setToDouble(value); formatter->fFormatter.formatImpl(&result->fData, *ec); } U_CAPI void U_EXPORT2 unumf_formatDecimal(const UNumberFormatter* uformatter, const char* value, int32_t valueLen, UFormattedNumber* uresult, UErrorCode* ec) { const UNumberFormatterData* formatter = UNumberFormatterData::validate(uformatter, *ec); auto* result = UFormattedNumberApiHelper::validate(uresult, *ec); if (U_FAILURE(*ec)) { return; } result->fData.resetString(); result->fData.quantity.clear(); result->fData.quantity.setToDecNumber({value, valueLen}, *ec); if (U_FAILURE(*ec)) { return; } formatter->fFormatter.formatImpl(&result->fData, *ec); } U_CAPI int32_t U_EXPORT2 unumf_resultToString(const UFormattedNumber* uresult, char16_t* buffer, int32_t bufferCapacity, UErrorCode* ec) { const auto* result = UFormattedNumberApiHelper::validate(uresult, *ec); if (U_FAILURE(*ec)) { return 0; } if (buffer == nullptr ? bufferCapacity != 0 : bufferCapacity < 0) { *ec = U_ILLEGAL_ARGUMENT_ERROR; return 0; } return result->fData.toTempString(*ec).extract(buffer, bufferCapacity, *ec); } U_CAPI UBool U_EXPORT2 unumf_resultNextFieldPosition(const UFormattedNumber* uresult, UFieldPosition* ufpos, UErrorCode* ec) { const auto* result = UFormattedNumberApiHelper::validate(uresult, *ec); if (U_FAILURE(*ec)) { return false; } if (ufpos == nullptr) { *ec = U_ILLEGAL_ARGUMENT_ERROR; return false; } FieldPosition fp; fp.setField(ufpos->field); fp.setBeginIndex(ufpos->beginIndex); fp.setEndIndex(ufpos->endIndex); bool retval = result->fData.nextFieldPosition(fp, *ec); ufpos->beginIndex = fp.getBeginIndex(); ufpos->endIndex = fp.getEndIndex(); // NOTE: MSVC sometimes complains when implicitly converting between bool and UBool return retval ? true : false; } U_CAPI void U_EXPORT2 unumf_resultGetAllFieldPositions(const UFormattedNumber* uresult, UFieldPositionIterator* ufpositer, UErrorCode* ec) { const auto* result = UFormattedNumberApiHelper::validate(uresult, *ec); if (U_FAILURE(*ec)) { return; } if (ufpositer == nullptr) { *ec = U_ILLEGAL_ARGUMENT_ERROR; return; } auto* fpi = reinterpret_cast(ufpositer); FieldPositionIteratorHandler fpih(fpi, *ec); result->fData.getAllFieldPositions(fpih, *ec); } U_CAPI int32_t U_EXPORT2 unumf_resultToDecimalNumber( const UFormattedNumber* uresult, char* dest, int32_t destCapacity, UErrorCode* ec) { const auto* result = UFormattedNumberApiHelper::validate(uresult, *ec); if (U_FAILURE(*ec)) { return 0; } DecNum decnum; return result->fData.quantity .toDecNum(decnum, *ec) .toCharString(*ec) .extract(dest, destCapacity, *ec); } U_CAPI void U_EXPORT2 unumf_close(UNumberFormatter* f) { UErrorCode localStatus = U_ZERO_ERROR; const UNumberFormatterData* impl = UNumberFormatterData::validate(f, localStatus); delete impl; } ///// SIMPLE NUMBER FORMATTER ///// U_CAPI USimpleNumber* U_EXPORT2 usnum_openForInt64(int64_t value, UErrorCode* ec) { auto* number = new USimpleNumberData(); if (number == nullptr) { *ec = U_MEMORY_ALLOCATION_ERROR; return nullptr; } number->fNumber = SimpleNumber::forInt64(value, *ec); return number->exportForC(); } U_CAPI void U_EXPORT2 usnum_setToInt64(USimpleNumber* unumber, int64_t value, UErrorCode* ec) { auto* number = USimpleNumberData::validate(unumber, *ec); if (U_FAILURE(*ec)) { return; } number->fNumber = SimpleNumber::forInt64(value, *ec); } U_CAPI void U_EXPORT2 usnum_multiplyByPowerOfTen(USimpleNumber* unumber, int32_t power, UErrorCode* ec) { auto* number = USimpleNumberData::validate(unumber, *ec); if (U_FAILURE(*ec)) { return; } number->fNumber.multiplyByPowerOfTen(power, *ec); } U_CAPI void U_EXPORT2 usnum_roundTo(USimpleNumber* unumber, int32_t position, UNumberFormatRoundingMode roundingMode, UErrorCode* ec) { auto* number = USimpleNumberData::validate(unumber, *ec); if (U_FAILURE(*ec)) { return; } number->fNumber.roundTo(position, roundingMode, *ec); } U_CAPI void U_EXPORT2 usnum_setMinimumIntegerDigits(USimpleNumber* unumber, int32_t minimumIntegerDigits, UErrorCode* ec) { auto* number = USimpleNumberData::validate(unumber, *ec); if (U_FAILURE(*ec)) { return; } number->fNumber.setMinimumIntegerDigits(minimumIntegerDigits, *ec); } U_CAPI void U_EXPORT2 usnum_setMinimumFractionDigits(USimpleNumber* unumber, int32_t minimumFractionDigits, UErrorCode* ec) { auto* number = USimpleNumberData::validate(unumber, *ec); if (U_FAILURE(*ec)) { return; } number->fNumber.setMinimumFractionDigits(minimumFractionDigits, *ec); } U_CAPI void U_EXPORT2 usnum_truncateStart(USimpleNumber* unumber, int32_t maximumIntegerDigits, UErrorCode* ec) { auto* number = USimpleNumberData::validate(unumber, *ec); if (U_FAILURE(*ec)) { return; } number->fNumber.truncateStart(maximumIntegerDigits, *ec); } U_CAPI void U_EXPORT2 usnum_setSign(USimpleNumber* unumber, USimpleNumberSign sign, UErrorCode* ec) { auto* number = USimpleNumberData::validate(unumber, *ec); if (U_FAILURE(*ec)) { return; } number->fNumber.setSign(sign, *ec); } U_CAPI USimpleNumberFormatter* U_EXPORT2 usnumf_openForLocale(const char* locale, UErrorCode* ec) { auto* impl = new USimpleNumberFormatterData(); if (impl == nullptr) { *ec = U_MEMORY_ALLOCATION_ERROR; return nullptr; } impl->fFormatter = SimpleNumberFormatter::forLocale(locale, *ec); return impl->exportForC(); } U_CAPI USimpleNumberFormatter* U_EXPORT2 usnumf_openForLocaleAndGroupingStrategy( const char* locale, UNumberGroupingStrategy groupingStrategy, UErrorCode* ec) { auto* impl = new USimpleNumberFormatterData(); if (impl == nullptr) { *ec = U_MEMORY_ALLOCATION_ERROR; return nullptr; } impl->fFormatter = SimpleNumberFormatter::forLocaleAndGroupingStrategy(locale, groupingStrategy, *ec); return impl->exportForC(); } U_CAPI void U_EXPORT2 usnumf_format( const USimpleNumberFormatter* uformatter, USimpleNumber* unumber, UFormattedNumber* uresult, UErrorCode* ec) { auto* formatter = USimpleNumberFormatterData::validate(uformatter, *ec); auto* number = USimpleNumberData::validate(unumber, *ec); auto* result = UFormattedNumberApiHelper::validate(uresult, *ec); if (U_FAILURE(*ec)) { return; } auto localResult = formatter->fFormatter.format(std::move(number->fNumber), *ec); if (U_FAILURE(*ec)) { return; } result->setTo(std::move(localResult)); } U_CAPI void U_EXPORT2 usnumf_formatInt64( const USimpleNumberFormatter* uformatter, int64_t value, UFormattedNumber* uresult, UErrorCode* ec) { auto* formatter = USimpleNumberFormatterData::validate(uformatter, *ec); auto* result = UFormattedNumberApiHelper::validate(uresult, *ec); if (U_FAILURE(*ec)) { return; } auto localResult = formatter->fFormatter.formatInt64(value, *ec); result->setTo(std::move(localResult)); } U_CAPI void U_EXPORT2 usnum_close(USimpleNumber* unumber) { UErrorCode localStatus = U_ZERO_ERROR; const USimpleNumberData* impl = USimpleNumberData::validate(unumber, localStatus); delete impl; } U_CAPI void U_EXPORT2 usnumf_close(USimpleNumberFormatter* uformatter) { UErrorCode localStatus = U_ZERO_ERROR; const USimpleNumberFormatterData* impl = USimpleNumberFormatterData::validate(uformatter, localStatus); delete impl; } #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/number_types.h0000644000176200001440000002707714700200761017131 0ustar liggesusers// © 2017 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #ifndef __NUMBER_TYPES_H__ #define __NUMBER_TYPES_H__ #include #include "unicode/decimfmt.h" #include "unicode/unum.h" #include "unicode/numsys.h" #include "unicode/numberformatter.h" #include "unicode/utf16.h" #include "uassert.h" #include "unicode/platform.h" #include "unicode/uniset.h" #include "standardplural.h" #include "formatted_string_builder.h" U_NAMESPACE_BEGIN namespace number { namespace impl { // For convenience and historical reasons, import the Field typedef to the namespace. typedef FormattedStringBuilder::Field Field; // Typedef several enums for brevity and for easier comparison to Java. typedef UNumberFormatRoundingMode RoundingMode; typedef UNumberFormatPadPosition PadPosition; typedef UNumberCompactStyle CompactStyle; // ICU4J Equivalent: RoundingUtils.MAX_INT_FRAC_SIG static constexpr int32_t kMaxIntFracSig = 999; // ICU4J Equivalent: RoundingUtils.DEFAULT_ROUNDING_MODE static constexpr RoundingMode kDefaultMode = RoundingMode::UNUM_FOUND_HALFEVEN; // ICU4J Equivalent: Padder.FALLBACK_PADDING_STRING static constexpr char16_t kFallbackPaddingString[] = u" "; // Forward declarations: class Modifier; class MutablePatternModifier; class DecimalQuantity; class ModifierStore; struct MicroProps; enum AffixPatternType { // Represents a literal character; the value is stored in the code point field. TYPE_CODEPOINT = 0, // Represents a minus sign symbol '-'. TYPE_MINUS_SIGN = -1, // Represents a plus sign symbol '+'. TYPE_PLUS_SIGN = -2, // Represents an approximately sign symbol '~'. TYPE_APPROXIMATELY_SIGN = -3, // Represents a percent sign symbol '%'. TYPE_PERCENT = -4, // Represents a permille sign symbol '‰'. TYPE_PERMILLE = -5, // Represents a single currency symbol '¤'. TYPE_CURRENCY_SINGLE = -6, // Represents a double currency symbol '¤¤'. TYPE_CURRENCY_DOUBLE = -7, // Represents a triple currency symbol '¤¤¤'. TYPE_CURRENCY_TRIPLE = -8, // Represents a quadruple currency symbol '¤¤¤¤'. TYPE_CURRENCY_QUAD = -9, // Represents a quintuple currency symbol '¤¤¤¤¤'. TYPE_CURRENCY_QUINT = -10, // Represents a sequence of six or more currency symbols. TYPE_CURRENCY_OVERFLOW = -15 }; enum CompactType { TYPE_DECIMAL, TYPE_CURRENCY }; enum Signum { SIGNUM_NEG = 0, SIGNUM_NEG_ZERO = 1, SIGNUM_POS_ZERO = 2, SIGNUM_POS = 3, SIGNUM_COUNT = 4, }; class U_I18N_API AffixPatternProvider { public: static const int32_t AFFIX_PLURAL_MASK = 0xff; static const int32_t AFFIX_PREFIX = 0x100; static const int32_t AFFIX_NEGATIVE_SUBPATTERN = 0x200; static const int32_t AFFIX_PADDING = 0x400; // Convenience compound flags static const int32_t AFFIX_POS_PREFIX = AFFIX_PREFIX; static const int32_t AFFIX_POS_SUFFIX = 0; static const int32_t AFFIX_NEG_PREFIX = AFFIX_PREFIX | AFFIX_NEGATIVE_SUBPATTERN; static const int32_t AFFIX_NEG_SUFFIX = AFFIX_NEGATIVE_SUBPATTERN; virtual ~AffixPatternProvider(); virtual char16_t charAt(int flags, int i) const = 0; virtual int length(int flags) const = 0; virtual UnicodeString getString(int flags) const = 0; virtual bool hasCurrencySign() const = 0; virtual bool positiveHasPlusSign() const = 0; virtual bool hasNegativeSubpattern() const = 0; virtual bool negativeHasMinusSign() const = 0; virtual bool containsSymbolType(AffixPatternType, UErrorCode&) const = 0; /** * True if the pattern has a number placeholder like "0" or "#,##0.00"; false if the pattern does not * have one. This is used in cases like compact notation, where the pattern replaces the entire * number instead of rendering the number. */ virtual bool hasBody() const = 0; /** * True if the currency symbol should replace the decimal separator. */ virtual bool currencyAsDecimal() const = 0; }; /** * A Modifier is an object that can be passed through the formatting pipeline until it is finally applied to the string * builder. A Modifier usually contains a prefix and a suffix that are applied, but it could contain something else, * like a {@link com.ibm.icu.text.SimpleFormatter} pattern. * * A Modifier is usually immutable, except in cases such as {@link MutablePatternModifier}, which are mutable for performance * reasons. * * Exported as U_I18N_API because it is a base class for other exported types */ class U_I18N_API Modifier { public: virtual ~Modifier(); /** * Apply this Modifier to the string builder. * * @param output * The string builder to which to apply this modifier. * @param leftIndex * The left index of the string within the builder. Equal to 0 when only one number is being formatted. * @param rightIndex * The right index of the string within the string builder. Equal to length when only one number is being * formatted. * @return The number of characters (UTF-16 code units) that were added to the string builder. */ virtual int32_t apply(FormattedStringBuilder& output, int leftIndex, int rightIndex, UErrorCode& status) const = 0; /** * Gets the length of the prefix. This information can be used in combination with {@link #apply} to extract the * prefix and suffix strings. * * @return The number of characters (UTF-16 code units) in the prefix. */ virtual int32_t getPrefixLength() const = 0; /** * Returns the number of code points in the modifier, prefix plus suffix. */ virtual int32_t getCodePointCount() const = 0; /** * Whether this modifier is strong. If a modifier is strong, it should always be applied immediately and not allowed * to bubble up. With regard to padding, strong modifiers are considered to be on the inside of the prefix and * suffix. * * @return Whether the modifier is strong. */ virtual bool isStrong() const = 0; /** * Whether the modifier contains at least one occurrence of the given field. */ virtual bool containsField(Field field) const = 0; /** * A fill-in for getParameters(). obj will always be set; if non-null, the other * two fields are also safe to read. */ struct U_I18N_API Parameters { const ModifierStore* obj = nullptr; Signum signum; StandardPlural::Form plural; Parameters(); Parameters(const ModifierStore* _obj, Signum _signum, StandardPlural::Form _plural); }; /** * Gets a set of "parameters" for this Modifier. * * TODO: Make this return a `const Parameters*` more like Java? */ virtual void getParameters(Parameters& output) const = 0; /** * Returns whether this Modifier is *semantically equivalent* to the other Modifier; * in many cases, this is the same as equal, but parameters should be ignored. */ virtual bool semanticallyEquivalent(const Modifier& other) const = 0; }; /** * This is *not* a modifier; rather, it is an object that can return modifiers * based on given parameters. * * Exported as U_I18N_API because it is a base class for other exported types. */ class U_I18N_API ModifierStore { public: virtual ~ModifierStore(); /** * Returns a Modifier with the given parameters (best-effort). */ virtual const Modifier* getModifier(Signum signum, StandardPlural::Form plural) const = 0; }; /** * This interface is used when all number formatting settings, including the locale, are known, except for the quantity * itself. The {@link #processQuantity} method performs the final step in the number processing pipeline: it uses the * quantity to generate a finalized {@link MicroProps}, which can be used to render the number to output. * * In other words, this interface is used for the parts of number processing that are quantity-dependent. * * In order to allow for multiple different objects to all mutate the same MicroProps, a "chain" of MicroPropsGenerators * are linked together, and each one is responsible for manipulating a certain quantity-dependent part of the * MicroProps. At the tail of the linked list is a base instance of {@link MicroProps} with properties that are not * quantity-dependent. Each element in the linked list calls {@link #processQuantity} on its "parent", then does its * work, and then returns the result. * * This chain of MicroPropsGenerators is typically constructed by NumberFormatterImpl::macrosToMicroGenerator() when * constructing a NumberFormatter. * * Exported as U_I18N_API because it is a base class for other exported types * */ class U_I18N_API MicroPropsGenerator { public: virtual ~MicroPropsGenerator() = default; /** * Considers the given {@link DecimalQuantity}, optionally mutates it, and * populates a {@link MicroProps} instance. * * @param quantity The quantity for consideration and optional mutation. * @param micros The MicroProps instance to populate. It will be modified as * needed for the given quantity. */ virtual void processQuantity(DecimalQuantity& quantity, MicroProps& micros, UErrorCode& status) const = 0; }; /** * An interface used by compact notation and scientific notation to choose a multiplier while rounding. */ class MultiplierProducer { public: virtual ~MultiplierProducer(); /** * Maps a magnitude to a multiplier in powers of ten. For example, in compact notation in English, a magnitude of 5 * (e.g., 100,000) should return a multiplier of -3, since the number is displayed in thousands. * * @param magnitude * The power of ten of the input number. * @return The shift in powers of ten. */ virtual int32_t getMultiplier(int32_t magnitude) const = 0; }; // Exported as U_I18N_API because it is a public member field of exported DecimalFormatProperties template class U_I18N_API NullableValue { public: NullableValue() : fNull(true) {} NullableValue(const NullableValue& other) = default; explicit NullableValue(const T& other) { fValue = other; fNull = false; } NullableValue& operator=(const NullableValue& other) { fNull = other.fNull; if (!fNull) { fValue = other.fValue; } return *this; } NullableValue& operator=(const T& other) { fValue = other; fNull = false; return *this; } bool operator==(const NullableValue& other) const { // "fValue == other.fValue" returns UBool, not bool (causes compiler warnings) return fNull ? other.fNull : (other.fNull ? false : static_cast(fValue == other.fValue)); } void nullify() { // TODO: It might be nice to call the destructor here. fNull = true; } bool isNull() const { return fNull; } T get(UErrorCode& status) const { if (fNull) { status = U_UNDEFINED_VARIABLE; } return fValue; } T getNoError() const { return fValue; } T getOrDefault(T defaultValue) const { return fNull ? defaultValue : fValue; } private: bool fNull; T fValue; }; } // namespace impl } // namespace number U_NAMESPACE_END #endif //__NUMBER_TYPES_H__ #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/number_multiplier.h0000644000176200001440000000347514700200761020147 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #ifndef __SOURCE_NUMBER_MULTIPLIER_H__ #define __SOURCE_NUMBER_MULTIPLIER_H__ #include "numparse_types.h" #include "number_decimfmtprops.h" U_NAMESPACE_BEGIN namespace number { namespace impl { /** * Wraps a {@link Multiplier} for use in the number formatting pipeline. */ // Exported as U_I18N_API for tests class U_I18N_API MultiplierFormatHandler : public MicroPropsGenerator, public UMemory { public: MultiplierFormatHandler() = default; // WARNING: Leaves object in an unusable state; call setAndChain() void setAndChain(const Scale& multiplier, const MicroPropsGenerator* parent); void processQuantity(DecimalQuantity& quantity, MicroProps& micros, UErrorCode& status) const override; private: Scale fMultiplier; const MicroPropsGenerator *fParent; }; /** Gets a Scale from a DecimalFormatProperties. In Java, defined in RoundingUtils.java */ static inline Scale scaleFromProperties(const DecimalFormatProperties& properties) { int32_t magnitudeMultiplier = properties.magnitudeMultiplier + properties.multiplierScale; int32_t arbitraryMultiplier = properties.multiplier; if (magnitudeMultiplier != 0 && arbitraryMultiplier != 1) { return Scale::byDoubleAndPowerOfTen(arbitraryMultiplier, magnitudeMultiplier); } else if (magnitudeMultiplier != 0) { return Scale::powerOfTen(magnitudeMultiplier); } else if (arbitraryMultiplier != 1) { return Scale::byDouble(arbitraryMultiplier); } else { return Scale::none(); } } } // namespace impl } // namespace number U_NAMESPACE_END #endif //__SOURCE_NUMBER_MULTIPLIER_H__ #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/nultrans.h0000644000176200001440000000423014700200761016245 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (c) 2000-2007, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 01/11/2000 aliu Creation. ********************************************************************** */ #ifndef NULTRANS_H #define NULTRANS_H #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/translit.h" U_NAMESPACE_BEGIN /** * A transliterator that leaves text unchanged. * @author Alan Liu * @internal Use transliterator factory methods instead since this class will be removed in that release. */ class NullTransliterator : public Transliterator { public: /** * Constructs a transliterator. * @internal Use transliterator factory methods instead since this class will be removed in that release. */ NullTransliterator(); /** * Destructor. * @internal Use transliterator factory methods instead since this class will be removed in that release. */ virtual ~NullTransliterator(); /** * Transliterator API. * @internal Use transliterator factory methods instead since this class will be removed in that release. */ virtual NullTransliterator* clone() const override; /** * Implements {@link Transliterator#handleTransliterate}. * @internal Use transliterator factory methods instead since this class will be removed in that release. */ virtual void handleTransliterate(Replaceable& text, UTransPosition& offset, UBool isIncremental) const override; /** * ICU "poor man's RTTI", returns a UClassID for the actual class. */ virtual UClassID getDynamicClassID() const override; /** * ICU "poor man's RTTI", returns a UClassID for this class. */ U_I18N_API static UClassID U_EXPORT2 getStaticClassID(); }; U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ #endif stringi/src/icu74/i18n/utf8collationiterator.h0000644000176200001440000001233714700200761020753 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2012-2016, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * utf8collationiterator.h * * created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h) * created by: Markus W. Scherer */ #ifndef __UTF8COLLATIONITERATOR_H__ #define __UTF8COLLATIONITERATOR_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION #include "cmemory.h" #include "collation.h" #include "collationdata.h" #include "collationiterator.h" #include "normalizer2impl.h" U_NAMESPACE_BEGIN /** * UTF-8 collation element and character iterator. * Handles normalized UTF-8 text inline, with length or NUL-terminated. * Unnormalized text is handled by a subclass. */ class U_I18N_API UTF8CollationIterator : public CollationIterator { public: UTF8CollationIterator(const CollationData *d, UBool numeric, const uint8_t *s, int32_t p, int32_t len) : CollationIterator(d, numeric), u8(s), pos(p), length(len) {} virtual ~UTF8CollationIterator(); virtual void resetToOffset(int32_t newOffset) override; virtual int32_t getOffset() const override; virtual UChar32 nextCodePoint(UErrorCode &errorCode) override; virtual UChar32 previousCodePoint(UErrorCode &errorCode) override; protected: /** * For byte sequences that are illegal in UTF-8, an error value may be returned * together with a bogus code point. The caller will ignore that code point. * * Special values may be returned for surrogate code points, which are also illegal in UTF-8, * but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns true. * * Valid lead surrogates are returned from inside a normalized text segment, * where handleGetTrailSurrogate() will return the matching trail surrogate. */ virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode) override; virtual UBool foundNULTerminator() override; virtual UBool forbidSurrogateCodePoints() const override; virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override; virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override; const uint8_t *u8; int32_t pos; int32_t length; // <0 for NUL-terminated strings }; /** * Incrementally checks the input text for FCD and normalizes where necessary. */ class U_I18N_API FCDUTF8CollationIterator : public UTF8CollationIterator { public: FCDUTF8CollationIterator(const CollationData *data, UBool numeric, const uint8_t *s, int32_t p, int32_t len) : UTF8CollationIterator(data, numeric, s, p, len), state(CHECK_FWD), start(p), nfcImpl(data->nfcImpl) {} virtual ~FCDUTF8CollationIterator(); virtual void resetToOffset(int32_t newOffset) override; virtual int32_t getOffset() const override; virtual UChar32 nextCodePoint(UErrorCode &errorCode) override; virtual UChar32 previousCodePoint(UErrorCode &errorCode) override; protected: virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode) override; virtual char16_t handleGetTrailSurrogate() override; virtual UBool foundNULTerminator() override; virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override; virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override; private: UBool nextHasLccc() const; UBool previousHasTccc() const; /** * Switches to forward checking if possible. */ void switchToForward(); /** * Extends the FCD text segment forward or normalizes around pos. * @return true if success */ UBool nextSegment(UErrorCode &errorCode); /** * Switches to backward checking. */ void switchToBackward(); /** * Extends the FCD text segment backward or normalizes around pos. * @return true if success */ UBool previousSegment(UErrorCode &errorCode); UBool normalize(const UnicodeString &s, UErrorCode &errorCode); enum State { /** * The input text [start..pos[ passes the FCD check. * Moving forward checks incrementally. * limit is undefined. */ CHECK_FWD, /** * The input text [pos..limit[ passes the FCD check. * Moving backward checks incrementally. * start is undefined. */ CHECK_BWD, /** * The input text [start..limit[ passes the FCD check. * pos tracks the current text index. */ IN_FCD_SEGMENT, /** * The input text [start..limit[ failed the FCD check and was normalized. * pos tracks the current index in the normalized string. */ IN_NORMALIZED }; State state; int32_t start; int32_t limit; const Normalizer2Impl &nfcImpl; UnicodeString normalized; }; U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION #endif // __UTF8COLLATIONITERATOR_H__ stringi/src/icu74/i18n/csrucode.cpp0000644000176200001440000001267714700200761016557 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2005-2013, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_CONVERSION #include "csrucode.h" #include "csmatch.h" U_NAMESPACE_BEGIN CharsetRecog_Unicode::~CharsetRecog_Unicode() { // nothing to do } CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE() { // nothing to do } const char *CharsetRecog_UTF_16_BE::getName() const { return "UTF-16BE"; } // UTF-16 confidence calculation. Very simple minded, but better than nothing. // Any 8 bit non-control characters bump the confidence up. These have a zero high byte, // and are very likely to be UTF-16, although they could also be part of a UTF-32 code. // NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32. // NULs should be rare in actual text. static int32_t adjustConfidence(char16_t codeUnit, int32_t confidence) { if (codeUnit == 0) { confidence -= 10; } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) { confidence += 10; } if (confidence < 0) { confidence = 0; } else if (confidence > 100) { confidence = 100; } return confidence; } UBool CharsetRecog_UTF_16_BE::match(InputText* textIn, CharsetMatch *results) const { const uint8_t *input = textIn->fRawInput; int32_t confidence = 10; int32_t length = textIn->fRawLength; int32_t bytesToCheck = (length > 30) ? 30 : length; for (int32_t charIndex=0; charIndexset(textIn, this, confidence); return (confidence > 0); } CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE() { // nothing to do } const char *CharsetRecog_UTF_16_LE::getName() const { return "UTF-16LE"; } UBool CharsetRecog_UTF_16_LE::match(InputText* textIn, CharsetMatch *results) const { const uint8_t *input = textIn->fRawInput; int32_t confidence = 10; int32_t length = textIn->fRawLength; int32_t bytesToCheck = (length > 30) ? 30 : length; for (int32_t charIndex=0; charIndex= 4 && input[2] == 0 && input[3] == 0) { confidence = 0; // UTF-32 BOM } break; } confidence = adjustConfidence(codeUnit, confidence); if (confidence == 0 || confidence == 100) { break; } } if (bytesToCheck < 4 && confidence < 100) { confidence = 0; } results->set(textIn, this, confidence); return (confidence > 0); } CharsetRecog_UTF_32::~CharsetRecog_UTF_32() { // nothing to do } UBool CharsetRecog_UTF_32::match(InputText* textIn, CharsetMatch *results) const { const uint8_t *input = textIn->fRawInput; int32_t limit = (textIn->fRawLength / 4) * 4; int32_t numValid = 0; int32_t numInvalid = 0; bool hasBOM = false; int32_t confidence = 0; if (limit > 0 && getChar(input, 0) == 0x0000FEFFUL) { hasBOM = true; } for(int32_t i = 0; i < limit; i += 4) { int32_t ch = getChar(input, i); if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) { numInvalid += 1; } else { numValid += 1; } } // Cook up some sort of confidence score, based on presence of a BOM // and the existence of valid and/or invalid multi-byte sequences. if (hasBOM && numInvalid==0) { confidence = 100; } else if (hasBOM && numValid > numInvalid*10) { confidence = 80; } else if (numValid > 3 && numInvalid == 0) { confidence = 100; } else if (numValid > 0 && numInvalid == 0) { confidence = 80; } else if (numValid > numInvalid*10) { // Probably corrupt UTF-32BE data. Valid sequences aren't likely by chance. confidence = 25; } results->set(textIn, this, confidence); return (confidence > 0); } CharsetRecog_UTF_32_BE::~CharsetRecog_UTF_32_BE() { // nothing to do } const char *CharsetRecog_UTF_32_BE::getName() const { return "UTF-32BE"; } int32_t CharsetRecog_UTF_32_BE::getChar(const uint8_t *input, int32_t index) const { return input[index + 0] << 24 | input[index + 1] << 16 | input[index + 2] << 8 | input[index + 3]; } CharsetRecog_UTF_32_LE::~CharsetRecog_UTF_32_LE() { // nothing to do } const char *CharsetRecog_UTF_32_LE::getName() const { return "UTF-32LE"; } int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input, int32_t index) const { return input[index + 3] << 24 | input[index + 2] << 16 | input[index + 1] << 8 | input[index + 0]; } U_NAMESPACE_END #endif stringi/src/icu74/i18n/collationdatareader.h0000644000176200001440000002415214700200761020405 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2013-2015, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * collationdatareader.h * * created on: 2013feb07 * created by: Markus W. Scherer */ #ifndef __COLLATIONDATAREADER_H__ #define __COLLATIONDATAREADER_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION #include "unicode/udata.h" struct UDataMemory; U_NAMESPACE_BEGIN struct CollationTailoring; /** * Collation binary data reader. */ struct U_I18N_API CollationDataReader /* all static */ { // The following constants are also copied into source/common/ucol_swp.cpp. // Keep them in sync! enum { /** * Number of int32_t indexes. * * Can be 2 if there are only options. * Can be 7 or 8 if there are only options and a script reordering. * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0. */ IX_INDEXES_LENGTH, // 0 /** * Bits 31..24: numericPrimary, for numeric collation * 23..16: fast Latin format version (0 = no fast Latin table) * 15.. 0: options bit set */ IX_OPTIONS, IX_RESERVED2, IX_RESERVED3, /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */ IX_JAMO_CE32S_START, // 4 // Byte offsets from the start of the data, after the generic header. // The indexes[] are at byte offset 0, other data follows. // Each data item is aligned properly. // The data items should be in descending order of unit size, // to minimize the need for padding. // Each item's byte length is given by the difference between its offset and // the next index/offset value. /** Byte offset to int32_t reorderCodes[]. */ IX_REORDER_CODES_OFFSET, /** * Byte offset to uint8_t reorderTable[]. * Empty table if <256 bytes (padding only). * Otherwise 256 bytes or more (with padding). */ IX_REORDER_TABLE_OFFSET, /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */ IX_TRIE_OFFSET, IX_RESERVED8_OFFSET, // 8 /** Byte offset to int64_t ces[]. */ IX_CES_OFFSET, IX_RESERVED10_OFFSET, /** Byte offset to uint32_t ce32s[]. */ IX_CE32S_OFFSET, /** Byte offset to uint32_t rootElements[]. */ IX_ROOT_ELEMENTS_OFFSET, // 12 /** Byte offset to char16_t *contexts[]. */ IX_CONTEXTS_OFFSET, /** Byte offset to uint16_t [] with serialized unsafeBackwardSet. */ IX_UNSAFE_BWD_OFFSET, /** Byte offset to uint16_t fastLatinTable[]. */ IX_FAST_LATIN_TABLE_OFFSET, /** Byte offset to uint16_t scripts[]. */ IX_SCRIPTS_OFFSET, // 16 /** * Byte offset to UBool compressibleBytes[]. * Empty table if <256 bytes (padding only). * Otherwise 256 bytes or more (with padding). */ IX_COMPRESSIBLE_BYTES_OFFSET, IX_RESERVED18_OFFSET, IX_TOTAL_SIZE }; static void read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength, CollationTailoring &tailoring, UErrorCode &errorCode); static UBool U_CALLCONV isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo); private: CollationDataReader() = delete; // no constructor }; /* * Format of collation data (ucadata.icu, binary data in coll/ *.res files). * Format version 5. * * The root collation data is stored in the ucadata.icu file. * Tailorings are stored inside .res resource bundle files, with a complete file header. * * Collation data begins with a standard ICU data file header * (DataHeader, see ucmndata.h and unicode/udata.h). * The UDataInfo.dataVersion field contains the UCA and other version numbers, * see the comments for CollationTailoring.version. * * After the header, the file contains the following parts. * Constants are defined as enum values of the CollationDataReader class. * See also the Collation class. * * int32_t indexes[indexesLength]; * The indexes array has variable length. * Some tailorings only need the length and the options, * others only add reorderCodes and the reorderTable, * some need to store mappings. * Only as many indexes are stored as needed to read all of the data. * * Index 0: indexesLength * Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see IX_OPTIONS * Index 2..3: Unused/reserved/0. * Index 4: Index into the ce32s array where the CE32s of the conjoining Jamo * are stored in a short, contiguous part of the ce32s array. * * Indexes 5..19 are byte offsets in ascending order. * Each byte offset marks the start of the next part in the data file, * and the end of the previous one. * When two consecutive byte offsets are the same (or too short), * then the corresponding part is empty. * Byte offsets are offsets from after the header, * that is, from the beginning of the indexes[]. * Each part starts at an offset with proper alignment for its data. * If necessary, the previous part may include padding bytes to achieve this alignment. * The last byte offset that is stored in the indexes indicates the total size of the data * (starting with the indexes). * * int32_t reorderCodes[]; -- empty in root * The list of script and reordering codes. * * Beginning with format version 5, this array may optionally * have trailing entries with a full list of reorder ranges * as described for CollationSettings::reorderRanges. * * Script or reorder codes are first and do not exceed 16-bit values. * Range limits are stored in the upper 16 bits, and are never 0. * Split this array into reorder codes and ranges at the first entry * with non-zero upper 16 bits. * * If the ranges are missing but needed for split-reordered primary lead bytes, * then they are regenerated at load time. * * uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes * Primary-weight lead byte permutation table. * Normally present when the reorderCodes are, but can be built at load time. * * Beginning with format version 5, a 0 entry at a non-zero index * (which is otherwise an illegal value) * means that the primary lead byte is "split" * (there are different offsets for primaries that share that lead byte) * and the reordering offset must be determined via the reorder ranges * that are either stored as part of the reorderCodes array * or regenerated at load time. * * UTrie2 trie; -- see utrie2_impl.h and utrie2.h * The trie holds the main collation data. Each code point is mapped to a 32-bit value. * It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set, * in which case it is a special CE32 and contains a 4-bit tag and further data. * See the Collation class for details. * * The trie has a value for each lead surrogate code unit with some bits encoding * collective properties of the 1024 supplementary characters whose UTF-16 form starts with * the lead surrogate. See Collation::LEAD_SURROGATE_TAG.. * * int64_t ces[]; * 64-bit CEs and expansions that cannot be stored in a more compact form. * * uint32_t ce32s[]; * CE32s for expansions in compact form, and for characters whose trie values * contain special data. * * uint32_t rootElements[]; -- empty in all tailorings * Compact storage for all of the CEs that occur in the root collation. * See the CollationRootElements class. * * char16_t *contexts[]; * Serialized UCharsTrie structures with prefix (pre-context) and contraction mappings. * * uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize() * Serialized form of characters that are unsafe when iterating backwards, * and at the end of an identical string prefix. * Back up to a safe character. * Lead surrogates are "unsafe" when any of their corresponding supplementary * code points are unsafe. * Does not include [:^lccc=0:][:^tccc=0:]. * For each tailoring, the root unsafeBackwardSet is subtracted. * (As a result, in many tailorings no set needs to be stored.) * * uint16_t fastLatinTable[]; * Optional optimization for Latin text. * See the CollationFastLatin class. * * uint16_t scripts[]; -- empty in all tailorings * Format version 5: * uint16_t numScripts; * uint16_t scriptsIndex[numScripts+16]; * uint16_t scriptStarts[]; * See CollationData::numScripts etc. * * Format version 4: * Table of the reordering groups with their first and last lead bytes, * and their script and reordering codes. * See CollationData::scripts. * * UBool compressibleBytes[]; -- empty in all tailorings * Flag for getSortKey(), indicating primary weight lead bytes that are compressible. * * ----------------- * Changes for formatVersion 5 (ICU 55) * * Reordering moves single scripts, not groups of scripts. * Reorder ranges are optionally appended to the reorderCodes, * and a 0 entry in the reorderTable indicates a split lead byte. * The scripts data has a new format. * * The rootElements may contain secondary and tertiary weights below common=05. * (Used for small Hiragana letters.) * Where is occurs, there is also an explicit unit with common secondary & tertiary weights. * There are no other data structure changes, but builder code needs to be able to handle such data. * * The collation element for the merge separator code point U+FFFE * does not necessarily have special, unique secondary/tertiary weights any more. */ U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION #endif // __COLLATIONDATAREADER_H__ stringi/src/icu74/i18n/unesctrn.h0000644000176200001440000000677214700200761016255 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (c) 2001-2007, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 11/20/2001 aliu Creation. ********************************************************************** */ #ifndef UNESCTRN_H #define UNESCTRN_H #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/translit.h" U_NAMESPACE_BEGIN /** * A transliterator that converts Unicode escape forms to the * characters they represent. Escape forms have a prefix, a suffix, a * radix, and minimum and maximum digit counts. * *

This class is package private. It registers several standard * variants with the system which are then accessed via their IDs. * * @author Alan Liu */ class UnescapeTransliterator : public Transliterator { private: /** * The encoded pattern specification. The pattern consists of * zero or more forms. Each form consists of a prefix, suffix, * radix, minimum digit count, and maximum digit count. These * values are stored as a five character header. That is, their * numeric values are cast to 16-bit characters and stored in the * string. Following these five characters, the prefix * characters, then suffix characters are stored. Each form thus * takes n+5 characters, where n is the total length of the prefix * and suffix. The end is marked by a header of length one * consisting of the character END. */ char16_t* spec; // owned; may not be nullptr public: /** * Registers standard variants with the system. Called by * Transliterator during initialization. */ static void registerIDs(); /** * Constructor. Takes the encoded spec array (does not adopt it). * @param ID the string identifier for this transliterator * @param spec the encoded spec array */ UnescapeTransliterator(const UnicodeString& ID, const char16_t *spec); /** * Copy constructor. */ UnescapeTransliterator(const UnescapeTransliterator&); /** * Destructor. */ virtual ~UnescapeTransliterator(); /** * Transliterator API. */ virtual UnescapeTransliterator* clone() const override; /** * ICU "poor man's RTTI", returns a UClassID for the actual class. */ virtual UClassID getDynamicClassID() const override; /** * ICU "poor man's RTTI", returns a UClassID for this class. */ U_I18N_API static UClassID U_EXPORT2 getStaticClassID(); protected: /** * Implements {@link Transliterator#handleTransliterate}. * @param text the buffer holding transliterated and * untransliterated text * @param offset the start and limit of the text, the position * of the cursor, and the start and limit of transliteration. * @param incremental if true, assume more text may be coming after * pos.contextLimit. Otherwise, assume the text is complete. */ virtual void handleTransliterate(Replaceable& text, UTransPosition& offset, UBool isIncremental) const override; }; U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ #endif stringi/src/icu74/i18n/currpinf.cpp0000644000176200001440000003676714700200761016606 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2009-2014, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ #include "unicode/currpinf.h" #if !UCONFIG_NO_FORMATTING //#define CURRENCY_PLURAL_INFO_DEBUG 1 #ifdef CURRENCY_PLURAL_INFO_DEBUG #include #endif #include "unicode/locid.h" #include "unicode/plurrule.h" #include "unicode/strenum.h" #include "unicode/ures.h" #include "unicode/numsys.h" #include "cstring.h" #include "hash.h" #include "uresimp.h" #include "ureslocs.h" U_NAMESPACE_BEGIN static const char16_t gNumberPatternSeparator = 0x3B; // ; U_CDECL_BEGIN /** * @internal ICU 4.2 */ static UBool U_CALLCONV ValueComparator(UHashTok val1, UHashTok val2); UBool U_CALLCONV ValueComparator(UHashTok val1, UHashTok val2) { const UnicodeString* affix_1 = (UnicodeString*)val1.pointer; const UnicodeString* affix_2 = (UnicodeString*)val2.pointer; return *affix_1 == *affix_2; } U_CDECL_END UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CurrencyPluralInfo) static const char16_t gDefaultCurrencyPluralPattern[] = {'0', '.', '#', '#', ' ', 0xA4, 0xA4, 0xA4, 0}; static const char16_t gTripleCurrencySign[] = {0xA4, 0xA4, 0xA4, 0}; static const char16_t gPluralCountOther[] = {0x6F, 0x74, 0x68, 0x65, 0x72, 0}; static const char16_t gPart0[] = {0x7B, 0x30, 0x7D, 0}; static const char16_t gPart1[] = {0x7B, 0x31, 0x7D, 0}; static const char gNumberElementsTag[]="NumberElements"; static const char gLatnTag[]="latn"; static const char gPatternsTag[]="patterns"; static const char gDecimalFormatTag[]="decimalFormat"; static const char gCurrUnitPtnTag[]="CurrencyUnitPatterns"; CurrencyPluralInfo::CurrencyPluralInfo(UErrorCode& status) : fPluralCountToCurrencyUnitPattern(nullptr), fPluralRules(nullptr), fLocale(nullptr), fInternalStatus(U_ZERO_ERROR) { initialize(Locale::getDefault(), status); } CurrencyPluralInfo::CurrencyPluralInfo(const Locale& locale, UErrorCode& status) : fPluralCountToCurrencyUnitPattern(nullptr), fPluralRules(nullptr), fLocale(nullptr), fInternalStatus(U_ZERO_ERROR) { initialize(locale, status); } CurrencyPluralInfo::CurrencyPluralInfo(const CurrencyPluralInfo& info) : UObject(info), fPluralCountToCurrencyUnitPattern(nullptr), fPluralRules(nullptr), fLocale(nullptr), fInternalStatus(U_ZERO_ERROR) { *this = info; } CurrencyPluralInfo& CurrencyPluralInfo::operator=(const CurrencyPluralInfo& info) { if (this == &info) { return *this; } fInternalStatus = info.fInternalStatus; if (U_FAILURE(fInternalStatus)) { // bail out early if the object we were copying from was already 'invalid'. return *this; } deleteHash(fPluralCountToCurrencyUnitPattern); fPluralCountToCurrencyUnitPattern = initHash(fInternalStatus); copyHash(info.fPluralCountToCurrencyUnitPattern, fPluralCountToCurrencyUnitPattern, fInternalStatus); if ( U_FAILURE(fInternalStatus) ) { return *this; } delete fPluralRules; fPluralRules = nullptr; delete fLocale; fLocale = nullptr; if (info.fPluralRules != nullptr) { fPluralRules = info.fPluralRules->clone(); if (fPluralRules == nullptr) { fInternalStatus = U_MEMORY_ALLOCATION_ERROR; return *this; } } if (info.fLocale != nullptr) { fLocale = info.fLocale->clone(); if (fLocale == nullptr) { // Note: If clone had an error parameter, then we could check/set that instead. fInternalStatus = U_MEMORY_ALLOCATION_ERROR; return *this; } // If the other locale wasn't bogus, but our clone'd locale is bogus, then OOM happened // during the call to clone(). if (!info.fLocale->isBogus() && fLocale->isBogus()) { fInternalStatus = U_MEMORY_ALLOCATION_ERROR; return *this; } } return *this; } CurrencyPluralInfo::~CurrencyPluralInfo() { deleteHash(fPluralCountToCurrencyUnitPattern); fPluralCountToCurrencyUnitPattern = nullptr; delete fPluralRules; delete fLocale; fPluralRules = nullptr; fLocale = nullptr; } bool CurrencyPluralInfo::operator==(const CurrencyPluralInfo& info) const { #ifdef CURRENCY_PLURAL_INFO_DEBUG if (*fPluralRules == *info.fPluralRules) { std::cout << "same plural rules\n"; } if (*fLocale == *info.fLocale) { std::cout << "same locale\n"; } if (fPluralCountToCurrencyUnitPattern->equals(*info.fPluralCountToCurrencyUnitPattern)) { std::cout << "same pattern\n"; } #endif return *fPluralRules == *info.fPluralRules && *fLocale == *info.fLocale && fPluralCountToCurrencyUnitPattern->equals(*info.fPluralCountToCurrencyUnitPattern); } CurrencyPluralInfo* CurrencyPluralInfo::clone() const { CurrencyPluralInfo* newObj = new CurrencyPluralInfo(*this); // Since clone doesn't have a 'status' parameter, the best we can do is return nullptr // if the new object was not full constructed properly (an error occurred). if (newObj != nullptr && U_FAILURE(newObj->fInternalStatus)) { delete newObj; newObj = nullptr; } return newObj; } const PluralRules* CurrencyPluralInfo::getPluralRules() const { return fPluralRules; } UnicodeString& CurrencyPluralInfo::getCurrencyPluralPattern(const UnicodeString& pluralCount, UnicodeString& result) const { const UnicodeString* currencyPluralPattern = (UnicodeString*)fPluralCountToCurrencyUnitPattern->get(pluralCount); if (currencyPluralPattern == nullptr) { // fall back to "other" if (pluralCount.compare(gPluralCountOther, 5)) { currencyPluralPattern = (UnicodeString*)fPluralCountToCurrencyUnitPattern->get(UnicodeString(true, gPluralCountOther, 5)); } if (currencyPluralPattern == nullptr) { // no currencyUnitPatterns defined, // fallback to predefined default. // This should never happen when ICU resource files are // available, since currencyUnitPattern of "other" is always // defined in root. result = UnicodeString(gDefaultCurrencyPluralPattern); return result; } } result = *currencyPluralPattern; return result; } const Locale& CurrencyPluralInfo::getLocale() const { return *fLocale; } void CurrencyPluralInfo::setPluralRules(const UnicodeString& ruleDescription, UErrorCode& status) { if (U_SUCCESS(status)) { delete fPluralRules; fPluralRules = PluralRules::createRules(ruleDescription, status); } } void CurrencyPluralInfo::setCurrencyPluralPattern(const UnicodeString& pluralCount, const UnicodeString& pattern, UErrorCode& status) { if (U_SUCCESS(status)) { UnicodeString* oldValue = static_cast( fPluralCountToCurrencyUnitPattern->get(pluralCount)); delete oldValue; LocalPointer p(new UnicodeString(pattern), status); if (U_SUCCESS(status)) { // the p object allocated above will be owned by fPluralCountToCurrencyUnitPattern // after the call to put(), even if the method returns failure. fPluralCountToCurrencyUnitPattern->put(pluralCount, p.orphan(), status); } } } void CurrencyPluralInfo::setLocale(const Locale& loc, UErrorCode& status) { initialize(loc, status); } void CurrencyPluralInfo::initialize(const Locale& loc, UErrorCode& status) { if (U_FAILURE(status)) { return; } delete fLocale; fLocale = nullptr; delete fPluralRules; fPluralRules = nullptr; fLocale = loc.clone(); if (fLocale == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } // If the locale passed in wasn't bogus, but our clone'd locale is bogus, then OOM happened // during the call to loc.clone(). if (!loc.isBogus() && fLocale->isBogus()) { status = U_MEMORY_ALLOCATION_ERROR; return; } fPluralRules = PluralRules::forLocale(loc, status); setupCurrencyPluralPattern(loc, status); } void CurrencyPluralInfo::setupCurrencyPluralPattern(const Locale& loc, UErrorCode& status) { if (U_FAILURE(status)) { return; } deleteHash(fPluralCountToCurrencyUnitPattern); fPluralCountToCurrencyUnitPattern = initHash(status); if (U_FAILURE(status)) { return; } LocalPointer ns(NumberingSystem::createInstance(loc, status), status); if (U_FAILURE(status)) { return; } UErrorCode ec = U_ZERO_ERROR; LocalUResourceBundlePointer rb(ures_open(nullptr, loc.getName(), &ec)); LocalUResourceBundlePointer numElements(ures_getByKeyWithFallback(rb.getAlias(), gNumberElementsTag, nullptr, &ec)); ures_getByKeyWithFallback(numElements.getAlias(), ns->getName(), rb.getAlias(), &ec); ures_getByKeyWithFallback(rb.getAlias(), gPatternsTag, rb.getAlias(), &ec); int32_t ptnLen; const char16_t* numberStylePattern = ures_getStringByKeyWithFallback(rb.getAlias(), gDecimalFormatTag, &ptnLen, &ec); // Fall back to "latn" if num sys specific pattern isn't there. if ( ec == U_MISSING_RESOURCE_ERROR && (uprv_strcmp(ns->getName(), gLatnTag) != 0)) { ec = U_ZERO_ERROR; ures_getByKeyWithFallback(numElements.getAlias(), gLatnTag, rb.getAlias(), &ec); ures_getByKeyWithFallback(rb.getAlias(), gPatternsTag, rb.getAlias(), &ec); numberStylePattern = ures_getStringByKeyWithFallback(rb.getAlias(), gDecimalFormatTag, &ptnLen, &ec); } int32_t numberStylePatternLen = ptnLen; const char16_t* negNumberStylePattern = nullptr; int32_t negNumberStylePatternLen = 0; // TODO: Java // parse to check whether there is ";" separator in the numberStylePattern UBool hasSeparator = false; if (U_SUCCESS(ec)) { for (int32_t styleCharIndex = 0; styleCharIndex < ptnLen; ++styleCharIndex) { if (numberStylePattern[styleCharIndex] == gNumberPatternSeparator) { hasSeparator = true; // split the number style pattern into positive and negative negNumberStylePattern = numberStylePattern + styleCharIndex + 1; negNumberStylePatternLen = ptnLen - styleCharIndex - 1; numberStylePatternLen = styleCharIndex; } } } if (U_FAILURE(ec)) { // If OOM occurred during the above code, then we want to report that back to the caller. if (ec == U_MEMORY_ALLOCATION_ERROR) { status = ec; } return; } LocalUResourceBundlePointer currRb(ures_open(U_ICUDATA_CURR, loc.getName(), &ec)); LocalUResourceBundlePointer currencyRes(ures_getByKeyWithFallback(currRb.getAlias(), gCurrUnitPtnTag, nullptr, &ec)); #ifdef CURRENCY_PLURAL_INFO_DEBUG std::cout << "in set up\n"; #endif LocalPointer keywords(fPluralRules->getKeywords(ec), ec); if (U_SUCCESS(ec)) { const char* pluralCount; while (((pluralCount = keywords->next(nullptr, ec)) != nullptr) && U_SUCCESS(ec)) { int32_t ptnLength; UErrorCode err = U_ZERO_ERROR; const char16_t* patternChars = ures_getStringByKeyWithFallback(currencyRes.getAlias(), pluralCount, &ptnLength, &err); if (err == U_MEMORY_ALLOCATION_ERROR || patternChars == nullptr) { ec = err; break; } if (U_SUCCESS(err) && ptnLength > 0) { UnicodeString* pattern = new UnicodeString(patternChars, ptnLength); if (pattern == nullptr) { ec = U_MEMORY_ALLOCATION_ERROR; break; } #ifdef CURRENCY_PLURAL_INFO_DEBUG char result_1[1000]; pattern->extract(0, pattern->length(), result_1, "UTF-8"); std::cout << "pluralCount: " << pluralCount << "; pattern: " << result_1 << "\n"; #endif pattern->findAndReplace(UnicodeString(true, gPart0, 3), UnicodeString(numberStylePattern, numberStylePatternLen)); pattern->findAndReplace(UnicodeString(true, gPart1, 3), UnicodeString(true, gTripleCurrencySign, 3)); if (hasSeparator) { UnicodeString negPattern(patternChars, ptnLength); negPattern.findAndReplace(UnicodeString(true, gPart0, 3), UnicodeString(negNumberStylePattern, negNumberStylePatternLen)); negPattern.findAndReplace(UnicodeString(true, gPart1, 3), UnicodeString(true, gTripleCurrencySign, 3)); pattern->append(gNumberPatternSeparator); pattern->append(negPattern); } #ifdef CURRENCY_PLURAL_INFO_DEBUG pattern->extract(0, pattern->length(), result_1, "UTF-8"); std::cout << "pluralCount: " << pluralCount << "; pattern: " << result_1 << "\n"; #endif // the 'pattern' object allocated above will be owned by the fPluralCountToCurrencyUnitPattern after the call to // put(), even if the method returns failure. fPluralCountToCurrencyUnitPattern->put(UnicodeString(pluralCount, -1, US_INV), pattern, status); } } } // If OOM occurred during the above code, then we want to report that back to the caller. if (ec == U_MEMORY_ALLOCATION_ERROR) { status = ec; } } void CurrencyPluralInfo::deleteHash(Hashtable* hTable) { if ( hTable == nullptr ) { return; } int32_t pos = UHASH_FIRST; const UHashElement* element = nullptr; while ( (element = hTable->nextElement(pos)) != nullptr ) { const UHashTok valueTok = element->value; const UnicodeString* value = (UnicodeString*)valueTok.pointer; delete value; } delete hTable; hTable = nullptr; } Hashtable* CurrencyPluralInfo::initHash(UErrorCode& status) { if (U_FAILURE(status)) { return nullptr; } LocalPointer hTable(new Hashtable(true, status), status); if (U_FAILURE(status)) { return nullptr; } hTable->setValueComparator(ValueComparator); return hTable.orphan(); } void CurrencyPluralInfo::copyHash(const Hashtable* source, Hashtable* target, UErrorCode& status) { if (U_FAILURE(status)) { return; } int32_t pos = UHASH_FIRST; const UHashElement* element = nullptr; if (source) { while ( (element = source->nextElement(pos)) != nullptr ) { const UHashTok keyTok = element->key; const UnicodeString* key = (UnicodeString*)keyTok.pointer; const UHashTok valueTok = element->value; const UnicodeString* value = (UnicodeString*)valueTok.pointer; LocalPointer copy(new UnicodeString(*value), status); if (U_FAILURE(status)) { return; } // The HashTable owns the 'copy' object after the call to put(). target->put(UnicodeString(*key), copy.orphan(), status); if (U_FAILURE(status)) { return; } } } } U_NAMESPACE_END #endif stringi/src/icu74/i18n/astro.h0000644000176200001440000005526714700200761015547 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /************************************************************************ * Copyright (C) 1996-2008, International Business Machines Corporation * * and others. All Rights Reserved. * ************************************************************************ * 2003-nov-07 srl Port from Java */ #ifndef ASTRO_H #define ASTRO_H #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "gregoimp.h" // for Math #include "unicode/unistr.h" U_NAMESPACE_BEGIN /** * CalendarAstronomer is a class that can perform the calculations to * determine the positions of the sun and moon, the time of sunrise and * sunset, and other astronomy-related data. The calculations it performs * are in some cases quite complicated, and this utility class saves you * the trouble of worrying about them. *

* The measurement of time is a very important part of astronomy. Because * astronomical bodies are constantly in motion, observations are only valid * at a given moment in time. Accordingly, each CalendarAstronomer * object has a time property that determines the date * and time for which its calculations are performed. You can set and * retrieve this property with {@link #setDate setDate}, {@link #getDate getDate} * and related methods. *

* Almost all of the calculations performed by this class, or by any * astronomer, are approximations to various degrees of accuracy. The * calculations in this class are mostly modelled after those described * in the book * * Practical Astronomy With Your Calculator, by Peter J. * Duffett-Smith, Cambridge University Press, 1990. This is an excellent * book, and if you want a greater understanding of how these calculations * are performed it a very good, readable starting point. *

* WARNING: This class is very early in its development, and * it is highly likely that its API will change to some degree in the future. * At the moment, it basically does just enough to support {@link IslamicCalendar} * and {@link ChineseCalendar}. * * @author Laura Werner * @author Alan Liu * @internal */ class U_I18N_API CalendarAstronomer : public UMemory { public: // some classes public: /** * Represents the position of an object in the sky relative to the ecliptic, * the plane of the earth's orbit around the Sun. * This is a spherical coordinate system in which the latitude * specifies the position north or south of the plane of the ecliptic. * The longitude specifies the position along the ecliptic plane * relative to the "First Point of Aries", which is the Sun's position in the sky * at the Vernal Equinox. *

* Note that Ecliptic objects are immutable and cannot be modified * once they are constructed. This allows them to be passed and returned by * value without worrying about whether other code will modify them. * * @see CalendarAstronomer.Equatorial * @see CalendarAstronomer.Horizon * @internal */ class U_I18N_API Ecliptic : public UMemory { public: /** * Constructs an Ecliptic coordinate object. *

* @param lat The ecliptic latitude, measured in radians. * @param lon The ecliptic longitude, measured in radians. * @internal */ Ecliptic(double lat = 0, double lon = 0) { latitude = lat; longitude = lon; } /** * Setter for Ecliptic Coordinate object * @param lat The ecliptic latitude, measured in radians. * @param lon The ecliptic longitude, measured in radians. * @internal */ void set(double lat, double lon) { latitude = lat; longitude = lon; } /** * Return a string representation of this object * @internal */ UnicodeString toString() const; /** * The ecliptic latitude, in radians. This specifies an object's * position north or south of the plane of the ecliptic, * with positive angles representing north. * @internal */ double latitude; /** * The ecliptic longitude, in radians. * This specifies an object's position along the ecliptic plane * relative to the "First Point of Aries", which is the Sun's position * in the sky at the Vernal Equinox, * with positive angles representing east. *

* A bit of trivia: the first point of Aries is currently in the * constellation Pisces, due to the precession of the earth's axis. * @internal */ double longitude; }; /** * Represents the position of an * object in the sky relative to the plane of the earth's equator. * The Right Ascension specifies the position east or west * along the equator, relative to the sun's position at the vernal * equinox. The Declination is the position north or south * of the equatorial plane. *

* Note that Equatorial objects are immutable and cannot be modified * once they are constructed. This allows them to be passed and returned by * value without worrying about whether other code will modify them. * * @see CalendarAstronomer.Ecliptic * @see CalendarAstronomer.Horizon * @internal */ class U_I18N_API Equatorial : public UMemory { public: /** * Constructs an Equatorial coordinate object. *

* @param asc The right ascension, measured in radians. * @param dec The declination, measured in radians. * @internal */ Equatorial(double asc = 0, double dec = 0) : ascension(asc), declination(dec) { } /** * Setter * @param asc The right ascension, measured in radians. * @param dec The declination, measured in radians. * @internal */ void set(double asc, double dec) { ascension = asc; declination = dec; } /** * Return a string representation of this object, with the * angles measured in degrees. * @internal */ UnicodeString toString() const; /** * Return a string representation of this object with the right ascension * measured in hours, minutes, and seconds. * @internal */ //String toHmsString() { //return radToHms(ascension) + "," + radToDms(declination); //} /** * The right ascension, in radians. * This is the position east or west along the equator * relative to the sun's position at the vernal equinox, * with positive angles representing East. * @internal */ double ascension; /** * The declination, in radians. * This is the position north or south of the equatorial plane, * with positive angles representing north. * @internal */ double declination; }; /** * Represents the position of an object in the sky relative to * the local horizon. * The Altitude represents the object's elevation above the horizon, * with objects below the horizon having a negative altitude. * The Azimuth is the geographic direction of the object from the * observer's position, with 0 representing north. The azimuth increases * clockwise from north. *

* Note that Horizon objects are immutable and cannot be modified * once they are constructed. This allows them to be passed and returned by * value without worrying about whether other code will modify them. * * @see CalendarAstronomer.Ecliptic * @see CalendarAstronomer.Equatorial * @internal */ class U_I18N_API Horizon : public UMemory { public: /** * Constructs a Horizon coordinate object. *

* @param alt The altitude, measured in radians above the horizon. * @param azim The azimuth, measured in radians clockwise from north. * @internal */ Horizon(double alt=0, double azim=0) : altitude(alt), azimuth(azim) { } /** * Setter for Ecliptic Coordinate object * @param alt The altitude, measured in radians above the horizon. * @param azim The azimuth, measured in radians clockwise from north. * @internal */ void set(double alt, double azim) { altitude = alt; azimuth = azim; } /** * Return a string representation of this object, with the * angles measured in degrees. * @internal */ UnicodeString toString() const; /** * The object's altitude above the horizon, in radians. * @internal */ double altitude; /** * The object's direction, in radians clockwise from north. * @internal */ double azimuth; }; public: //------------------------------------------------------------------------- // Assorted private data used for conversions //------------------------------------------------------------------------- // My own copies of these so compilers are more likely to optimize them away static const double PI; /** * The average number of solar days from one new moon to the next. This is the time * it takes for the moon to return the same ecliptic longitude as the sun. * It is longer than the sidereal month because the sun's longitude increases * during the year due to the revolution of the earth around the sun. * Approximately 29.53. * * @see #SIDEREAL_MONTH * @internal * @deprecated ICU 2.4. This class may be removed or modified. */ static const double SYNODIC_MONTH; //------------------------------------------------------------------------- // Constructors //------------------------------------------------------------------------- /** * Construct a new CalendarAstronomer object that is initialized to * the current date and time. * @internal */ CalendarAstronomer(); /** * Construct a new CalendarAstronomer object that is initialized to * the specified date and time. * @internal */ CalendarAstronomer(UDate d); /** * Construct a new CalendarAstronomer object with the given * latitude and longitude. The object's time is set to the current * date and time. *

* @param longitude The desired longitude, in degrees east of * the Greenwich meridian. * * @param latitude The desired latitude, in degrees. Positive * values signify North, negative South. * * @see java.util.Date#getTime() * @internal */ CalendarAstronomer(double longitude, double latitude); /** * Destructor * @internal */ ~CalendarAstronomer(); //------------------------------------------------------------------------- // Time and date getters and setters //------------------------------------------------------------------------- /** * Set the current date and time of this CalendarAstronomer object. All * astronomical calculations are performed based on this time setting. * * @param aTime the date and time, expressed as the number of milliseconds since * 1/1/1970 0:00 GMT (Gregorian). * * @see #setDate * @see #getTime * @internal */ void setTime(UDate aTime); /** * Set the current date and time of this CalendarAstronomer object. All * astronomical calculations are performed based on this time setting. * * @param aTime the date and time, expressed as the number of milliseconds since * 1/1/1970 0:00 GMT (Gregorian). * * @see #getTime * @internal */ void setDate(UDate aDate) { setTime(aDate); } /** * Set the current date and time of this CalendarAstronomer object. All * astronomical calculations are performed based on this time setting. * * @param jdn the desired time, expressed as a "julian day number", * which is the number of elapsed days since * 1/1/4713 BC (Julian), 12:00 GMT. Note that julian day * numbers start at noon. To get the jdn for * the corresponding midnight, subtract 0.5. * * @see #getJulianDay * @see #JULIAN_EPOCH_MS * @internal */ void setJulianDay(double jdn); /** * Get the current time of this CalendarAstronomer object, * represented as the number of milliseconds since * 1/1/1970 AD 0:00 GMT (Gregorian). * * @see #setTime * @see #getDate * @internal */ UDate getTime(); /** * Get the current time of this CalendarAstronomer object, * expressed as a "julian day number", which is the number of elapsed * days since 1/1/4713 BC (Julian), 12:00 GMT. * * @see #setJulianDay * @see #JULIAN_EPOCH_MS * @internal */ double getJulianDay(); /** * Return this object's time expressed in julian centuries: * the number of centuries after 1/1/1900 AD, 12:00 GMT * * @see #getJulianDay * @internal */ double getJulianCentury(); /** * Returns the current Greenwich sidereal time, measured in hours * @internal */ double getGreenwichSidereal(); private: double getSiderealOffset(); public: /** * Returns the current local sidereal time, measured in hours * @internal */ double getLocalSidereal(); /** * Converts local sidereal time to Universal Time. * * @param lst The Local Sidereal Time, in hours since sidereal midnight * on this object's current date. * * @return The corresponding Universal Time, in milliseconds since * 1 Jan 1970, GMT. */ //private: double lstToUT(double lst); /** * * Convert from ecliptic to equatorial coordinates. * * @param ecliptic The ecliptic * @param result Fillin result * @return reference to result */ Equatorial& eclipticToEquatorial(Equatorial& result, const Ecliptic& ecliptic); /** * Convert from ecliptic to equatorial coordinates. * * @param eclipLong The ecliptic longitude * @param eclipLat The ecliptic latitude * * @return The corresponding point in equatorial coordinates. * @internal */ Equatorial& eclipticToEquatorial(Equatorial& result, double eclipLong, double eclipLat); /** * Convert from ecliptic longitude to equatorial coordinates. * * @param eclipLong The ecliptic longitude * * @return The corresponding point in equatorial coordinates. * @internal */ Equatorial& eclipticToEquatorial(Equatorial& result, double eclipLong) ; /** * @internal */ Horizon& eclipticToHorizon(Horizon& result, double eclipLong) ; //------------------------------------------------------------------------- // The Sun //------------------------------------------------------------------------- /** * The longitude of the sun at the time specified by this object. * The longitude is measured in radians along the ecliptic * from the "first point of Aries," the point at which the ecliptic * crosses the earth's equatorial plane at the vernal equinox. *

* Currently, this method uses an approximation of the two-body Kepler's * equation for the earth and the sun. It does not take into account the * perturbations caused by the other planets, the moon, etc. * @internal */ double getSunLongitude(); /** * TODO Make this public when the entire class is package-private. */ /*public*/ void getSunLongitude(double julianDay, double &longitude, double &meanAnomaly); /** * The position of the sun at this object's current date and time, * in equatorial coordinates. * @param result fillin for the result * @internal */ Equatorial& getSunPosition(Equatorial& result); public: /** * Constant representing the vernal equinox. * For use with {@link #getSunTime getSunTime}. * Note: In this case, "vernal" refers to the northern hemisphere's seasons. * @internal */ // static double VERNAL_EQUINOX(); /** * Constant representing the summer solstice. * For use with {@link #getSunTime getSunTime}. * Note: In this case, "summer" refers to the northern hemisphere's seasons. * @internal */ static double SUMMER_SOLSTICE(); /** * Constant representing the autumnal equinox. * For use with {@link #getSunTime getSunTime}. * Note: In this case, "autumn" refers to the northern hemisphere's seasons. * @internal */ // static double AUTUMN_EQUINOX(); /** * Constant representing the winter solstice. * For use with {@link #getSunTime getSunTime}. * Note: In this case, "winter" refers to the northern hemisphere's seasons. * @internal */ static double WINTER_SOLSTICE(); /** * Find the next time at which the sun's ecliptic longitude will have * the desired value. * @internal */ UDate getSunTime(double desired, UBool next); /** * Returns the time (GMT) of sunrise or sunset on the local date to which * this calendar is currently set. * * NOTE: This method only works well if this object is set to a * time near local noon. Because of variations between the local * official time zone and the geographic longitude, the * computation can flop over into an adjacent day if this object * is set to a time near local midnight. * * @internal */ UDate getSunRiseSet(UBool rise); //------------------------------------------------------------------------- // The Moon //------------------------------------------------------------------------- /** * The position of the moon at the time set on this * object, in equatorial coordinates. * @internal * @return const reference to internal field of calendar astronomer. Do not use outside of the lifetime of this astronomer. */ const Equatorial& getMoonPosition(); /** * The "age" of the moon at the time specified in this object. * This is really the angle between the * current ecliptic longitudes of the sun and the moon, * measured in radians. * * @see #getMoonPhase * @internal */ double getMoonAge(); /** * Calculate the phase of the moon at the time set in this object. * The returned phase is a double in the range * 0 <= phase < 1, interpreted as follows: *

    *
  • 0.00: New moon *
  • 0.25: First quarter *
  • 0.50: Full moon *
  • 0.75: Last quarter *
* * @see #getMoonAge * @internal */ double getMoonPhase(); class U_I18N_API MoonAge : public UMemory { public: MoonAge(double l) : value(l) { } void set(double l) { value = l; } double value; }; /** * Constant representing a new moon. * For use with {@link #getMoonTime getMoonTime} * @internal */ static const MoonAge NEW_MOON(); /** * Constant representing the moon's first quarter. * For use with {@link #getMoonTime getMoonTime} * @internal */ // static const MoonAge FIRST_QUARTER(); /** * Constant representing a full moon. * For use with {@link #getMoonTime getMoonTime} * @internal */ static const MoonAge FULL_MOON(); /** * Constant representing the moon's last quarter. * For use with {@link #getMoonTime getMoonTime} * @internal */ // static const MoonAge LAST_QUARTER(); /** * Find the next or previous time at which the Moon's ecliptic * longitude will have the desired value. *

* @param desired The desired longitude. * @param next true if the next occurrence of the phase * is desired, false for the previous occurrence. * @internal */ UDate getMoonTime(double desired, UBool next); UDate getMoonTime(const MoonAge& desired, UBool next); /** * Returns the time (GMT) of sunrise or sunset on the local date to which * this calendar is currently set. * @internal */ UDate getMoonRiseSet(UBool rise); //------------------------------------------------------------------------- // Interpolation methods for finding the time at which a given event occurs //------------------------------------------------------------------------- // private class AngleFunc : public UMemory { public: virtual double eval(CalendarAstronomer&) = 0; virtual ~AngleFunc(); }; friend class AngleFunc; UDate timeOfAngle(AngleFunc& func, double desired, double periodDays, double epsilon, UBool next); class CoordFunc : public UMemory { public: virtual void eval(Equatorial& result, CalendarAstronomer&) = 0; virtual ~CoordFunc(); }; friend class CoordFunc; double riseOrSet(CoordFunc& func, UBool rise, double diameter, double refraction, double epsilon); //------------------------------------------------------------------------- // Other utility methods //------------------------------------------------------------------------- private: /** * Return the obliquity of the ecliptic (the angle between the ecliptic * and the earth's equator) at the current time. This varies due to * the precession of the earth's axis. * * @return the obliquity of the ecliptic relative to the equator, * measured in radians. */ double eclipticObliquity(); //------------------------------------------------------------------------- // Private data //------------------------------------------------------------------------- private: /** * Current time in milliseconds since 1/1/1970 AD * @see java.util.Date#getTime */ UDate fTime; /* These aren't used yet, but they'll be needed for sunset calculations * and equatorial to horizon coordinate conversions */ double fLongitude; double fLatitude; double fGmtOffset; // // The following fields are used to cache calculated results for improved // performance. These values all depend on the current time setting // of this object, so the clearCache method is provided. // double julianDay; double julianCentury; double sunLongitude; double meanAnomalySun; double moonLongitude; double moonEclipLong; double meanAnomalyMoon; double eclipObliquity; double siderealT0; double siderealTime; void clearCache(); Equatorial moonPosition; UBool moonPositionSet; /** * @internal */ // UDate local(UDate localMillis); }; U_NAMESPACE_END struct UHashtable; U_NAMESPACE_BEGIN /** * Cache of month -> julian day * @internal */ class CalendarCache : public UMemory { public: static int32_t get(CalendarCache** cache, int32_t key, UErrorCode &status); static void put(CalendarCache** cache, int32_t key, int32_t value, UErrorCode &status); virtual ~CalendarCache(); private: CalendarCache(int32_t size, UErrorCode& status); static void createCache(CalendarCache** cache, UErrorCode& status); /** * not implemented */ CalendarCache(); UHashtable *fTable; }; U_NAMESPACE_END #endif #endif stringi/src/icu74/i18n/dtfmtsym.cpp0000644000176200001440000032347614700200761016621 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 1997-2016, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* * * File DTFMTSYM.CPP * * Modification History: * * Date Name Description * 02/19/97 aliu Converted from java. * 07/21/98 stephen Added getZoneIndex * Changed weekdays/short weekdays to be one-based * 06/14/99 stephen Removed SimpleDateFormat::fgTimeZoneDataSuffix * 11/16/99 weiv Added 'Y' and 'e' to fgPatternChars * 03/27/00 weiv Keeping resource bundle around! * 06/30/05 emmons Added eraNames, narrow month/day, standalone context * 10/12/05 emmons Added setters for eraNames, month/day by width/context ******************************************************************************* */ #include #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/ustring.h" #include "unicode/localpointer.h" #include "unicode/dtfmtsym.h" #include "unicode/smpdtfmt.h" #include "unicode/msgfmt.h" #include "unicode/numsys.h" #include "unicode/tznames.h" #include "cpputils.h" #include "umutex.h" #include "cmemory.h" #include "cstring.h" #include "charstr.h" #include "dt_impl.h" #include "locbased.h" #include "gregoimp.h" #include "hash.h" #include "uassert.h" #include "uresimp.h" #include "ureslocs.h" #include "uvector.h" #include "shareddateformatsymbols.h" #include "unicode/calendar.h" #include "unifiedcache.h" // ***************************************************************************** // class DateFormatSymbols // ***************************************************************************** /** * These are static arrays we use only in the case where we have no * resource data. */ #if UDAT_HAS_PATTERN_CHAR_FOR_TIME_SEPARATOR #define PATTERN_CHARS_LEN 38 #else #define PATTERN_CHARS_LEN 37 #endif /** * Unlocalized date-time pattern characters. For example: 'y', 'd', etc. All * locales use the same these unlocalized pattern characters. */ static const char16_t gPatternChars[] = { // if UDAT_HAS_PATTERN_CHAR_FOR_TIME_SEPARATOR: // GyMdkHmsSEDFwWahKzYeugAZvcLQqVUOXxrbB: // else: // GyMdkHmsSEDFwWahKzYeugAZvcLQqVUOXxrbB 0x47, 0x79, 0x4D, 0x64, 0x6B, 0x48, 0x6D, 0x73, 0x53, 0x45, 0x44, 0x46, 0x77, 0x57, 0x61, 0x68, 0x4B, 0x7A, 0x59, 0x65, 0x75, 0x67, 0x41, 0x5A, 0x76, 0x63, 0x4c, 0x51, 0x71, 0x56, 0x55, 0x4F, 0x58, 0x78, 0x72, 0x62, 0x42, #if UDAT_HAS_PATTERN_CHAR_FOR_TIME_SEPARATOR 0x3a, #endif 0 }; //------------------------------------------------------ // Strings of last resort. These are only used if we have no resource // files. They aren't designed for actual use, just for backup. // These are the month names and abbreviations of last resort. static const char16_t gLastResortMonthNames[13][3] = { {0x0030, 0x0031, 0x0000}, /* "01" */ {0x0030, 0x0032, 0x0000}, /* "02" */ {0x0030, 0x0033, 0x0000}, /* "03" */ {0x0030, 0x0034, 0x0000}, /* "04" */ {0x0030, 0x0035, 0x0000}, /* "05" */ {0x0030, 0x0036, 0x0000}, /* "06" */ {0x0030, 0x0037, 0x0000}, /* "07" */ {0x0030, 0x0038, 0x0000}, /* "08" */ {0x0030, 0x0039, 0x0000}, /* "09" */ {0x0031, 0x0030, 0x0000}, /* "10" */ {0x0031, 0x0031, 0x0000}, /* "11" */ {0x0031, 0x0032, 0x0000}, /* "12" */ {0x0031, 0x0033, 0x0000} /* "13" */ }; // These are the weekday names and abbreviations of last resort. static const char16_t gLastResortDayNames[8][2] = { {0x0030, 0x0000}, /* "0" */ {0x0031, 0x0000}, /* "1" */ {0x0032, 0x0000}, /* "2" */ {0x0033, 0x0000}, /* "3" */ {0x0034, 0x0000}, /* "4" */ {0x0035, 0x0000}, /* "5" */ {0x0036, 0x0000}, /* "6" */ {0x0037, 0x0000} /* "7" */ }; // These are the quarter names and abbreviations of last resort. static const char16_t gLastResortQuarters[4][2] = { {0x0031, 0x0000}, /* "1" */ {0x0032, 0x0000}, /* "2" */ {0x0033, 0x0000}, /* "3" */ {0x0034, 0x0000}, /* "4" */ }; // These are the am/pm and BC/AD markers of last resort. static const char16_t gLastResortAmPmMarkers[2][3] = { {0x0041, 0x004D, 0x0000}, /* "AM" */ {0x0050, 0x004D, 0x0000} /* "PM" */ }; static const char16_t gLastResortEras[2][3] = { {0x0042, 0x0043, 0x0000}, /* "BC" */ {0x0041, 0x0044, 0x0000} /* "AD" */ }; /* Sizes for the last resort string arrays */ typedef enum LastResortSize { kMonthNum = 13, kMonthLen = 3, kDayNum = 8, kDayLen = 2, kAmPmNum = 2, kAmPmLen = 3, kQuarterNum = 4, kQuarterLen = 2, kEraNum = 2, kEraLen = 3, kZoneNum = 5, kZoneLen = 4, kGmtHourNum = 4, kGmtHourLen = 10 } LastResortSize; U_NAMESPACE_BEGIN SharedDateFormatSymbols::~SharedDateFormatSymbols() { } template<> U_I18N_API const SharedDateFormatSymbols * LocaleCacheKey::createObject( const void * /*unusedContext*/, UErrorCode &status) const { char type[256]; Calendar::getCalendarTypeFromLocale(fLoc, type, UPRV_LENGTHOF(type), status); if (U_FAILURE(status)) { return nullptr; } SharedDateFormatSymbols *shared = new SharedDateFormatSymbols(fLoc, type, status); if (shared == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } if (U_FAILURE(status)) { delete shared; return nullptr; } shared->addRef(); return shared; } UOBJECT_DEFINE_RTTI_IMPLEMENTATION(DateFormatSymbols) #define kSUPPLEMENTAL "supplementalData" /** * These are the tags we expect to see in normal resource bundle files associated * with a locale and calendar */ static const char gCalendarTag[]="calendar"; static const char gGregorianTag[]="gregorian"; static const char gErasTag[]="eras"; static const char gCyclicNameSetsTag[]="cyclicNameSets"; static const char gNameSetYearsTag[]="years"; static const char gNameSetZodiacsTag[]="zodiacs"; static const char gMonthNamesTag[]="monthNames"; static const char gMonthPatternsTag[]="monthPatterns"; static const char gDayNamesTag[]="dayNames"; static const char gNamesWideTag[]="wide"; static const char gNamesAbbrTag[]="abbreviated"; static const char gNamesShortTag[]="short"; static const char gNamesNarrowTag[]="narrow"; static const char gNamesAllTag[]="all"; static const char gNamesFormatTag[]="format"; static const char gNamesStandaloneTag[]="stand-alone"; static const char gNamesNumericTag[]="numeric"; static const char gAmPmMarkersTag[]="AmPmMarkers"; static const char gAmPmMarkersAbbrTag[]="AmPmMarkersAbbr"; static const char gAmPmMarkersNarrowTag[]="AmPmMarkersNarrow"; static const char gQuartersTag[]="quarters"; static const char gNumberElementsTag[]="NumberElements"; static const char gSymbolsTag[]="symbols"; static const char gTimeSeparatorTag[]="timeSeparator"; static const char gDayPeriodTag[]="dayPeriod"; // static const char gZoneStringsTag[]="zoneStrings"; // static const char gLocalPatternCharsTag[]="localPatternChars"; static const char gContextTransformsTag[]="contextTransforms"; /** * Jitterbug 2974: MSVC has a bug whereby new X[0] behaves badly. * Work around this. */ static inline UnicodeString* newUnicodeStringArray(size_t count) { return new UnicodeString[count ? count : 1]; } //------------------------------------------------------ DateFormatSymbols * U_EXPORT2 DateFormatSymbols::createForLocale( const Locale& locale, UErrorCode &status) { const SharedDateFormatSymbols *shared = nullptr; UnifiedCache::getByLocale(locale, shared, status); if (U_FAILURE(status)) { return nullptr; } DateFormatSymbols *result = new DateFormatSymbols(shared->get()); shared->removeRef(); if (result == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } return result; } DateFormatSymbols::DateFormatSymbols(const Locale& locale, UErrorCode& status) : UObject() { initializeData(locale, nullptr, status); } DateFormatSymbols::DateFormatSymbols(UErrorCode& status) : UObject() { initializeData(Locale::getDefault(), nullptr, status, true); } DateFormatSymbols::DateFormatSymbols(const Locale& locale, const char *type, UErrorCode& status) : UObject() { initializeData(locale, type, status); } DateFormatSymbols::DateFormatSymbols(const char *type, UErrorCode& status) : UObject() { initializeData(Locale::getDefault(), type, status, true); } DateFormatSymbols::DateFormatSymbols(const DateFormatSymbols& other) : UObject(other) { copyData(other); } void DateFormatSymbols::assignArray(UnicodeString*& dstArray, int32_t& dstCount, const UnicodeString* srcArray, int32_t srcCount) { // assignArray() is only called by copyData() and initializeData(), which in turn // implements the copy constructor and the assignment operator. // All strings in a DateFormatSymbols object are created in one of the following // three ways that all allow to safely use UnicodeString::fastCopyFrom(): // - readonly-aliases from resource bundles // - readonly-aliases or allocated strings from constants // - safely cloned strings (with owned buffers) from setXYZ() functions // // Note that this is true for as long as DateFormatSymbols can be constructed // only from a locale bundle or set via the cloning API, // *and* for as long as all the strings are in *private* fields, preventing // a subclass from creating these strings in an "unsafe" way (with respect to fastCopyFrom()). if(srcArray == nullptr) { // Do not attempt to copy bogus input (which will crash). // Note that this assignArray method already had the potential to return a null dstArray; // see handling below for "if(dstArray != nullptr)". dstCount = 0; dstArray = nullptr; return; } dstCount = srcCount; dstArray = newUnicodeStringArray(srcCount); if(dstArray != nullptr) { int32_t i; for(i=0; i= 0; i--) { delete[] fZoneStrings[i]; } uprv_free(fZoneStrings); fZoneStrings = nullptr; } } /** * Copy all of the other's data to this. */ void DateFormatSymbols::copyData(const DateFormatSymbols& other) { UErrorCode status = U_ZERO_ERROR; U_LOCALE_BASED(locBased, *this); locBased.setLocaleIDs( other.getLocale(ULOC_VALID_LOCALE, status), other.getLocale(ULOC_ACTUAL_LOCALE, status)); assignArray(fEras, fErasCount, other.fEras, other.fErasCount); assignArray(fEraNames, fEraNamesCount, other.fEraNames, other.fEraNamesCount); assignArray(fNarrowEras, fNarrowErasCount, other.fNarrowEras, other.fNarrowErasCount); assignArray(fMonths, fMonthsCount, other.fMonths, other.fMonthsCount); assignArray(fShortMonths, fShortMonthsCount, other.fShortMonths, other.fShortMonthsCount); assignArray(fNarrowMonths, fNarrowMonthsCount, other.fNarrowMonths, other.fNarrowMonthsCount); assignArray(fStandaloneMonths, fStandaloneMonthsCount, other.fStandaloneMonths, other.fStandaloneMonthsCount); assignArray(fStandaloneShortMonths, fStandaloneShortMonthsCount, other.fStandaloneShortMonths, other.fStandaloneShortMonthsCount); assignArray(fStandaloneNarrowMonths, fStandaloneNarrowMonthsCount, other.fStandaloneNarrowMonths, other.fStandaloneNarrowMonthsCount); assignArray(fWeekdays, fWeekdaysCount, other.fWeekdays, other.fWeekdaysCount); assignArray(fShortWeekdays, fShortWeekdaysCount, other.fShortWeekdays, other.fShortWeekdaysCount); assignArray(fShorterWeekdays, fShorterWeekdaysCount, other.fShorterWeekdays, other.fShorterWeekdaysCount); assignArray(fNarrowWeekdays, fNarrowWeekdaysCount, other.fNarrowWeekdays, other.fNarrowWeekdaysCount); assignArray(fStandaloneWeekdays, fStandaloneWeekdaysCount, other.fStandaloneWeekdays, other.fStandaloneWeekdaysCount); assignArray(fStandaloneShortWeekdays, fStandaloneShortWeekdaysCount, other.fStandaloneShortWeekdays, other.fStandaloneShortWeekdaysCount); assignArray(fStandaloneShorterWeekdays, fStandaloneShorterWeekdaysCount, other.fStandaloneShorterWeekdays, other.fStandaloneShorterWeekdaysCount); assignArray(fStandaloneNarrowWeekdays, fStandaloneNarrowWeekdaysCount, other.fStandaloneNarrowWeekdays, other.fStandaloneNarrowWeekdaysCount); assignArray(fAmPms, fAmPmsCount, other.fAmPms, other.fAmPmsCount); assignArray(fNarrowAmPms, fNarrowAmPmsCount, other.fNarrowAmPms, other.fNarrowAmPmsCount ); fTimeSeparator.fastCopyFrom(other.fTimeSeparator); // fastCopyFrom() - see assignArray comments assignArray(fQuarters, fQuartersCount, other.fQuarters, other.fQuartersCount); assignArray(fShortQuarters, fShortQuartersCount, other.fShortQuarters, other.fShortQuartersCount); assignArray(fNarrowQuarters, fNarrowQuartersCount, other.fNarrowQuarters, other.fNarrowQuartersCount); assignArray(fStandaloneQuarters, fStandaloneQuartersCount, other.fStandaloneQuarters, other.fStandaloneQuartersCount); assignArray(fStandaloneShortQuarters, fStandaloneShortQuartersCount, other.fStandaloneShortQuarters, other.fStandaloneShortQuartersCount); assignArray(fStandaloneNarrowQuarters, fStandaloneNarrowQuartersCount, other.fStandaloneNarrowQuarters, other.fStandaloneNarrowQuartersCount); assignArray(fWideDayPeriods, fWideDayPeriodsCount, other.fWideDayPeriods, other.fWideDayPeriodsCount); assignArray(fNarrowDayPeriods, fNarrowDayPeriodsCount, other.fNarrowDayPeriods, other.fNarrowDayPeriodsCount); assignArray(fAbbreviatedDayPeriods, fAbbreviatedDayPeriodsCount, other.fAbbreviatedDayPeriods, other.fAbbreviatedDayPeriodsCount); assignArray(fStandaloneWideDayPeriods, fStandaloneWideDayPeriodsCount, other.fStandaloneWideDayPeriods, other.fStandaloneWideDayPeriodsCount); assignArray(fStandaloneNarrowDayPeriods, fStandaloneNarrowDayPeriodsCount, other.fStandaloneNarrowDayPeriods, other.fStandaloneNarrowDayPeriodsCount); assignArray(fStandaloneAbbreviatedDayPeriods, fStandaloneAbbreviatedDayPeriodsCount, other.fStandaloneAbbreviatedDayPeriods, other.fStandaloneAbbreviatedDayPeriodsCount); if (other.fLeapMonthPatterns != nullptr) { assignArray(fLeapMonthPatterns, fLeapMonthPatternsCount, other.fLeapMonthPatterns, other.fLeapMonthPatternsCount); } else { fLeapMonthPatterns = nullptr; fLeapMonthPatternsCount = 0; } if (other.fShortYearNames != nullptr) { assignArray(fShortYearNames, fShortYearNamesCount, other.fShortYearNames, other.fShortYearNamesCount); } else { fShortYearNames = nullptr; fShortYearNamesCount = 0; } if (other.fShortZodiacNames != nullptr) { assignArray(fShortZodiacNames, fShortZodiacNamesCount, other.fShortZodiacNames, other.fShortZodiacNamesCount); } else { fShortZodiacNames = nullptr; fShortZodiacNamesCount = 0; } if (other.fZoneStrings != nullptr) { fZoneStringsColCount = other.fZoneStringsColCount; fZoneStringsRowCount = other.fZoneStringsRowCount; createZoneStrings((const UnicodeString**)other.fZoneStrings); } else { fZoneStrings = nullptr; fZoneStringsColCount = 0; fZoneStringsRowCount = 0; } fZSFLocale = other.fZSFLocale; // Other zone strings data is created on demand fLocaleZoneStrings = nullptr; // fastCopyFrom() - see assignArray comments fLocalPatternChars.fastCopyFrom(other.fLocalPatternChars); uprv_memcpy(fCapitalization, other.fCapitalization, sizeof(fCapitalization)); } /** * Assignment operator. */ DateFormatSymbols& DateFormatSymbols::operator=(const DateFormatSymbols& other) { if (this == &other) { return *this; } // self-assignment: no-op dispose(); copyData(other); return *this; } DateFormatSymbols::~DateFormatSymbols() { dispose(); } void DateFormatSymbols::dispose() { delete[] fEras; delete[] fEraNames; delete[] fNarrowEras; delete[] fMonths; delete[] fShortMonths; delete[] fNarrowMonths; delete[] fStandaloneMonths; delete[] fStandaloneShortMonths; delete[] fStandaloneNarrowMonths; delete[] fWeekdays; delete[] fShortWeekdays; delete[] fShorterWeekdays; delete[] fNarrowWeekdays; delete[] fStandaloneWeekdays; delete[] fStandaloneShortWeekdays; delete[] fStandaloneShorterWeekdays; delete[] fStandaloneNarrowWeekdays; delete[] fAmPms; delete[] fNarrowAmPms; delete[] fQuarters; delete[] fShortQuarters; delete[] fNarrowQuarters; delete[] fStandaloneQuarters; delete[] fStandaloneShortQuarters; delete[] fStandaloneNarrowQuarters; delete[] fLeapMonthPatterns; delete[] fShortYearNames; delete[] fShortZodiacNames; delete[] fAbbreviatedDayPeriods; delete[] fWideDayPeriods; delete[] fNarrowDayPeriods; delete[] fStandaloneAbbreviatedDayPeriods; delete[] fStandaloneWideDayPeriods; delete[] fStandaloneNarrowDayPeriods; disposeZoneStrings(); } void DateFormatSymbols::disposeZoneStrings() { if (fZoneStrings) { for (int32_t row = 0; row < fZoneStringsRowCount; ++row) { delete[] fZoneStrings[row]; } uprv_free(fZoneStrings); } if (fLocaleZoneStrings) { for (int32_t row = 0; row < fZoneStringsRowCount; ++row) { delete[] fLocaleZoneStrings[row]; } uprv_free(fLocaleZoneStrings); } fZoneStrings = nullptr; fLocaleZoneStrings = nullptr; fZoneStringsRowCount = 0; fZoneStringsColCount = 0; } UBool DateFormatSymbols::arrayCompare(const UnicodeString* array1, const UnicodeString* array2, int32_t count) { if (array1 == array2) return true; while (count>0) { --count; if (array1[count] != array2[count]) return false; } return true; } bool DateFormatSymbols::operator==(const DateFormatSymbols& other) const { // First do cheap comparisons if (this == &other) { return true; } if (fErasCount == other.fErasCount && fEraNamesCount == other.fEraNamesCount && fNarrowErasCount == other.fNarrowErasCount && fMonthsCount == other.fMonthsCount && fShortMonthsCount == other.fShortMonthsCount && fNarrowMonthsCount == other.fNarrowMonthsCount && fStandaloneMonthsCount == other.fStandaloneMonthsCount && fStandaloneShortMonthsCount == other.fStandaloneShortMonthsCount && fStandaloneNarrowMonthsCount == other.fStandaloneNarrowMonthsCount && fWeekdaysCount == other.fWeekdaysCount && fShortWeekdaysCount == other.fShortWeekdaysCount && fShorterWeekdaysCount == other.fShorterWeekdaysCount && fNarrowWeekdaysCount == other.fNarrowWeekdaysCount && fStandaloneWeekdaysCount == other.fStandaloneWeekdaysCount && fStandaloneShortWeekdaysCount == other.fStandaloneShortWeekdaysCount && fStandaloneShorterWeekdaysCount == other.fStandaloneShorterWeekdaysCount && fStandaloneNarrowWeekdaysCount == other.fStandaloneNarrowWeekdaysCount && fAmPmsCount == other.fAmPmsCount && fNarrowAmPmsCount == other.fNarrowAmPmsCount && fQuartersCount == other.fQuartersCount && fShortQuartersCount == other.fShortQuartersCount && fNarrowQuartersCount == other.fNarrowQuartersCount && fStandaloneQuartersCount == other.fStandaloneQuartersCount && fStandaloneShortQuartersCount == other.fStandaloneShortQuartersCount && fStandaloneNarrowQuartersCount == other.fStandaloneNarrowQuartersCount && fLeapMonthPatternsCount == other.fLeapMonthPatternsCount && fShortYearNamesCount == other.fShortYearNamesCount && fShortZodiacNamesCount == other.fShortZodiacNamesCount && fAbbreviatedDayPeriodsCount == other.fAbbreviatedDayPeriodsCount && fWideDayPeriodsCount == other.fWideDayPeriodsCount && fNarrowDayPeriodsCount == other.fNarrowDayPeriodsCount && fStandaloneAbbreviatedDayPeriodsCount == other.fStandaloneAbbreviatedDayPeriodsCount && fStandaloneWideDayPeriodsCount == other.fStandaloneWideDayPeriodsCount && fStandaloneNarrowDayPeriodsCount == other.fStandaloneNarrowDayPeriodsCount && (uprv_memcmp(fCapitalization, other.fCapitalization, sizeof(fCapitalization))==0)) { // Now compare the arrays themselves if (arrayCompare(fEras, other.fEras, fErasCount) && arrayCompare(fEraNames, other.fEraNames, fEraNamesCount) && arrayCompare(fNarrowEras, other.fNarrowEras, fNarrowErasCount) && arrayCompare(fMonths, other.fMonths, fMonthsCount) && arrayCompare(fShortMonths, other.fShortMonths, fShortMonthsCount) && arrayCompare(fNarrowMonths, other.fNarrowMonths, fNarrowMonthsCount) && arrayCompare(fStandaloneMonths, other.fStandaloneMonths, fStandaloneMonthsCount) && arrayCompare(fStandaloneShortMonths, other.fStandaloneShortMonths, fStandaloneShortMonthsCount) && arrayCompare(fStandaloneNarrowMonths, other.fStandaloneNarrowMonths, fStandaloneNarrowMonthsCount) && arrayCompare(fWeekdays, other.fWeekdays, fWeekdaysCount) && arrayCompare(fShortWeekdays, other.fShortWeekdays, fShortWeekdaysCount) && arrayCompare(fShorterWeekdays, other.fShorterWeekdays, fShorterWeekdaysCount) && arrayCompare(fNarrowWeekdays, other.fNarrowWeekdays, fNarrowWeekdaysCount) && arrayCompare(fStandaloneWeekdays, other.fStandaloneWeekdays, fStandaloneWeekdaysCount) && arrayCompare(fStandaloneShortWeekdays, other.fStandaloneShortWeekdays, fStandaloneShortWeekdaysCount) && arrayCompare(fStandaloneShorterWeekdays, other.fStandaloneShorterWeekdays, fStandaloneShorterWeekdaysCount) && arrayCompare(fStandaloneNarrowWeekdays, other.fStandaloneNarrowWeekdays, fStandaloneNarrowWeekdaysCount) && arrayCompare(fAmPms, other.fAmPms, fAmPmsCount) && arrayCompare(fNarrowAmPms, other.fNarrowAmPms, fNarrowAmPmsCount) && fTimeSeparator == other.fTimeSeparator && arrayCompare(fQuarters, other.fQuarters, fQuartersCount) && arrayCompare(fShortQuarters, other.fShortQuarters, fShortQuartersCount) && arrayCompare(fNarrowQuarters, other.fNarrowQuarters, fNarrowQuartersCount) && arrayCompare(fStandaloneQuarters, other.fStandaloneQuarters, fStandaloneQuartersCount) && arrayCompare(fStandaloneShortQuarters, other.fStandaloneShortQuarters, fStandaloneShortQuartersCount) && arrayCompare(fStandaloneNarrowQuarters, other.fStandaloneNarrowQuarters, fStandaloneNarrowQuartersCount) && arrayCompare(fLeapMonthPatterns, other.fLeapMonthPatterns, fLeapMonthPatternsCount) && arrayCompare(fShortYearNames, other.fShortYearNames, fShortYearNamesCount) && arrayCompare(fShortZodiacNames, other.fShortZodiacNames, fShortZodiacNamesCount) && arrayCompare(fAbbreviatedDayPeriods, other.fAbbreviatedDayPeriods, fAbbreviatedDayPeriodsCount) && arrayCompare(fWideDayPeriods, other.fWideDayPeriods, fWideDayPeriodsCount) && arrayCompare(fNarrowDayPeriods, other.fNarrowDayPeriods, fNarrowDayPeriodsCount) && arrayCompare(fStandaloneAbbreviatedDayPeriods, other.fStandaloneAbbreviatedDayPeriods, fStandaloneAbbreviatedDayPeriodsCount) && arrayCompare(fStandaloneWideDayPeriods, other.fStandaloneWideDayPeriods, fStandaloneWideDayPeriodsCount) && arrayCompare(fStandaloneNarrowDayPeriods, other.fStandaloneNarrowDayPeriods, fStandaloneWideDayPeriodsCount)) { // Compare the contents of fZoneStrings if (fZoneStrings == nullptr && other.fZoneStrings == nullptr) { if (fZSFLocale == other.fZSFLocale) { return true; } } else if (fZoneStrings != nullptr && other.fZoneStrings != nullptr) { if (fZoneStringsRowCount == other.fZoneStringsRowCount && fZoneStringsColCount == other.fZoneStringsColCount) { bool cmpres = true; for (int32_t i = 0; (i < fZoneStringsRowCount) && cmpres; i++) { cmpres = arrayCompare(fZoneStrings[i], other.fZoneStrings[i], fZoneStringsColCount); } return cmpres; } } return false; } } return false; } //------------------------------------------------------ const UnicodeString* DateFormatSymbols::getEras(int32_t &count) const { count = fErasCount; return fEras; } const UnicodeString* DateFormatSymbols::getEraNames(int32_t &count) const { count = fEraNamesCount; return fEraNames; } const UnicodeString* DateFormatSymbols::getNarrowEras(int32_t &count) const { count = fNarrowErasCount; return fNarrowEras; } const UnicodeString* DateFormatSymbols::getMonths(int32_t &count) const { count = fMonthsCount; return fMonths; } const UnicodeString* DateFormatSymbols::getShortMonths(int32_t &count) const { count = fShortMonthsCount; return fShortMonths; } const UnicodeString* DateFormatSymbols::getMonths(int32_t &count, DtContextType context, DtWidthType width ) const { UnicodeString *returnValue = nullptr; switch (context) { case FORMAT : switch(width) { case WIDE : count = fMonthsCount; returnValue = fMonths; break; case ABBREVIATED : case SHORT : // no month data for this, defaults to ABBREVIATED count = fShortMonthsCount; returnValue = fShortMonths; break; case NARROW : count = fNarrowMonthsCount; returnValue = fNarrowMonths; break; case DT_WIDTH_COUNT : break; } break; case STANDALONE : switch(width) { case WIDE : count = fStandaloneMonthsCount; returnValue = fStandaloneMonths; break; case ABBREVIATED : case SHORT : // no month data for this, defaults to ABBREVIATED count = fStandaloneShortMonthsCount; returnValue = fStandaloneShortMonths; break; case NARROW : count = fStandaloneNarrowMonthsCount; returnValue = fStandaloneNarrowMonths; break; case DT_WIDTH_COUNT : break; } break; case DT_CONTEXT_COUNT : break; } return returnValue; } const UnicodeString* DateFormatSymbols::getWeekdays(int32_t &count) const { count = fWeekdaysCount; return fWeekdays; } const UnicodeString* DateFormatSymbols::getShortWeekdays(int32_t &count) const { count = fShortWeekdaysCount; return fShortWeekdays; } const UnicodeString* DateFormatSymbols::getWeekdays(int32_t &count, DtContextType context, DtWidthType width) const { UnicodeString *returnValue = nullptr; switch (context) { case FORMAT : switch(width) { case WIDE : count = fWeekdaysCount; returnValue = fWeekdays; break; case ABBREVIATED : count = fShortWeekdaysCount; returnValue = fShortWeekdays; break; case SHORT : count = fShorterWeekdaysCount; returnValue = fShorterWeekdays; break; case NARROW : count = fNarrowWeekdaysCount; returnValue = fNarrowWeekdays; break; case DT_WIDTH_COUNT : break; } break; case STANDALONE : switch(width) { case WIDE : count = fStandaloneWeekdaysCount; returnValue = fStandaloneWeekdays; break; case ABBREVIATED : count = fStandaloneShortWeekdaysCount; returnValue = fStandaloneShortWeekdays; break; case SHORT : count = fStandaloneShorterWeekdaysCount; returnValue = fStandaloneShorterWeekdays; break; case NARROW : count = fStandaloneNarrowWeekdaysCount; returnValue = fStandaloneNarrowWeekdays; break; case DT_WIDTH_COUNT : break; } break; case DT_CONTEXT_COUNT : break; } return returnValue; } const UnicodeString* DateFormatSymbols::getQuarters(int32_t &count, DtContextType context, DtWidthType width ) const { UnicodeString *returnValue = nullptr; switch (context) { case FORMAT : switch(width) { case WIDE : count = fQuartersCount; returnValue = fQuarters; break; case ABBREVIATED : case SHORT : // no quarter data for this, defaults to ABBREVIATED count = fShortQuartersCount; returnValue = fShortQuarters; break; case NARROW : count = fNarrowQuartersCount; returnValue = fNarrowQuarters; break; case DT_WIDTH_COUNT : break; } break; case STANDALONE : switch(width) { case WIDE : count = fStandaloneQuartersCount; returnValue = fStandaloneQuarters; break; case ABBREVIATED : case SHORT : // no quarter data for this, defaults to ABBREVIATED count = fStandaloneShortQuartersCount; returnValue = fStandaloneShortQuarters; break; case NARROW : count = fStandaloneNarrowQuartersCount; returnValue = fStandaloneNarrowQuarters; break; case DT_WIDTH_COUNT : break; } break; case DT_CONTEXT_COUNT : break; } return returnValue; } UnicodeString& DateFormatSymbols::getTimeSeparatorString(UnicodeString& result) const { // fastCopyFrom() - see assignArray comments return result.fastCopyFrom(fTimeSeparator); } const UnicodeString* DateFormatSymbols::getAmPmStrings(int32_t &count) const { count = fAmPmsCount; return fAmPms; } const UnicodeString* DateFormatSymbols::getLeapMonthPatterns(int32_t &count) const { count = fLeapMonthPatternsCount; return fLeapMonthPatterns; } const UnicodeString* DateFormatSymbols::getYearNames(int32_t& count, DtContextType /*ignored*/, DtWidthType /*ignored*/) const { count = fShortYearNamesCount; return fShortYearNames; } void DateFormatSymbols::setYearNames(const UnicodeString* yearNames, int32_t count, DtContextType context, DtWidthType width) { if (context == FORMAT && width == ABBREVIATED) { if (fShortYearNames) { delete[] fShortYearNames; } fShortYearNames = newUnicodeStringArray(count); uprv_arrayCopy(yearNames, fShortYearNames, count); fShortYearNamesCount = count; } } const UnicodeString* DateFormatSymbols::getZodiacNames(int32_t& count, DtContextType /*ignored*/, DtWidthType /*ignored*/) const { count = fShortZodiacNamesCount; return fShortZodiacNames; } void DateFormatSymbols::setZodiacNames(const UnicodeString* zodiacNames, int32_t count, DtContextType context, DtWidthType width) { if (context == FORMAT && width == ABBREVIATED) { if (fShortZodiacNames) { delete[] fShortZodiacNames; } fShortZodiacNames = newUnicodeStringArray(count); uprv_arrayCopy(zodiacNames, fShortZodiacNames, count); fShortZodiacNamesCount = count; } } //------------------------------------------------------ void DateFormatSymbols::setEras(const UnicodeString* erasArray, int32_t count) { // delete the old list if we own it if (fEras) delete[] fEras; // we always own the new list, which we create here (we duplicate rather // than adopting the list passed in) fEras = newUnicodeStringArray(count); uprv_arrayCopy(erasArray,fEras, count); fErasCount = count; } void DateFormatSymbols::setEraNames(const UnicodeString* eraNamesArray, int32_t count) { // delete the old list if we own it if (fEraNames) delete[] fEraNames; // we always own the new list, which we create here (we duplicate rather // than adopting the list passed in) fEraNames = newUnicodeStringArray(count); uprv_arrayCopy(eraNamesArray,fEraNames, count); fEraNamesCount = count; } void DateFormatSymbols::setNarrowEras(const UnicodeString* narrowErasArray, int32_t count) { // delete the old list if we own it if (fNarrowEras) delete[] fNarrowEras; // we always own the new list, which we create here (we duplicate rather // than adopting the list passed in) fNarrowEras = newUnicodeStringArray(count); uprv_arrayCopy(narrowErasArray,fNarrowEras, count); fNarrowErasCount = count; } void DateFormatSymbols::setMonths(const UnicodeString* monthsArray, int32_t count) { // delete the old list if we own it if (fMonths) delete[] fMonths; // we always own the new list, which we create here (we duplicate rather // than adopting the list passed in) fMonths = newUnicodeStringArray(count); uprv_arrayCopy( monthsArray,fMonths,count); fMonthsCount = count; } void DateFormatSymbols::setShortMonths(const UnicodeString* shortMonthsArray, int32_t count) { // delete the old list if we own it if (fShortMonths) delete[] fShortMonths; // we always own the new list, which we create here (we duplicate rather // than adopting the list passed in) fShortMonths = newUnicodeStringArray(count); uprv_arrayCopy(shortMonthsArray,fShortMonths, count); fShortMonthsCount = count; } void DateFormatSymbols::setMonths(const UnicodeString* monthsArray, int32_t count, DtContextType context, DtWidthType width) { // delete the old list if we own it // we always own the new list, which we create here (we duplicate rather // than adopting the list passed in) switch (context) { case FORMAT : switch (width) { case WIDE : if (fMonths) delete[] fMonths; fMonths = newUnicodeStringArray(count); uprv_arrayCopy( monthsArray,fMonths,count); fMonthsCount = count; break; case ABBREVIATED : if (fShortMonths) delete[] fShortMonths; fShortMonths = newUnicodeStringArray(count); uprv_arrayCopy( monthsArray,fShortMonths,count); fShortMonthsCount = count; break; case NARROW : if (fNarrowMonths) delete[] fNarrowMonths; fNarrowMonths = newUnicodeStringArray(count); uprv_arrayCopy( monthsArray,fNarrowMonths,count); fNarrowMonthsCount = count; break; default : break; } break; case STANDALONE : switch (width) { case WIDE : if (fStandaloneMonths) delete[] fStandaloneMonths; fStandaloneMonths = newUnicodeStringArray(count); uprv_arrayCopy( monthsArray,fStandaloneMonths,count); fStandaloneMonthsCount = count; break; case ABBREVIATED : if (fStandaloneShortMonths) delete[] fStandaloneShortMonths; fStandaloneShortMonths = newUnicodeStringArray(count); uprv_arrayCopy( monthsArray,fStandaloneShortMonths,count); fStandaloneShortMonthsCount = count; break; case NARROW : if (fStandaloneNarrowMonths) delete[] fStandaloneNarrowMonths; fStandaloneNarrowMonths = newUnicodeStringArray(count); uprv_arrayCopy( monthsArray,fStandaloneNarrowMonths,count); fStandaloneNarrowMonthsCount = count; break; default : break; } break; case DT_CONTEXT_COUNT : break; } } void DateFormatSymbols::setWeekdays(const UnicodeString* weekdaysArray, int32_t count) { // delete the old list if we own it if (fWeekdays) delete[] fWeekdays; // we always own the new list, which we create here (we duplicate rather // than adopting the list passed in) fWeekdays = newUnicodeStringArray(count); uprv_arrayCopy(weekdaysArray,fWeekdays,count); fWeekdaysCount = count; } void DateFormatSymbols::setShortWeekdays(const UnicodeString* shortWeekdaysArray, int32_t count) { // delete the old list if we own it if (fShortWeekdays) delete[] fShortWeekdays; // we always own the new list, which we create here (we duplicate rather // than adopting the list passed in) fShortWeekdays = newUnicodeStringArray(count); uprv_arrayCopy(shortWeekdaysArray, fShortWeekdays, count); fShortWeekdaysCount = count; } void DateFormatSymbols::setWeekdays(const UnicodeString* weekdaysArray, int32_t count, DtContextType context, DtWidthType width) { // delete the old list if we own it // we always own the new list, which we create here (we duplicate rather // than adopting the list passed in) switch (context) { case FORMAT : switch (width) { case WIDE : if (fWeekdays) delete[] fWeekdays; fWeekdays = newUnicodeStringArray(count); uprv_arrayCopy(weekdaysArray, fWeekdays, count); fWeekdaysCount = count; break; case ABBREVIATED : if (fShortWeekdays) delete[] fShortWeekdays; fShortWeekdays = newUnicodeStringArray(count); uprv_arrayCopy(weekdaysArray, fShortWeekdays, count); fShortWeekdaysCount = count; break; case SHORT : if (fShorterWeekdays) delete[] fShorterWeekdays; fShorterWeekdays = newUnicodeStringArray(count); uprv_arrayCopy(weekdaysArray, fShorterWeekdays, count); fShorterWeekdaysCount = count; break; case NARROW : if (fNarrowWeekdays) delete[] fNarrowWeekdays; fNarrowWeekdays = newUnicodeStringArray(count); uprv_arrayCopy(weekdaysArray, fNarrowWeekdays, count); fNarrowWeekdaysCount = count; break; case DT_WIDTH_COUNT : break; } break; case STANDALONE : switch (width) { case WIDE : if (fStandaloneWeekdays) delete[] fStandaloneWeekdays; fStandaloneWeekdays = newUnicodeStringArray(count); uprv_arrayCopy(weekdaysArray, fStandaloneWeekdays, count); fStandaloneWeekdaysCount = count; break; case ABBREVIATED : if (fStandaloneShortWeekdays) delete[] fStandaloneShortWeekdays; fStandaloneShortWeekdays = newUnicodeStringArray(count); uprv_arrayCopy(weekdaysArray, fStandaloneShortWeekdays, count); fStandaloneShortWeekdaysCount = count; break; case SHORT : if (fStandaloneShorterWeekdays) delete[] fStandaloneShorterWeekdays; fStandaloneShorterWeekdays = newUnicodeStringArray(count); uprv_arrayCopy(weekdaysArray, fStandaloneShorterWeekdays, count); fStandaloneShorterWeekdaysCount = count; break; case NARROW : if (fStandaloneNarrowWeekdays) delete[] fStandaloneNarrowWeekdays; fStandaloneNarrowWeekdays = newUnicodeStringArray(count); uprv_arrayCopy(weekdaysArray, fStandaloneNarrowWeekdays, count); fStandaloneNarrowWeekdaysCount = count; break; case DT_WIDTH_COUNT : break; } break; case DT_CONTEXT_COUNT : break; } } void DateFormatSymbols::setQuarters(const UnicodeString* quartersArray, int32_t count, DtContextType context, DtWidthType width) { // delete the old list if we own it // we always own the new list, which we create here (we duplicate rather // than adopting the list passed in) switch (context) { case FORMAT : switch (width) { case WIDE : if (fQuarters) delete[] fQuarters; fQuarters = newUnicodeStringArray(count); uprv_arrayCopy( quartersArray,fQuarters,count); fQuartersCount = count; break; case ABBREVIATED : if (fShortQuarters) delete[] fShortQuarters; fShortQuarters = newUnicodeStringArray(count); uprv_arrayCopy( quartersArray,fShortQuarters,count); fShortQuartersCount = count; break; case NARROW : if (fNarrowQuarters) delete[] fNarrowQuarters; fNarrowQuarters = newUnicodeStringArray(count); uprv_arrayCopy( quartersArray,fNarrowQuarters,count); fNarrowQuartersCount = count; break; default : break; } break; case STANDALONE : switch (width) { case WIDE : if (fStandaloneQuarters) delete[] fStandaloneQuarters; fStandaloneQuarters = newUnicodeStringArray(count); uprv_arrayCopy( quartersArray,fStandaloneQuarters,count); fStandaloneQuartersCount = count; break; case ABBREVIATED : if (fStandaloneShortQuarters) delete[] fStandaloneShortQuarters; fStandaloneShortQuarters = newUnicodeStringArray(count); uprv_arrayCopy( quartersArray,fStandaloneShortQuarters,count); fStandaloneShortQuartersCount = count; break; case NARROW : if (fStandaloneNarrowQuarters) delete[] fStandaloneNarrowQuarters; fStandaloneNarrowQuarters = newUnicodeStringArray(count); uprv_arrayCopy( quartersArray,fStandaloneNarrowQuarters,count); fStandaloneNarrowQuartersCount = count; break; default : break; } break; case DT_CONTEXT_COUNT : break; } } void DateFormatSymbols::setAmPmStrings(const UnicodeString* amPmsArray, int32_t count) { // delete the old list if we own it if (fAmPms) delete[] fAmPms; // we always own the new list, which we create here (we duplicate rather // than adopting the list passed in) fAmPms = newUnicodeStringArray(count); uprv_arrayCopy(amPmsArray,fAmPms,count); fAmPmsCount = count; } void DateFormatSymbols::setTimeSeparatorString(const UnicodeString& newTimeSeparator) { fTimeSeparator = newTimeSeparator; } const UnicodeString** DateFormatSymbols::getZoneStrings(int32_t& rowCount, int32_t& columnCount) const { const UnicodeString **result = nullptr; static UMutex LOCK; umtx_lock(&LOCK); if (fZoneStrings == nullptr) { if (fLocaleZoneStrings == nullptr) { ((DateFormatSymbols*)this)->initZoneStringsArray(); } result = (const UnicodeString**)fLocaleZoneStrings; } else { result = (const UnicodeString**)fZoneStrings; } rowCount = fZoneStringsRowCount; columnCount = fZoneStringsColCount; umtx_unlock(&LOCK); return result; } // For now, we include all zones #define ZONE_SET UCAL_ZONE_TYPE_ANY // This code must be called within a synchronized block void DateFormatSymbols::initZoneStringsArray() { if (fZoneStrings != nullptr || fLocaleZoneStrings != nullptr) { return; } UErrorCode status = U_ZERO_ERROR; StringEnumeration *tzids = nullptr; UnicodeString ** zarray = nullptr; TimeZoneNames *tzNames = nullptr; int32_t rows = 0; static const UTimeZoneNameType TYPES[] = { UTZNM_LONG_STANDARD, UTZNM_SHORT_STANDARD, UTZNM_LONG_DAYLIGHT, UTZNM_SHORT_DAYLIGHT }; static const int32_t NUM_TYPES = 4; do { // dummy do-while tzids = TimeZone::createTimeZoneIDEnumeration(ZONE_SET, nullptr, nullptr, status); rows = tzids->count(status); if (U_FAILURE(status)) { break; } // Allocate array int32_t size = rows * sizeof(UnicodeString*); zarray = (UnicodeString**)uprv_malloc(size); if (zarray == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; break; } uprv_memset(zarray, 0, size); tzNames = TimeZoneNames::createInstance(fZSFLocale, status); tzNames->loadAllDisplayNames(status); if (U_FAILURE(status)) { break; } const UnicodeString *tzid; int32_t i = 0; UDate now = Calendar::getNow(); UnicodeString tzDispName; while ((tzid = tzids->snext(status)) != 0) { if (U_FAILURE(status)) { break; } zarray[i] = new UnicodeString[5]; if (zarray[i] == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; break; } zarray[i][0].setTo(*tzid); tzNames->getDisplayNames(*tzid, TYPES, NUM_TYPES, now, zarray[i]+1, status); i++; } } while (false); if (U_FAILURE(status)) { if (zarray) { for (int32_t i = 0; i < rows; i++) { if (zarray[i]) { delete[] zarray[i]; } } uprv_free(zarray); zarray = nullptr; } } if (tzNames) { delete tzNames; } if (tzids) { delete tzids; } fLocaleZoneStrings = zarray; fZoneStringsRowCount = rows; fZoneStringsColCount = 1 + NUM_TYPES; } void DateFormatSymbols::setZoneStrings(const UnicodeString* const *strings, int32_t rowCount, int32_t columnCount) { // since deleting a 2-d array is a pain in the butt, we offload that task to // a separate function disposeZoneStrings(); // we always own the new list, which we create here (we duplicate rather // than adopting the list passed in) fZoneStringsRowCount = rowCount; fZoneStringsColCount = columnCount; createZoneStrings((const UnicodeString**)strings); } //------------------------------------------------------ const char16_t * U_EXPORT2 DateFormatSymbols::getPatternUChars() { return gPatternChars; } UDateFormatField U_EXPORT2 DateFormatSymbols::getPatternCharIndex(char16_t c) { const char16_t *p = u_strchr(gPatternChars, c); if (p == nullptr) { return UDAT_FIELD_COUNT; } else { return static_cast(p - gPatternChars); } } static const uint64_t kNumericFieldsAlways = ((uint64_t)1 << UDAT_YEAR_FIELD) | // y ((uint64_t)1 << UDAT_DATE_FIELD) | // d ((uint64_t)1 << UDAT_HOUR_OF_DAY1_FIELD) | // k ((uint64_t)1 << UDAT_HOUR_OF_DAY0_FIELD) | // H ((uint64_t)1 << UDAT_MINUTE_FIELD) | // m ((uint64_t)1 << UDAT_SECOND_FIELD) | // s ((uint64_t)1 << UDAT_FRACTIONAL_SECOND_FIELD) | // S ((uint64_t)1 << UDAT_DAY_OF_YEAR_FIELD) | // D ((uint64_t)1 << UDAT_DAY_OF_WEEK_IN_MONTH_FIELD) | // F ((uint64_t)1 << UDAT_WEEK_OF_YEAR_FIELD) | // w ((uint64_t)1 << UDAT_WEEK_OF_MONTH_FIELD) | // W ((uint64_t)1 << UDAT_HOUR1_FIELD) | // h ((uint64_t)1 << UDAT_HOUR0_FIELD) | // K ((uint64_t)1 << UDAT_YEAR_WOY_FIELD) | // Y ((uint64_t)1 << UDAT_EXTENDED_YEAR_FIELD) | // u ((uint64_t)1 << UDAT_JULIAN_DAY_FIELD) | // g ((uint64_t)1 << UDAT_MILLISECONDS_IN_DAY_FIELD) | // A ((uint64_t)1 << UDAT_RELATED_YEAR_FIELD); // r static const uint64_t kNumericFieldsForCount12 = ((uint64_t)1 << UDAT_MONTH_FIELD) | // M or MM ((uint64_t)1 << UDAT_DOW_LOCAL_FIELD) | // e or ee ((uint64_t)1 << UDAT_STANDALONE_DAY_FIELD) | // c or cc ((uint64_t)1 << UDAT_STANDALONE_MONTH_FIELD) | // L or LL ((uint64_t)1 << UDAT_QUARTER_FIELD) | // Q or QQ ((uint64_t)1 << UDAT_STANDALONE_QUARTER_FIELD); // q or qq UBool U_EXPORT2 DateFormatSymbols::isNumericField(UDateFormatField f, int32_t count) { if (f == UDAT_FIELD_COUNT) { return false; } uint64_t flag = ((uint64_t)1 << f); return ((kNumericFieldsAlways & flag) != 0 || ((kNumericFieldsForCount12 & flag) != 0 && count < 3)); } UBool U_EXPORT2 DateFormatSymbols::isNumericPatternChar(char16_t c, int32_t count) { return isNumericField(getPatternCharIndex(c), count); } //------------------------------------------------------ UnicodeString& DateFormatSymbols::getLocalPatternChars(UnicodeString& result) const { // fastCopyFrom() - see assignArray comments return result.fastCopyFrom(fLocalPatternChars); } //------------------------------------------------------ void DateFormatSymbols::setLocalPatternChars(const UnicodeString& newLocalPatternChars) { fLocalPatternChars = newLocalPatternChars; } //------------------------------------------------------ namespace { // Constants declarations static const char16_t kCalendarAliasPrefixUChar[] = { SOLIDUS, CAP_L, CAP_O, CAP_C, CAP_A, CAP_L, CAP_E, SOLIDUS, LOW_C, LOW_A, LOW_L, LOW_E, LOW_N, LOW_D, LOW_A, LOW_R, SOLIDUS }; static const char16_t kGregorianTagUChar[] = { LOW_G, LOW_R, LOW_E, LOW_G, LOW_O, LOW_R, LOW_I, LOW_A, LOW_N }; static const char16_t kVariantTagUChar[] = { PERCENT, LOW_V, LOW_A, LOW_R, LOW_I, LOW_A, LOW_N, LOW_T }; static const char16_t kLeapTagUChar[] = { LOW_L, LOW_E, LOW_A, LOW_P }; static const char16_t kCyclicNameSetsTagUChar[] = { LOW_C, LOW_Y, LOW_C, LOW_L, LOW_I, LOW_C, CAP_N, LOW_A, LOW_M, LOW_E, CAP_S, LOW_E, LOW_T, LOW_S }; static const char16_t kYearsTagUChar[] = { SOLIDUS, LOW_Y, LOW_E, LOW_A, LOW_R, LOW_S }; static const char16_t kZodiacsUChar[] = { SOLIDUS, LOW_Z, LOW_O, LOW_D, LOW_I, LOW_A, LOW_C, LOW_S }; static const char16_t kDayPartsTagUChar[] = { SOLIDUS, LOW_D, LOW_A, LOW_Y, CAP_P, LOW_A, LOW_R, LOW_T, LOW_S }; static const char16_t kFormatTagUChar[] = { SOLIDUS, LOW_F, LOW_O, LOW_R, LOW_M, LOW_A, LOW_T }; static const char16_t kAbbrTagUChar[] = { SOLIDUS, LOW_A, LOW_B, LOW_B, LOW_R, LOW_E, LOW_V, LOW_I, LOW_A, LOW_T, LOW_E, LOW_D }; // ResourceSink to enumerate all calendar resources struct CalendarDataSink : public ResourceSink { // Enum which specifies the type of alias received, or no alias enum AliasType { SAME_CALENDAR, DIFFERENT_CALENDAR, GREGORIAN, NONE }; // Data structures to store resources from the current resource bundle Hashtable arrays; Hashtable arraySizes; Hashtable maps; /** * Whenever there are aliases, the same object will be added twice to 'map'. * To avoid double deletion, 'maps' won't take ownership of the objects. Instead, * 'mapRefs' will own them and will delete them when CalendarDataSink is deleted. */ MemoryPool mapRefs; // Paths and the aliases they point to UVector aliasPathPairs; // Current and next calendar resource table which should be loaded UnicodeString currentCalendarType; UnicodeString nextCalendarType; // Resources to visit when enumerating fallback calendars LocalPointer resourcesToVisit; // Alias' relative path populated whenever an alias is read UnicodeString aliasRelativePath; // Initializes CalendarDataSink with default values CalendarDataSink(UErrorCode& status) : arrays(false, status), arraySizes(false, status), maps(false, status), mapRefs(), aliasPathPairs(uprv_deleteUObject, uhash_compareUnicodeString, status), currentCalendarType(), nextCalendarType(), resourcesToVisit(nullptr), aliasRelativePath() { if (U_FAILURE(status)) { return; } } virtual ~CalendarDataSink(); // Configure the CalendarSink to visit all the resources void visitAllResources() { resourcesToVisit.adoptInstead(nullptr); } // Actions to be done before enumerating void preEnumerate(const UnicodeString &calendarType) { currentCalendarType = calendarType; nextCalendarType.setToBogus(); aliasPathPairs.removeAllElements(); } virtual void put(const char *key, ResourceValue &value, UBool, UErrorCode &errorCode) override { if (U_FAILURE(errorCode)) { return; } U_ASSERT(!currentCalendarType.isEmpty()); // Stores the resources to visit on the next calendar. LocalPointer resourcesToVisitNext(nullptr); ResourceTable calendarData = value.getTable(errorCode); if (U_FAILURE(errorCode)) { return; } // Enumerate all resources for this calendar for (int i = 0; calendarData.getKeyAndValue(i, key, value); i++) { UnicodeString keyUString(key, -1, US_INV); // == Handle aliases == AliasType aliasType = processAliasFromValue(keyUString, value, errorCode); if (U_FAILURE(errorCode)) { return; } if (aliasType == GREGORIAN) { // Ignore aliases to the gregorian calendar, all of its resources will be loaded anyway. continue; } else if (aliasType == DIFFERENT_CALENDAR) { // Whenever an alias to the next calendar (except gregorian) is encountered, register the // calendar type it's pointing to if (resourcesToVisitNext.isNull()) { resourcesToVisitNext .adoptInsteadAndCheckErrorCode(new UVector(uprv_deleteUObject, uhash_compareUnicodeString, errorCode), errorCode); if (U_FAILURE(errorCode)) { return; } } LocalPointer aliasRelativePathCopy(aliasRelativePath.clone(), errorCode); resourcesToVisitNext->adoptElement(aliasRelativePathCopy.orphan(), errorCode); if (U_FAILURE(errorCode)) { return; } continue; } else if (aliasType == SAME_CALENDAR) { // Register same-calendar alias if (arrays.get(aliasRelativePath) == nullptr && maps.get(aliasRelativePath) == nullptr) { LocalPointer aliasRelativePathCopy(aliasRelativePath.clone(), errorCode); aliasPathPairs.adoptElement(aliasRelativePathCopy.orphan(), errorCode); if (U_FAILURE(errorCode)) { return; } LocalPointer keyUStringCopy(keyUString.clone(), errorCode); aliasPathPairs.adoptElement(keyUStringCopy.orphan(), errorCode); if (U_FAILURE(errorCode)) { return; } } continue; } // Only visit the resources that were referenced by an alias on the previous calendar // (AmPmMarkersAbbr is an exception). if (!resourcesToVisit.isNull() && !resourcesToVisit->isEmpty() && !resourcesToVisit->contains(&keyUString) && uprv_strcmp(key, gAmPmMarkersAbbrTag) != 0) { continue; } // == Handle data == if (uprv_strcmp(key, gAmPmMarkersTag) == 0 || uprv_strcmp(key, gAmPmMarkersAbbrTag) == 0 || uprv_strcmp(key, gAmPmMarkersNarrowTag) == 0) { if (arrays.get(keyUString) == nullptr) { ResourceArray resourceArray = value.getArray(errorCode); int32_t arraySize = resourceArray.getSize(); LocalArray stringArray(new UnicodeString[arraySize], errorCode); value.getStringArray(stringArray.getAlias(), arraySize, errorCode); arrays.put(keyUString, stringArray.orphan(), errorCode); arraySizes.puti(keyUString, arraySize, errorCode); if (U_FAILURE(errorCode)) { return; } } } else if (uprv_strcmp(key, gErasTag) == 0 || uprv_strcmp(key, gDayNamesTag) == 0 || uprv_strcmp(key, gMonthNamesTag) == 0 || uprv_strcmp(key, gQuartersTag) == 0 || uprv_strcmp(key, gDayPeriodTag) == 0 || uprv_strcmp(key, gMonthPatternsTag) == 0 || uprv_strcmp(key, gCyclicNameSetsTag) == 0) { processResource(keyUString, key, value, errorCode); } } // Apply same-calendar aliases UBool modified; do { modified = false; for (int32_t i = 0; i < aliasPathPairs.size();) { UBool mod = false; UnicodeString *alias = (UnicodeString*)aliasPathPairs[i]; UnicodeString *aliasArray; Hashtable *aliasMap; if ((aliasArray = (UnicodeString*)arrays.get(*alias)) != nullptr) { UnicodeString *path = (UnicodeString*)aliasPathPairs[i + 1]; if (arrays.get(*path) == nullptr) { // Clone the array int32_t aliasArraySize = arraySizes.geti(*alias); LocalArray aliasArrayCopy(new UnicodeString[aliasArraySize], errorCode); if (U_FAILURE(errorCode)) { return; } uprv_arrayCopy(aliasArray, aliasArrayCopy.getAlias(), aliasArraySize); // Put the array on the 'arrays' map arrays.put(*path, aliasArrayCopy.orphan(), errorCode); arraySizes.puti(*path, aliasArraySize, errorCode); } if (U_FAILURE(errorCode)) { return; } mod = true; } else if ((aliasMap = (Hashtable*)maps.get(*alias)) != nullptr) { UnicodeString *path = (UnicodeString*)aliasPathPairs[i + 1]; if (maps.get(*path) == nullptr) { maps.put(*path, aliasMap, errorCode); } if (U_FAILURE(errorCode)) { return; } mod = true; } if (mod) { aliasPathPairs.removeElementAt(i + 1); aliasPathPairs.removeElementAt(i); modified = true; } else { i += 2; } } } while (modified && !aliasPathPairs.isEmpty()); // Set the resources to visit on the next calendar if (!resourcesToVisitNext.isNull()) { resourcesToVisit = std::move(resourcesToVisitNext); } } // Process the nested resource bundle tables void processResource(UnicodeString &path, const char *key, ResourceValue &value, UErrorCode &errorCode) { if (U_FAILURE(errorCode)) return; ResourceTable table = value.getTable(errorCode); if (U_FAILURE(errorCode)) return; Hashtable* stringMap = nullptr; // Iterate over all the elements of the table and add them to the map for (int i = 0; table.getKeyAndValue(i, key, value); i++) { UnicodeString keyUString(key, -1, US_INV); // Ignore '%variant' keys if (keyUString.endsWith(kVariantTagUChar, UPRV_LENGTHOF(kVariantTagUChar))) { continue; } // == Handle String elements == if (value.getType() == URES_STRING) { // We are on a leaf, store the map elements into the stringMap if (i == 0) { // mapRefs will keep ownership of 'stringMap': stringMap = mapRefs.create(false, errorCode); if (stringMap == nullptr) { errorCode = U_MEMORY_ALLOCATION_ERROR; return; } maps.put(path, stringMap, errorCode); if (U_FAILURE(errorCode)) { return; } stringMap->setValueDeleter(uprv_deleteUObject); } U_ASSERT(stringMap != nullptr); int32_t valueStringSize; const char16_t *valueString = value.getString(valueStringSize, errorCode); if (U_FAILURE(errorCode)) { return; } LocalPointer valueUString(new UnicodeString(true, valueString, valueStringSize), errorCode); stringMap->put(keyUString, valueUString.orphan(), errorCode); if (U_FAILURE(errorCode)) { return; } continue; } U_ASSERT(stringMap == nullptr); // Store the current path's length and append the current key to the path. int32_t pathLength = path.length(); path.append(SOLIDUS).append(keyUString); // In cyclicNameSets ignore everything but years/format/abbreviated // and zodiacs/format/abbreviated if (path.startsWith(kCyclicNameSetsTagUChar, UPRV_LENGTHOF(kCyclicNameSetsTagUChar))) { UBool skip = true; int32_t startIndex = UPRV_LENGTHOF(kCyclicNameSetsTagUChar); int32_t length = 0; if (startIndex == path.length() || path.compare(startIndex, (length = UPRV_LENGTHOF(kZodiacsUChar)), kZodiacsUChar, 0, UPRV_LENGTHOF(kZodiacsUChar)) == 0 || path.compare(startIndex, (length = UPRV_LENGTHOF(kYearsTagUChar)), kYearsTagUChar, 0, UPRV_LENGTHOF(kYearsTagUChar)) == 0 || path.compare(startIndex, (length = UPRV_LENGTHOF(kDayPartsTagUChar)), kDayPartsTagUChar, 0, UPRV_LENGTHOF(kDayPartsTagUChar)) == 0) { startIndex += length; length = 0; if (startIndex == path.length() || path.compare(startIndex, (length = UPRV_LENGTHOF(kFormatTagUChar)), kFormatTagUChar, 0, UPRV_LENGTHOF(kFormatTagUChar)) == 0) { startIndex += length; length = 0; if (startIndex == path.length() || path.compare(startIndex, (length = UPRV_LENGTHOF(kAbbrTagUChar)), kAbbrTagUChar, 0, UPRV_LENGTHOF(kAbbrTagUChar)) == 0) { skip = false; } } } if (skip) { // Drop the latest key on the path and continue path.retainBetween(0, pathLength); continue; } } // == Handle aliases == if (arrays.get(path) != nullptr || maps.get(path) != nullptr) { // Drop the latest key on the path and continue path.retainBetween(0, pathLength); continue; } AliasType aliasType = processAliasFromValue(path, value, errorCode); if (U_FAILURE(errorCode)) { return; } if (aliasType == SAME_CALENDAR) { // Store the alias path and the current path on aliasPathPairs LocalPointer aliasRelativePathCopy(aliasRelativePath.clone(), errorCode); aliasPathPairs.adoptElement(aliasRelativePathCopy.orphan(), errorCode); if (U_FAILURE(errorCode)) { return; } LocalPointer pathCopy(path.clone(), errorCode); aliasPathPairs.adoptElement(pathCopy.orphan(), errorCode); if (U_FAILURE(errorCode)) { return; } // Drop the latest key on the path and continue path.retainBetween(0, pathLength); continue; } U_ASSERT(aliasType == NONE); // == Handle data == if (value.getType() == URES_ARRAY) { // We are on a leaf, store the array ResourceArray rDataArray = value.getArray(errorCode); int32_t dataArraySize = rDataArray.getSize(); LocalArray dataArray(new UnicodeString[dataArraySize], errorCode); value.getStringArray(dataArray.getAlias(), dataArraySize, errorCode); arrays.put(path, dataArray.orphan(), errorCode); arraySizes.puti(path, dataArraySize, errorCode); if (U_FAILURE(errorCode)) { return; } } else if (value.getType() == URES_TABLE) { // We are not on a leaf, recursively process the subtable. processResource(path, key, value, errorCode); if (U_FAILURE(errorCode)) { return; } } // Drop the latest key on the path path.retainBetween(0, pathLength); } } // Populates an AliasIdentifier with the alias information contained on the UResource.Value. AliasType processAliasFromValue(UnicodeString ¤tRelativePath, ResourceValue &value, UErrorCode &errorCode) { if (U_FAILURE(errorCode)) { return NONE; } if (value.getType() == URES_ALIAS) { int32_t aliasPathSize; const char16_t* aliasPathUChar = value.getAliasString(aliasPathSize, errorCode); if (U_FAILURE(errorCode)) { return NONE; } UnicodeString aliasPath(aliasPathUChar, aliasPathSize); const int32_t aliasPrefixLength = UPRV_LENGTHOF(kCalendarAliasPrefixUChar); if (aliasPath.startsWith(kCalendarAliasPrefixUChar, aliasPrefixLength) && aliasPath.length() > aliasPrefixLength) { int32_t typeLimit = aliasPath.indexOf(SOLIDUS, aliasPrefixLength); if (typeLimit > aliasPrefixLength) { const UnicodeString aliasCalendarType = aliasPath.tempSubStringBetween(aliasPrefixLength, typeLimit); aliasRelativePath.setTo(aliasPath, typeLimit + 1, aliasPath.length()); if (currentCalendarType == aliasCalendarType && currentRelativePath != aliasRelativePath) { // If we have an alias to the same calendar, the path to the resource must be different return SAME_CALENDAR; } else if (currentCalendarType != aliasCalendarType && currentRelativePath == aliasRelativePath) { // If we have an alias to a different calendar, the path to the resource must be the same if (aliasCalendarType.compare(kGregorianTagUChar, UPRV_LENGTHOF(kGregorianTagUChar)) == 0) { return GREGORIAN; } else if (nextCalendarType.isBogus()) { nextCalendarType = aliasCalendarType; return DIFFERENT_CALENDAR; } else if (nextCalendarType == aliasCalendarType) { return DIFFERENT_CALENDAR; } } } } errorCode = U_INTERNAL_PROGRAM_ERROR; return NONE; } return NONE; } // Deleter function to be used by 'arrays' static void U_CALLCONV deleteUnicodeStringArray(void *uArray) { delete[] static_cast(uArray); } }; // Virtual destructors have to be defined out of line CalendarDataSink::~CalendarDataSink() { arrays.setValueDeleter(deleteUnicodeStringArray); } } //------------------------------------------------------ static void initField(UnicodeString **field, int32_t& length, const char16_t *data, LastResortSize numStr, LastResortSize strLen, UErrorCode &status) { if (U_SUCCESS(status)) { length = numStr; *field = newUnicodeStringArray((size_t)numStr); if (*field) { for(int32_t i = 0; isetTo(true, data+(i*((int32_t)strLen)), -1); } } else { length = 0; status = U_MEMORY_ALLOCATION_ERROR; } } } static void initField(UnicodeString **field, int32_t& length, CalendarDataSink &sink, CharString &key, UErrorCode &status) { if (U_SUCCESS(status)) { UnicodeString keyUString(key.data(), -1, US_INV); UnicodeString* array = static_cast(sink.arrays.get(keyUString)); if (array != nullptr) { length = sink.arraySizes.geti(keyUString); *field = array; // DateFormatSymbols takes ownership of the array: sink.arrays.remove(keyUString); } else { length = 0; status = U_MISSING_RESOURCE_ERROR; } } } static void initField(UnicodeString **field, int32_t& length, CalendarDataSink &sink, CharString &key, int32_t arrayOffset, UErrorCode &status) { if (U_SUCCESS(status)) { UnicodeString keyUString(key.data(), -1, US_INV); UnicodeString* array = static_cast(sink.arrays.get(keyUString)); if (array != nullptr) { int32_t arrayLength = sink.arraySizes.geti(keyUString); length = arrayLength + arrayOffset; *field = new UnicodeString[length]; if (*field == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } uprv_arrayCopy(array, 0, *field, arrayOffset, arrayLength); } else { length = 0; status = U_MISSING_RESOURCE_ERROR; } } } static void initLeapMonthPattern(UnicodeString *field, int32_t index, CalendarDataSink &sink, CharString &path, UErrorCode &status) { field[index].remove(); if (U_SUCCESS(status)) { UnicodeString pathUString(path.data(), -1, US_INV); Hashtable *leapMonthTable = static_cast(sink.maps.get(pathUString)); if (leapMonthTable != nullptr) { UnicodeString leapLabel(false, kLeapTagUChar, UPRV_LENGTHOF(kLeapTagUChar)); UnicodeString *leapMonthPattern = static_cast(leapMonthTable->get(leapLabel)); if (leapMonthPattern != nullptr) { field[index].fastCopyFrom(*leapMonthPattern); } else { field[index].setToBogus(); } return; } status = U_MISSING_RESOURCE_ERROR; } } static CharString &buildResourcePath(CharString &path, const char* segment1, UErrorCode &errorCode) { return path.clear().append(segment1, -1, errorCode); } static CharString &buildResourcePath(CharString &path, const char* segment1, const char* segment2, UErrorCode &errorCode) { return buildResourcePath(path, segment1, errorCode).append('/', errorCode) .append(segment2, -1, errorCode); } static CharString &buildResourcePath(CharString &path, const char* segment1, const char* segment2, const char* segment3, UErrorCode &errorCode) { return buildResourcePath(path, segment1, segment2, errorCode).append('/', errorCode) .append(segment3, -1, errorCode); } static CharString &buildResourcePath(CharString &path, const char* segment1, const char* segment2, const char* segment3, const char* segment4, UErrorCode &errorCode) { return buildResourcePath(path, segment1, segment2, segment3, errorCode).append('/', errorCode) .append(segment4, -1, errorCode); } typedef struct { const char * usageTypeName; DateFormatSymbols::ECapitalizationContextUsageType usageTypeEnumValue; } ContextUsageTypeNameToEnumValue; static const ContextUsageTypeNameToEnumValue contextUsageTypeMap[] = { // Entries must be sorted by usageTypeName; entry with nullptr name terminates list. { "day-format-except-narrow", DateFormatSymbols::kCapContextUsageDayFormat }, { "day-narrow", DateFormatSymbols::kCapContextUsageDayNarrow }, { "day-standalone-except-narrow", DateFormatSymbols::kCapContextUsageDayStandalone }, { "era-abbr", DateFormatSymbols::kCapContextUsageEraAbbrev }, { "era-name", DateFormatSymbols::kCapContextUsageEraWide }, { "era-narrow", DateFormatSymbols::kCapContextUsageEraNarrow }, { "metazone-long", DateFormatSymbols::kCapContextUsageMetazoneLong }, { "metazone-short", DateFormatSymbols::kCapContextUsageMetazoneShort }, { "month-format-except-narrow", DateFormatSymbols::kCapContextUsageMonthFormat }, { "month-narrow", DateFormatSymbols::kCapContextUsageMonthNarrow }, { "month-standalone-except-narrow", DateFormatSymbols::kCapContextUsageMonthStandalone }, { "zone-long", DateFormatSymbols::kCapContextUsageZoneLong }, { "zone-short", DateFormatSymbols::kCapContextUsageZoneShort }, { nullptr, (DateFormatSymbols::ECapitalizationContextUsageType)0 }, }; // Resource keys to look up localized strings for day periods. // The first one must be midnight and the second must be noon, so that their indices coincide // with the am/pm field. Formatting and parsing code for day periods relies on this coincidence. static const char *dayPeriodKeys[] = {"midnight", "noon", "morning1", "afternoon1", "evening1", "night1", "morning2", "afternoon2", "evening2", "night2"}; UnicodeString* loadDayPeriodStrings(CalendarDataSink &sink, CharString &path, int32_t &stringCount, UErrorCode &status) { if (U_FAILURE(status)) { return nullptr; } UnicodeString pathUString(path.data(), -1, US_INV); Hashtable* map = static_cast(sink.maps.get(pathUString)); stringCount = UPRV_LENGTHOF(dayPeriodKeys); UnicodeString *strings = new UnicodeString[stringCount]; if (strings == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } if (map != nullptr) { for (int32_t i = 0; i < stringCount; ++i) { UnicodeString dayPeriodKey(dayPeriodKeys[i], -1, US_INV); UnicodeString *dayPeriod = static_cast(map->get(dayPeriodKey)); if (dayPeriod != nullptr) { strings[i].fastCopyFrom(*dayPeriod); } else { strings[i].setToBogus(); } } } else { for (int32_t i = 0; i < stringCount; i++) { strings[i].setToBogus(); } } return strings; } void DateFormatSymbols::initializeData(const Locale& locale, const char *type, UErrorCode& status, UBool useLastResortData) { int32_t len = 0; /* In case something goes wrong, initialize all of the data to nullptr. */ fEras = nullptr; fErasCount = 0; fEraNames = nullptr; fEraNamesCount = 0; fNarrowEras = nullptr; fNarrowErasCount = 0; fMonths = nullptr; fMonthsCount=0; fShortMonths = nullptr; fShortMonthsCount=0; fNarrowMonths = nullptr; fNarrowMonthsCount=0; fStandaloneMonths = nullptr; fStandaloneMonthsCount=0; fStandaloneShortMonths = nullptr; fStandaloneShortMonthsCount=0; fStandaloneNarrowMonths = nullptr; fStandaloneNarrowMonthsCount=0; fWeekdays = nullptr; fWeekdaysCount=0; fShortWeekdays = nullptr; fShortWeekdaysCount=0; fShorterWeekdays = nullptr; fShorterWeekdaysCount=0; fNarrowWeekdays = nullptr; fNarrowWeekdaysCount=0; fStandaloneWeekdays = nullptr; fStandaloneWeekdaysCount=0; fStandaloneShortWeekdays = nullptr; fStandaloneShortWeekdaysCount=0; fStandaloneShorterWeekdays = nullptr; fStandaloneShorterWeekdaysCount=0; fStandaloneNarrowWeekdays = nullptr; fStandaloneNarrowWeekdaysCount=0; fAmPms = nullptr; fAmPmsCount=0; fNarrowAmPms = nullptr; fNarrowAmPmsCount=0; fTimeSeparator.setToBogus(); fQuarters = nullptr; fQuartersCount = 0; fShortQuarters = nullptr; fShortQuartersCount = 0; fNarrowQuarters = nullptr; fNarrowQuartersCount = 0; fStandaloneQuarters = nullptr; fStandaloneQuartersCount = 0; fStandaloneShortQuarters = nullptr; fStandaloneShortQuartersCount = 0; fStandaloneNarrowQuarters = nullptr; fStandaloneNarrowQuartersCount = 0; fLeapMonthPatterns = nullptr; fLeapMonthPatternsCount = 0; fShortYearNames = nullptr; fShortYearNamesCount = 0; fShortZodiacNames = nullptr; fShortZodiacNamesCount = 0; fZoneStringsRowCount = 0; fZoneStringsColCount = 0; fZoneStrings = nullptr; fLocaleZoneStrings = nullptr; fAbbreviatedDayPeriods = nullptr; fAbbreviatedDayPeriodsCount = 0; fWideDayPeriods = nullptr; fWideDayPeriodsCount = 0; fNarrowDayPeriods = nullptr; fNarrowDayPeriodsCount = 0; fStandaloneAbbreviatedDayPeriods = nullptr; fStandaloneAbbreviatedDayPeriodsCount = 0; fStandaloneWideDayPeriods = nullptr; fStandaloneWideDayPeriodsCount = 0; fStandaloneNarrowDayPeriods = nullptr; fStandaloneNarrowDayPeriodsCount = 0; uprv_memset(fCapitalization, 0, sizeof(fCapitalization)); // We need to preserve the requested locale for // lazy ZoneStringFormat instantiation. ZoneStringFormat // is region sensitive, thus, bundle locale bundle's locale // is not sufficient. fZSFLocale = locale; if (U_FAILURE(status)) return; // Create a CalendarDataSink to process this data and the resource bundles CalendarDataSink calendarSink(status); UResourceBundle *rb = ures_open(nullptr, locale.getBaseName(), &status); UResourceBundle *cb = ures_getByKey(rb, gCalendarTag, nullptr, &status); if (U_FAILURE(status)) return; // Iterate over the resource bundle data following the fallbacks through different calendar types UnicodeString calendarType((type != nullptr && *type != '\0')? type : gGregorianTag, -1, US_INV); while (!calendarType.isBogus()) { CharString calendarTypeBuffer; calendarTypeBuffer.appendInvariantChars(calendarType, status); if (U_FAILURE(status)) { return; } const char *calendarTypeCArray = calendarTypeBuffer.data(); // Enumerate this calendar type. If the calendar is not found fallback to gregorian UErrorCode oldStatus = status; UResourceBundle *ctb = ures_getByKeyWithFallback(cb, calendarTypeCArray, nullptr, &status); if (status == U_MISSING_RESOURCE_ERROR) { ures_close(ctb); if (uprv_strcmp(calendarTypeCArray, gGregorianTag) != 0) { calendarType.setTo(false, kGregorianTagUChar, UPRV_LENGTHOF(kGregorianTagUChar)); calendarSink.visitAllResources(); status = oldStatus; continue; } return; } calendarSink.preEnumerate(calendarType); ures_getAllItemsWithFallback(ctb, "", calendarSink, status); ures_close(ctb); if (U_FAILURE(status)) break; // Stop loading when gregorian was loaded if (uprv_strcmp(calendarTypeCArray, gGregorianTag) == 0) { break; } // Get the next calendar type to process from the sink calendarType = calendarSink.nextCalendarType; // Gregorian is always the last fallback if (calendarType.isBogus()) { calendarType.setTo(false, kGregorianTagUChar, UPRV_LENGTHOF(kGregorianTagUChar)); calendarSink.visitAllResources(); } } // CharString object to build paths CharString path; // Load Leap Month Patterns UErrorCode tempStatus = status; fLeapMonthPatterns = newUnicodeStringArray(kMonthPatternsCount); if (fLeapMonthPatterns) { initLeapMonthPattern(fLeapMonthPatterns, kLeapMonthPatternFormatWide, calendarSink, buildResourcePath(path, gMonthPatternsTag, gNamesFormatTag, gNamesWideTag, tempStatus), tempStatus); initLeapMonthPattern(fLeapMonthPatterns, kLeapMonthPatternFormatAbbrev, calendarSink, buildResourcePath(path, gMonthPatternsTag, gNamesFormatTag, gNamesAbbrTag, tempStatus), tempStatus); initLeapMonthPattern(fLeapMonthPatterns, kLeapMonthPatternFormatNarrow, calendarSink, buildResourcePath(path, gMonthPatternsTag, gNamesFormatTag, gNamesNarrowTag, tempStatus), tempStatus); initLeapMonthPattern(fLeapMonthPatterns, kLeapMonthPatternStandaloneWide, calendarSink, buildResourcePath(path, gMonthPatternsTag, gNamesStandaloneTag, gNamesWideTag, tempStatus), tempStatus); initLeapMonthPattern(fLeapMonthPatterns, kLeapMonthPatternStandaloneAbbrev, calendarSink, buildResourcePath(path, gMonthPatternsTag, gNamesStandaloneTag, gNamesAbbrTag, tempStatus), tempStatus); initLeapMonthPattern(fLeapMonthPatterns, kLeapMonthPatternStandaloneNarrow, calendarSink, buildResourcePath(path, gMonthPatternsTag, gNamesStandaloneTag, gNamesNarrowTag, tempStatus), tempStatus); initLeapMonthPattern(fLeapMonthPatterns, kLeapMonthPatternNumeric, calendarSink, buildResourcePath(path, gMonthPatternsTag, gNamesNumericTag, gNamesAllTag, tempStatus), tempStatus); if (U_SUCCESS(tempStatus)) { // Hack to fix bad C inheritance for dangi monthPatterns (OK in J); this should be handled by aliases in root, but isn't. // The ordering of the following statements is important. if (fLeapMonthPatterns[kLeapMonthPatternFormatAbbrev].isEmpty()) { fLeapMonthPatterns[kLeapMonthPatternFormatAbbrev].setTo(fLeapMonthPatterns[kLeapMonthPatternFormatWide]); } if (fLeapMonthPatterns[kLeapMonthPatternFormatNarrow].isEmpty()) { fLeapMonthPatterns[kLeapMonthPatternFormatNarrow].setTo(fLeapMonthPatterns[kLeapMonthPatternStandaloneNarrow]); } if (fLeapMonthPatterns[kLeapMonthPatternStandaloneWide].isEmpty()) { fLeapMonthPatterns[kLeapMonthPatternStandaloneWide].setTo(fLeapMonthPatterns[kLeapMonthPatternFormatWide]); } if (fLeapMonthPatterns[kLeapMonthPatternStandaloneAbbrev].isEmpty()) { fLeapMonthPatterns[kLeapMonthPatternStandaloneAbbrev].setTo(fLeapMonthPatterns[kLeapMonthPatternFormatAbbrev]); } // end of hack fLeapMonthPatternsCount = kMonthPatternsCount; } else { delete[] fLeapMonthPatterns; fLeapMonthPatterns = nullptr; } } // Load cyclic names sets tempStatus = status; initField(&fShortYearNames, fShortYearNamesCount, calendarSink, buildResourcePath(path, gCyclicNameSetsTag, gNameSetYearsTag, gNamesFormatTag, gNamesAbbrTag, tempStatus), tempStatus); initField(&fShortZodiacNames, fShortZodiacNamesCount, calendarSink, buildResourcePath(path, gCyclicNameSetsTag, gNameSetZodiacsTag, gNamesFormatTag, gNamesAbbrTag, tempStatus), tempStatus); // Load context transforms and capitalization tempStatus = U_ZERO_ERROR; UResourceBundle *localeBundle = ures_open(nullptr, locale.getName(), &tempStatus); if (U_SUCCESS(tempStatus)) { UResourceBundle *contextTransforms = ures_getByKeyWithFallback(localeBundle, gContextTransformsTag, nullptr, &tempStatus); if (U_SUCCESS(tempStatus)) { UResourceBundle *contextTransformUsage; while ( (contextTransformUsage = ures_getNextResource(contextTransforms, nullptr, &tempStatus)) != nullptr ) { const int32_t * intVector = ures_getIntVector(contextTransformUsage, &len, &status); if (U_SUCCESS(tempStatus) && intVector != nullptr && len >= 2) { const char* usageType = ures_getKey(contextTransformUsage); if (usageType != nullptr) { const ContextUsageTypeNameToEnumValue * typeMapPtr = contextUsageTypeMap; int32_t compResult = 0; // linear search; list is short and we cannot be sure that bsearch is available while ( typeMapPtr->usageTypeName != nullptr && (compResult = uprv_strcmp(usageType, typeMapPtr->usageTypeName)) > 0 ) { ++typeMapPtr; } if (typeMapPtr->usageTypeName != nullptr && compResult == 0) { fCapitalization[typeMapPtr->usageTypeEnumValue][0] = static_cast(intVector[0]); fCapitalization[typeMapPtr->usageTypeEnumValue][1] = static_cast(intVector[1]); } } } tempStatus = U_ZERO_ERROR; ures_close(contextTransformUsage); } ures_close(contextTransforms); } tempStatus = U_ZERO_ERROR; const LocalPointer numberingSystem( NumberingSystem::createInstance(locale, tempStatus), tempStatus); if (U_SUCCESS(tempStatus)) { // These functions all fail gracefully if passed nullptr pointers and // do nothing unless U_SUCCESS(tempStatus), so it's only necessary // to check for errors once after all calls are made. const LocalUResourceBundlePointer numberElementsData(ures_getByKeyWithFallback( localeBundle, gNumberElementsTag, nullptr, &tempStatus)); const LocalUResourceBundlePointer nsNameData(ures_getByKeyWithFallback( numberElementsData.getAlias(), numberingSystem->getName(), nullptr, &tempStatus)); const LocalUResourceBundlePointer symbolsData(ures_getByKeyWithFallback( nsNameData.getAlias(), gSymbolsTag, nullptr, &tempStatus)); fTimeSeparator = ures_getUnicodeStringByKey( symbolsData.getAlias(), gTimeSeparatorTag, &tempStatus); if (U_FAILURE(tempStatus)) { fTimeSeparator.setToBogus(); } } ures_close(localeBundle); } if (fTimeSeparator.isBogus()) { fTimeSeparator.setTo(DateFormatSymbols::DEFAULT_TIME_SEPARATOR); } // Load day periods fAbbreviatedDayPeriods = loadDayPeriodStrings(calendarSink, buildResourcePath(path, gDayPeriodTag, gNamesFormatTag, gNamesAbbrTag, status), fAbbreviatedDayPeriodsCount, status); fWideDayPeriods = loadDayPeriodStrings(calendarSink, buildResourcePath(path, gDayPeriodTag, gNamesFormatTag, gNamesWideTag, status), fWideDayPeriodsCount, status); fNarrowDayPeriods = loadDayPeriodStrings(calendarSink, buildResourcePath(path, gDayPeriodTag, gNamesFormatTag, gNamesNarrowTag, status), fNarrowDayPeriodsCount, status); fStandaloneAbbreviatedDayPeriods = loadDayPeriodStrings(calendarSink, buildResourcePath(path, gDayPeriodTag, gNamesStandaloneTag, gNamesAbbrTag, status), fStandaloneAbbreviatedDayPeriodsCount, status); fStandaloneWideDayPeriods = loadDayPeriodStrings(calendarSink, buildResourcePath(path, gDayPeriodTag, gNamesStandaloneTag, gNamesWideTag, status), fStandaloneWideDayPeriodsCount, status); fStandaloneNarrowDayPeriods = loadDayPeriodStrings(calendarSink, buildResourcePath(path, gDayPeriodTag, gNamesStandaloneTag, gNamesNarrowTag, status), fStandaloneNarrowDayPeriodsCount, status); // Fill in for missing/bogus items (dayPeriods are a map so single items might be missing) if (U_SUCCESS(status)) { for (int32_t dpidx = 0; dpidx < fAbbreviatedDayPeriodsCount; ++dpidx) { if (dpidx < fWideDayPeriodsCount && fWideDayPeriods != nullptr && fWideDayPeriods[dpidx].isBogus()) { fWideDayPeriods[dpidx].fastCopyFrom(fAbbreviatedDayPeriods[dpidx]); } if (dpidx < fNarrowDayPeriodsCount && fNarrowDayPeriods != nullptr && fNarrowDayPeriods[dpidx].isBogus()) { fNarrowDayPeriods[dpidx].fastCopyFrom(fAbbreviatedDayPeriods[dpidx]); } if (dpidx < fStandaloneAbbreviatedDayPeriodsCount && fStandaloneAbbreviatedDayPeriods != nullptr && fStandaloneAbbreviatedDayPeriods[dpidx].isBogus()) { fStandaloneAbbreviatedDayPeriods[dpidx].fastCopyFrom(fAbbreviatedDayPeriods[dpidx]); } if (dpidx < fStandaloneWideDayPeriodsCount && fStandaloneWideDayPeriods != nullptr && fStandaloneWideDayPeriods[dpidx].isBogus()) { fStandaloneWideDayPeriods[dpidx].fastCopyFrom(fStandaloneAbbreviatedDayPeriods[dpidx]); } if (dpidx < fStandaloneNarrowDayPeriodsCount && fStandaloneNarrowDayPeriods != nullptr && fStandaloneNarrowDayPeriods[dpidx].isBogus()) { fStandaloneNarrowDayPeriods[dpidx].fastCopyFrom(fStandaloneAbbreviatedDayPeriods[dpidx]); } } } U_LOCALE_BASED(locBased, *this); // if we make it to here, the resource data is cool, and we can get everything out // of it that we need except for the time-zone and localized-pattern data, which // are stored in a separate file locBased.setLocaleIDs(ures_getLocaleByType(cb, ULOC_VALID_LOCALE, &status), ures_getLocaleByType(cb, ULOC_ACTUAL_LOCALE, &status)); // Load eras initField(&fEras, fErasCount, calendarSink, buildResourcePath(path, gErasTag, gNamesAbbrTag, status), status); UErrorCode oldStatus = status; initField(&fEraNames, fEraNamesCount, calendarSink, buildResourcePath(path, gErasTag, gNamesWideTag, status), status); if (status == U_MISSING_RESOURCE_ERROR) { // Workaround because eras/wide was omitted from CLDR 1.3 status = oldStatus; assignArray(fEraNames, fEraNamesCount, fEras, fErasCount); } // current ICU4J falls back to abbreviated if narrow eras are missing, so we will too oldStatus = status; initField(&fNarrowEras, fNarrowErasCount, calendarSink, buildResourcePath(path, gErasTag, gNamesNarrowTag, status), status); if (status == U_MISSING_RESOURCE_ERROR) { // Workaround because eras/wide was omitted from CLDR 1.3 status = oldStatus; assignArray(fNarrowEras, fNarrowErasCount, fEras, fErasCount); } // Load month names initField(&fMonths, fMonthsCount, calendarSink, buildResourcePath(path, gMonthNamesTag, gNamesFormatTag, gNamesWideTag, status), status); initField(&fShortMonths, fShortMonthsCount, calendarSink, buildResourcePath(path, gMonthNamesTag, gNamesFormatTag, gNamesAbbrTag, status), status); initField(&fStandaloneMonths, fStandaloneMonthsCount, calendarSink, buildResourcePath(path, gMonthNamesTag, gNamesStandaloneTag, gNamesWideTag, status), status); if (status == U_MISSING_RESOURCE_ERROR) { /* If standalone/wide not available, use format/wide */ status = U_ZERO_ERROR; assignArray(fStandaloneMonths, fStandaloneMonthsCount, fMonths, fMonthsCount); } initField(&fStandaloneShortMonths, fStandaloneShortMonthsCount, calendarSink, buildResourcePath(path, gMonthNamesTag, gNamesStandaloneTag, gNamesAbbrTag, status), status); if (status == U_MISSING_RESOURCE_ERROR) { /* If standalone/abbreviated not available, use format/abbreviated */ status = U_ZERO_ERROR; assignArray(fStandaloneShortMonths, fStandaloneShortMonthsCount, fShortMonths, fShortMonthsCount); } UErrorCode narrowMonthsEC = status; UErrorCode standaloneNarrowMonthsEC = status; initField(&fNarrowMonths, fNarrowMonthsCount, calendarSink, buildResourcePath(path, gMonthNamesTag, gNamesFormatTag, gNamesNarrowTag, narrowMonthsEC), narrowMonthsEC); initField(&fStandaloneNarrowMonths, fStandaloneNarrowMonthsCount, calendarSink, buildResourcePath(path, gMonthNamesTag, gNamesStandaloneTag, gNamesNarrowTag, narrowMonthsEC), standaloneNarrowMonthsEC); if (narrowMonthsEC == U_MISSING_RESOURCE_ERROR && standaloneNarrowMonthsEC != U_MISSING_RESOURCE_ERROR) { // If format/narrow not available, use standalone/narrow assignArray(fNarrowMonths, fNarrowMonthsCount, fStandaloneNarrowMonths, fStandaloneNarrowMonthsCount); } else if (narrowMonthsEC != U_MISSING_RESOURCE_ERROR && standaloneNarrowMonthsEC == U_MISSING_RESOURCE_ERROR) { // If standalone/narrow not available, use format/narrow assignArray(fStandaloneNarrowMonths, fStandaloneNarrowMonthsCount, fNarrowMonths, fNarrowMonthsCount); } else if (narrowMonthsEC == U_MISSING_RESOURCE_ERROR && standaloneNarrowMonthsEC == U_MISSING_RESOURCE_ERROR) { // If neither is available, use format/abbreviated assignArray(fNarrowMonths, fNarrowMonthsCount, fShortMonths, fShortMonthsCount); assignArray(fStandaloneNarrowMonths, fStandaloneNarrowMonthsCount, fShortMonths, fShortMonthsCount); } // Load AM/PM markers; if wide or narrow not available, use short UErrorCode ampmStatus = U_ZERO_ERROR; initField(&fAmPms, fAmPmsCount, calendarSink, buildResourcePath(path, gAmPmMarkersTag, ampmStatus), ampmStatus); if (U_FAILURE(ampmStatus)) { initField(&fAmPms, fAmPmsCount, calendarSink, buildResourcePath(path, gAmPmMarkersAbbrTag, status), status); } ampmStatus = U_ZERO_ERROR; initField(&fNarrowAmPms, fNarrowAmPmsCount, calendarSink, buildResourcePath(path, gAmPmMarkersNarrowTag, ampmStatus), ampmStatus); if (U_FAILURE(ampmStatus)) { initField(&fNarrowAmPms, fNarrowAmPmsCount, calendarSink, buildResourcePath(path, gAmPmMarkersAbbrTag, status), status); } if(status == U_MISSING_RESOURCE_ERROR) { status = U_ZERO_ERROR; assignArray(fNarrowAmPms, fNarrowAmPmsCount, fAmPms, fAmPmsCount); } // Load quarters initField(&fQuarters, fQuartersCount, calendarSink, buildResourcePath(path, gQuartersTag, gNamesFormatTag, gNamesWideTag, status), status); initField(&fShortQuarters, fShortQuartersCount, calendarSink, buildResourcePath(path, gQuartersTag, gNamesFormatTag, gNamesAbbrTag, status), status); if(status == U_MISSING_RESOURCE_ERROR) { status = U_ZERO_ERROR; assignArray(fShortQuarters, fShortQuartersCount, fQuarters, fQuartersCount); } initField(&fStandaloneQuarters, fStandaloneQuartersCount, calendarSink, buildResourcePath(path, gQuartersTag, gNamesStandaloneTag, gNamesWideTag, status), status); if(status == U_MISSING_RESOURCE_ERROR) { status = U_ZERO_ERROR; assignArray(fStandaloneQuarters, fStandaloneQuartersCount, fQuarters, fQuartersCount); } initField(&fStandaloneShortQuarters, fStandaloneShortQuartersCount, calendarSink, buildResourcePath(path, gQuartersTag, gNamesStandaloneTag, gNamesAbbrTag, status), status); if(status == U_MISSING_RESOURCE_ERROR) { status = U_ZERO_ERROR; assignArray(fStandaloneShortQuarters, fStandaloneShortQuartersCount, fShortQuarters, fShortQuartersCount); } // unlike the fields above, narrow format quarters fall back on narrow standalone quarters initField(&fStandaloneNarrowQuarters, fStandaloneNarrowQuartersCount, calendarSink, buildResourcePath(path, gQuartersTag, gNamesStandaloneTag, gNamesNarrowTag, status), status); initField(&fNarrowQuarters, fNarrowQuartersCount, calendarSink, buildResourcePath(path, gQuartersTag, gNamesFormatTag, gNamesNarrowTag, status), status); if(status == U_MISSING_RESOURCE_ERROR) { status = U_ZERO_ERROR; assignArray(fNarrowQuarters, fNarrowQuartersCount, fStandaloneNarrowQuarters, fStandaloneNarrowQuartersCount); } // ICU 3.8 or later version no longer uses localized date-time pattern characters by default (ticket#5597) /* // fastCopyFrom()/setTo() - see assignArray comments resStr = ures_getStringByKey(fResourceBundle, gLocalPatternCharsTag, &len, &status); fLocalPatternChars.setTo(true, resStr, len); // If the locale data does not include new pattern chars, use the defaults // TODO: Consider making this an error, since this may add conflicting characters. if (len < PATTERN_CHARS_LEN) { fLocalPatternChars.append(UnicodeString(true, &gPatternChars[len], PATTERN_CHARS_LEN-len)); } */ fLocalPatternChars.setTo(true, gPatternChars, PATTERN_CHARS_LEN); // Format wide weekdays -> fWeekdays // {sfb} fixed to handle 1-based weekdays initField(&fWeekdays, fWeekdaysCount, calendarSink, buildResourcePath(path, gDayNamesTag, gNamesFormatTag, gNamesWideTag, status), 1, status); // Format abbreviated weekdays -> fShortWeekdays initField(&fShortWeekdays, fShortWeekdaysCount, calendarSink, buildResourcePath(path, gDayNamesTag, gNamesFormatTag, gNamesAbbrTag, status), 1, status); // Format short weekdays -> fShorterWeekdays (fall back to abbreviated) initField(&fShorterWeekdays, fShorterWeekdaysCount, calendarSink, buildResourcePath(path, gDayNamesTag, gNamesFormatTag, gNamesShortTag, status), 1, status); if (status == U_MISSING_RESOURCE_ERROR) { status = U_ZERO_ERROR; assignArray(fShorterWeekdays, fShorterWeekdaysCount, fShortWeekdays, fShortWeekdaysCount); } // Stand-alone wide weekdays -> fStandaloneWeekdays initField(&fStandaloneWeekdays, fStandaloneWeekdaysCount, calendarSink, buildResourcePath(path, gDayNamesTag, gNamesStandaloneTag, gNamesWideTag, status), 1, status); if (status == U_MISSING_RESOURCE_ERROR) { /* If standalone/wide is not available, use format/wide */ status = U_ZERO_ERROR; assignArray(fStandaloneWeekdays, fStandaloneWeekdaysCount, fWeekdays, fWeekdaysCount); } // Stand-alone abbreviated weekdays -> fStandaloneShortWeekdays initField(&fStandaloneShortWeekdays, fStandaloneShortWeekdaysCount, calendarSink, buildResourcePath(path, gDayNamesTag, gNamesStandaloneTag, gNamesAbbrTag, status), 1, status); if (status == U_MISSING_RESOURCE_ERROR) { /* If standalone/abbreviated is not available, use format/abbreviated */ status = U_ZERO_ERROR; assignArray(fStandaloneShortWeekdays, fStandaloneShortWeekdaysCount, fShortWeekdays, fShortWeekdaysCount); } // Stand-alone short weekdays -> fStandaloneShorterWeekdays (fall back to format abbreviated) initField(&fStandaloneShorterWeekdays, fStandaloneShorterWeekdaysCount, calendarSink, buildResourcePath(path, gDayNamesTag, gNamesStandaloneTag, gNamesShortTag, status), 1, status); if (status == U_MISSING_RESOURCE_ERROR) { /* If standalone/short is not available, use format/short */ status = U_ZERO_ERROR; assignArray(fStandaloneShorterWeekdays, fStandaloneShorterWeekdaysCount, fShorterWeekdays, fShorterWeekdaysCount); } // Format narrow weekdays -> fNarrowWeekdays UErrorCode narrowWeeksEC = status; initField(&fNarrowWeekdays, fNarrowWeekdaysCount, calendarSink, buildResourcePath(path, gDayNamesTag, gNamesFormatTag, gNamesNarrowTag, status), 1, narrowWeeksEC); // Stand-alone narrow weekdays -> fStandaloneNarrowWeekdays UErrorCode standaloneNarrowWeeksEC = status; initField(&fStandaloneNarrowWeekdays, fStandaloneNarrowWeekdaysCount, calendarSink, buildResourcePath(path, gDayNamesTag, gNamesStandaloneTag, gNamesNarrowTag, status), 1, standaloneNarrowWeeksEC); if (narrowWeeksEC == U_MISSING_RESOURCE_ERROR && standaloneNarrowWeeksEC != U_MISSING_RESOURCE_ERROR) { // If format/narrow not available, use standalone/narrow assignArray(fNarrowWeekdays, fNarrowWeekdaysCount, fStandaloneNarrowWeekdays, fStandaloneNarrowWeekdaysCount); } else if (narrowWeeksEC != U_MISSING_RESOURCE_ERROR && standaloneNarrowWeeksEC == U_MISSING_RESOURCE_ERROR) { // If standalone/narrow not available, use format/narrow assignArray(fStandaloneNarrowWeekdays, fStandaloneNarrowWeekdaysCount, fNarrowWeekdays, fNarrowWeekdaysCount); } else if (narrowWeeksEC == U_MISSING_RESOURCE_ERROR && standaloneNarrowWeeksEC == U_MISSING_RESOURCE_ERROR ) { // If neither is available, use format/abbreviated assignArray(fNarrowWeekdays, fNarrowWeekdaysCount, fShortWeekdays, fShortWeekdaysCount); assignArray(fStandaloneNarrowWeekdays, fStandaloneNarrowWeekdaysCount, fShortWeekdays, fShortWeekdaysCount); } // Last resort fallback in case previous data wasn't loaded if (U_FAILURE(status)) { if (useLastResortData) { // Handle the case in which there is no resource data present. // We don't have to generate usable patterns in this situation; // we just need to produce something that will be semi-intelligible // in most locales. status = U_USING_FALLBACK_WARNING; //TODO(fabalbon): make sure we are storing las resort data for all fields in here. initField(&fEras, fErasCount, (const char16_t *)gLastResortEras, kEraNum, kEraLen, status); initField(&fEraNames, fEraNamesCount, (const char16_t *)gLastResortEras, kEraNum, kEraLen, status); initField(&fNarrowEras, fNarrowErasCount, (const char16_t *)gLastResortEras, kEraNum, kEraLen, status); initField(&fMonths, fMonthsCount, (const char16_t *)gLastResortMonthNames, kMonthNum, kMonthLen, status); initField(&fShortMonths, fShortMonthsCount, (const char16_t *)gLastResortMonthNames, kMonthNum, kMonthLen, status); initField(&fNarrowMonths, fNarrowMonthsCount, (const char16_t *)gLastResortMonthNames, kMonthNum, kMonthLen, status); initField(&fStandaloneMonths, fStandaloneMonthsCount, (const char16_t *)gLastResortMonthNames, kMonthNum, kMonthLen, status); initField(&fStandaloneShortMonths, fStandaloneShortMonthsCount, (const char16_t *)gLastResortMonthNames, kMonthNum, kMonthLen, status); initField(&fStandaloneNarrowMonths, fStandaloneNarrowMonthsCount, (const char16_t *)gLastResortMonthNames, kMonthNum, kMonthLen, status); initField(&fWeekdays, fWeekdaysCount, (const char16_t *)gLastResortDayNames, kDayNum, kDayLen, status); initField(&fShortWeekdays, fShortWeekdaysCount, (const char16_t *)gLastResortDayNames, kDayNum, kDayLen, status); initField(&fShorterWeekdays, fShorterWeekdaysCount, (const char16_t *)gLastResortDayNames, kDayNum, kDayLen, status); initField(&fNarrowWeekdays, fNarrowWeekdaysCount, (const char16_t *)gLastResortDayNames, kDayNum, kDayLen, status); initField(&fStandaloneWeekdays, fStandaloneWeekdaysCount, (const char16_t *)gLastResortDayNames, kDayNum, kDayLen, status); initField(&fStandaloneShortWeekdays, fStandaloneShortWeekdaysCount, (const char16_t *)gLastResortDayNames, kDayNum, kDayLen, status); initField(&fStandaloneShorterWeekdays, fStandaloneShorterWeekdaysCount, (const char16_t *)gLastResortDayNames, kDayNum, kDayLen, status); initField(&fStandaloneNarrowWeekdays, fStandaloneNarrowWeekdaysCount, (const char16_t *)gLastResortDayNames, kDayNum, kDayLen, status); initField(&fAmPms, fAmPmsCount, (const char16_t *)gLastResortAmPmMarkers, kAmPmNum, kAmPmLen, status); initField(&fNarrowAmPms, fNarrowAmPmsCount, (const char16_t *)gLastResortAmPmMarkers, kAmPmNum, kAmPmLen, status); initField(&fQuarters, fQuartersCount, (const char16_t *)gLastResortQuarters, kQuarterNum, kQuarterLen, status); initField(&fShortQuarters, fShortQuartersCount, (const char16_t *)gLastResortQuarters, kQuarterNum, kQuarterLen, status); initField(&fNarrowQuarters, fNarrowQuartersCount, (const char16_t *)gLastResortQuarters, kQuarterNum, kQuarterLen, status); initField(&fStandaloneQuarters, fStandaloneQuartersCount, (const char16_t *)gLastResortQuarters, kQuarterNum, kQuarterLen, status); initField(&fStandaloneShortQuarters, fStandaloneShortQuartersCount, (const char16_t *)gLastResortQuarters, kQuarterNum, kQuarterLen, status); initField(&fStandaloneNarrowQuarters, fStandaloneNarrowQuartersCount, (const char16_t *)gLastResortQuarters, kQuarterNum, kQuarterLen, status); fLocalPatternChars.setTo(true, gPatternChars, PATTERN_CHARS_LEN); } } // Close resources ures_close(cb); ures_close(rb); } Locale DateFormatSymbols::getLocale(ULocDataLocaleType type, UErrorCode& status) const { U_LOCALE_BASED(locBased, *this); return locBased.getLocale(type, status); } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ //eof stringi/src/icu74/i18n/dt_impl.h0000644000176200001440000000634514700200761016040 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2007-2016, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* * * File dt_impl.h * ******************************************************************************* */ #ifndef DT_IMPL_H__ #define DT_IMPL_H__ /** * \file * \brief C++ API: Defines macros for interval format implementation */ #if !UCONFIG_NO_FORMATTING #include "unicode/unistr.h" #define QUOTE ((char16_t)0x0027) #define LOW_LINE ((char16_t)0x005F) #define COLON ((char16_t)0x003A) #define LEFT_CURLY_BRACKET ((char16_t)0x007B) #define RIGHT_CURLY_BRACKET ((char16_t)0x007D) #define SPACE ((char16_t)0x0020) #define EN_DASH ((char16_t)0x2013) #define SOLIDUS ((char16_t)0x002F) #define PERCENT ((char16_t)0x0025) #define DIGIT_ZERO ((char16_t)0x0030) #define DIGIT_ONE ((char16_t)0x0031) #define LOW_A ((char16_t)0x0061) #define LOW_B ((char16_t)0x0062) #define LOW_C ((char16_t)0x0063) #define LOW_D ((char16_t)0x0064) #define LOW_E ((char16_t)0x0065) #define LOW_F ((char16_t)0x0066) #define LOW_G ((char16_t)0x0067) #define LOW_H ((char16_t)0x0068) #define LOW_I ((char16_t)0x0069) #define LOW_J ((char16_t)0x006a) #define LOW_K ((char16_t)0x006B) #define LOW_L ((char16_t)0x006C) #define LOW_M ((char16_t)0x006D) #define LOW_N ((char16_t)0x006E) #define LOW_O ((char16_t)0x006F) #define LOW_P ((char16_t)0x0070) #define LOW_Q ((char16_t)0x0071) #define LOW_R ((char16_t)0x0072) #define LOW_S ((char16_t)0x0073) #define LOW_T ((char16_t)0x0074) #define LOW_U ((char16_t)0x0075) #define LOW_V ((char16_t)0x0076) #define LOW_W ((char16_t)0x0077) #define LOW_Y ((char16_t)0x0079) #define LOW_Z ((char16_t)0x007A) #define CAP_A ((char16_t)0x0041) #define CAP_C ((char16_t)0x0043) #define CAP_D ((char16_t)0x0044) #define CAP_E ((char16_t)0x0045) #define CAP_F ((char16_t)0x0046) #define CAP_G ((char16_t)0x0047) #define CAP_H ((char16_t)0x0048) #define CAP_K ((char16_t)0x004B) #define CAP_L ((char16_t)0x004C) #define CAP_M ((char16_t)0x004D) #define CAP_N ((char16_t)0x004E) #define CAP_O ((char16_t)0x004F) #define CAP_P ((char16_t)0x0050) #define CAP_Q ((char16_t)0x0051) #define CAP_S ((char16_t)0x0053) #define CAP_T ((char16_t)0x0054) #define CAP_U ((char16_t)0x0055) #define CAP_V ((char16_t)0x0056) #define CAP_W ((char16_t)0x0057) #define CAP_Y ((char16_t)0x0059) #define CAP_Z ((char16_t)0x005A) #endif /* #if !UCONFIG_NO_FORMATTING */ #endif //eof stringi/src/icu74/i18n/number_scientific.cpp0000644000176200001440000001525014700200761020426 0ustar liggesusers// © 2017 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include #include "number_scientific.h" #include "number_utils.h" #include "formatted_string_builder.h" #include "unicode/unum.h" #include "number_microprops.h" using namespace icu; using namespace icu::number; using namespace icu::number::impl; // NOTE: The object lifecycle of ScientificModifier and ScientificHandler differ greatly in Java and C++. // // During formatting, we need to provide an object with state (the exponent) as the inner modifier. // // In Java, where the priority is put on reducing object creations, the unsafe code path re-uses the // ScientificHandler as a ScientificModifier, and the safe code path pre-computes 25 ScientificModifier // instances. This scheme reduces the number of object creations by 1 in both safe and unsafe. // // In C++, MicroProps provides a pre-allocated ScientificModifier, and ScientificHandler simply populates // the state (the exponent) into that ScientificModifier. There is no difference between safe and unsafe. ScientificModifier::ScientificModifier() : fExponent(0), fHandler(nullptr) {} void ScientificModifier::set(int32_t exponent, const ScientificHandler *handler) { // ScientificModifier should be set only once. U_ASSERT(fHandler == nullptr); fExponent = exponent; fHandler = handler; } int32_t ScientificModifier::apply(FormattedStringBuilder &output, int32_t /*leftIndex*/, int32_t rightIndex, UErrorCode &status) const { // FIXME: Localized exponent separator location. int i = rightIndex; // Append the exponent separator and sign i += output.insert( i, fHandler->fSymbols->getSymbol(DecimalFormatSymbols::ENumberFormatSymbol::kExponentialSymbol), {UFIELD_CATEGORY_NUMBER, UNUM_EXPONENT_SYMBOL_FIELD}, status); if (fExponent < 0 && fHandler->fSettings.fExponentSignDisplay != UNUM_SIGN_NEVER) { i += output.insert( i, fHandler->fSymbols ->getSymbol(DecimalFormatSymbols::ENumberFormatSymbol::kMinusSignSymbol), {UFIELD_CATEGORY_NUMBER, UNUM_EXPONENT_SIGN_FIELD}, status); } else if (fExponent >= 0 && fHandler->fSettings.fExponentSignDisplay == UNUM_SIGN_ALWAYS) { i += output.insert( i, fHandler->fSymbols ->getSymbol(DecimalFormatSymbols::ENumberFormatSymbol::kPlusSignSymbol), {UFIELD_CATEGORY_NUMBER, UNUM_EXPONENT_SIGN_FIELD}, status); } // Append the exponent digits (using a simple inline algorithm) int32_t disp = std::abs(fExponent); for (int j = 0; j < fHandler->fSettings.fMinExponentDigits || disp > 0; j++, disp /= 10) { auto d = static_cast(disp % 10); i += utils::insertDigitFromSymbols( output, i - j, d, *fHandler->fSymbols, {UFIELD_CATEGORY_NUMBER, UNUM_EXPONENT_FIELD}, status); } return i - rightIndex; } int32_t ScientificModifier::getPrefixLength() const { // TODO: Localized exponent separator location. return 0; } int32_t ScientificModifier::getCodePointCount() const { // NOTE: This method is only called one place, NumberRangeFormatterImpl. // The call site only cares about != 0 and != 1. // Return a very large value so that if this method is used elsewhere, we should notice. return 999; } bool ScientificModifier::isStrong() const { // Scientific is always strong return true; } bool ScientificModifier::containsField(Field field) const { (void)field; // This method is not used for inner modifiers. UPRV_UNREACHABLE_EXIT; } void ScientificModifier::getParameters(Parameters& output) const { // Not part of any plural sets output.obj = nullptr; } bool ScientificModifier::semanticallyEquivalent(const Modifier& other) const { auto* _other = dynamic_cast(&other); if (_other == nullptr) { return false; } // TODO: Check for locale symbols and settings as well? Could be less efficient. return fExponent == _other->fExponent; } // Note: Visual Studio does not compile this function without full name space. Why? icu::number::impl::ScientificHandler::ScientificHandler(const Notation *notation, const DecimalFormatSymbols *symbols, const MicroPropsGenerator *parent) : fSettings(notation->fUnion.scientific), fSymbols(symbols), fParent(parent) {} void ScientificHandler::processQuantity(DecimalQuantity &quantity, MicroProps µs, UErrorCode &status) const { fParent->processQuantity(quantity, micros, status); if (U_FAILURE(status)) { return; } // Do not apply scientific notation to special doubles if (quantity.isInfinite() || quantity.isNaN()) { micros.modInner = µs.helpers.emptyStrongModifier; return; } // Treat zero as if it had magnitude 0 int32_t exponent; if (quantity.isZeroish()) { if (fSettings.fRequireMinInt && micros.rounder.isSignificantDigits()) { // Show "00.000E0" on pattern "00.000E0" micros.rounder.apply(quantity, fSettings.fEngineeringInterval, status); exponent = 0; } else { micros.rounder.apply(quantity, status); exponent = 0; } } else { exponent = -micros.rounder.chooseMultiplierAndApply(quantity, *this, status); } // Use MicroProps's helper ScientificModifier and save it as the modInner. ScientificModifier &mod = micros.helpers.scientificModifier; mod.set(exponent, this); micros.modInner = &mod; // Change the exponent only after we select appropriate plural form // for formatting purposes so that we preserve expected formatted // string behavior. quantity.adjustExponent(exponent); // We already performed rounding. Do not perform it again. micros.rounder = RoundingImpl::passThrough(); } int32_t ScientificHandler::getMultiplier(int32_t magnitude) const { int32_t interval = fSettings.fEngineeringInterval; int32_t digitsShown; if (fSettings.fRequireMinInt) { // For patterns like "000.00E0" and ".00E0" digitsShown = interval; } else if (interval <= 1) { // For patterns like "0.00E0" and "@@@E0" digitsShown = 1; } else { // For patterns like "##0.00" digitsShown = ((magnitude % interval + interval) % interval) + 1; } return digitsShown - magnitude - 1; } #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/units_router.cpp0000644000176200001440000001314514700200761017501 0ustar liggesusers// © 2020 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "charstr.h" #include "cmemory.h" #include "cstring.h" #include "measunit_impl.h" #include "number_decimalquantity.h" #include "number_roundingutils.h" #include "resource.h" #include "unicode/measure.h" #include "units_data.h" #include "units_router.h" #include U_NAMESPACE_BEGIN namespace units { using number::Precision; using number::impl::parseIncrementOption; Precision UnitsRouter::parseSkeletonToPrecision(icu::UnicodeString precisionSkeleton, UErrorCode &status) { if (U_FAILURE(status)) { // As a member of UsagePrefsHandler, which is a friend of Precision, we // get access to the default constructor. return {}; } constexpr int32_t kSkelPrefixLen = 20; if (!precisionSkeleton.startsWith(UNICODE_STRING_SIMPLE("precision-increment/"))) { status = U_INVALID_FORMAT_ERROR; return {}; } U_ASSERT(precisionSkeleton[kSkelPrefixLen - 1] == u'/'); StringSegment segment(precisionSkeleton, false); segment.adjustOffset(kSkelPrefixLen); Precision result; parseIncrementOption(segment, result, status); return result; } UnitsRouter::UnitsRouter(StringPiece inputUnitIdentifier, const Locale &locale, StringPiece usage, UErrorCode &status) { this->init(MeasureUnit::forIdentifier(inputUnitIdentifier, status), locale, usage, status); } UnitsRouter::UnitsRouter(const MeasureUnit &inputUnit, const Locale &locale, StringPiece usage, UErrorCode &status) { this->init(std::move(inputUnit), locale, usage, status); } void UnitsRouter::init(const MeasureUnit &inputUnit, const Locale &locale, StringPiece usage, UErrorCode &status) { if (U_FAILURE(status)) { return; } // TODO: do we want to pass in ConversionRates and UnitPreferences instead // of loading in each UnitsRouter instance? (Or make global?) ConversionRates conversionRates(status); UnitPreferences prefs(status); MeasureUnitImpl inputUnitImpl = MeasureUnitImpl::forMeasureUnitMaybeCopy(inputUnit, status); MeasureUnitImpl baseUnitImpl = (extractCompoundBaseUnit(inputUnitImpl, conversionRates, status)); CharString category = getUnitQuantity(baseUnitImpl, status); if (U_FAILURE(status)) { return; } const MaybeStackVector unitPrefs = prefs.getPreferencesFor(category.toStringPiece(), usage, locale, status); for (int32_t i = 0, n = unitPrefs.length(); i < n; ++i) { U_ASSERT(unitPrefs[i] != nullptr); const auto preference = unitPrefs[i]; MeasureUnitImpl complexTargetUnitImpl = MeasureUnitImpl::forIdentifier(preference->unit.data(), status); if (U_FAILURE(status)) { return; } UnicodeString precision = preference->skeleton; // For now, we only have "precision-increment" in Units Preferences skeleton. // Therefore, we check if the skeleton starts with "precision-increment" and force the program to // fail otherwise. // NOTE: // It is allowed to have an empty precision. if (!precision.isEmpty() && !precision.startsWith(u"precision-increment", 19)) { status = U_INTERNAL_PROGRAM_ERROR; return; } outputUnits_.emplaceBackAndCheckErrorCode(status, complexTargetUnitImpl.copy(status).build(status)); converterPreferences_.emplaceBackAndCheckErrorCode(status, inputUnitImpl, complexTargetUnitImpl, preference->geq, std::move(precision), conversionRates, status); if (U_FAILURE(status)) { return; } } } RouteResult UnitsRouter::route(double quantity, icu::number::impl::RoundingImpl *rounder, UErrorCode &status) const { // Find the matching preference const ConverterPreference *converterPreference = nullptr; for (int32_t i = 0, n = converterPreferences_.length(); i < n; i++) { converterPreference = converterPreferences_[i]; if (converterPreference->converter.greaterThanOrEqual(std::abs(quantity) * (1 + DBL_EPSILON), converterPreference->limit)) { break; } } U_ASSERT(converterPreference != nullptr); // Set up the rounder for this preference's precision if (rounder != nullptr && rounder->fPrecision.isBogus()) { if (converterPreference->precision.length() > 0) { rounder->fPrecision = parseSkeletonToPrecision(converterPreference->precision, status); } else { // We use the same rounding mode as COMPACT notation: known to be a // human-friendly rounding mode: integers, but add a decimal digit // as needed to ensure we have at least 2 significant digits. rounder->fPrecision = Precision::integer().withMinDigits(2); } } return RouteResult(converterPreference->converter.convert(quantity, rounder, status), converterPreference->targetUnit.copy(status)); } const MaybeStackVector *UnitsRouter::getOutputUnits() const { // TODO: consider pulling this from converterPreferences_ and dropping // outputUnits_? return &outputUnits_; } } // namespace units U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/transreg.cpp0000644000176200001440000014250514700200761016567 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (c) 2001-2014, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 08/10/2001 aliu Creation. ********************************************************************** */ #include "unicode/utypes.h" #include "unicode/rep.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/translit.h" #include "unicode/resbund.h" #include "unicode/uniset.h" #include "unicode/uscript.h" #include "rbt.h" #include "cpdtrans.h" #include "nultrans.h" #include "transreg.h" #include "rbt_data.h" #include "rbt_pars.h" #include "tridpars.h" #include "charstr.h" #include "uassert.h" #include "locutil.h" // Enable the following symbol to add debugging code that tracks the // allocation, deletion, and use of Entry objects. BoundsChecker has // reported dangling pointer errors with these objects, but I have // been unable to confirm them. I suspect BoundsChecker is getting // confused with pointers going into and coming out of a UHashtable, // despite the hinting code that is designed to help it. // #define DEBUG_MEM #ifdef DEBUG_MEM #include #endif // char16_t constants static const char16_t LOCALE_SEP = 95; // '_' //static const char16_t ID_SEP = 0x002D; /*-*/ //static const char16_t VARIANT_SEP = 0x002F; // '/' // String constants static const char16_t ANY[] = { 0x41, 0x6E, 0x79, 0 }; // Any static const char16_t LAT[] = { 0x4C, 0x61, 0x74, 0 }; // Lat // empty string #define NO_VARIANT UnicodeString() // initial estimate for specDAG size // ICU 60 Transliterator::countAvailableSources() #define SPECDAG_INIT_SIZE 149 // initial estimate for number of variant names #define VARIANT_LIST_INIT_SIZE 11 #define VARIANT_LIST_MAX_SIZE 31 // initial estimate for availableIDs count (default estimate is 8 => multiple reallocs) // ICU 60 Transliterator::countAvailableIDs() #define AVAILABLE_IDS_INIT_SIZE 641 // initial estimate for number of targets for source "Any", "Lat" // ICU 60 Transliterator::countAvailableTargets("Any")/("Latn") #define ANY_TARGETS_INIT_SIZE 125 #define LAT_TARGETS_INIT_SIZE 23 /** * Resource bundle key for the RuleBasedTransliterator rule. */ //static const char RB_RULE[] = "Rule"; U_NAMESPACE_BEGIN //------------------------------------------------------------------ // Alias //------------------------------------------------------------------ TransliteratorAlias::TransliteratorAlias(const UnicodeString& theAliasID, const UnicodeSet* cpdFilter) : ID(), aliasesOrRules(theAliasID), transes(0), compoundFilter(cpdFilter), direction(UTRANS_FORWARD), type(TransliteratorAlias::SIMPLE) { } TransliteratorAlias::TransliteratorAlias(const UnicodeString& theID, const UnicodeString& idBlocks, UVector* adoptedTransliterators, const UnicodeSet* cpdFilter) : ID(theID), aliasesOrRules(idBlocks), transes(adoptedTransliterators), compoundFilter(cpdFilter), direction(UTRANS_FORWARD), type(TransliteratorAlias::COMPOUND) { } TransliteratorAlias::TransliteratorAlias(const UnicodeString& theID, const UnicodeString& rules, UTransDirection dir) : ID(theID), aliasesOrRules(rules), transes(0), compoundFilter(0), direction(dir), type(TransliteratorAlias::RULES) { } TransliteratorAlias::~TransliteratorAlias() { delete transes; } Transliterator* TransliteratorAlias::create(UParseError& pe, UErrorCode& ec) { if (U_FAILURE(ec)) { return 0; } Transliterator *t = nullptr; switch (type) { case SIMPLE: t = Transliterator::createInstance(aliasesOrRules, UTRANS_FORWARD, pe, ec); if(U_FAILURE(ec)){ return 0; } if (compoundFilter != 0) t->adoptFilter(compoundFilter->clone()); break; case COMPOUND: { // the total number of transliterators in the compound is the total number of anonymous transliterators // plus the total number of ID blocks-- we start by assuming the list begins and ends with an ID // block and that each pair anonymous transliterators has an ID block between them. Then we go back // to see whether there really are ID blocks at the beginning and end (by looking for U+FFFF, which // marks the position where an anonymous transliterator goes) and adjust accordingly int32_t anonymousRBTs = transes->size(); UnicodeString noIDBlock((char16_t)(0xffff)); noIDBlock += ((char16_t)(0xffff)); int32_t pos = aliasesOrRules.indexOf(noIDBlock); while (pos >= 0) { pos = aliasesOrRules.indexOf(noIDBlock, pos + 1); } UVector transliterators(uprv_deleteUObject, nullptr, ec); UnicodeString idBlock; int32_t blockSeparatorPos = aliasesOrRules.indexOf((char16_t)(0xffff)); while (blockSeparatorPos >= 0) { aliasesOrRules.extract(0, blockSeparatorPos, idBlock); aliasesOrRules.remove(0, blockSeparatorPos + 1); if (!idBlock.isEmpty()) transliterators.adoptElement(Transliterator::createInstance(idBlock, UTRANS_FORWARD, pe, ec), ec); if (!transes->isEmpty()) transliterators.adoptElement(transes->orphanElementAt(0), ec); blockSeparatorPos = aliasesOrRules.indexOf((char16_t)(0xffff)); } if (!aliasesOrRules.isEmpty()) transliterators.adoptElement(Transliterator::createInstance(aliasesOrRules, UTRANS_FORWARD, pe, ec), ec); while (!transes->isEmpty()) transliterators.adoptElement(transes->orphanElementAt(0), ec); transliterators.setDeleter(nullptr); if (U_SUCCESS(ec)) { t = new CompoundTransliterator(ID, transliterators, (compoundFilter ? compoundFilter->clone() : nullptr), anonymousRBTs, pe, ec); if (t == 0) { ec = U_MEMORY_ALLOCATION_ERROR; return 0; } } else { for (int32_t i = 0; i < transliterators.size(); i++) delete (Transliterator*)(transliterators.elementAt(i)); } } break; case RULES: UPRV_UNREACHABLE_EXIT; // don't call create() if isRuleBased() returns true! } return t; } UBool TransliteratorAlias::isRuleBased() const { return type == RULES; } void TransliteratorAlias::parse(TransliteratorParser& parser, UParseError& pe, UErrorCode& ec) const { U_ASSERT(type == RULES); if (U_FAILURE(ec)) { return; } parser.parse(aliasesOrRules, direction, pe, ec); } //---------------------------------------------------------------------- // class TransliteratorSpec //---------------------------------------------------------------------- /** * A TransliteratorSpec is a string specifying either a source or a target. In more * general terms, it may also specify a variant, but we only use the * Spec class for sources and targets. * * A Spec may be a locale or a script. If it is a locale, it has a * fallback chain that goes xx_YY_ZZZ -> xx_YY -> xx -> ssss, where * ssss is the script mapping of xx_YY_ZZZ. The Spec API methods * hasFallback(), next(), and reset() iterate over this fallback * sequence. * * The Spec class canonicalizes itself, so the locale is put into * canonical form, or the script is transformed from an abbreviation * to a full name. */ class TransliteratorSpec : public UMemory { public: TransliteratorSpec(const UnicodeString& spec); ~TransliteratorSpec(); const UnicodeString& get() const; UBool hasFallback() const; const UnicodeString& next(); void reset(); UBool isLocale() const; ResourceBundle& getBundle() const; operator const UnicodeString&() const { return get(); } const UnicodeString& getTop() const { return top; } private: void setupNext(); UnicodeString top; UnicodeString spec; UnicodeString nextSpec; UnicodeString scriptName; UBool isSpecLocale; // true if spec is a locale UBool isNextLocale; // true if nextSpec is a locale ResourceBundle* res; TransliteratorSpec(const TransliteratorSpec &other); // forbid copying of this class TransliteratorSpec &operator=(const TransliteratorSpec &other); // forbid copying of this class }; TransliteratorSpec::TransliteratorSpec(const UnicodeString& theSpec) : top(theSpec), res(0) { UErrorCode status = U_ZERO_ERROR; Locale topLoc(""); LocaleUtility::initLocaleFromName(theSpec, topLoc); if (!topLoc.isBogus()) { res = new ResourceBundle(U_ICUDATA_TRANSLIT, topLoc, status); /* test for nullptr */ if (res == 0) { return; } if (U_FAILURE(status) || status == U_USING_DEFAULT_WARNING) { delete res; res = 0; } } // Canonicalize script name -or- do locale->script mapping status = U_ZERO_ERROR; static const int32_t capacity = 10; UScriptCode script[capacity]={USCRIPT_INVALID_CODE}; int32_t num = uscript_getCode(CharString().appendInvariantChars(theSpec, status).data(), script, capacity, &status); if (num > 0 && script[0] != USCRIPT_INVALID_CODE) { scriptName = UnicodeString(uscript_getName(script[0]), -1, US_INV); } // Canonicalize top if (res != 0) { // Canonicalize locale name UnicodeString locStr; LocaleUtility::initNameFromLocale(topLoc, locStr); if (!locStr.isBogus()) { top = locStr; } } else if (scriptName.length() != 0) { // We are a script; use canonical name top = scriptName; } // assert(spec != top); reset(); } TransliteratorSpec::~TransliteratorSpec() { delete res; } UBool TransliteratorSpec::hasFallback() const { return nextSpec.length() != 0; } void TransliteratorSpec::reset() { if (spec != top) { spec = top; isSpecLocale = (res != 0); setupNext(); } } void TransliteratorSpec::setupNext() { isNextLocale = false; if (isSpecLocale) { nextSpec = spec; int32_t i = nextSpec.lastIndexOf(LOCALE_SEP); // If i == 0 then we have _FOO, so we fall through // to the scriptName. if (i > 0) { nextSpec.truncate(i); isNextLocale = true; } else { nextSpec = scriptName; // scriptName may be empty } } else { // spec is a script, so we are at the end nextSpec.truncate(0); } } // Protocol: // for(const UnicodeString& s(spec.get()); // spec.hasFallback(); s(spec.next())) { ... const UnicodeString& TransliteratorSpec::next() { spec = nextSpec; isSpecLocale = isNextLocale; setupNext(); return spec; } const UnicodeString& TransliteratorSpec::get() const { return spec; } UBool TransliteratorSpec::isLocale() const { return isSpecLocale; } ResourceBundle& TransliteratorSpec::getBundle() const { return *res; } //---------------------------------------------------------------------- #ifdef DEBUG_MEM // Vector of Entry pointers currently in use static UVector* DEBUG_entries = nullptr; static void DEBUG_setup() { if (DEBUG_entries == nullptr) { UErrorCode ec = U_ZERO_ERROR; DEBUG_entries = new UVector(ec); } } // Caller must call DEBUG_setup first. Return index of given Entry, // if it is in use (not deleted yet), or -1 if not found. static int DEBUG_findEntry(TransliteratorEntry* e) { for (int i=0; isize(); ++i) { if (e == (TransliteratorEntry*) DEBUG_entries->elementAt(i)) { return i; } } return -1; } // Track object creation static void DEBUG_newEntry(TransliteratorEntry* e) { DEBUG_setup(); if (DEBUG_findEntry(e) >= 0) { // This should really never happen unless the heap is broken printf("ERROR DEBUG_newEntry duplicate new pointer %08X\n", e); return; } UErrorCode ec = U_ZERO_ERROR; DEBUG_entries->addElement(e, ec); } // Track object deletion static void DEBUG_delEntry(TransliteratorEntry* e) { DEBUG_setup(); int i = DEBUG_findEntry(e); if (i < 0) { printf("ERROR DEBUG_delEntry possible double deletion %08X\n", e); return; } DEBUG_entries->removeElementAt(i); } // Track object usage static void DEBUG_useEntry(TransliteratorEntry* e) { if (e == nullptr) return; DEBUG_setup(); int i = DEBUG_findEntry(e); if (i < 0) { printf("ERROR DEBUG_useEntry possible dangling pointer %08X\n", e); } } #else // If we're not debugging then make these macros into NOPs #define DEBUG_newEntry(x) #define DEBUG_delEntry(x) #define DEBUG_useEntry(x) #endif //---------------------------------------------------------------------- // class Entry //---------------------------------------------------------------------- /** * The Entry object stores objects of different types and * singleton objects as placeholders for rule-based transliterators to * be built as needed. Instances of this struct can be placeholders, * can represent prototype transliterators to be cloned, or can * represent TransliteratorData objects. We don't support storing * classes in the registry because we don't have the rtti infrastructure * for it. We could easily add this if there is a need for it in the * future. */ class TransliteratorEntry : public UMemory { public: enum Type { RULES_FORWARD, RULES_REVERSE, LOCALE_RULES, PROTOTYPE, RBT_DATA, COMPOUND_RBT, ALIAS, FACTORY, NONE // Only used for uninitialized entries } entryType; // NOTE: stringArg cannot go inside the union because // it has a copy constructor UnicodeString stringArg; // For RULES_*, ALIAS, COMPOUND_RBT int32_t intArg; // For COMPOUND_RBT, LOCALE_RULES UnicodeSet* compoundFilter; // For COMPOUND_RBT union { Transliterator* prototype; // For PROTOTYPE TransliterationRuleData* data; // For RBT_DATA UVector* dataVector; // For COMPOUND_RBT struct { Transliterator::Factory function; Transliterator::Token context; } factory; // For FACTORY } u; TransliteratorEntry(); ~TransliteratorEntry(); void adoptPrototype(Transliterator* adopted); void setFactory(Transliterator::Factory factory, Transliterator::Token context); private: TransliteratorEntry(const TransliteratorEntry &other); // forbid copying of this class TransliteratorEntry &operator=(const TransliteratorEntry &other); // forbid copying of this class }; TransliteratorEntry::TransliteratorEntry() { u.prototype = 0; compoundFilter = nullptr; entryType = NONE; DEBUG_newEntry(this); } TransliteratorEntry::~TransliteratorEntry() { DEBUG_delEntry(this); if (entryType == PROTOTYPE) { delete u.prototype; } else if (entryType == RBT_DATA) { // The data object is shared between instances of RBT. The // entry object owns it. It should only be deleted when the // transliterator component is being cleaned up. Doing so // invalidates any RBTs that the user has instantiated. delete u.data; } else if (entryType == COMPOUND_RBT) { while (u.dataVector != nullptr && !u.dataVector->isEmpty()) delete (TransliterationRuleData*)u.dataVector->orphanElementAt(0); delete u.dataVector; } delete compoundFilter; } void TransliteratorEntry::adoptPrototype(Transliterator* adopted) { if (entryType == PROTOTYPE) { delete u.prototype; } entryType = PROTOTYPE; u.prototype = adopted; } void TransliteratorEntry::setFactory(Transliterator::Factory factory, Transliterator::Token context) { if (entryType == PROTOTYPE) { delete u.prototype; } entryType = FACTORY; u.factory.function = factory; u.factory.context = context; } // UObjectDeleter for Hashtable::setValueDeleter U_CDECL_BEGIN static void U_CALLCONV deleteEntry(void* obj) { delete (TransliteratorEntry*) obj; } U_CDECL_END //---------------------------------------------------------------------- // class TransliteratorRegistry: Basic public API //---------------------------------------------------------------------- TransliteratorRegistry::TransliteratorRegistry(UErrorCode& status) : registry(true, status), specDAG(true, SPECDAG_INIT_SIZE, status), variantList(VARIANT_LIST_INIT_SIZE, status), availableIDs(true, AVAILABLE_IDS_INIT_SIZE, status) { registry.setValueDeleter(deleteEntry); variantList.setDeleter(uprv_deleteUObject); variantList.setComparer(uhash_compareCaselessUnicodeString); UnicodeString *emptyString = new UnicodeString(); if (emptyString != nullptr) { variantList.adoptElement(emptyString, status); } specDAG.setValueDeleter(uhash_deleteHashtable); } TransliteratorRegistry::~TransliteratorRegistry() { // Through the magic of C++, everything cleans itself up } Transliterator* TransliteratorRegistry::get(const UnicodeString& ID, TransliteratorAlias*& aliasReturn, UErrorCode& status) { U_ASSERT(aliasReturn == nullptr); TransliteratorEntry *entry = find(ID); return (entry == 0) ? 0 : instantiateEntry(ID, entry, aliasReturn, status); } Transliterator* TransliteratorRegistry::reget(const UnicodeString& ID, TransliteratorParser& parser, TransliteratorAlias*& aliasReturn, UErrorCode& status) { U_ASSERT(aliasReturn == nullptr); TransliteratorEntry *entry = find(ID); if (entry == 0) { // We get to this point if there are two threads, one of which // is instantiating an ID, and another of which is removing // the same ID from the registry, and the timing is just right. return 0; } // The usage model for the caller is that they will first call // reg->get() inside the mutex, they'll get back an alias, they call // alias->isRuleBased(), and if they get true, they call alias->parse() // outside the mutex, then reg->reget() inside the mutex again. A real // mess, but it gets things working for ICU 3.0. [alan]. // Note: It's possible that in between the caller calling // alias->parse() and reg->reget(), that another thread will have // called reg->reget(), and the entry will already have been fixed up. // We have to detect this so we don't stomp over existing entry // data members and potentially leak memory (u.data and compoundFilter). if (entry->entryType == TransliteratorEntry::RULES_FORWARD || entry->entryType == TransliteratorEntry::RULES_REVERSE || entry->entryType == TransliteratorEntry::LOCALE_RULES) { if (parser.idBlockVector.isEmpty() && parser.dataVector.isEmpty()) { entry->u.data = 0; entry->entryType = TransliteratorEntry::ALIAS; entry->stringArg = UNICODE_STRING_SIMPLE("Any-nullptr"); } else if (parser.idBlockVector.isEmpty() && parser.dataVector.size() == 1) { entry->u.data = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0); entry->entryType = TransliteratorEntry::RBT_DATA; } else if (parser.idBlockVector.size() == 1 && parser.dataVector.isEmpty()) { entry->stringArg = *(UnicodeString*)(parser.idBlockVector.elementAt(0)); entry->compoundFilter = parser.orphanCompoundFilter(); entry->entryType = TransliteratorEntry::ALIAS; } else { entry->entryType = TransliteratorEntry::COMPOUND_RBT; entry->compoundFilter = parser.orphanCompoundFilter(); entry->u.dataVector = new UVector(status); // TODO ICU-21701: missing check for nullptr and failed status. // Unclear how best to bail out. entry->stringArg.remove(); int32_t limit = parser.idBlockVector.size(); if (parser.dataVector.size() > limit) limit = parser.dataVector.size(); for (int32_t i = 0; i < limit; i++) { if (i < parser.idBlockVector.size()) { UnicodeString* idBlock = (UnicodeString*)parser.idBlockVector.elementAt(i); if (!idBlock->isEmpty()) entry->stringArg += *idBlock; } if (!parser.dataVector.isEmpty()) { TransliterationRuleData* data = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0); entry->u.dataVector->addElement(data, status); if (U_FAILURE(status)) { delete data; } entry->stringArg += (char16_t)0xffff; // use U+FFFF to mark position of RBTs in ID block } } } } Transliterator *t = instantiateEntry(ID, entry, aliasReturn, status); return t; } void TransliteratorRegistry::put(Transliterator* adoptedProto, UBool visible, UErrorCode& ec) { TransliteratorEntry *entry = new TransliteratorEntry(); if (entry == nullptr) { ec = U_MEMORY_ALLOCATION_ERROR; return; } entry->adoptPrototype(adoptedProto); registerEntry(adoptedProto->getID(), entry, visible); } void TransliteratorRegistry::put(const UnicodeString& ID, Transliterator::Factory factory, Transliterator::Token context, UBool visible, UErrorCode& ec) { TransliteratorEntry *entry = new TransliteratorEntry(); if (entry == nullptr) { ec = U_MEMORY_ALLOCATION_ERROR; return; } entry->setFactory(factory, context); registerEntry(ID, entry, visible); } void TransliteratorRegistry::put(const UnicodeString& ID, const UnicodeString& resourceName, UTransDirection dir, UBool readonlyResourceAlias, UBool visible, UErrorCode& ec) { TransliteratorEntry *entry = new TransliteratorEntry(); if (entry == nullptr) { ec = U_MEMORY_ALLOCATION_ERROR; return; } entry->entryType = (dir == UTRANS_FORWARD) ? TransliteratorEntry::RULES_FORWARD : TransliteratorEntry::RULES_REVERSE; if (readonlyResourceAlias) { entry->stringArg.setTo(true, resourceName.getBuffer(), -1); } else { entry->stringArg = resourceName; } registerEntry(ID, entry, visible); } void TransliteratorRegistry::put(const UnicodeString& ID, const UnicodeString& alias, UBool readonlyAliasAlias, UBool visible, UErrorCode& /*ec*/) { TransliteratorEntry *entry = new TransliteratorEntry(); // Null pointer check if (entry != nullptr) { entry->entryType = TransliteratorEntry::ALIAS; if (readonlyAliasAlias) { entry->stringArg.setTo(true, alias.getBuffer(), -1); } else { entry->stringArg = alias; } registerEntry(ID, entry, visible); } } void TransliteratorRegistry::remove(const UnicodeString& ID) { UnicodeString source, target, variant; UBool sawSource; TransliteratorIDParser::IDtoSTV(ID, source, target, variant, sawSource); // Only need to do this if ID.indexOf('-') < 0 UnicodeString id; TransliteratorIDParser::STVtoID(source, target, variant, id); registry.remove(id); removeSTV(source, target, variant); availableIDs.remove(id); } //---------------------------------------------------------------------- // class TransliteratorRegistry: Public ID and spec management //---------------------------------------------------------------------- /** * == OBSOLETE - remove in ICU 3.4 == * Return the number of IDs currently registered with the system. * To retrieve the actual IDs, call getAvailableID(i) with * i from 0 to countAvailableIDs() - 1. */ int32_t TransliteratorRegistry::countAvailableIDs() const { return availableIDs.count(); } /** * == OBSOLETE - remove in ICU 3.4 == * Return the index-th available ID. index must be between 0 * and countAvailableIDs() - 1, inclusive. If index is out of * range, the result of getAvailableID(0) is returned. */ const UnicodeString& TransliteratorRegistry::getAvailableID(int32_t index) const { if (index < 0 || index >= availableIDs.count()) { index = 0; } int32_t pos = UHASH_FIRST; const UHashElement *e = nullptr; while (index-- >= 0) { e = availableIDs.nextElement(pos); if (e == nullptr) { break; } } if (e != nullptr) { return *(UnicodeString*) e->key.pointer; } // If the code reaches here, the hash table was likely modified during iteration. // Return an statically initialized empty string due to reference return type. static UnicodeString empty; return empty; } StringEnumeration* TransliteratorRegistry::getAvailableIDs() const { return new Enumeration(*this); } int32_t TransliteratorRegistry::countAvailableSources() const { return specDAG.count(); } UnicodeString& TransliteratorRegistry::getAvailableSource(int32_t index, UnicodeString& result) const { int32_t pos = UHASH_FIRST; const UHashElement *e = 0; while (index-- >= 0) { e = specDAG.nextElement(pos); if (e == 0) { break; } } if (e == 0) { result.truncate(0); } else { result = *(UnicodeString*) e->key.pointer; } return result; } int32_t TransliteratorRegistry::countAvailableTargets(const UnicodeString& source) const { Hashtable *targets = (Hashtable*) specDAG.get(source); return (targets == 0) ? 0 : targets->count(); } UnicodeString& TransliteratorRegistry::getAvailableTarget(int32_t index, const UnicodeString& source, UnicodeString& result) const { Hashtable *targets = (Hashtable*) specDAG.get(source); if (targets == 0) { result.truncate(0); // invalid source return result; } int32_t pos = UHASH_FIRST; const UHashElement *e = 0; while (index-- >= 0) { e = targets->nextElement(pos); if (e == 0) { break; } } if (e == 0) { result.truncate(0); // invalid index } else { result = *(UnicodeString*) e->key.pointer; } return result; } int32_t TransliteratorRegistry::countAvailableVariants(const UnicodeString& source, const UnicodeString& target) const { Hashtable *targets = (Hashtable*) specDAG.get(source); if (targets == 0) { return 0; } uint32_t varMask = targets->geti(target); int32_t varCount = 0; while (varMask > 0) { if (varMask & 1) { varCount++; } varMask >>= 1; } return varCount; } UnicodeString& TransliteratorRegistry::getAvailableVariant(int32_t index, const UnicodeString& source, const UnicodeString& target, UnicodeString& result) const { Hashtable *targets = (Hashtable*) specDAG.get(source); if (targets == 0) { result.truncate(0); // invalid source return result; } uint32_t varMask = targets->geti(target); int32_t varCount = 0; int32_t varListIndex = 0; while (varMask > 0) { if (varMask & 1) { if (varCount == index) { UnicodeString *v = (UnicodeString*) variantList.elementAt(varListIndex); if (v != nullptr) { result = *v; return result; } break; } varCount++; } varMask >>= 1; varListIndex++; } result.truncate(0); // invalid target or index return result; } //---------------------------------------------------------------------- // class TransliteratorRegistry::Enumeration //---------------------------------------------------------------------- TransliteratorRegistry::Enumeration::Enumeration(const TransliteratorRegistry& _reg) : pos(UHASH_FIRST), size(_reg.availableIDs.count()), reg(_reg) { } TransliteratorRegistry::Enumeration::~Enumeration() { } int32_t TransliteratorRegistry::Enumeration::count(UErrorCode& /*status*/) const { return size; } const UnicodeString* TransliteratorRegistry::Enumeration::snext(UErrorCode& status) { // This is sloppy but safe -- if we get out of sync with the underlying // registry, we will still return legal strings, but they might not // correspond to the snapshot at construction time. So there could be // duplicate IDs or omitted IDs if insertions or deletions occur in one // thread while another is iterating. To be more rigorous, add a timestamp, // which is incremented with any modification, and validate this iterator // against the timestamp at construction time. This probably isn't worth // doing as long as there is some possibility of removing this code in favor // of some new code based on Doug's service framework. if (U_FAILURE(status)) { return nullptr; } int32_t n = reg.availableIDs.count(); if (n != size) { status = U_ENUM_OUT_OF_SYNC_ERROR; return nullptr; } const UHashElement* element = reg.availableIDs.nextElement(pos); if (element == nullptr) { // If the code reaches this point, it means that it's out of sync // or the caller keeps asking for snext(). return nullptr; } // Copy the string! This avoids lifetime problems. unistr = *(const UnicodeString*) element->key.pointer; return &unistr; } void TransliteratorRegistry::Enumeration::reset(UErrorCode& /*status*/) { pos = UHASH_FIRST; size = reg.availableIDs.count(); } UOBJECT_DEFINE_RTTI_IMPLEMENTATION(TransliteratorRegistry::Enumeration) //---------------------------------------------------------------------- // class TransliteratorRegistry: internal //---------------------------------------------------------------------- /** * Convenience method. Calls 6-arg registerEntry(). */ void TransliteratorRegistry::registerEntry(const UnicodeString& source, const UnicodeString& target, const UnicodeString& variant, TransliteratorEntry* adopted, UBool visible) { UnicodeString ID; UnicodeString s(source); if (s.length() == 0) { s.setTo(true, ANY, 3); } TransliteratorIDParser::STVtoID(source, target, variant, ID); registerEntry(ID, s, target, variant, adopted, visible); } /** * Convenience method. Calls 6-arg registerEntry(). */ void TransliteratorRegistry::registerEntry(const UnicodeString& ID, TransliteratorEntry* adopted, UBool visible) { UnicodeString source, target, variant; UBool sawSource; TransliteratorIDParser::IDtoSTV(ID, source, target, variant, sawSource); // Only need to do this if ID.indexOf('-') < 0 UnicodeString id; TransliteratorIDParser::STVtoID(source, target, variant, id); registerEntry(id, source, target, variant, adopted, visible); } /** * Register an entry object (adopted) with the given ID, source, * target, and variant strings. */ void TransliteratorRegistry::registerEntry(const UnicodeString& ID, const UnicodeString& source, const UnicodeString& target, const UnicodeString& variant, TransliteratorEntry* adopted, UBool visible) { UErrorCode status = U_ZERO_ERROR; registry.put(ID, adopted, status); if (visible) { registerSTV(source, target, variant); if (!availableIDs.containsKey(ID)) { availableIDs.puti(ID, /* unused value */ 1, status); } } else { removeSTV(source, target, variant); availableIDs.remove(ID); } } /** * Register a source-target/variant in the specDAG. Variant may be * empty, but source and target must not be. */ void TransliteratorRegistry::registerSTV(const UnicodeString& source, const UnicodeString& target, const UnicodeString& variant) { // assert(source.length() > 0); // assert(target.length() > 0); UErrorCode status = U_ZERO_ERROR; Hashtable *targets = (Hashtable*) specDAG.get(source); if (targets == 0) { int32_t size = 3; if (source.compare(ANY,3) == 0) { size = ANY_TARGETS_INIT_SIZE; } else if (source.compare(LAT,3) == 0) { size = LAT_TARGETS_INIT_SIZE; } targets = new Hashtable(true, size, status); if (U_FAILURE(status) || targets == nullptr) { return; } specDAG.put(source, targets, status); } int32_t variantListIndex = variantList.indexOf((void*) &variant, 0); if (variantListIndex < 0) { if (variantList.size() >= VARIANT_LIST_MAX_SIZE) { // can't handle any more variants return; } UnicodeString *variantEntry = new UnicodeString(variant); if (variantEntry != nullptr) { variantList.adoptElement(variantEntry, status); if (U_SUCCESS(status)) { variantListIndex = variantList.size() - 1; } } if (variantListIndex < 0) { return; } } uint32_t addMask = 1 << variantListIndex; uint32_t varMask = targets->geti(target); targets->puti(target, varMask | addMask, status); } /** * Remove a source-target/variant from the specDAG. */ void TransliteratorRegistry::removeSTV(const UnicodeString& source, const UnicodeString& target, const UnicodeString& variant) { // assert(source.length() > 0); // assert(target.length() > 0); UErrorCode status = U_ZERO_ERROR; Hashtable *targets = (Hashtable*) specDAG.get(source); if (targets == nullptr) { return; // should never happen for valid s-t/v } uint32_t varMask = targets->geti(target); if (varMask == 0) { return; // should never happen for valid s-t/v } int32_t variantListIndex = variantList.indexOf((void*) &variant, 0); if (variantListIndex < 0) { return; // should never happen for valid s-t/v } int32_t remMask = 1 << variantListIndex; varMask &= (~remMask); if (varMask != 0) { targets->puti(target, varMask, status); } else { targets->remove(target); // should delete variants if (targets->count() == 0) { specDAG.remove(source); // should delete targets } } } /** * Attempt to find a source-target/variant in the dynamic registry * store. Return 0 on failure. * * Caller does NOT own returned object. */ TransliteratorEntry* TransliteratorRegistry::findInDynamicStore(const TransliteratorSpec& src, const TransliteratorSpec& trg, const UnicodeString& variant) const { UnicodeString ID; TransliteratorIDParser::STVtoID(src, trg, variant, ID); TransliteratorEntry *e = (TransliteratorEntry*) registry.get(ID); DEBUG_useEntry(e); return e; } /** * Attempt to find a source-target/variant in the static locale * resource store. Do not perform fallback. Return 0 on failure. * * On success, create a new entry object, register it in the dynamic * store, and return a pointer to it, but do not make it public -- * just because someone requested something, we do not expand the * available ID list (or spec DAG). * * Caller does NOT own returned object. */ TransliteratorEntry* TransliteratorRegistry::findInStaticStore(const TransliteratorSpec& src, const TransliteratorSpec& trg, const UnicodeString& variant) { TransliteratorEntry* entry = 0; if (src.isLocale()) { entry = findInBundle(src, trg, variant, UTRANS_FORWARD); } else if (trg.isLocale()) { entry = findInBundle(trg, src, variant, UTRANS_REVERSE); } // If we found an entry, store it in the Hashtable for next // time. if (entry != 0) { registerEntry(src.getTop(), trg.getTop(), variant, entry, false); } return entry; } // As of 2.0, resource bundle keys cannot contain '_' static const char16_t TRANSLITERATE_TO[] = {84,114,97,110,115,108,105,116,101,114,97,116,101,84,111,0}; // "TransliterateTo" static const char16_t TRANSLITERATE_FROM[] = {84,114,97,110,115,108,105,116,101,114,97,116,101,70,114,111,109,0}; // "TransliterateFrom" static const char16_t TRANSLITERATE[] = {84,114,97,110,115,108,105,116,101,114,97,116,101,0}; // "Transliterate" /** * Attempt to find an entry in a single resource bundle. This is * a one-sided lookup. findInStaticStore() performs up to two such * lookups, one for the source, and one for the target. * * Do not perform fallback. Return 0 on failure. * * On success, create a new Entry object, populate it, and return it. * The caller owns the returned object. */ TransliteratorEntry* TransliteratorRegistry::findInBundle(const TransliteratorSpec& specToOpen, const TransliteratorSpec& specToFind, const UnicodeString& variant, UTransDirection direction) { UnicodeString utag; UnicodeString resStr; int32_t pass; for (pass=0; pass<2; ++pass) { utag.truncate(0); // First try either TransliteratorTo_xxx or // TransliterateFrom_xxx, then try the bidirectional // Transliterate_xxx. This precedence order is arbitrary // but must be consistent and documented. if (pass == 0) { utag.append(direction == UTRANS_FORWARD ? TRANSLITERATE_TO : TRANSLITERATE_FROM, -1); } else { utag.append(TRANSLITERATE, -1); } UnicodeString s(specToFind.get()); utag.append(s.toUpper("")); UErrorCode status = U_ZERO_ERROR; ResourceBundle subres(specToOpen.getBundle().get( CharString().appendInvariantChars(utag, status).data(), status)); if (U_FAILURE(status) || status == U_USING_DEFAULT_WARNING) { continue; } s.truncate(0); if (specToOpen.get() != LocaleUtility::initNameFromLocale(subres.getLocale(), s)) { continue; } if (variant.length() != 0) { status = U_ZERO_ERROR; resStr = subres.getStringEx( CharString().appendInvariantChars(variant, status).data(), status); if (U_SUCCESS(status)) { // Exit loop successfully break; } } else { // Variant is empty, which means match the first variant listed. status = U_ZERO_ERROR; resStr = subres.getStringEx(1, status); if (U_SUCCESS(status)) { // Exit loop successfully break; } } } if (pass==2) { // Failed return nullptr; } // We have succeeded in loading a string from the locale // resources. Create a new registry entry to hold it and return it. TransliteratorEntry *entry = new TransliteratorEntry(); if (entry != 0) { // The direction is always forward for the // TransliterateTo_xxx and TransliterateFrom_xxx // items; those are unidirectional forward rules. // For the bidirectional Transliterate_xxx items, // the direction is the value passed in to this // function. int32_t dir = (pass == 0) ? UTRANS_FORWARD : direction; entry->entryType = TransliteratorEntry::LOCALE_RULES; entry->stringArg = resStr; entry->intArg = dir; } return entry; } /** * Convenience method. Calls 3-arg find(). */ TransliteratorEntry* TransliteratorRegistry::find(const UnicodeString& ID) { UnicodeString source, target, variant; UBool sawSource; TransliteratorIDParser::IDtoSTV(ID, source, target, variant, sawSource); return find(source, target, variant); } /** * Top-level find method. Attempt to find a source-target/variant in * either the dynamic or the static (locale resource) store. Perform * fallback. * * Lookup sequence for ss_SS_SSS-tt_TT_TTT/v: * * ss_SS_SSS-tt_TT_TTT/v -- in hashtable * ss_SS_SSS-tt_TT_TTT/v -- in ss_SS_SSS (no fallback) * * repeat with t = tt_TT_TTT, tt_TT, tt, and tscript * * ss_SS_SSS-t/ * * ss_SS-t/ * * ss-t/ * * sscript-t/ * * * Here * matches the first variant listed. * * Caller does NOT own returned object. Return 0 on failure. */ TransliteratorEntry* TransliteratorRegistry::find(UnicodeString& source, UnicodeString& target, UnicodeString& variant) { TransliteratorSpec src(source); TransliteratorSpec trg(target); TransliteratorEntry* entry; // Seek exact match in hashtable. Temporary fix for ICU 4.6. // TODO: The general logic for finding a matching transliterator needs to be reviewed. // ICU ticket #8089 UnicodeString ID; TransliteratorIDParser::STVtoID(source, target, variant, ID); entry = (TransliteratorEntry*) registry.get(ID); if (entry != 0) { // std::string ss; // std::cout << ID.toUTF8String(ss) << std::endl; return entry; } if (variant.length() != 0) { // Seek exact match in hashtable entry = findInDynamicStore(src, trg, variant); if (entry != 0) { return entry; } // Seek exact match in locale resources entry = findInStaticStore(src, trg, variant); if (entry != 0) { return entry; } } for (;;) { src.reset(); for (;;) { // Seek match in hashtable entry = findInDynamicStore(src, trg, NO_VARIANT); if (entry != 0) { return entry; } // Seek match in locale resources entry = findInStaticStore(src, trg, NO_VARIANT); if (entry != 0) { return entry; } if (!src.hasFallback()) { break; } src.next(); } if (!trg.hasFallback()) { break; } trg.next(); } return 0; } /** * Given an Entry object, instantiate it. Caller owns result. Return * 0 on failure. * * Return a non-empty aliasReturn value if the ID points to an alias. * We cannot instantiate it ourselves because the alias may contain * filters or compounds, which we do not understand. Caller should * make aliasReturn empty before calling. * * The entry object is assumed to reside in the dynamic store. It may be * modified. */ Transliterator* TransliteratorRegistry::instantiateEntry(const UnicodeString& ID, TransliteratorEntry *entry, TransliteratorAlias* &aliasReturn, UErrorCode& status) { Transliterator *t = 0; U_ASSERT(aliasReturn == 0); switch (entry->entryType) { case TransliteratorEntry::RBT_DATA: t = new RuleBasedTransliterator(ID, entry->u.data); if (t == 0) { status = U_MEMORY_ALLOCATION_ERROR; } return t; case TransliteratorEntry::PROTOTYPE: t = entry->u.prototype->clone(); if (t == 0) { status = U_MEMORY_ALLOCATION_ERROR; } return t; case TransliteratorEntry::ALIAS: aliasReturn = new TransliteratorAlias(entry->stringArg, entry->compoundFilter); if (aliasReturn == 0) { status = U_MEMORY_ALLOCATION_ERROR; } return 0; case TransliteratorEntry::FACTORY: t = entry->u.factory.function(ID, entry->u.factory.context); if (t == 0) { status = U_MEMORY_ALLOCATION_ERROR; } return t; case TransliteratorEntry::COMPOUND_RBT: { UVector* rbts = new UVector(uprv_deleteUObject, nullptr, entry->u.dataVector->size(), status); // Check for null pointer if (rbts == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } int32_t passNumber = 1; for (int32_t i = 0; U_SUCCESS(status) && i < entry->u.dataVector->size(); i++) { // TODO: Should passNumber be turned into a decimal-string representation (1 -> "1")? Transliterator* tl = new RuleBasedTransliterator(UnicodeString(CompoundTransliterator::PASS_STRING) + UnicodeString(passNumber++), (TransliterationRuleData*)(entry->u.dataVector->elementAt(i)), false); if (tl == 0) status = U_MEMORY_ALLOCATION_ERROR; else rbts->adoptElement(tl, status); } if (U_FAILURE(status)) { delete rbts; return 0; } rbts->setDeleter(nullptr); aliasReturn = new TransliteratorAlias(ID, entry->stringArg, rbts, entry->compoundFilter); } if (aliasReturn == 0) { status = U_MEMORY_ALLOCATION_ERROR; } return 0; case TransliteratorEntry::LOCALE_RULES: aliasReturn = new TransliteratorAlias(ID, entry->stringArg, (UTransDirection) entry->intArg); if (aliasReturn == 0) { status = U_MEMORY_ALLOCATION_ERROR; } return 0; case TransliteratorEntry::RULES_FORWARD: case TransliteratorEntry::RULES_REVERSE: // Process the rule data into a TransliteratorRuleData object, // and possibly also into an ::id header and/or footer. Then // we modify the registry with the parsed data and retry. { TransliteratorParser parser(status); // We use the file name, taken from another resource bundle // 2-d array at static init time, as a locale language. We're // just using the locale mechanism to map through to a file // name; this in no way represents an actual locale. //CharString ch(entry->stringArg); //UResourceBundle *bundle = ures_openDirect(0, ch, &status); UnicodeString rules = entry->stringArg; //ures_close(bundle); //if (U_FAILURE(status)) { // We have a failure of some kind. Remove the ID from the // registry so we don't keep trying. NOTE: This will throw off // anyone who is, at the moment, trying to iterate over the // available IDs. That's acceptable since we should never // really get here except under installation, configuration, // or unrecoverable run time memory failures. // remove(ID); //} else { // If the status indicates a failure, then we don't have any // rules -- there is probably an installation error. The list // in the root locale should correspond to all the installed // transliterators; if it lists something that's not // installed, we'll get an error from ResourceBundle. aliasReturn = new TransliteratorAlias(ID, rules, ((entry->entryType == TransliteratorEntry::RULES_REVERSE) ? UTRANS_REVERSE : UTRANS_FORWARD)); if (aliasReturn == 0) { status = U_MEMORY_ALLOCATION_ERROR; } //} } return 0; default: UPRV_UNREACHABLE_EXIT; // can't get here } } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ //eof stringi/src/icu74/i18n/ulistformatter.cpp0000644000176200001440000001175114700200761020024 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ***************************************************************************************** * Copyright (C) 2015, International Business Machines * Corporation and others. All Rights Reserved. ***************************************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/ulistformatter.h" #include "unicode/listformatter.h" #include "unicode/localpointer.h" #include "cmemory.h" #include "formattedval_impl.h" U_NAMESPACE_USE U_CAPI UListFormatter* U_EXPORT2 ulistfmt_open(const char* locale, UErrorCode* status) { if (U_FAILURE(*status)) { return nullptr; } LocalPointer listfmt(ListFormatter::createInstance(Locale(locale), *status)); if (U_FAILURE(*status)) { return nullptr; } return (UListFormatter*)listfmt.orphan(); } U_CAPI UListFormatter* U_EXPORT2 ulistfmt_openForType(const char* locale, UListFormatterType type, UListFormatterWidth width, UErrorCode* status) { if (U_FAILURE(*status)) { return nullptr; } LocalPointer listfmt(ListFormatter::createInstance(Locale(locale), type, width, *status)); if (U_FAILURE(*status)) { return nullptr; } return (UListFormatter*)listfmt.orphan(); } U_CAPI void U_EXPORT2 ulistfmt_close(UListFormatter *listfmt) { delete (ListFormatter*)listfmt; } // Magic number: FLST in ASCII UPRV_FORMATTED_VALUE_CAPI_AUTO_IMPL( FormattedList, UFormattedList, UFormattedListImpl, UFormattedListApiHelper, ulistfmt, 0x464C5354) static UnicodeString* getUnicodeStrings( const char16_t* const strings[], const int32_t* stringLengths, int32_t stringCount, UnicodeString* length4StackBuffer, LocalArray& maybeOwner, UErrorCode& status) { U_ASSERT(U_SUCCESS(status)); if (stringCount < 0 || (strings == nullptr && stringCount > 0)) { status = U_ILLEGAL_ARGUMENT_ERROR; return nullptr; } UnicodeString* ustrings = length4StackBuffer; if (stringCount > 4) { maybeOwner.adoptInsteadAndCheckErrorCode(new UnicodeString[stringCount], status); if (U_FAILURE(status)) { return nullptr; } ustrings = maybeOwner.getAlias(); } if (stringLengths == nullptr) { for (int32_t stringIndex = 0; stringIndex < stringCount; stringIndex++) { ustrings[stringIndex].setTo(true, strings[stringIndex], -1); } } else { for (int32_t stringIndex = 0; stringIndex < stringCount; stringIndex++) { ustrings[stringIndex].setTo(stringLengths[stringIndex] < 0, strings[stringIndex], stringLengths[stringIndex]); } } return ustrings; } U_CAPI int32_t U_EXPORT2 ulistfmt_format(const UListFormatter* listfmt, const char16_t* const strings[], const int32_t * stringLengths, int32_t stringCount, char16_t* result, int32_t resultCapacity, UErrorCode* status) { if (U_FAILURE(*status)) { return -1; } if ((result == nullptr) ? resultCapacity != 0 : resultCapacity < 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return -1; } UnicodeString length4StackBuffer[4]; LocalArray maybeOwner; UnicodeString* ustrings = getUnicodeStrings( strings, stringLengths, stringCount, length4StackBuffer, maybeOwner, *status); if (U_FAILURE(*status)) { return -1; } UnicodeString res; if (result != nullptr) { // nullptr destination for pure preflighting: empty dummy string // otherwise, alias the destination buffer (copied from udat_format) res.setTo(result, 0, resultCapacity); } reinterpret_cast(listfmt)->format( ustrings, stringCount, res, *status ); return res.extract(result, resultCapacity, *status); } U_CAPI void U_EXPORT2 ulistfmt_formatStringsToResult( const UListFormatter* listfmt, const char16_t* const strings[], const int32_t * stringLengths, int32_t stringCount, UFormattedList* uresult, UErrorCode* status) { auto* result = UFormattedListApiHelper::validate(uresult, *status); if (U_FAILURE(*status)) { return; } UnicodeString length4StackBuffer[4]; LocalArray maybeOwner; UnicodeString* ustrings = getUnicodeStrings( strings, stringLengths, stringCount, length4StackBuffer, maybeOwner, *status); if (U_FAILURE(*status)) { return; } result->fImpl = reinterpret_cast(listfmt) ->formatStringsToValue(ustrings, stringCount, *status); } #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/islamcal.cpp0000644000176200001440000012114714700200761016526 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * Copyright (C) 2003-2015, International Business Machines Corporation * and others. All Rights Reserved. ****************************************************************************** * * File ISLAMCAL.H * * Modification History: * * Date Name Description * 10/14/2003 srl ported from java IslamicCalendar ***************************************************************************** */ #include "islamcal.h" #if !UCONFIG_NO_FORMATTING #include "umutex.h" #include #include "gregoimp.h" // Math #include "astro.h" // CalendarAstronomer #include "uhash.h" #include "ucln_in.h" #include "uassert.h" static const UDate HIJRA_MILLIS = -42521587200000.0; // 7/16/622 AD 00:00 // Debugging #ifdef U_DEBUG_ISLAMCAL # include # include static void debug_islamcal_loc(const char *f, int32_t l) { fprintf(stderr, "%s:%d: ", f, l); } static void debug_islamcal_msg(const char *pat, ...) { va_list ap; va_start(ap, pat); vfprintf(stderr, pat, ap); fflush(stderr); } // must use double parens, i.e.: U_DEBUG_ISLAMCAL_MSG(("four is: %d",4)); #define U_DEBUG_ISLAMCAL_MSG(x) {debug_islamcal_loc(__FILE__,__LINE__);debug_islamcal_msg x;} #else #define U_DEBUG_ISLAMCAL_MSG(x) #endif // --- The cache -- // cache of months static icu::CalendarCache *gMonthCache = nullptr; static icu::CalendarAstronomer *gIslamicCalendarAstro = nullptr; U_CDECL_BEGIN static UBool calendar_islamic_cleanup() { if (gMonthCache) { delete gMonthCache; gMonthCache = nullptr; } if (gIslamicCalendarAstro) { delete gIslamicCalendarAstro; gIslamicCalendarAstro = nullptr; } return true; } U_CDECL_END U_NAMESPACE_BEGIN // Implementation of the IslamicCalendar class /** * Friday EPOC */ static const int32_t CIVIL_EPOC = 1948440; // CE 622 July 16 Friday (Julian calendar) / CE 622 July 19 (Gregorian calendar) /** * Thursday EPOC */ static const int32_t ASTRONOMICAL_EPOC = 1948439; // CE 622 July 15 Thursday (Julian calendar) static const int32_t UMALQURA_YEAR_START = 1300; static const int32_t UMALQURA_YEAR_END = 1600; static const int UMALQURA_MONTHLENGTH[] = { //* 1300 -1302 */ "1010 1010 1010", "1101 0101 0100", "1110 1100 1001", 0x0AAA, 0x0D54, 0x0EC9, //* 1303 -1307 */ "0110 1101 0100", "0110 1110 1010", "0011 0110 1100", "1010 1010 1101", "0101 0101 0101", 0x06D4, 0x06EA, 0x036C, 0x0AAD, 0x0555, //* 1308 -1312 */ "0110 1010 1001", "0111 1001 0010", "1011 1010 1001", "0101 1101 0100", "1010 1101 1010", 0x06A9, 0x0792, 0x0BA9, 0x05D4, 0x0ADA, //* 1313 -1317 */ "0101 0101 1100", "1101 0010 1101", "0110 1001 0101", "0111 0100 1010", "1011 0101 0100", 0x055C, 0x0D2D, 0x0695, 0x074A, 0x0B54, //* 1318 -1322 */ "1011 0110 1010", "0101 1010 1101", "0100 1010 1110", "1010 0100 1111", "0101 0001 0111", 0x0B6A, 0x05AD, 0x04AE, 0x0A4F, 0x0517, //* 1323 -1327 */ "0110 1000 1011", "0110 1010 0101", "1010 1101 0101", "0010 1101 0110", "1001 0101 1011", 0x068B, 0x06A5, 0x0AD5, 0x02D6, 0x095B, //* 1328 -1332 */ "0100 1001 1101", "1010 0100 1101", "1101 0010 0110", "1101 1001 0101", "0101 1010 1100", 0x049D, 0x0A4D, 0x0D26, 0x0D95, 0x05AC, //* 1333 -1337 */ "1001 1011 0110", "0010 1011 1010", "1010 0101 1011", "0101 0010 1011", "1010 1001 0101", 0x09B6, 0x02BA, 0x0A5B, 0x052B, 0x0A95, //* 1338 -1342 */ "0110 1100 1010", "1010 1110 1001", "0010 1111 0100", "1001 0111 0110", "0010 1011 0110", 0x06CA, 0x0AE9, 0x02F4, 0x0976, 0x02B6, //* 1343 -1347 */ "1001 0101 0110", "1010 1100 1010", "1011 1010 0100", "1011 1101 0010", "0101 1101 1001", 0x0956, 0x0ACA, 0x0BA4, 0x0BD2, 0x05D9, //* 1348 -1352 */ "0010 1101 1100", "1001 0110 1101", "0101 0100 1101", "1010 1010 0101", "1011 0101 0010", 0x02DC, 0x096D, 0x054D, 0x0AA5, 0x0B52, //* 1353 -1357 */ "1011 1010 0101", "0101 1011 0100", "1001 1011 0110", "0101 0101 0111", "0010 1001 0111", 0x0BA5, 0x05B4, 0x09B6, 0x0557, 0x0297, //* 1358 -1362 */ "0101 0100 1011", "0110 1010 0011", "0111 0101 0010", "1011 0110 0101", "0101 0110 1010", 0x054B, 0x06A3, 0x0752, 0x0B65, 0x056A, //* 1363 -1367 */ "1010 1010 1011", "0101 0010 1011", "1100 1001 0101", "1101 0100 1010", "1101 1010 0101", 0x0AAB, 0x052B, 0x0C95, 0x0D4A, 0x0DA5, //* 1368 -1372 */ "0101 1100 1010", "1010 1101 0110", "1001 0101 0111", "0100 1010 1011", "1001 0100 1011", 0x05CA, 0x0AD6, 0x0957, 0x04AB, 0x094B, //* 1373 -1377 */ "1010 1010 0101", "1011 0101 0010", "1011 0110 1010", "0101 0111 0101", "0010 0111 0110", 0x0AA5, 0x0B52, 0x0B6A, 0x0575, 0x0276, //* 1378 -1382 */ "1000 1011 0111", "0100 0101 1011", "0101 0101 0101", "0101 1010 1001", "0101 1011 0100", 0x08B7, 0x045B, 0x0555, 0x05A9, 0x05B4, //* 1383 -1387 */ "1001 1101 1010", "0100 1101 1101", "0010 0110 1110", "1001 0011 0110", "1010 1010 1010", 0x09DA, 0x04DD, 0x026E, 0x0936, 0x0AAA, //* 1388 -1392 */ "1101 0101 0100", "1101 1011 0010", "0101 1101 0101", "0010 1101 1010", "1001 0101 1011", 0x0D54, 0x0DB2, 0x05D5, 0x02DA, 0x095B, //* 1393 -1397 */ "0100 1010 1011", "1010 0101 0101", "1011 0100 1001", "1011 0110 0100", "1011 0111 0001", 0x04AB, 0x0A55, 0x0B49, 0x0B64, 0x0B71, //* 1398 -1402 */ "0101 1011 0100", "1010 1011 0101", "1010 0101 0101", "1101 0010 0101", "1110 1001 0010", 0x05B4, 0x0AB5, 0x0A55, 0x0D25, 0x0E92, //* 1403 -1407 */ "1110 1100 1001", "0110 1101 0100", "1010 1110 1001", "1001 0110 1011", "0100 1010 1011", 0x0EC9, 0x06D4, 0x0AE9, 0x096B, 0x04AB, //* 1408 -1412 */ "1010 1001 0011", "1101 0100 1001", "1101 1010 0100", "1101 1011 0010", "1010 1011 1001", 0x0A93, 0x0D49, 0x0DA4, 0x0DB2, 0x0AB9, //* 1413 -1417 */ "0100 1011 1010", "1010 0101 1011", "0101 0010 1011", "1010 1001 0101", "1011 0010 1010", 0x04BA, 0x0A5B, 0x052B, 0x0A95, 0x0B2A, //* 1418 -1422 */ "1011 0101 0101", "0101 0101 1100", "0100 1011 1101", "0010 0011 1101", "1001 0001 1101", 0x0B55, 0x055C, 0x04BD, 0x023D, 0x091D, //* 1423 -1427 */ "1010 1001 0101", "1011 0100 1010", "1011 0101 1010", "0101 0110 1101", "0010 1011 0110", 0x0A95, 0x0B4A, 0x0B5A, 0x056D, 0x02B6, //* 1428 -1432 */ "1001 0011 1011", "0100 1001 1011", "0110 0101 0101", "0110 1010 1001", "0111 0101 0100", 0x093B, 0x049B, 0x0655, 0x06A9, 0x0754, //* 1433 -1437 */ "1011 0110 1010", "0101 0110 1100", "1010 1010 1101", "0101 0101 0101", "1011 0010 1001", 0x0B6A, 0x056C, 0x0AAD, 0x0555, 0x0B29, //* 1438 -1442 */ "1011 1001 0010", "1011 1010 1001", "0101 1101 0100", "1010 1101 1010", "0101 0101 1010", 0x0B92, 0x0BA9, 0x05D4, 0x0ADA, 0x055A, //* 1443 -1447 */ "1010 1010 1011", "0101 1001 0101", "0111 0100 1001", "0111 0110 0100", "1011 1010 1010", 0x0AAB, 0x0595, 0x0749, 0x0764, 0x0BAA, //* 1448 -1452 */ "0101 1011 0101", "0010 1011 0110", "1010 0101 0110", "1110 0100 1101", "1011 0010 0101", 0x05B5, 0x02B6, 0x0A56, 0x0E4D, 0x0B25, //* 1453 -1457 */ "1011 0101 0010", "1011 0110 1010", "0101 1010 1101", "0010 1010 1110", "1001 0010 1111", 0x0B52, 0x0B6A, 0x05AD, 0x02AE, 0x092F, //* 1458 -1462 */ "0100 1001 0111", "0110 0100 1011", "0110 1010 0101", "0110 1010 1100", "1010 1101 0110", 0x0497, 0x064B, 0x06A5, 0x06AC, 0x0AD6, //* 1463 -1467 */ "0101 0101 1101", "0100 1001 1101", "1010 0100 1101", "1101 0001 0110", "1101 1001 0101", 0x055D, 0x049D, 0x0A4D, 0x0D16, 0x0D95, //* 1468 -1472 */ "0101 1010 1010", "0101 1011 0101", "0010 1101 1010", "1001 0101 1011", "0100 1010 1101", 0x05AA, 0x05B5, 0x02DA, 0x095B, 0x04AD, //* 1473 -1477 */ "0101 1001 0101", "0110 1100 1010", "0110 1110 0100", "1010 1110 1010", "0100 1111 0101", 0x0595, 0x06CA, 0x06E4, 0x0AEA, 0x04F5, //* 1478 -1482 */ "0010 1011 0110", "1001 0101 0110", "1010 1010 1010", "1011 0101 0100", "1011 1101 0010", 0x02B6, 0x0956, 0x0AAA, 0x0B54, 0x0BD2, //* 1483 -1487 */ "0101 1101 1001", "0010 1110 1010", "1001 0110 1101", "0100 1010 1101", "1010 1001 0101", 0x05D9, 0x02EA, 0x096D, 0x04AD, 0x0A95, //* 1488 -1492 */ "1011 0100 1010", "1011 1010 0101", "0101 1011 0010", "1001 1011 0101", "0100 1101 0110", 0x0B4A, 0x0BA5, 0x05B2, 0x09B5, 0x04D6, //* 1493 -1497 */ "1010 1001 0111", "0101 0100 0111", "0110 1001 0011", "0111 0100 1001", "1011 0101 0101", 0x0A97, 0x0547, 0x0693, 0x0749, 0x0B55, //* 1498 -1508 */ "0101 0110 1010", "1010 0110 1011", "0101 0010 1011", "1010 1000 1011", "1101 0100 0110", "1101 1010 0011", "0101 1100 1010", "1010 1101 0110", "0100 1101 1011", "0010 0110 1011", "1001 0100 1011", 0x056A, 0x0A6B, 0x052B, 0x0A8B, 0x0D46, 0x0DA3, 0x05CA, 0x0AD6, 0x04DB, 0x026B, 0x094B, //* 1509 -1519 */ "1010 1010 0101", "1011 0101 0010", "1011 0110 1001", "0101 0111 0101", "0001 0111 0110", "1000 1011 0111", "0010 0101 1011", "0101 0010 1011", "0101 0110 0101", "0101 1011 0100", "1001 1101 1010", 0x0AA5, 0x0B52, 0x0B69, 0x0575, 0x0176, 0x08B7, 0x025B, 0x052B, 0x0565, 0x05B4, 0x09DA, //* 1520 -1530 */ "0100 1110 1101", "0001 0110 1101", "1000 1011 0110", "1010 1010 0110", "1101 0101 0010", "1101 1010 1001", "0101 1101 0100", "1010 1101 1010", "1001 0101 1011", "0100 1010 1011", "0110 0101 0011", 0x04ED, 0x016D, 0x08B6, 0x0AA6, 0x0D52, 0x0DA9, 0x05D4, 0x0ADA, 0x095B, 0x04AB, 0x0653, //* 1531 -1541 */ "0111 0010 1001", "0111 0110 0010", "1011 1010 1001", "0101 1011 0010", "1010 1011 0101", "0101 0101 0101", "1011 0010 0101", "1101 1001 0010", "1110 1100 1001", "0110 1101 0010", "1010 1110 1001", 0x0729, 0x0762, 0x0BA9, 0x05B2, 0x0AB5, 0x0555, 0x0B25, 0x0D92, 0x0EC9, 0x06D2, 0x0AE9, //* 1542 -1552 */ "0101 0110 1011", "0100 1010 1011", "1010 0101 0101", "1101 0010 1001", "1101 0101 0100", "1101 1010 1010", "1001 1011 0101", "0100 1011 1010", "1010 0011 1011", "0100 1001 1011", "1010 0100 1101", 0x056B, 0x04AB, 0x0A55, 0x0D29, 0x0D54, 0x0DAA, 0x09B5, 0x04BA, 0x0A3B, 0x049B, 0x0A4D, //* 1553 -1563 */ "1010 1010 1010", "1010 1101 0101", "0010 1101 1010", "1001 0101 1101", "0100 0101 1110", "1010 0010 1110", "1100 1001 1010", "1101 0101 0101", "0110 1011 0010", "0110 1011 1001", "0100 1011 1010", 0x0AAA, 0x0AD5, 0x02DA, 0x095D, 0x045E, 0x0A2E, 0x0C9A, 0x0D55, 0x06B2, 0x06B9, 0x04BA, //* 1564 -1574 */ "1010 0101 1101", "0101 0010 1101", "1010 1001 0101", "1011 0101 0010", "1011 1010 1000", "1011 1011 0100", "0101 1011 1001", "0010 1101 1010", "1001 0101 1010", "1011 0100 1010", "1101 1010 0100", 0x0A5D, 0x052D, 0x0A95, 0x0B52, 0x0BA8, 0x0BB4, 0x05B9, 0x02DA, 0x095A, 0x0B4A, 0x0DA4, //* 1575 -1585 */ "1110 1101 0001", "0110 1110 1000", "1011 0110 1010", "0101 0110 1101", "0101 0011 0101", "0110 1001 0101", "1101 0100 1010", "1101 1010 1000", "1101 1101 0100", "0110 1101 1010", "0101 0101 1011", 0x0ED1, 0x06E8, 0x0B6A, 0x056D, 0x0535, 0x0695, 0x0D4A, 0x0DA8, 0x0DD4, 0x06DA, 0x055B, //* 1586 -1596 */ "0010 1001 1101", "0110 0010 1011", "1011 0001 0101", "1011 0100 1010", "1011 1001 0101", "0101 1010 1010", "1010 1010 1110", "1001 0010 1110", "1100 1000 1111", "0101 0010 0111", "0110 1001 0101", 0x029D, 0x062B, 0x0B15, 0x0B4A, 0x0B95, 0x05AA, 0x0AAE, 0x092E, 0x0C8F, 0x0527, 0x0695, //* 1597 -1600 */ "0110 1010 1010", "1010 1101 0110", "0101 0101 1101", "0010 1001 1101", }; 0x06AA, 0x0AD6, 0x055D, 0x029D }; int32_t getUmalqura_MonthLength(int32_t y, int32_t m) { int32_t mask = (int32_t) (0x01 << (11 - m)); // set mask for bit corresponding to month if((UMALQURA_MONTHLENGTH[y] & mask) == 0 ) return 29; else return 30; } //------------------------------------------------------------------------- // Constructors... //------------------------------------------------------------------------- const char *IslamicCalendar::getType() const { return "islamic"; } IslamicCalendar* IslamicCalendar::clone() const { return new IslamicCalendar(*this); } IslamicCalendar::IslamicCalendar(const Locale& aLocale, UErrorCode& success) : Calendar(TimeZone::forLocaleOrDefault(aLocale), aLocale, success) { setTimeInMillis(getNow(), success); // Call this again now that the vtable is set up properly. } IslamicCalendar::~IslamicCalendar() { } //------------------------------------------------------------------------- // Minimum / Maximum access functions //------------------------------------------------------------------------- // Note: Current IslamicCalendar implementation does not work // well with negative years. // TODO: In some cases the current ICU Islamic calendar implementation shows // a month as having 31 days. Since date parsing now uses range checks based // on the table below, we need to change the range for last day of month to // include 31 as a workaround until the implementation is fixed. static const int32_t LIMITS[UCAL_FIELD_COUNT][4] = { // Minimum Greatest Least Maximum // Minimum Maximum { 0, 0, 0, 0}, // ERA { 1, 1, 5000000, 5000000}, // YEAR { 0, 0, 11, 11}, // MONTH { 1, 1, 50, 51}, // WEEK_OF_YEAR {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // WEEK_OF_MONTH { 1, 1, 29, 31}, // DAY_OF_MONTH - 31 to workaround for cal implementation bug, should be 30 { 1, 1, 354, 355}, // DAY_OF_YEAR {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // DAY_OF_WEEK { -1, -1, 5, 5}, // DAY_OF_WEEK_IN_MONTH {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // AM_PM {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // HOUR {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // HOUR_OF_DAY {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // MINUTE {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // SECOND {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // MILLISECOND {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // ZONE_OFFSET {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // DST_OFFSET { 1, 1, 5000000, 5000000}, // YEAR_WOY {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // DOW_LOCAL { 1, 1, 5000000, 5000000}, // EXTENDED_YEAR {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // JULIAN_DAY {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // MILLISECONDS_IN_DAY {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // IS_LEAP_MONTH { 0, 0, 11, 11}, // ORDINAL_MONTH }; /** * @draft ICU 2.4 */ int32_t IslamicCalendar::handleGetLimit(UCalendarDateFields field, ELimitType limitType) const { return LIMITS[field][limitType]; } //------------------------------------------------------------------------- // Assorted calculation utilities // // we could compress this down more if we need to static const int8_t umAlQuraYrStartEstimateFix[] = { 0, 0, -1, 0, -1, 0, 0, 0, 0, 0, // 1300.. -1, 0, 0, 0, 0, 0, 0, 0, -1, 0, // 1310.. 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, // 1320.. 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, // 1330.. 0, 0, 1, 0, 0, -1, -1, 0, 0, 0, // 1340.. 1, 0, 0, -1, 0, 0, 0, 1, 1, 0, // 1350.. 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, // 1360.. 0, 1, 1, 0, 0, -1, 0, 1, 0, 1, // 1370.. 1, 0, 0, -1, 0, 1, 0, 0, 0, -1, // 1380.. 0, 1, 0, 1, 0, 0, 0, -1, 0, 0, // 1390.. 0, 0, -1, -1, 0, -1, 0, 1, 0, 0, // 1400.. 0, -1, 0, 0, 0, 1, 0, 0, 0, 0, // 1410.. 0, 1, 0, 0, -1, -1, 0, 0, 0, 1, // 1420.. 0, 0, -1, -1, 0, -1, 0, 0, -1, -1, // 1430.. 0, -1, 0, -1, 0, 0, -1, -1, 0, 0, // 1440.. 0, 0, 0, 0, -1, 0, 1, 0, 1, 1, // 1450.. 0, 0, -1, 0, 1, 0, 0, 0, 0, 0, // 1460.. 1, 0, 1, 0, 0, 0, -1, 0, 1, 0, // 1470.. 0, -1, -1, 0, 0, 0, 1, 0, 0, 0, // 1480.. 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, // 1490.. 1, 0, 0, -1, 0, 0, 0, 1, 1, 0, // 1500.. 0, -1, 0, 1, 0, 1, 1, 0, 0, 0, // 1510.. 0, 1, 0, 0, 0, -1, 0, 0, 0, 1, // 1520.. 0, 0, 0, -1, 0, 0, 0, 0, 0, -1, // 1530.. 0, -1, 0, 1, 0, 0, 0, -1, 0, 1, // 1540.. 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, // 1550.. -1, 0, 0, 0, 0, 1, 0, 0, 0, -1, // 1560.. 0, 0, 0, 0, -1, -1, 0, -1, 0, 1, // 1570.. 0, 0, -1, -1, 0, 0, 1, 1, 0, 0, // 1580.. -1, 0, 0, 0, 0, 1, 0, 0, 0, 0, // 1590.. 1 // 1600 }; /** * Determine whether a year is a leap year in the Islamic civil calendar */ UBool IslamicCalendar::civilLeapYear(int32_t year) { return (14 + 11 * year) % 30 < 11; } /** * Return the day # on which the given year starts. Days are counted * from the Hijri epoch, origin 0. */ int32_t IslamicCalendar::yearStart(int32_t year) const{ return trueMonthStart(12*(year-1)); } /** * Return the day # on which the given month starts. Days are counted * from the Hijri epoch, origin 0. * * @param year The hijri year * @param month The hijri month, 0-based (assumed to be in range 0..11) */ int32_t IslamicCalendar::monthStart(int32_t year, int32_t month) const { return trueMonthStart(12*(year-1) + month); } /** * Find the day number on which a particular month of the true/lunar * Islamic calendar starts. * * @param month The month in question, origin 0 from the Hijri epoch * * @return The day number on which the given month starts. */ int32_t IslamicCalendar::trueMonthStart(int32_t month) const { UErrorCode status = U_ZERO_ERROR; int32_t start = CalendarCache::get(&gMonthCache, month, status); if (start==0) { // Make a guess at when the month started, using the average length UDate origin = HIJRA_MILLIS + uprv_floor(month * CalendarAstronomer::SYNODIC_MONTH) * kOneDay; // moonAge will fail due to memory allocation error double age = moonAge(origin, status); if (U_FAILURE(status)) { goto trueMonthStartEnd; } if (age >= 0) { // The month has already started do { origin -= kOneDay; age = moonAge(origin, status); if (U_FAILURE(status)) { goto trueMonthStartEnd; } } while (age >= 0); } else { // Preceding month has not ended yet. do { origin += kOneDay; age = moonAge(origin, status); if (U_FAILURE(status)) { goto trueMonthStartEnd; } } while (age < 0); } start = (int32_t)(ClockMath::floorDivide( (int64_t)((int64_t)origin - HIJRA_MILLIS), (int64_t)kOneDay) + 1); CalendarCache::put(&gMonthCache, month, start, status); } trueMonthStartEnd : if(U_FAILURE(status)) { start = 0; } return start; } /** * Return the "age" of the moon at the given time; this is the difference * in ecliptic latitude between the moon and the sun. This method simply * calls CalendarAstronomer.moonAge, converts to degrees, * and adjusts the result to be in the range [-180, 180]. * * @param time The time at which the moon's age is desired, * in millis since 1/1/1970. */ double IslamicCalendar::moonAge(UDate time, UErrorCode &status) { double age = 0; static UMutex astroLock; // pod bay door lock umtx_lock(&astroLock); if(gIslamicCalendarAstro == nullptr) { gIslamicCalendarAstro = new CalendarAstronomer(); if (gIslamicCalendarAstro == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return age; } ucln_i18n_registerCleanup(UCLN_I18N_ISLAMIC_CALENDAR, calendar_islamic_cleanup); } gIslamicCalendarAstro->setTime(time); age = gIslamicCalendarAstro->getMoonAge(); umtx_unlock(&astroLock); // Convert to degrees and normalize... age = age * 180 / CalendarAstronomer::PI; if (age > 180) { age = age - 360; } return age; } //---------------------------------------------------------------------- // Calendar framework //---------------------------------------------------------------------- /** * Return the length (in days) of the given month. * * @param year The hijri year * @param year The hijri month, 0-based * @draft ICU 2.4 */ int32_t IslamicCalendar::handleGetMonthLength(int32_t extendedYear, int32_t month) const { month = 12*(extendedYear-1) + month; return trueMonthStart(month+1) - trueMonthStart(month) ; } /** * Return the number of days in the given Islamic year * @draft ICU 2.4 */ int32_t IslamicCalendar::handleGetYearLength(int32_t extendedYear) const { int32_t month = 12*(extendedYear-1); return (trueMonthStart(month + 12) - trueMonthStart(month)); } //------------------------------------------------------------------------- // Functions for converting from field values to milliseconds.... //------------------------------------------------------------------------- // Return JD of start of given month/year // Calendar says: // Get the Julian day of the day BEFORE the start of this year. // If useMonth is true, get the day before the start of the month. // Hence the -1 /** * @draft ICU 2.4 */ int32_t IslamicCalendar::handleComputeMonthStart(int32_t eyear, int32_t month, UBool /* useMonth */) const { // This may be called by Calendar::handleComputeJulianDay with months out of the range // 0..11. Need to handle that here since monthStart requires months in the range 0.11. if (month > 11) { eyear += (month / 12); month %= 12; } else if (month < 0) { month++; eyear += (month / 12) - 1; month = (month % 12) + 11; } return monthStart(eyear, month) + getEpoc() - 1; } //------------------------------------------------------------------------- // Functions for converting from milliseconds to field values //------------------------------------------------------------------------- /** * @draft ICU 2.4 */ int32_t IslamicCalendar::handleGetExtendedYear() { int32_t year; if (newerField(UCAL_EXTENDED_YEAR, UCAL_YEAR) == UCAL_EXTENDED_YEAR) { year = internalGet(UCAL_EXTENDED_YEAR, 1); // Default to year 1 } else { year = internalGet(UCAL_YEAR, 1); // Default to year 1 } return year; } /** * Override Calendar to compute several fields specific to the Islamic * calendar system. These are: * *

  • ERA *
  • YEAR *
  • MONTH *
  • DAY_OF_MONTH *
  • DAY_OF_YEAR *
  • EXTENDED_YEAR
* * The DAY_OF_WEEK and DOW_LOCAL fields are already set when this * method is called. The getGregorianXxx() methods return Gregorian * calendar equivalents for the given Julian day. * @draft ICU 2.4 */ void IslamicCalendar::handleComputeFields(int32_t julianDay, UErrorCode &status) { if (U_FAILURE(status)) return; int32_t days = julianDay - getEpoc(); // Guess at the number of elapsed full months since the epoch int32_t month = (int32_t)uprv_floor((double)days / CalendarAstronomer::SYNODIC_MONTH); int32_t startDate = (int32_t)uprv_floor(month * CalendarAstronomer::SYNODIC_MONTH); double age = moonAge(internalGetTime(), status); if (U_FAILURE(status)) { status = U_MEMORY_ALLOCATION_ERROR; return; } if ( days - startDate >= 25 && age > 0) { // If we're near the end of the month, assume next month and search backwards month++; } // Find out the last time that the new moon was actually visible at this longitude // This returns midnight the night that the moon was visible at sunset. while ((startDate = trueMonthStart(month)) > days) { // If it was after the date in question, back up a month and try again month--; } int32_t year = month >= 0 ? ((month / 12) + 1) : ((month + 1 ) / 12); month = ((month % 12) + 12 ) % 12; int32_t dayOfMonth = (days - monthStart(year, month)) + 1; // Now figure out the day of the year. int32_t dayOfYear = (days - monthStart(year, 0)) + 1; internalSet(UCAL_ERA, 0); internalSet(UCAL_YEAR, year); internalSet(UCAL_EXTENDED_YEAR, year); internalSet(UCAL_MONTH, month); internalSet(UCAL_ORDINAL_MONTH, month); internalSet(UCAL_DAY_OF_MONTH, dayOfMonth); internalSet(UCAL_DAY_OF_YEAR, dayOfYear); } int32_t IslamicCalendar::getEpoc() const { return CIVIL_EPOC; } static int32_t gregoYearFromIslamicStart(int32_t year) { // ad hoc conversion, improve under #10752 // rough est for now, ok for grego 1846-2138, // otherwise occasionally wrong (for 3% of years) int cycle, offset, shift = 0; if (year >= 1397) { cycle = (year - 1397) / 67; offset = (year - 1397) % 67; shift = 2*cycle + ((offset >= 33)? 1: 0); } else { cycle = (year - 1396) / 67 - 1; offset = -(year - 1396) % 67; shift = 2*cycle + ((offset <= 33)? 1: 0); } return year + 579 - shift; } int32_t IslamicCalendar::getRelatedYear(UErrorCode &status) const { int32_t year = get(UCAL_EXTENDED_YEAR, status); if (U_FAILURE(status)) { return 0; } return gregoYearFromIslamicStart(year); } static int32_t firstIslamicStartYearFromGrego(int32_t year) { // ad hoc conversion, improve under #10752 // rough est for now, ok for grego 1846-2138, // otherwise occasionally wrong (for 3% of years) int cycle, offset, shift = 0; if (year >= 1977) { cycle = (year - 1977) / 65; offset = (year - 1977) % 65; shift = 2*cycle + ((offset >= 32)? 1: 0); } else { cycle = (year - 1976) / 65 - 1; offset = -(year - 1976) % 65; shift = 2*cycle + ((offset <= 32)? 1: 0); } return year - 579 + shift; } void IslamicCalendar::setRelatedYear(int32_t year) { set(UCAL_EXTENDED_YEAR, firstIslamicStartYearFromGrego(year)); } /** * The system maintains a static default century start date and Year. They are * initialized the first time they are used. Once the system default century date * and year are set, they do not change. */ static UDate gSystemDefaultCenturyStart = DBL_MIN; static int32_t gSystemDefaultCenturyStartYear = -1; static icu::UInitOnce gSystemDefaultCenturyInit {}; UBool IslamicCalendar::haveDefaultCentury() const { return true; } UDate IslamicCalendar::defaultCenturyStart() const { // lazy-evaluate systemDefaultCenturyStart umtx_initOnce(gSystemDefaultCenturyInit, &initializeSystemDefaultCentury); return gSystemDefaultCenturyStart; } int32_t IslamicCalendar::defaultCenturyStartYear() const { // lazy-evaluate systemDefaultCenturyStartYear umtx_initOnce(gSystemDefaultCenturyInit, &initializeSystemDefaultCentury); return gSystemDefaultCenturyStartYear; } bool IslamicCalendar::inTemporalLeapYear(UErrorCode &status) const { int32_t days = getActualMaximum(UCAL_DAY_OF_YEAR, status); if (U_FAILURE(status)) return false; return days == 355; } U_CFUNC void U_CALLCONV IslamicCalendar::initializeSystemDefaultCentury() { // initialize systemDefaultCentury and systemDefaultCenturyYear based // on the current time. They'll be set to 80 years before // the current time. UErrorCode status = U_ZERO_ERROR; IslamicCalendar calendar(Locale("@calendar=islamic-civil"),status); if (U_SUCCESS(status)) { calendar.setTime(Calendar::getNow(), status); calendar.add(UCAL_YEAR, -80, status); gSystemDefaultCenturyStart = calendar.getTime(status); gSystemDefaultCenturyStartYear = calendar.get(UCAL_YEAR, status); } // We have no recourse upon failure unless we want to propagate the failure // out. } /***************************************************************************** * IslamicCivilCalendar *****************************************************************************/ IslamicCivilCalendar::IslamicCivilCalendar(const Locale& aLocale, UErrorCode& success) : IslamicCalendar(aLocale, success) { } IslamicCivilCalendar::~IslamicCivilCalendar() { } const char *IslamicCivilCalendar::getType() const { return "islamic-civil"; } IslamicCivilCalendar* IslamicCivilCalendar::clone() const { return new IslamicCivilCalendar(*this); } /** * Return the day # on which the given year starts. Days are counted * from the Hijri epoch, origin 0. */ int32_t IslamicCivilCalendar::yearStart(int32_t year) const{ return static_cast( (year-1)*354 + ClockMath::floorDivide((3+11*static_cast(year)), static_cast(30))); } /** * Return the day # on which the given month starts. Days are counted * from the Hijri epoch, origin 0. * * @param year The hijri year * @param month The hijri month, 0-based (assumed to be in range 0..11) */ int32_t IslamicCivilCalendar::monthStart(int32_t year, int32_t month) const { // This does not handle months out of the range 0..11 return static_cast( uprv_ceil(29.5*month) + (year-1)*354 + static_cast(ClockMath::floorDivide( 3+11*static_cast(year), static_cast(30)))); } /** * Return the length (in days) of the given month. * * @param year The hijri year * @param year The hijri month, 0-based * @draft ICU 2.4 */ int32_t IslamicCivilCalendar::handleGetMonthLength(int32_t extendedYear, int32_t month) const { int32_t length = 29 + (month+1) % 2; if (month == DHU_AL_HIJJAH && civilLeapYear(extendedYear)) { length++; } return length; } /** * Return the number of days in the given Islamic year * @draft ICU 2.4 */ int32_t IslamicCivilCalendar::handleGetYearLength(int32_t extendedYear) const { return 354 + (civilLeapYear(extendedYear) ? 1 : 0); } /** * Override Calendar to compute several fields specific to the Islamic * calendar system. These are: * *
  • ERA *
  • YEAR *
  • MONTH *
  • DAY_OF_MONTH *
  • DAY_OF_YEAR *
  • EXTENDED_YEAR
* * The DAY_OF_WEEK and DOW_LOCAL fields are already set when this * method is called. The getGregorianXxx() methods return Gregorian * calendar equivalents for the given Julian day. * @draft ICU 2.4 */ void IslamicCivilCalendar::handleComputeFields(int32_t julianDay, UErrorCode &status) { if (U_FAILURE(status)) return; int32_t days = julianDay - getEpoc(); // Use the civil calendar approximation, which is just arithmetic int32_t year = static_cast( ClockMath::floorDivide(30 * static_cast(days) + 10646, static_cast(10631))); int32_t month = static_cast( uprv_ceil((days - 29 - yearStart(year)) / 29.5 )); month = month<11?month:11; int32_t dayOfMonth = (days - monthStart(year, month)) + 1; // Now figure out the day of the year. int32_t dayOfYear = (days - monthStart(year, 0)) + 1; internalSet(UCAL_ERA, 0); internalSet(UCAL_YEAR, year); internalSet(UCAL_EXTENDED_YEAR, year); internalSet(UCAL_MONTH, month); internalSet(UCAL_ORDINAL_MONTH, month); internalSet(UCAL_DAY_OF_MONTH, dayOfMonth); internalSet(UCAL_DAY_OF_YEAR, dayOfYear); } /***************************************************************************** * IslamicTBLACalendar *****************************************************************************/ IslamicTBLACalendar::IslamicTBLACalendar(const Locale& aLocale, UErrorCode& success) : IslamicCivilCalendar(aLocale, success) { } IslamicTBLACalendar::~IslamicTBLACalendar() { } const char *IslamicTBLACalendar::getType() const { return "islamic-tbla"; } IslamicTBLACalendar* IslamicTBLACalendar::clone() const { return new IslamicTBLACalendar(*this); } int32_t IslamicTBLACalendar::getEpoc() const { return ASTRONOMICAL_EPOC; } /***************************************************************************** * IslamicUmalquraCalendar *****************************************************************************/ IslamicUmalquraCalendar::IslamicUmalquraCalendar(const Locale& aLocale, UErrorCode& success) : IslamicCalendar(aLocale, success) { } IslamicUmalquraCalendar::~IslamicUmalquraCalendar() { } const char *IslamicUmalquraCalendar::getType() const { return "islamic-umalqura"; } IslamicUmalquraCalendar* IslamicUmalquraCalendar::clone() const { return new IslamicUmalquraCalendar(*this); } /** * Return the day # on which the given year starts. Days are counted * from the Hijri epoch, origin 0. */ int32_t IslamicUmalquraCalendar::yearStart(int32_t year) const { if (year < UMALQURA_YEAR_START || year > UMALQURA_YEAR_END) { return static_cast( (year-1)*354 + ClockMath::floorDivide((3+11*static_cast(year)), static_cast(30))); } year -= UMALQURA_YEAR_START; // rounded least-squares fit of the dates previously calculated from UMALQURA_MONTHLENGTH iteration int32_t yrStartLinearEstimate = static_cast( (354.36720 * (double)year) + 460322.05 + 0.5); // need a slight correction to some return yrStartLinearEstimate + umAlQuraYrStartEstimateFix[year]; } /** * Return the day # on which the given month starts. Days are counted * from the Hijri epoch, origin 0. * * @param year The hijri year * @param month The hijri month, 0-based (assumed to be in range 0..11) */ int32_t IslamicUmalquraCalendar::monthStart(int32_t year, int32_t month) const { int32_t ms = yearStart(year); for(int i=0; i< month; i++){ ms+= handleGetMonthLength(year, i); } return ms; } /** * Return the length (in days) of the given month. * * @param year The hijri year * @param year The hijri month, 0-based */ int32_t IslamicUmalquraCalendar::handleGetMonthLength(int32_t extendedYear, int32_t month) const { int32_t length = 0; if (extendedYearUMALQURA_YEAR_END) { length = 29 + (month+1) % 2; if (month == DHU_AL_HIJJAH && civilLeapYear(extendedYear)) { length++; } return length; } return getUmalqura_MonthLength(extendedYear - UMALQURA_YEAR_START, month); } /** * Return the number of days in the given Islamic year * @draft ICU 2.4 */ int32_t IslamicUmalquraCalendar::handleGetYearLength(int32_t extendedYear) const { if (extendedYearUMALQURA_YEAR_END) { return 354 + (civilLeapYear(extendedYear) ? 1 : 0); } int len = 0; for(int i=0; i<12; i++) { len += handleGetMonthLength(extendedYear, i); } return len; } /** * Override Calendar to compute several fields specific to the Islamic * calendar system. These are: * *
  • ERA *
  • YEAR *
  • MONTH *
  • DAY_OF_MONTH *
  • DAY_OF_YEAR *
  • EXTENDED_YEAR
* * The DAY_OF_WEEK and DOW_LOCAL fields are already set when this * method is called. The getGregorianXxx() methods return Gregorian * calendar equivalents for the given Julian day. * @draft ICU 2.4 */ void IslamicUmalquraCalendar::handleComputeFields(int32_t julianDay, UErrorCode &status) { if (U_FAILURE(status)) return; int32_t year, month, dayOfMonth, dayOfYear; int32_t days = julianDay - getEpoc(); int32_t umalquraStartdays = yearStart(UMALQURA_YEAR_START) ; if (days < umalquraStartdays) { //Use Civil calculation year = (int32_t)ClockMath::floorDivide( (30 * (int64_t)days + 10646) , (int64_t)10631.0 ); month = (int32_t)uprv_ceil((days - 29 - yearStart(year)) / 29.5 ); month = month < 11 ? month : 11; } else { int y =UMALQURA_YEAR_START-1, m =0; long d = 1; while (d > 0) { y++; d = days - yearStart(y) +1; if (d == handleGetYearLength(y)) { m=11; break; } if (d < handleGetYearLength(y)){ int monthLen = handleGetMonthLength(y, m); m=0; while(d > monthLen){ d -= monthLen; m++; monthLen = handleGetMonthLength(y, m); } break; } } year = y; month = m; } dayOfMonth = (days - monthStart(year, month)) + 1; // Now figure out the day of the year. dayOfYear = (days - monthStart(year, 0)) + 1; internalSet(UCAL_ERA, 0); internalSet(UCAL_YEAR, year); internalSet(UCAL_EXTENDED_YEAR, year); internalSet(UCAL_MONTH, month); internalSet(UCAL_ORDINAL_MONTH, month); internalSet(UCAL_DAY_OF_MONTH, dayOfMonth); internalSet(UCAL_DAY_OF_YEAR, dayOfYear); } /***************************************************************************** * IslamicRGSACalendar *****************************************************************************/ IslamicRGSACalendar::IslamicRGSACalendar(const Locale& aLocale, UErrorCode& success) : IslamicCalendar(aLocale, success) { } IslamicRGSACalendar::~IslamicRGSACalendar() { } const char *IslamicRGSACalendar::getType() const { return "islamic-rgsa"; } IslamicRGSACalendar* IslamicRGSACalendar::clone() const { return new IslamicRGSACalendar(*this); } UOBJECT_DEFINE_RTTI_IMPLEMENTATION(IslamicCalendar) UOBJECT_DEFINE_RTTI_IMPLEMENTATION(IslamicCivilCalendar) UOBJECT_DEFINE_RTTI_IMPLEMENTATION(IslamicUmalquraCalendar) UOBJECT_DEFINE_RTTI_IMPLEMENTATION(IslamicTBLACalendar) UOBJECT_DEFINE_RTTI_IMPLEMENTATION(IslamicRGSACalendar) U_NAMESPACE_END #endif stringi/src/icu74/i18n/numparse_validators.cpp0000644000176200001440000000454014700200761021020 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING // Allow implicit conversion from char16_t* to UnicodeString for this file: // Helpful in toString methods and elsewhere. #define UNISTR_FROM_STRING_EXPLICIT #include "numparse_types.h" #include "numparse_validators.h" #include "static_unicode_sets.h" using namespace icu; using namespace icu::numparse; using namespace icu::numparse::impl; void RequireAffixValidator::postProcess(ParsedNumber& result) const { if (result.prefix.isBogus() || result.suffix.isBogus()) { // We saw a prefix or a suffix but not both. Fail the parse. result.flags |= FLAG_FAIL; } } UnicodeString RequireAffixValidator::toString() const { return u""; } void RequireCurrencyValidator::postProcess(ParsedNumber& result) const { if (result.currencyCode[0] == 0) { result.flags |= FLAG_FAIL; } } UnicodeString RequireCurrencyValidator::toString() const { return u""; } RequireDecimalSeparatorValidator::RequireDecimalSeparatorValidator(bool patternHasDecimalSeparator) : fPatternHasDecimalSeparator(patternHasDecimalSeparator) { } void RequireDecimalSeparatorValidator::postProcess(ParsedNumber& result) const { bool parseHasDecimalSeparator = 0 != (result.flags & FLAG_HAS_DECIMAL_SEPARATOR); if (parseHasDecimalSeparator != fPatternHasDecimalSeparator) { result.flags |= FLAG_FAIL; } } UnicodeString RequireDecimalSeparatorValidator::toString() const { return u""; } void RequireNumberValidator::postProcess(ParsedNumber& result) const { // Require that a number is matched. if (!result.seenNumber()) { result.flags |= FLAG_FAIL; } } UnicodeString RequireNumberValidator::toString() const { return u""; } MultiplierParseHandler::MultiplierParseHandler(::icu::number::Scale multiplier) : fMultiplier(std::move(multiplier)) {} void MultiplierParseHandler::postProcess(ParsedNumber& result) const { if (!result.quantity.bogus) { fMultiplier.applyReciprocalTo(result.quantity); // NOTE: It is okay if the multiplier was negative. } } UnicodeString MultiplierParseHandler::toString() const { return u""; } #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/number_affixutils.h0000644000176200001440000002146714700200761020140 0ustar liggesusers// © 2017 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #ifndef __NUMBER_AFFIXUTILS_H__ #define __NUMBER_AFFIXUTILS_H__ #include #include "number_types.h" #include "unicode/stringpiece.h" #include "unicode/unistr.h" #include "formatted_string_builder.h" #include "unicode/uniset.h" U_NAMESPACE_BEGIN namespace number { namespace impl { enum AffixPatternState { STATE_BASE = 0, STATE_FIRST_QUOTE = 1, STATE_INSIDE_QUOTE = 2, STATE_AFTER_QUOTE = 3, STATE_FIRST_CURR = 4, STATE_SECOND_CURR = 5, STATE_THIRD_CURR = 6, STATE_FOURTH_CURR = 7, STATE_FIFTH_CURR = 8, STATE_OVERFLOW_CURR = 9 }; // enum AffixPatternType defined in internals.h struct AffixTag { int32_t offset; UChar32 codePoint; AffixPatternState state; AffixPatternType type; AffixTag() : offset(0), state(STATE_BASE) {} AffixTag(int32_t offset) : offset(offset) {} AffixTag(int32_t offset, UChar32 codePoint, AffixPatternState state, AffixPatternType type) : offset(offset), codePoint(codePoint), state(state), type(type) {} }; class TokenConsumer { public: virtual ~TokenConsumer(); virtual void consumeToken(AffixPatternType type, UChar32 cp, UErrorCode& status) = 0; }; // Exported as U_I18N_API because it is a base class for other exported types class U_I18N_API SymbolProvider { public: virtual ~SymbolProvider(); // TODO: Could this be more efficient if it returned by reference? virtual UnicodeString getSymbol(AffixPatternType type) const = 0; }; /** * Performs manipulations on affix patterns: the prefix and suffix strings associated with a decimal * format pattern. For example: * * * * * * * *
Affix PatternExample Unescaped (Formatted) String
abcabc
ab-ab−
ab'-'ab-
ab''ab'
* * To manually iterate over tokens in a literal string, use the following pattern, which is designed * to be efficient. * *
 * long tag = 0L;
 * while (AffixPatternUtils.hasNext(tag, patternString)) {
 *   tag = AffixPatternUtils.nextToken(tag, patternString);
 *   int typeOrCp = AffixPatternUtils.getTypeOrCp(tag);
 *   switch (typeOrCp) {
 *     case AffixPatternUtils.TYPE_MINUS_SIGN:
 *       // Current token is a minus sign.
 *       break;
 *     case AffixPatternUtils.TYPE_PLUS_SIGN:
 *       // Current token is a plus sign.
 *       break;
 *     case AffixPatternUtils.TYPE_PERCENT:
 *       // Current token is a percent sign.
 *       break;
 *     // ... other types ...
 *     default:
 *       // Current token is an arbitrary code point.
 *       // The variable typeOrCp is the code point.
 *       break;
 *   }
 * }
 * 
*/ class U_I18N_API AffixUtils { public: /** * Estimates the number of code points present in an unescaped version of the affix pattern string * (one that would be returned by {@link #unescape}), assuming that all interpolated symbols * consume one code point and that currencies consume as many code points as their symbol width. * Used for computing padding width. * * @param patternString The original string whose width will be estimated. * @return The length of the unescaped string. */ static int32_t estimateLength(const UnicodeString& patternString, UErrorCode& status); /** * Takes a string and escapes (quotes) characters that have special meaning in the affix pattern * syntax. This function does not reverse-lookup symbols. * *

Example input: "-$x"; example output: "'-'$x" * * @param input The string to be escaped. * @return The resulting UnicodeString. */ static UnicodeString escape(const UnicodeString& input); static Field getFieldForType(AffixPatternType type); /** * Executes the unescape state machine. Replaces the unquoted characters "-", "+", "%", "‰", and * "¤" with the corresponding symbols provided by the {@link SymbolProvider}, and inserts the * result into the FormattedStringBuilder at the requested location. * *

Example input: "'-'¤x"; example output: "-$x" * * @param affixPattern The original string to be unescaped. * @param output The FormattedStringBuilder to mutate with the result. * @param position The index into the FormattedStringBuilder to insert the string. * @param provider An object to generate locale symbols. */ static int32_t unescape(const UnicodeString& affixPattern, FormattedStringBuilder& output, int32_t position, const SymbolProvider& provider, Field field, UErrorCode& status); /** * Sames as {@link #unescape}, but only calculates the code point count. More efficient than {@link #unescape} * if you only need the length but not the string itself. * * @param affixPattern The original string to be unescaped. * @param provider An object to generate locale symbols. * @return The same return value as if you called {@link #unescape}. */ static int32_t unescapedCodePointCount(const UnicodeString& affixPattern, const SymbolProvider& provider, UErrorCode& status); /** * Checks whether the given affix pattern contains at least one token of the given type, which is * one of the constants "TYPE_" in {@link AffixPatternUtils}. * * @param affixPattern The affix pattern to check. * @param type The token type. * @return true if the affix pattern contains the given token type; false otherwise. */ static bool containsType(const UnicodeString& affixPattern, AffixPatternType type, UErrorCode& status); /** * Checks whether the specified affix pattern has any unquoted currency symbols ("¤"). * * @param affixPattern The string to check for currency symbols. * @return true if the literal has at least one unquoted currency symbol; false otherwise. */ static bool hasCurrencySymbols(const UnicodeString& affixPattern, UErrorCode& status); /** * Replaces all occurrences of tokens with the given type with the given replacement char. * * @param affixPattern The source affix pattern (does not get modified). * @param type The token type. * @param replacementChar The char to substitute in place of chars of the given token type. * @return A string containing the new affix pattern. */ static UnicodeString replaceType(const UnicodeString& affixPattern, AffixPatternType type, char16_t replacementChar, UErrorCode& status); /** * Returns whether the given affix pattern contains only symbols and ignorables as defined by the * given ignorables set. */ static bool containsOnlySymbolsAndIgnorables(const UnicodeString& affixPattern, const UnicodeSet& ignorables, UErrorCode& status); /** * Iterates over the affix pattern, calling the TokenConsumer for each token. */ static void iterateWithConsumer(const UnicodeString& affixPattern, TokenConsumer& consumer, UErrorCode& status); /** * Returns the next token from the affix pattern. * * @param tag A bitmask used for keeping track of state from token to token. The initial value * should be 0L. * @param patternString The affix pattern. * @return The bitmask tag to pass to the next call of this method to retrieve the following token * (never negative), or -1 if there were no more tokens in the affix pattern. * @see #hasNext */ static AffixTag nextToken(AffixTag tag, const UnicodeString& patternString, UErrorCode& status); /** * Returns whether the affix pattern string has any more tokens to be retrieved from a call to * {@link #nextToken}. * * @param tag The bitmask tag of the previous token, as returned by {@link #nextToken}. * @param string The affix pattern. * @return true if there are more tokens to consume; false otherwise. */ static bool hasNext(const AffixTag& tag, const UnicodeString& string); private: /** * Encodes the given values into a tag struct. * The order of the arguments is consistent with Java, but the order of the stored * fields is not necessarily the same. */ static inline AffixTag makeTag(int32_t offset, AffixPatternType type, AffixPatternState state, UChar32 cp) { return {offset, cp, state, type}; } }; } // namespace impl } // namespace number U_NAMESPACE_END #endif //__NUMBER_AFFIXUTILS_H__ #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/dtrule.cpp0000644000176200001440000000756714700200761016251 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2007-2012, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ #include "utypeinfo.h" // for 'typeid' to work #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/dtrule.h" U_NAMESPACE_BEGIN UOBJECT_DEFINE_RTTI_IMPLEMENTATION(DateTimeRule) DateTimeRule::DateTimeRule(int32_t month, int32_t dayOfMonth, int32_t millisInDay, TimeRuleType timeType) : fMonth(month), fDayOfMonth(dayOfMonth), fDayOfWeek(0), fWeekInMonth(0), fMillisInDay(millisInDay), fDateRuleType(DateTimeRule::DOM), fTimeRuleType(timeType) { } DateTimeRule::DateTimeRule(int32_t month, int32_t weekInMonth, int32_t dayOfWeek, int32_t millisInDay, TimeRuleType timeType) : fMonth(month), fDayOfMonth(0), fDayOfWeek(dayOfWeek), fWeekInMonth(weekInMonth), fMillisInDay(millisInDay), fDateRuleType(DateTimeRule::DOW), fTimeRuleType(timeType) { } DateTimeRule::DateTimeRule(int32_t month, int32_t dayOfMonth, int32_t dayOfWeek, UBool after, int32_t millisInDay, TimeRuleType timeType) : UObject(), fMonth(month), fDayOfMonth(dayOfMonth), fDayOfWeek(dayOfWeek), fWeekInMonth(0), fMillisInDay(millisInDay), fTimeRuleType(timeType) { if (after) { fDateRuleType = DateTimeRule::DOW_GEQ_DOM; } else { fDateRuleType = DateTimeRule::DOW_LEQ_DOM; } } DateTimeRule::DateTimeRule(const DateTimeRule& source) : UObject(source), fMonth(source.fMonth), fDayOfMonth(source.fDayOfMonth), fDayOfWeek(source.fDayOfWeek), fWeekInMonth(source.fWeekInMonth), fMillisInDay(source.fMillisInDay), fDateRuleType(source.fDateRuleType), fTimeRuleType(source.fTimeRuleType) { } DateTimeRule::~DateTimeRule() { } DateTimeRule* DateTimeRule::clone() const { return new DateTimeRule(*this); } DateTimeRule& DateTimeRule::operator=(const DateTimeRule& right) { if (this != &right) { fMonth = right.fMonth; fDayOfMonth = right.fDayOfMonth; fDayOfWeek = right.fDayOfWeek; fWeekInMonth = right.fWeekInMonth; fMillisInDay = right.fMillisInDay; fDateRuleType = right.fDateRuleType; fTimeRuleType = right.fTimeRuleType; } return *this; } bool DateTimeRule::operator==(const DateTimeRule& that) const { return ((this == &that) || (typeid(*this) == typeid(that) && fMonth == that.fMonth && fDayOfMonth == that.fDayOfMonth && fDayOfWeek == that.fDayOfWeek && fWeekInMonth == that.fWeekInMonth && fMillisInDay == that.fMillisInDay && fDateRuleType == that.fDateRuleType && fTimeRuleType == that.fTimeRuleType)); } bool DateTimeRule::operator!=(const DateTimeRule& that) const { return !operator==(that); } DateTimeRule::DateRuleType DateTimeRule::getDateRuleType() const { return fDateRuleType; } DateTimeRule::TimeRuleType DateTimeRule::getTimeRuleType() const { return fTimeRuleType; } int32_t DateTimeRule::getRuleMonth() const { return fMonth; } int32_t DateTimeRule::getRuleDayOfMonth() const { return fDayOfMonth; } int32_t DateTimeRule::getRuleDayOfWeek() const { return fDayOfWeek; } int32_t DateTimeRule::getRuleWeekInMonth() const { return fWeekInMonth; } int32_t DateTimeRule::getRuleMillisInDay() const { return fMillisInDay; } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ //eof stringi/src/icu74/i18n/region_impl.h0000644000176200001440000000266514700200761016715 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2013, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* * * File REGION_IMPL.H * ******************************************************************************* */ #ifndef __REGION_IMPL_H__ #define __REGION_IMPL_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "uvector.h" #include "unicode/strenum.h" U_NAMESPACE_BEGIN class RegionNameEnumeration : public StringEnumeration { public: /** * Construct an string enumeration over the supplied name list. * Makes a copy of the supplied input name list; does not retain a reference to the original. */ RegionNameEnumeration(UVector *nameList, UErrorCode& status); virtual ~RegionNameEnumeration(); static UClassID U_EXPORT2 getStaticClassID(); virtual UClassID getDynamicClassID() const override; virtual const UnicodeString* snext(UErrorCode& status) override; virtual void reset(UErrorCode& status) override; virtual int32_t count(UErrorCode& status) const override; private: int32_t pos; UVector *fRegionNames; }; U_NAMESPACE_END #endif #endif stringi/src/icu74/i18n/ethpccal.h0000644000176200001440000002550314700200761016170 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2003 - 2013, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ #ifndef ETHPCCAL_H #define ETHPCCAL_H #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/calendar.h" #include "cecal.h" U_NAMESPACE_BEGIN /** * Implement the Ethiopic calendar system. * @internal */ class EthiopicCalendar : public CECalendar { public: /** * Useful constants for EthiopicCalendar. * @internal */ enum EMonths { /** * Constant for መስከረም, the 1st month of the Ethiopic year. */ MESKEREM, /** * Constant for ጥቅምት, the 2nd month of the Ethiopic year. */ TEKEMT, /** * Constant for ኅዳር, the 3rd month of the Ethiopic year. */ HEDAR, /** * Constant for ታኅሣሥ, the 4th month of the Ethiopic year. */ TAHSAS, /** * Constant for ጥር, the 5th month of the Ethiopic year. */ TER, /** * Constant for የካቲት, the 6th month of the Ethiopic year. */ YEKATIT, /** * Constant for መጋቢት, the 7th month of the Ethiopic year. */ MEGABIT, /** * Constant for ሚያዝያ, the 8th month of the Ethiopic year. */ MIAZIA, /** * Constant for ግንቦት, the 9th month of the Ethiopic year. */ GENBOT, /** * Constant for ሰኔ, the 10th month of the Ethiopic year. */ SENE, /** * Constant for ሐምሌ, the 11th month of the Ethiopic year. */ HAMLE, /** * Constant for ነሐሴ, the 12th month of the Ethiopic year. */ NEHASSE, /** * Constant for ጳጉሜን, the 13th month of the Ethiopic year. */ PAGUMEN }; enum EEras { AMETE_ALEM, // Before the epoch AMETE_MIHRET // After the epoch }; /** * Constructs a EthiopicCalendar based on the current time in the default time zone * with the given locale. * * @param aLocale The given locale. * @param success Indicates the status of EthiopicCalendar object construction. * Returns U_ZERO_ERROR if constructed successfully. * @param type Whether this Ethiopic calendar use Amete Mihrret (default) or * only use Amete Alem for all the time. * @internal */ EthiopicCalendar(const Locale& aLocale, UErrorCode& success); /** * Copy Constructor * @internal */ EthiopicCalendar(const EthiopicCalendar& other) = default; /** * Destructor. * @internal */ virtual ~EthiopicCalendar(); /** * Create and return a polymorphic copy of this calendar. * @return return a polymorphic copy of this calendar. * @internal */ virtual EthiopicCalendar* clone() const override; /** * Return the calendar type, "ethiopic" * @return calendar type * @internal */ virtual const char * getType() const override; /** * @return The related Gregorian year; will be obtained by modifying the value * obtained by get from UCAL_EXTENDED_YEAR field * @internal */ virtual int32_t getRelatedYear(UErrorCode &status) const override; /** * @param year The related Gregorian year to set; will be modified as necessary then * set in UCAL_EXTENDED_YEAR field * @internal */ virtual void setRelatedYear(int32_t year) override; protected: //------------------------------------------------------------------------- // Calendar framework //------------------------------------------------------------------------- /** * Return the extended year defined by the current fields. * This calendar uses both AMETE_ALEM and AMETE_MIHRET. * * EXTENDED_YEAR ERA YEAR * 0 AMETE_ALEM 5500 * 1 AMETE_MIHRET 1 * @internal */ virtual int32_t handleGetExtendedYear() override; /** * Compute fields from the JD * @internal */ virtual void handleComputeFields(int32_t julianDay, UErrorCode &status) override; /** * Returns the date of the start of the default century * @return start of century - in milliseconds since epoch, 1970 * @internal */ virtual UDate defaultCenturyStart() const override; /** * Returns the year in which the default century begins * @internal */ virtual int32_t defaultCenturyStartYear() const override; /** * Return the date offset from Julian * @internal */ virtual int32_t getJDEpochOffset() const override; public: /** * Override Calendar Returns a unique class ID POLYMORPHICALLY. Pure virtual * override. This method is to implement a simple version of RTTI, since not all C++ * compilers support genuine RTTI. Polymorphic operator==() and clone() methods call * this method. * * @return The class ID for this object. All objects of a given class have the * same class ID. Objects of other classes have different class IDs. * @internal */ virtual UClassID getDynamicClassID() const override; /** * Return the class ID for this class. This is useful only for comparing to a return * value from getDynamicClassID(). For example: * * Base* polymorphic_pointer = createPolymorphicObject(); * if (polymorphic_pointer->getDynamicClassID() == * Derived::getStaticClassID()) ... * * @return The class ID for all objects of this class. * @internal */ U_I18N_API static UClassID U_EXPORT2 getStaticClassID(); #if 0 // We do not want to introduce this API in ICU4C. // It was accidentally introduced in ICU4J as a public API. public: //------------------------------------------------------------------------- // Calendar system Conversion methods... //------------------------------------------------------------------------- /** * Convert an Ethiopic year, month, and day to a Julian day. * * @param year the extended year * @param month the month * @param day the day * @return Julian day * @internal */ int32_t ethiopicToJD(int32_t year, int32_t month, int32_t day); #endif }; /** * Implement the Ethiopic Amete Alem calendar system. * @internal */ class EthiopicAmeteAlemCalendar : public EthiopicCalendar { public: /** * Constructs a EthiopicAmeteAlemCalendar based on the current time in the default time zone * with the given locale. * * @param aLocale The given locale. * @param success Indicates the status of EthiopicCalendar object construction. * Returns U_ZERO_ERROR if constructed successfully. * @internal */ EthiopicAmeteAlemCalendar(const Locale& aLocale, UErrorCode& success); /** * Copy Constructor * @internal */ EthiopicAmeteAlemCalendar(const EthiopicAmeteAlemCalendar& other) = default; /** * Destructor. * @internal */ virtual ~EthiopicAmeteAlemCalendar(); /** * Create and return a polymorphic copy of this calendar. * @return return a polymorphic copy of this calendar. * @internal */ virtual EthiopicAmeteAlemCalendar* clone() const override; /** * Return the calendar type, "ethiopic-amete-alem" * @return calendar type * @internal */ virtual const char * getType() const override; /** * Override Calendar Returns a unique class ID POLYMORPHICALLY. Pure virtual * override. This method is to implement a simple version of RTTI, since not all C++ * compilers support genuine RTTI. Polymorphic operator==() and clone() methods call * this method. * * @return The class ID for this object. All objects of a given class have the * same class ID. Objects of other classes have different class IDs. * @internal */ virtual UClassID getDynamicClassID() const override; /** * Return the class ID for this class. This is useful only for comparing to a return * value from getDynamicClassID(). For example: * * Base* polymorphic_pointer = createPolymorphicObject(); * if (polymorphic_pointer->getDynamicClassID() == * Derived::getStaticClassID()) ... * * @return The class ID for all objects of this class. * @internal */ U_I18N_API static UClassID U_EXPORT2 getStaticClassID(); /** * @return The related Gregorian year; will be obtained by modifying the value * obtained by get from UCAL_EXTENDED_YEAR field * @internal */ virtual int32_t getRelatedYear(UErrorCode &status) const override; /** * @param year The related Gregorian year to set; will be modified as necessary then * set in UCAL_EXTENDED_YEAR field * @internal */ virtual void setRelatedYear(int32_t year) override; protected: //------------------------------------------------------------------------- // Calendar framework //------------------------------------------------------------------------- /** * Return the extended year defined by the current fields. * This calendar use only AMETE_ALEM for the era. * * EXTENDED_YEAR ERA YEAR * 0 AMETE_ALEM 5500 * 1 AMETE_ALEM 5501 * @internal */ virtual int32_t handleGetExtendedYear() override; /** * Compute fields from the JD * @internal */ virtual void handleComputeFields(int32_t julianDay, UErrorCode &status) override; /** * Calculate the limit for a specified type of limit and field * @internal */ virtual int32_t handleGetLimit(UCalendarDateFields field, ELimitType limitType) const override; /** * Returns the year in which the default century begins * @internal */ virtual int32_t defaultCenturyStartYear() const override; }; U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ #endif /* ETHPCCAL_H */ //eof stringi/src/icu74/i18n/iso8601cal.h0000644000176200001440000000520114700200761016167 0ustar liggesusers// © 2022 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #ifndef ISO8601CAL_H #define ISO8601CAL_H #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/calendar.h" #include "unicode/gregocal.h" #include "unicode/timezone.h" U_NAMESPACE_BEGIN /** * Concrete class which provides the ISO8601 calendar. *

* ISO8601Calendar is a subclass of GregorianCalendar * that the first day of a week is Monday and the minimal days in the first * week of a year or month is four days. *

* The ISO8601 calendar is identical to the Gregorian calendar in all respects * except for the first day of week and the minimal days in the first week * of a year. * @internal */ class ISO8601Calendar : public GregorianCalendar { public: //------------------------------------------------------------------------- // Constructors... //------------------------------------------------------------------------- /** * Constructs a DangiCalendar based on the current time in the default time zone * with the given locale. * * @param aLocale The given locale. * @param success Indicates the status of ISO8601Calendar object construction. * Returns U_ZERO_ERROR if constructed successfully. * @internal */ ISO8601Calendar(const Locale& aLocale, UErrorCode &success); /** * Copy Constructor * @internal */ ISO8601Calendar(const ISO8601Calendar& other) = default; /** * Destructor. * @internal */ virtual ~ISO8601Calendar(); /** * Clone. * @internal */ virtual ISO8601Calendar* clone() const override; // UObject stuff public: /** * @return The class ID for this object. All objects of a given class have the * same class ID. Objects of other classes have different class IDs. * @internal */ virtual UClassID getDynamicClassID() const override; /** * Return the class ID for this class. This is useful only for comparing to a return * value from getDynamicClassID(). For example: * * Base* polymorphic_pointer = createPolymorphicObject(); * if (polymorphic_pointer->getDynamicClassID() == * Derived::getStaticClassID()) ... * * @return The class ID for all objects of this class. * @internal */ U_I18N_API static UClassID U_EXPORT2 getStaticClassID(); /** * return the calendar type, "iso8601". * * @return calendar type * @internal */ virtual const char * getType() const override; private: ISO8601Calendar(); // default constructor not implemented }; U_NAMESPACE_END #endif #endif stringi/src/icu74/i18n/nfsubs.h0000644000176200001440000002420114700200761015677 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * Copyright (C) 1997-2015, International Business Machines * Corporation and others. All Rights Reserved. ****************************************************************************** * file name: nfsubs.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * Modification history * Date Name Comments * 10/11/2001 Doug Ported from ICU4J */ #ifndef NFSUBS_H #define NFSUBS_H #include "unicode/utypes.h" #include "unicode/uobject.h" #include "nfrule.h" #if U_HAVE_RBNF #include "unicode/utypes.h" #include "unicode/decimfmt.h" #include "nfrs.h" #include U_NAMESPACE_BEGIN class NFSubstitution : public UObject { int32_t pos; const NFRuleSet* ruleSet; DecimalFormat* numberFormat; protected: NFSubstitution(int32_t pos, const NFRuleSet* ruleSet, const UnicodeString& description, UErrorCode& status); /** * Get the Ruleset of the object. * @return the Ruleset of the object. */ const NFRuleSet* getRuleSet() const { return ruleSet; } /** * get the NumberFormat of this object. * @return the numberformat of this object. */ const DecimalFormat* getNumberFormat() const { return numberFormat; } public: static NFSubstitution* makeSubstitution(int32_t pos, const NFRule* rule, const NFRule* predecessor, const NFRuleSet* ruleSet, const RuleBasedNumberFormat* rbnf, const UnicodeString& description, UErrorCode& status); /** * Destructor. */ virtual ~NFSubstitution(); /** * Return true if the given Format objects are semantically equal. * Objects of different subclasses are considered unequal. * @param rhs the object to be compared with. * @return true if the given Format objects are semantically equal. */ virtual bool operator==(const NFSubstitution& rhs) const; /** * Return true if the given Format objects are semantically unequal. * Objects of different subclasses are considered unequal. * @param rhs the object to be compared with. * @return true if the given Format objects are semantically unequal. */ bool operator!=(const NFSubstitution& rhs) const { return !operator==(rhs); } /** * Sets the substitution's divisor. Used by NFRule.setBaseValue(). * A no-op for all substitutions except multiplier and modulus * substitutions. * @param radix The radix of the divisor * @param exponent The exponent of the divisor */ virtual void setDivisor(int32_t radix, int16_t exponent, UErrorCode& status); /** * Replaces result with the string describing the substitution. * @param result Output param which will receive the string. */ virtual void toString(UnicodeString& result) const; void setDecimalFormatSymbols(const DecimalFormatSymbols &newSymbols, UErrorCode& status); //----------------------------------------------------------------------- // formatting //----------------------------------------------------------------------- /** * Performs a mathematical operation on the number, formats it using * either ruleSet or decimalFormat, and inserts the result into * toInsertInto. * @param number The number being formatted. * @param toInsertInto The string we insert the result into * @param pos The position in toInsertInto where the owning rule's * rule text begins (this value is added to this substitution's * position to determine exactly where to insert the new text) */ virtual void doSubstitution(int64_t number, UnicodeString& toInsertInto, int32_t pos, int32_t recursionCount, UErrorCode& status) const; /** * Performs a mathematical operation on the number, formats it using * either ruleSet or decimalFormat, and inserts the result into * toInsertInto. * @param number The number being formatted. * @param toInsertInto The string we insert the result into * @param pos The position in toInsertInto where the owning rule's * rule text begins (this value is added to this substitution's * position to determine exactly where to insert the new text) */ virtual void doSubstitution(double number, UnicodeString& toInsertInto, int32_t pos, int32_t recursionCount, UErrorCode& status) const; protected: /** * Subclasses override this function to perform some kind of * mathematical operation on the number. The result of this operation * is formatted using the rule set or DecimalFormat that this * substitution refers to, and the result is inserted into the result * string. * @param The number being formatted * @return The result of performing the opreration on the number */ virtual int64_t transformNumber(int64_t number) const = 0; /** * Subclasses override this function to perform some kind of * mathematical operation on the number. The result of this operation * is formatted using the rule set or DecimalFormat that this * substitution refers to, and the result is inserted into the result * string. * @param The number being formatted * @return The result of performing the opreration on the number */ virtual double transformNumber(double number) const = 0; public: //----------------------------------------------------------------------- // parsing //----------------------------------------------------------------------- /** * Parses a string using the rule set or DecimalFormat belonging * to this substitution. If there's a match, a mathematical * operation (the inverse of the one used in formatting) is * performed on the result of the parse and the value passed in * and returned as the result. The parse position is updated to * point to the first unmatched character in the string. * @param text The string to parse * @param parsePosition On entry, ignored, but assumed to be 0. * On exit, this is updated to point to the first unmatched * character (or 0 if the substitution didn't match) * @param baseValue A partial parse result that should be * combined with the result of this parse * @param upperBound When searching the rule set for a rule * matching the string passed in, only rules with base values * lower than this are considered * @param lenientParse If true and matching against rules fails, * the substitution will also try matching the text against * numerals using a default-costructed NumberFormat. If false, * no extra work is done. (This value is false whenever the * formatter isn't in lenient-parse mode, but is also false * under some conditions even when the formatter _is_ in * lenient-parse mode.) * @return If there's a match, this is the result of composing * baseValue with whatever was returned from matching the * characters. This will be either a Long or a Double. If there's * no match this is new Long(0) (not null), and parsePosition * is left unchanged. */ virtual UBool doParse(const UnicodeString& text, ParsePosition& parsePosition, double baseValue, double upperBound, UBool lenientParse, uint32_t nonNumericalExecutedRuleMask, Formattable& result) const; /** * Derives a new value from the two values passed in. The two values * are typically either the base values of two rules (the one containing * the substitution and the one matching the substitution) or partial * parse results derived in some other way. The operation is generally * the inverse of the operation performed by transformNumber(). * @param newRuleValue The value produced by matching this substitution * @param oldRuleValue The value that was passed to the substitution * by the rule that owns it * @return A third value derived from the other two, representing a * partial parse result */ virtual double composeRuleValue(double newRuleValue, double oldRuleValue) const = 0; /** * Calculates an upper bound when searching for a rule that matches * this substitution. Rules with base values greater than or equal * to upperBound are not considered. * @param oldUpperBound The current upper-bound setting. The new * upper bound can't be any higher. * @return the upper bound when searching for a rule that matches * this substitution. */ virtual double calcUpperBound(double oldUpperBound) const = 0; //----------------------------------------------------------------------- // simple accessors //----------------------------------------------------------------------- /** * Returns the substitution's position in the rule that owns it. * @return The substitution's position in the rule that owns it. */ int32_t getPos() const { return pos; } /** * Returns the character used in the textual representation of * substitutions of this type. Used by toString(). * @return This substitution's token character. */ virtual char16_t tokenChar() const = 0; /** * Returns true if this is a modulus substitution. (We didn't do this * with instanceof partially because it causes source files to * proliferate and partially because we have to port this to C++.) * @return true if this object is an instance of ModulusSubstitution */ virtual UBool isModulusSubstitution() const; private: NFSubstitution(const NFSubstitution &other) = delete; // forbid copying of this class NFSubstitution &operator=(const NFSubstitution &other) = delete; // forbid copying of this class public: static UClassID getStaticClassID(); virtual UClassID getDynamicClassID() const override; }; U_NAMESPACE_END /* U_HAVE_RBNF */ #endif // NFSUBS_H #endif stringi/src/icu74/i18n/collationiterator.h0000644000176200001440000002575514770521233020163 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2010-2014, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * collationiterator.h * * created on: 2010oct27 * created by: Markus W. Scherer */ #ifndef __COLLATIONITERATOR_H__ #define __COLLATIONITERATOR_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION #include "cmemory.h" #include "collation.h" #include "collationdata.h" U_NAMESPACE_BEGIN class SkippedState; class UCharsTrie; class UVector32; /* Large enough for CEs of most short strings. */ #define CEBUFFER_INITIAL_CAPACITY 40 // Export an explicit template instantiation of the MaybeStackArray that // is used as a data member of CEBuffer. // // When building DLLs for Windows this is required even though // no direct access to the MaybeStackArray leaks out of the i18n library. // // See digitlst.h, pluralaffix.h, datefmt.h, and others for similar examples. // #if U_PF_WINDOWS <= U_PLATFORM && U_PLATFORM <= U_PF_CYGWIN template class U_I18N_API MaybeStackArray; #endif /** * Collation element iterator and abstract character iterator. * * When a method returns a code point value, it must be in 0..10FFFF, * except it can be negative as a sentinel value. */ class U_I18N_API CollationIterator : public UObject { private: class U_I18N_API CEBuffer { private: /** Large enough for CEs of most short strings. */ static const int32_t INITIAL_CAPACITY = CEBUFFER_INITIAL_CAPACITY; public: CEBuffer() : length(0) {} ~CEBuffer(); inline void append(int64_t ce, UErrorCode &errorCode) { if(length < INITIAL_CAPACITY || ensureAppendCapacity(1, errorCode)) { buffer[length++] = ce; } } inline void appendUnsafe(int64_t ce) { buffer[length++] = ce; } UBool ensureAppendCapacity(int32_t appCap, UErrorCode &errorCode); inline UBool incLength(UErrorCode &errorCode) { // Use INITIAL_CAPACITY for a very simple fastpath. // (Rather than buffer.getCapacity().) if(length < INITIAL_CAPACITY || ensureAppendCapacity(1, errorCode)) { ++length; return true; } else { return false; } } inline int64_t set(int32_t i, int64_t ce) { return buffer[i] = ce; } inline int64_t get(int32_t i) const { return buffer[i]; } const int64_t *getCEs() const { return buffer.getAlias(); } int32_t length; private: CEBuffer(const CEBuffer &) = delete; void operator=(const CEBuffer &) = delete; MaybeStackArray buffer; }; public: CollationIterator(const CollationData *d, UBool numeric) : trie(d->trie), data(d), cesIndex(0), skipped(nullptr), numCpFwd(-1), isNumeric(numeric) {} virtual ~CollationIterator(); virtual bool operator==(const CollationIterator &other) const; inline bool operator!=(const CollationIterator &other) const { return !operator==(other); } /** * Resets the iterator state and sets the position to the specified offset. * Subclasses must implement, and must call the parent class method, * or CollationIterator::reset(). */ virtual void resetToOffset(int32_t newOffset) = 0; virtual int32_t getOffset() const = 0; /** * Returns the next collation element. */ inline int64_t nextCE(UErrorCode &errorCode) { if(cesIndex < ceBuffer.length) { // Return the next buffered CE. return ceBuffer.get(cesIndex++); } // assert cesIndex == ceBuffer.length; if(!ceBuffer.incLength(errorCode)) { return Collation::NO_CE; } UChar32 c; uint32_t ce32 = handleNextCE32(c, errorCode); uint32_t t = ce32 & 0xff; if(t < Collation::SPECIAL_CE32_LOW_BYTE) { // Forced-inline of isSpecialCE32(ce32). // Normal CE from the main data. // Forced-inline of ceFromSimpleCE32(ce32). return ceBuffer.set(cesIndex++, ((int64_t)(ce32 & 0xffff0000) << 32) | ((ce32 & 0xff00) << 16) | (t << 8)); } const CollationData *d; // The compiler should be able to optimize the previous and the following // comparisons of t with the same constant. if(t == Collation::SPECIAL_CE32_LOW_BYTE) { if(c < 0) { return ceBuffer.set(cesIndex++, Collation::NO_CE); } d = data->base; ce32 = d->getCE32(c); t = ce32 & 0xff; if(t < Collation::SPECIAL_CE32_LOW_BYTE) { // Normal CE from the base data. return ceBuffer.set(cesIndex++, ((int64_t)(ce32 & 0xffff0000) << 32) | ((ce32 & 0xff00) << 16) | (t << 8)); } } else { d = data; } if(t == Collation::LONG_PRIMARY_CE32_LOW_BYTE) { // Forced-inline of ceFromLongPrimaryCE32(ce32). return ceBuffer.set(cesIndex++, ((int64_t)(ce32 - t) << 32) | Collation::COMMON_SEC_AND_TER_CE); } return nextCEFromCE32(d, c, ce32, errorCode); } /** * Fetches all CEs. * @return getCEsLength() */ int32_t fetchCEs(UErrorCode &errorCode); /** * Overwrites the current CE (the last one returned by nextCE()). */ void setCurrentCE(int64_t ce) { // assert cesIndex > 0; ceBuffer.set(cesIndex - 1, ce); } /** * Returns the previous collation element. */ int64_t previousCE(UVector32 &offsets, UErrorCode &errorCode); inline int32_t getCEsLength() const { return ceBuffer.length; } inline int64_t getCE(int32_t i) const { return ceBuffer.get(i); } const int64_t *getCEs() const { return ceBuffer.getCEs(); } void clearCEs() { cesIndex = ceBuffer.length = 0; } void clearCEsIfNoneRemaining() { if(cesIndex == ceBuffer.length) { clearCEs(); } } /** * Returns the next code point (with post-increment). * Public for identical-level comparison and for testing. */ virtual UChar32 nextCodePoint(UErrorCode &errorCode) = 0; /** * Returns the previous code point (with pre-decrement). * Public for identical-level comparison and for testing. */ virtual UChar32 previousCodePoint(UErrorCode &errorCode) = 0; protected: CollationIterator(const CollationIterator &other); CollationIterator() { } // MG FIX void __CollationIterator_init(const CollationData *d, UBool numeric) { // MG FIX trie = d->trie; data = d; cesIndex = 0; skipped = nullptr; numCpFwd = -1; isNumeric = numeric; } void reset(); /** * Returns the next code point and its local CE32 value. * Returns Collation::FALLBACK_CE32 at the end of the text (c<0) * or when c's CE32 value is to be looked up in the base data (fallback). * * The code point is used for fallbacks, context and implicit weights. * It is ignored when the returned CE32 is not special (e.g., FFFD_CE32). */ virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode); /** * Called when handleNextCE32() returns a LEAD_SURROGATE_TAG for a lead surrogate code unit. * Returns the trail surrogate in that case and advances past it, * if a trail surrogate follows the lead surrogate. * Otherwise returns any other code unit and does not advance. */ virtual char16_t handleGetTrailSurrogate(); /** * Called when handleNextCE32() returns with c==0, to see whether it is a NUL terminator. * (Not needed in Java.) */ virtual UBool foundNULTerminator(); /** * @return false if surrogate code points U+D800..U+DFFF * map to their own implicit primary weights (for UTF-16), * or true if they map to CE(U+FFFD) (for UTF-8) */ virtual UBool forbidSurrogateCodePoints() const; virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) = 0; virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) = 0; /** * Returns the CE32 from the data trie. * Normally the same as data->getCE32(), but overridden in the builder. * Call this only when the faster data->getCE32() cannot be used. */ virtual uint32_t getDataCE32(UChar32 c) const; virtual uint32_t getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode); void appendCEsFromCE32(const CollationData *d, UChar32 c, uint32_t ce32, UBool forward, UErrorCode &errorCode); // Main lookup trie of the data object. const UTrie2 *trie; const CollationData *data; private: int64_t nextCEFromCE32(const CollationData *d, UChar32 c, uint32_t ce32, UErrorCode &errorCode); uint32_t getCE32FromPrefix(const CollationData *d, uint32_t ce32, UErrorCode &errorCode); UChar32 nextSkippedCodePoint(UErrorCode &errorCode); void backwardNumSkipped(int32_t n, UErrorCode &errorCode); uint32_t nextCE32FromContraction( const CollationData *d, uint32_t contractionCE32, const char16_t *p, uint32_t ce32, UChar32 c, UErrorCode &errorCode); uint32_t nextCE32FromDiscontiguousContraction( const CollationData *d, UCharsTrie &suffixes, uint32_t ce32, int32_t lookAhead, UChar32 c, UErrorCode &errorCode); /** * Returns the previous CE when data->isUnsafeBackward(c, isNumeric). */ int64_t previousCEUnsafe(UChar32 c, UVector32 &offsets, UErrorCode &errorCode); /** * Turns a string of digits (bytes 0..9) * into a sequence of CEs that will sort in numeric order. * * Starts from this ce32's digit value and consumes the following/preceding digits. * The digits string must not be empty and must not have leading zeros. */ void appendNumericCEs(uint32_t ce32, UBool forward, UErrorCode &errorCode); /** * Turns 1..254 digits into a sequence of CEs. * Called by appendNumericCEs() for each segment of at most 254 digits. */ void appendNumericSegmentCEs(const char *digits, int32_t length, UErrorCode &errorCode); CEBuffer ceBuffer; int32_t cesIndex; SkippedState *skipped; // Number of code points to read forward, or -1. // Used as a forward iteration limit in previousCEUnsafe(). int32_t numCpFwd; // Numeric collation (CollationSettings::NUMERIC). UBool isNumeric; }; U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION #endif // __COLLATIONITERATOR_H__ stringi/src/icu74/i18n/fmtableimp.h0000644000176200001440000000203114700200761016514 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2010-2014, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ #ifndef FMTABLEIMP_H #define FMTABLEIMP_H #include "number_decimalquantity.h" #if !UCONFIG_NO_FORMATTING U_NAMESPACE_BEGIN /** * Maximum int64_t value that can be stored in a double without chancing losing precision. * IEEE doubles have 53 bits of mantissa, 10 bits exponent, 1 bit sign. * IBM Mainframes have 56 bits of mantissa, 7 bits of base 16 exponent, 1 bit sign. * Define this constant to the smallest value from those for supported platforms. * @internal */ static const int64_t MAX_INT64_IN_DOUBLE = 0x001FFFFFFFFFFFFFLL; U_NAMESPACE_END #endif // #if !UCONFIG_NO_FORMATTING #endif stringi/src/icu74/i18n/units_complexconverter.cpp0000644000176200001440000002670014700200761021561 0ustar liggesusers// © 2020 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include #include "cmemory.h" #include "number_decimalquantity.h" #include "number_roundingutils.h" #include "putilimp.h" #include "uarrsort.h" #include "uassert.h" #include "unicode/fmtable.h" #include "unicode/localpointer.h" #include "unicode/measunit.h" #include "unicode/measure.h" #include "units_complexconverter.h" #include "units_converter.h" U_NAMESPACE_BEGIN namespace units { ComplexUnitsConverter::ComplexUnitsConverter(const MeasureUnitImpl &targetUnit, const ConversionRates &ratesInfo, UErrorCode &status) : units_(targetUnit.extractIndividualUnitsWithIndices(status)) { if (U_FAILURE(status)) { return; } U_ASSERT(units_.length() != 0); // Just borrowing a pointer to the instance MeasureUnitImpl *biggestUnit = &units_[0]->unitImpl; for (int32_t i = 1; i < units_.length(); i++) { if (UnitsConverter::compareTwoUnits(units_[i]->unitImpl, *biggestUnit, ratesInfo, status) > 0 && U_SUCCESS(status)) { biggestUnit = &units_[i]->unitImpl; } if (U_FAILURE(status)) { return; } } this->init(*biggestUnit, ratesInfo, status); } ComplexUnitsConverter::ComplexUnitsConverter(StringPiece inputUnitIdentifier, StringPiece outputUnitsIdentifier, UErrorCode &status) { if (U_FAILURE(status)) { return; } MeasureUnitImpl inputUnit = MeasureUnitImpl::forIdentifier(inputUnitIdentifier, status); MeasureUnitImpl outputUnits = MeasureUnitImpl::forIdentifier(outputUnitsIdentifier, status); this->units_ = outputUnits.extractIndividualUnitsWithIndices(status); U_ASSERT(units_.length() != 0); this->init(inputUnit, ConversionRates(status), status); } ComplexUnitsConverter::ComplexUnitsConverter(const MeasureUnitImpl &inputUnit, const MeasureUnitImpl &outputUnits, const ConversionRates &ratesInfo, UErrorCode &status) : units_(outputUnits.extractIndividualUnitsWithIndices(status)) { if (U_FAILURE(status)) { return; } U_ASSERT(units_.length() != 0); this->init(inputUnit, ratesInfo, status); } void ComplexUnitsConverter::init(const MeasureUnitImpl &inputUnit, const ConversionRates &ratesInfo, UErrorCode &status) { // Sorts units in descending order. Therefore, we return -1 if // the left is bigger than right and so on. auto descendingCompareUnits = [](const void *context, const void *left, const void *right) { UErrorCode status = U_ZERO_ERROR; const auto *leftPointer = static_cast(left); const auto *rightPointer = static_cast(right); // Multiply by -1 to sort in descending order return (-1) * UnitsConverter::compareTwoUnits((**leftPointer).unitImpl, // (**rightPointer).unitImpl, // *static_cast(context), // status); }; uprv_sortArray(units_.getAlias(), // units_.length(), // sizeof units_[0], /* NOTE: we have already asserted that the units_ is not empty.*/ // descendingCompareUnits, // &ratesInfo, // false, // &status // ); // In case the `outputUnits` are `UMEASURE_UNIT_MIXED` such as `foot+inch`. In this case we need more // converters to convert from the `inputUnit` to the first unit in the `outputUnits`. Then, a // converter from the first unit in the `outputUnits` to the second unit and so on. // For Example: // - inputUnit is `meter` // - outputUnits is `foot+inch` // - Therefore, we need to have two converters: // 1. a converter from `meter` to `foot` // 2. a converter from `foot` to `inch` // - Therefore, if the input is `2 meter`: // 1. convert `meter` to `foot` --> 2 meter to 6.56168 feet // 2. convert the residual of 6.56168 feet (0.56168) to inches, which will be (6.74016 // inches) // 3. then, the final result will be (6 feet and 6.74016 inches) for (int i = 0, n = units_.length(); i < n; i++) { if (i == 0) { // first element unitsConverters_.emplaceBackAndCheckErrorCode(status, inputUnit, units_[i]->unitImpl, ratesInfo, status); } else { unitsConverters_.emplaceBackAndCheckErrorCode(status, units_[i - 1]->unitImpl, units_[i]->unitImpl, ratesInfo, status); } if (U_FAILURE(status)) { return; } } } UBool ComplexUnitsConverter::greaterThanOrEqual(double quantity, double limit) const { U_ASSERT(unitsConverters_.length() > 0); // First converter converts to the biggest quantity. double newQuantity = unitsConverters_[0]->convert(quantity); return newQuantity >= limit; } MaybeStackVector ComplexUnitsConverter::convert(double quantity, icu::number::impl::RoundingImpl *rounder, UErrorCode &status) const { // TODO: return an error for "foot-and-foot"? MaybeStackVector result; int sign = 1; if (quantity < 0 && unitsConverters_.length() > 1) { quantity *= -1; sign = -1; } // For N converters: // - the first converter converts from the input unit to the largest unit, // - the following N-2 converters convert to bigger units for which we want integers, // - the Nth converter (index N-1) converts to the smallest unit, for which // we keep a double. MaybeStackArray intValues(unitsConverters_.length() - 1, status); if (U_FAILURE(status)) { return result; } uprv_memset(intValues.getAlias(), 0, (unitsConverters_.length() - 1) * sizeof(int64_t)); for (int i = 0, n = unitsConverters_.length(); i < n; ++i) { quantity = (*unitsConverters_[i]).convert(quantity); if (i < n - 1) { // If quantity is at the limits of double's precision from an // integer value, we take that integer value. int64_t flooredQuantity; if (uprv_isNaN(quantity)) { // With clang on Linux: floor does not support NaN, resulting in // a giant negative number. For now, we produce "0 feet, NaN // inches". TODO(icu-units#131): revisit desired output. flooredQuantity = 0; } else { flooredQuantity = static_cast(floor(quantity * (1 + DBL_EPSILON))); } intValues[i] = flooredQuantity; // Keep the residual of the quantity. // For example: `3.6 feet`, keep only `0.6 feet` double remainder = quantity - flooredQuantity; if (remainder < 0) { // Because we nudged flooredQuantity up by eps, remainder may be // negative: we must treat such a remainder as zero. quantity = 0; } else { quantity = remainder; } } } applyRounder(intValues, quantity, rounder, status); // Initialize empty result. We use a MaybeStackArray directly so we can // assign pointers - for this privilege we have to take care of cleanup. MaybeStackArray tmpResult(unitsConverters_.length(), status); if (U_FAILURE(status)) { return result; } // Package values into temporary Measure instances in tmpResult: for (int i = 0, n = unitsConverters_.length(); i < n; ++i) { if (i < n - 1) { Formattable formattableQuantity(intValues[i] * sign); // Measure takes ownership of the MeasureUnit* MeasureUnit *type = new MeasureUnit(units_[i]->unitImpl.copy(status).build(status)); tmpResult[units_[i]->index] = new Measure(formattableQuantity, type, status); } else { // LAST ELEMENT Formattable formattableQuantity(quantity * sign); // Measure takes ownership of the MeasureUnit* MeasureUnit *type = new MeasureUnit(units_[i]->unitImpl.copy(status).build(status)); tmpResult[units_[i]->index] = new Measure(formattableQuantity, type, status); } } // Transfer values into result and return: for(int32_t i = 0, n = unitsConverters_.length(); i < n; ++i) { U_ASSERT(tmpResult[i] != nullptr); result.emplaceBackAndCheckErrorCode(status, *tmpResult[i]); delete tmpResult[i]; } return result; } void ComplexUnitsConverter::applyRounder(MaybeStackArray &intValues, double &quantity, icu::number::impl::RoundingImpl *rounder, UErrorCode &status) const { if (uprv_isInfinite(quantity) || uprv_isNaN(quantity)) { // Inf and NaN can't be rounded, and calculating `carry` below is known // to fail on Gentoo on HPPA and OpenSUSE on riscv64. Nothing to do. return; } if (rounder == nullptr) { // Nothing to do for the quantity. return; } number::impl::DecimalQuantity decimalQuantity; decimalQuantity.setToDouble(quantity); rounder->apply(decimalQuantity, status); if (U_FAILURE(status)) { return; } quantity = decimalQuantity.toDouble(); int32_t lastIndex = unitsConverters_.length() - 1; if (lastIndex == 0) { // Only one element, no need to bubble up the carry return; } // Check if there's a carry, and bubble it back up the resulting intValues. int64_t carry = static_cast(floor(unitsConverters_[lastIndex]->convertInverse(quantity) * (1 + DBL_EPSILON))); if (carry <= 0) { return; } quantity -= unitsConverters_[lastIndex]->convert(static_cast(carry)); intValues[lastIndex - 1] += carry; // We don't use the first converter: that one is for the input unit for (int32_t j = lastIndex - 1; j > 0; j--) { carry = static_cast(floor(unitsConverters_[j]->convertInverse(static_cast(intValues[j])) * (1 + DBL_EPSILON))); if (carry <= 0) { return; } intValues[j] -= static_cast(round(unitsConverters_[j]->convert(static_cast(carry)))); intValues[j - 1] += carry; } } } // namespace units U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/number_compact.cpp0000644000176200001440000003363114700200761017737 0ustar liggesusers// © 2017 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/ustring.h" #include "unicode/ures.h" #include "cstring.h" #include "charstr.h" #include "resource.h" #include "number_compact.h" #include "number_microprops.h" #include "uresimp.h" using namespace icu; using namespace icu::number; using namespace icu::number::impl; namespace { // A dummy object used when a "0" compact decimal entry is encountered. This is necessary // in order to prevent falling back to root. Object equality ("==") is intended. const char16_t *USE_FALLBACK = u""; /** Produces a string like "NumberElements/latn/patternsShort/decimalFormat". */ void getResourceBundleKey(const char *nsName, CompactStyle compactStyle, CompactType compactType, CharString &sb, UErrorCode &status) { sb.clear(); sb.append("NumberElements/", status); sb.append(nsName, status); sb.append(compactStyle == CompactStyle::UNUM_SHORT ? "/patternsShort" : "/patternsLong", status); sb.append(compactType == CompactType::TYPE_DECIMAL ? "/decimalFormat" : "/currencyFormat", status); } int32_t getIndex(int32_t magnitude, StandardPlural::Form plural) { return magnitude * StandardPlural::COUNT + plural; } int32_t countZeros(const char16_t *patternString, int32_t patternLength) { // NOTE: This strategy for computing the number of zeros is a hack for efficiency. // It could break if there are any 0s that aren't part of the main pattern. int32_t numZeros = 0; for (int32_t i = 0; i < patternLength; i++) { if (patternString[i] == u'0') { numZeros++; } else if (numZeros > 0) { break; // zeros should always be contiguous } } return numZeros; } } // namespace // NOTE: patterns and multipliers both get zero-initialized. CompactData::CompactData() : patterns(), multipliers(), largestMagnitude(0), isEmpty(true) { } void CompactData::populate(const Locale &locale, const char *nsName, CompactStyle compactStyle, CompactType compactType, UErrorCode &status) { CompactDataSink sink(*this); LocalUResourceBundlePointer rb(ures_open(nullptr, locale.getName(), &status)); if (U_FAILURE(status)) { return; } bool nsIsLatn = strcmp(nsName, "latn") == 0; bool compactIsShort = compactStyle == CompactStyle::UNUM_SHORT; // Fall back to latn numbering system and/or short compact style. CharString resourceKey; getResourceBundleKey(nsName, compactStyle, compactType, resourceKey, status); UErrorCode localStatus = U_ZERO_ERROR; ures_getAllItemsWithFallback(rb.getAlias(), resourceKey.data(), sink, localStatus); if (isEmpty && !nsIsLatn) { getResourceBundleKey("latn", compactStyle, compactType, resourceKey, status); localStatus = U_ZERO_ERROR; ures_getAllItemsWithFallback(rb.getAlias(), resourceKey.data(), sink, localStatus); } if (isEmpty && !compactIsShort) { getResourceBundleKey(nsName, CompactStyle::UNUM_SHORT, compactType, resourceKey, status); localStatus = U_ZERO_ERROR; ures_getAllItemsWithFallback(rb.getAlias(), resourceKey.data(), sink, localStatus); } if (isEmpty && !nsIsLatn && !compactIsShort) { getResourceBundleKey("latn", CompactStyle::UNUM_SHORT, compactType, resourceKey, status); localStatus = U_ZERO_ERROR; ures_getAllItemsWithFallback(rb.getAlias(), resourceKey.data(), sink, localStatus); } // The last fallback should be guaranteed to return data. if (isEmpty) { status = U_INTERNAL_PROGRAM_ERROR; } } int32_t CompactData::getMultiplier(int32_t magnitude) const { if (magnitude < 0) { return 0; } if (magnitude > largestMagnitude) { magnitude = largestMagnitude; } return multipliers[magnitude]; } const char16_t *CompactData::getPattern( int32_t magnitude, const PluralRules *rules, const DecimalQuantity &dq) const { if (magnitude < 0) { return nullptr; } if (magnitude > largestMagnitude) { magnitude = largestMagnitude; } const char16_t *patternString = nullptr; if (dq.hasIntegerValue()) { int64_t i = dq.toLong(true); if (i == 0) { patternString = patterns[getIndex(magnitude, StandardPlural::Form::EQ_0)]; } else if (i == 1) { patternString = patterns[getIndex(magnitude, StandardPlural::Form::EQ_1)]; } if (patternString != nullptr) { return patternString; } } StandardPlural::Form plural = utils::getStandardPlural(rules, dq); patternString = patterns[getIndex(magnitude, plural)]; if (patternString == nullptr && plural != StandardPlural::OTHER) { // Fall back to "other" plural variant patternString = patterns[getIndex(magnitude, StandardPlural::OTHER)]; } if (patternString == USE_FALLBACK) { // == is intended // Return null if USE_FALLBACK is present patternString = nullptr; } return patternString; } void CompactData::getUniquePatterns(UVector &output, UErrorCode &status) const { U_ASSERT(output.isEmpty()); // NOTE: In C++, this is done more manually with a UVector. // In Java, we can take advantage of JDK HashSet. for (auto pattern : patterns) { if (pattern == nullptr || pattern == USE_FALLBACK) { continue; } // Insert pattern into the UVector if the UVector does not already contain the pattern. // Search the UVector from the end since identical patterns are likely to be adjacent. for (int32_t i = output.size() - 1; i >= 0; i--) { if (u_strcmp(pattern, static_cast(output[i])) == 0) { goto continue_outer; } } // The string was not found; add it to the UVector. // Note: must cast off const from pattern to store it in a UVector, which expects (void *) output.addElement(const_cast(pattern), status); continue_outer: continue; } } void CompactData::CompactDataSink::put(const char *key, ResourceValue &value, UBool /*noFallback*/, UErrorCode &status) { // traverse into the table of powers of ten ResourceTable powersOfTenTable = value.getTable(status); if (U_FAILURE(status)) { return; } for (int i3 = 0; powersOfTenTable.getKeyAndValue(i3, key, value); ++i3) { // Assumes that the keys are always of the form "10000" where the magnitude is the // length of the key minus one. We only support magnitudes less than COMPACT_MAX_DIGITS; // ignore entries that have greater magnitude. auto magnitude = static_cast (strlen(key) - 1); U_ASSERT(magnitude < COMPACT_MAX_DIGITS); // debug assert if (magnitude >= COMPACT_MAX_DIGITS) { // skip in production continue; } int8_t multiplier = data.multipliers[magnitude]; // Iterate over the plural variants ("one", "other", etc) ResourceTable pluralVariantsTable = value.getTable(status); if (U_FAILURE(status)) { return; } for (int i4 = 0; pluralVariantsTable.getKeyAndValue(i4, key, value); ++i4) { // Skip this magnitude/plural if we already have it from a child locale. // Note: This also skips USE_FALLBACK entries. StandardPlural::Form plural = StandardPlural::fromString(key, status); if (U_FAILURE(status)) { return; } if (data.patterns[getIndex(magnitude, plural)] != nullptr) { continue; } // The value "0" means that we need to use the default pattern and not fall back // to parent locales. Example locale where this is relevant: 'it'. int32_t patternLength; const char16_t *patternString = value.getString(patternLength, status); if (U_FAILURE(status)) { return; } if (u_strcmp(patternString, u"0") == 0) { patternString = USE_FALLBACK; patternLength = 0; } // Save the pattern string. We will parse it lazily. data.patterns[getIndex(magnitude, plural)] = patternString; // If necessary, compute the multiplier: the difference between the magnitude // and the number of zeros in the pattern. if (multiplier == 0) { int32_t numZeros = countZeros(patternString, patternLength); if (numZeros > 0) { // numZeros==0 in certain cases, like Somali "Kun" multiplier = static_cast (numZeros - magnitude - 1); } } } // Save the multiplier. if (data.multipliers[magnitude] == 0) { data.multipliers[magnitude] = multiplier; if (magnitude > data.largestMagnitude) { data.largestMagnitude = magnitude; } data.isEmpty = false; } else { U_ASSERT(data.multipliers[magnitude] == multiplier); } } } /////////////////////////////////////////////////////////// /// END OF CompactData.java; BEGIN CompactNotation.java /// /////////////////////////////////////////////////////////// CompactHandler::CompactHandler( CompactStyle compactStyle, const Locale &locale, const char *nsName, CompactType compactType, const PluralRules *rules, MutablePatternModifier *buildReference, bool safe, const MicroPropsGenerator *parent, UErrorCode &status) : rules(rules), parent(parent), safe(safe) { data.populate(locale, nsName, compactStyle, compactType, status); if (safe) { // Safe code path precomputeAllModifiers(*buildReference, status); } else { // Unsafe code path // Store the MutablePatternModifier reference. unsafePatternModifier = buildReference; } } CompactHandler::~CompactHandler() { for (int32_t i = 0; i < precomputedModsLength; i++) { delete precomputedMods[i].mod; } } void CompactHandler::precomputeAllModifiers(MutablePatternModifier &buildReference, UErrorCode &status) { if (U_FAILURE(status)) { return; } // Initial capacity of 12 for 0K, 00K, 000K, ...M, ...B, and ...T UVector allPatterns(12, status); if (U_FAILURE(status)) { return; } data.getUniquePatterns(allPatterns, status); if (U_FAILURE(status)) { return; } // C++ only: ensure that precomputedMods has room. precomputedModsLength = allPatterns.size(); if (precomputedMods.getCapacity() < precomputedModsLength) { precomputedMods.resize(allPatterns.size(), status); if (U_FAILURE(status)) { return; } } for (int32_t i = 0; i < precomputedModsLength; i++) { auto patternString = static_cast(allPatterns[i]); UnicodeString hello(patternString); CompactModInfo &info = precomputedMods[i]; ParsedPatternInfo patternInfo; PatternParser::parseToPatternInfo(UnicodeString(patternString), patternInfo, status); if (U_FAILURE(status)) { return; } buildReference.setPatternInfo(&patternInfo, {UFIELD_CATEGORY_NUMBER, UNUM_COMPACT_FIELD}); info.mod = buildReference.createImmutable(status); if (U_FAILURE(status)) { return; } info.patternString = patternString; } } void CompactHandler::processQuantity(DecimalQuantity &quantity, MicroProps µs, UErrorCode &status) const { parent->processQuantity(quantity, micros, status); if (U_FAILURE(status)) { return; } // Treat zero, NaN, and infinity as if they had magnitude 0 int32_t magnitude; int32_t multiplier = 0; if (quantity.isZeroish()) { magnitude = 0; micros.rounder.apply(quantity, status); } else { // TODO: Revisit chooseMultiplierAndApply multiplier = micros.rounder.chooseMultiplierAndApply(quantity, data, status); magnitude = quantity.isZeroish() ? 0 : quantity.getMagnitude(); magnitude -= multiplier; } const char16_t *patternString = data.getPattern(magnitude, rules, quantity); if (patternString == nullptr) { // Use the default (non-compact) modifier. // No need to take any action. } else if (safe) { // Safe code path. // Java uses a hash set here for O(1) lookup. C++ uses a linear search. // TODO: Benchmark this and maybe change to a binary search or hash table. int32_t i = 0; for (; i < precomputedModsLength; i++) { const CompactModInfo &info = precomputedMods[i]; if (u_strcmp(patternString, info.patternString) == 0) { info.mod->applyToMicros(micros, quantity, status); break; } } // It should be guaranteed that we found the entry. U_ASSERT(i < precomputedModsLength); } else { // Unsafe code path. // Overwrite the PatternInfo in the existing modMiddle. // C++ Note: Use unsafePatternInfo for proper lifecycle. ParsedPatternInfo &patternInfo = const_cast(this)->unsafePatternInfo; PatternParser::parseToPatternInfo(UnicodeString(patternString), patternInfo, status); unsafePatternModifier->setPatternInfo( &unsafePatternInfo, {UFIELD_CATEGORY_NUMBER, UNUM_COMPACT_FIELD}); unsafePatternModifier->setNumberProperties(quantity.signum(), StandardPlural::Form::COUNT); micros.modMiddle = unsafePatternModifier; } // Change the exponent only after we select appropriate plural form // for formatting purposes so that we preserve expected formatted // string behavior. quantity.adjustExponent(-1 * multiplier); // We already performed rounding. Do not perform it again. micros.rounder = RoundingImpl::passThrough(); } #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/inputext.cpp0000644000176200001440000001063414700200761016617 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2005-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_CONVERSION #include "inputext.h" #include "cmemory.h" #include "cstring.h" #include U_NAMESPACE_BEGIN #define BUFFER_SIZE 8192 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) #define DELETE_ARRAY(array) uprv_free((void *) (array)) InputText::InputText(UErrorCode &status) : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been // removed if appropriate. fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text. // Value is percent, not absolute. fDeclaredEncoding(0), fRawInput(0), fRawLength(0) { if (fInputBytes == nullptr || fByteStats == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; } } InputText::~InputText() { DELETE_ARRAY(fDeclaredEncoding); DELETE_ARRAY(fByteStats); DELETE_ARRAY(fInputBytes); } void InputText::setText(const char *in, int32_t len) { fInputLen = 0; fC1Bytes = false; fRawInput = (const uint8_t *) in; fRawLength = len == -1? (int32_t)uprv_strlen(in) : len; } void InputText::setDeclaredEncoding(const char* encoding, int32_t len) { if(encoding) { if (len == -1) { len = (int32_t)uprv_strlen(encoding); } len += 1; // to make place for the \0 at the end. uprv_free(fDeclaredEncoding); fDeclaredEncoding = NEW_ARRAY(char, len); uprv_strncpy(fDeclaredEncoding, encoding, len); } } UBool InputText::isSet() const { return fRawInput != nullptr; } /** * MungeInput - after getting a set of raw input data to be analyzed, preprocess * it by removing what appears to be html markup. * * @internal */ void InputText::MungeInput(UBool fStripTags) { int srci = 0; int dsti = 0; uint8_t b; bool inMarkup = false; int32_t openTags = 0; int32_t badTags = 0; // // html / xml markup stripping. // quick and dirty, not 100% accurate, but hopefully good enough, statistically. // discard everything within < brackets > // Count how many total '<' and illegal (nested) '<' occur, so we can make some // guess as to whether the input was actually marked up at all. // TODO: Think about how this interacts with EBCDIC charsets that are detected. if (fStripTags) { for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) { b = fRawInput[srci]; if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */ if (inMarkup) { badTags += 1; } inMarkup = true; openTags += 1; } if (! inMarkup) { fInputBytes[dsti++] = b; } if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */ inMarkup = false; } } fInputLen = dsti; } // // If it looks like this input wasn't marked up, or if it looks like it's // essentially nothing but markup abandon the markup stripping. // Detection will have to work on the unstripped input. // if (openTags<5 || openTags/5 < badTags || (fInputLen < 100 && fRawLength>600)) { int32_t limit = fRawLength; if (limit > BUFFER_SIZE) { limit = BUFFER_SIZE; } for (srci=0; srciname == _name; } void format(int64_t number, UnicodeString& toAppendTo, int32_t pos, int32_t recursionCount, UErrorCode& status) const; void format(double number, UnicodeString& toAppendTo, int32_t pos, int32_t recursionCount, UErrorCode& status) const; UBool parse(const UnicodeString& text, ParsePosition& pos, double upperBound, uint32_t nonNumericalExecutedRuleMask, Formattable& result) const; void appendRules(UnicodeString& result) const; // toString void setDecimalFormatSymbols(const DecimalFormatSymbols &newSymbols, UErrorCode& status); const RuleBasedNumberFormat *getOwner() const { return owner; } private: const NFRule * findNormalRule(int64_t number) const; const NFRule * findDoubleRule(double number) const; const NFRule * findFractionRuleSetRule(double number) const; friend class NFSubstitution; private: UnicodeString name; NFRuleList rules; NFRule *nonNumericalRules[6]; RuleBasedNumberFormat *owner; NFRuleList fractionRules; UBool fIsFractionRuleSet; UBool fIsPublic; UBool fIsParseable; NFRuleSet(const NFRuleSet &other); // forbid copying of this class NFRuleSet &operator=(const NFRuleSet &other); // forbid copying of this class }; // utilities from old llong.h // convert mantissa portion of double to int64 int64_t util64_fromDouble(double d); // raise radix to the power exponent, only non-negative exponents // Arithmetic is performed in unsigned space since overflow in // signed space is undefined behavior. uint64_t util64_pow(uint32_t radix, uint16_t exponent); // convert n to digit string in buffer, return length of string uint32_t util64_tou(int64_t n, char16_t* buffer, uint32_t buflen, uint32_t radix = 10, UBool raw = false); #ifdef RBNF_DEBUG int64_t util64_utoi(const char16_t* str, uint32_t radix = 10); uint32_t util64_toa(int64_t n, char* buffer, uint32_t buflen, uint32_t radix = 10, UBool raw = false); int64_t util64_atoi(const char* str, uint32_t radix); #endif U_NAMESPACE_END /* U_HAVE_RBNF */ #endif // NFRS_H #endif stringi/src/icu74/i18n/numparse_impl.cpp0000644000176200001440000003317414700200761017616 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING // Allow implicit conversion from char16_t* to UnicodeString for this file: // Helpful in toString methods and elsewhere. #define UNISTR_FROM_STRING_EXPLICIT #include #include #include "number_types.h" #include "number_patternstring.h" #include "numparse_types.h" #include "numparse_impl.h" #include "numparse_symbols.h" #include "numparse_decimal.h" #include "unicode/numberformatter.h" #include "cstr.h" #include "number_mapper.h" #include "static_unicode_sets.h" using namespace icu; using namespace icu::number; using namespace icu::number::impl; using namespace icu::numparse; using namespace icu::numparse::impl; NumberParseMatcher::~NumberParseMatcher() = default; NumberParserImpl* NumberParserImpl::createSimpleParser(const Locale& locale, const UnicodeString& patternString, parse_flags_t parseFlags, UErrorCode& status) { LocalPointer parser(new NumberParserImpl(parseFlags)); DecimalFormatSymbols symbols(locale, status); parser->fLocalMatchers.ignorables = {parseFlags}; IgnorablesMatcher& ignorables = parser->fLocalMatchers.ignorables; DecimalFormatSymbols dfs(locale, status); dfs.setSymbol(DecimalFormatSymbols::kCurrencySymbol, u"IU$"); dfs.setSymbol(DecimalFormatSymbols::kIntlCurrencySymbol, u"ICU"); CurrencySymbols currencySymbols({u"ICU", status}, locale, dfs, status); ParsedPatternInfo patternInfo; PatternParser::parseToPatternInfo(patternString, patternInfo, status); // The following statements set up the affix matchers. AffixTokenMatcherSetupData affixSetupData = { currencySymbols, symbols, ignorables, locale, parseFlags}; parser->fLocalMatchers.affixTokenMatcherWarehouse = {&affixSetupData}; parser->fLocalMatchers.affixMatcherWarehouse = {&parser->fLocalMatchers.affixTokenMatcherWarehouse}; parser->fLocalMatchers.affixMatcherWarehouse.createAffixMatchers( patternInfo, *parser, ignorables, parseFlags, status); Grouper grouper = Grouper::forStrategy(UNUM_GROUPING_AUTO); grouper.setLocaleData(patternInfo, locale); parser->addMatcher(parser->fLocalMatchers.ignorables); parser->addMatcher(parser->fLocalMatchers.decimal = {symbols, grouper, parseFlags}); parser->addMatcher(parser->fLocalMatchers.minusSign = {symbols, false}); parser->addMatcher(parser->fLocalMatchers.plusSign = {symbols, false}); parser->addMatcher(parser->fLocalMatchers.percent = {symbols}); parser->addMatcher(parser->fLocalMatchers.permille = {symbols}); parser->addMatcher(parser->fLocalMatchers.nan = {symbols}); parser->addMatcher(parser->fLocalMatchers.infinity = {symbols}); parser->addMatcher(parser->fLocalMatchers.padding = {u"@"}); parser->addMatcher(parser->fLocalMatchers.scientific = {symbols, grouper}); parser->addMatcher(parser->fLocalMatchers.currency = {currencySymbols, symbols, parseFlags, status}); parser->addMatcher(parser->fLocalValidators.number = {}); parser->freeze(); return parser.orphan(); } NumberParserImpl* NumberParserImpl::createParserFromProperties(const number::impl::DecimalFormatProperties& properties, const DecimalFormatSymbols& symbols, bool parseCurrency, UErrorCode& status) { Locale locale = symbols.getLocale(); AutoAffixPatternProvider affixProvider(properties, status); if (U_FAILURE(status)) { return nullptr; } CurrencyUnit currency = resolveCurrency(properties, locale, status); CurrencySymbols currencySymbols(currency, locale, symbols, status); bool isStrict = properties.parseMode.getOrDefault(PARSE_MODE_STRICT) == PARSE_MODE_STRICT; Grouper grouper = Grouper::forProperties(properties); int parseFlags = 0; if (U_FAILURE(status)) { return nullptr; } if (!properties.parseCaseSensitive) { parseFlags |= PARSE_FLAG_IGNORE_CASE; } if (properties.parseIntegerOnly) { parseFlags |= PARSE_FLAG_INTEGER_ONLY; } if (properties.signAlwaysShown) { parseFlags |= PARSE_FLAG_PLUS_SIGN_ALLOWED; } if (isStrict) { parseFlags |= PARSE_FLAG_STRICT_GROUPING_SIZE; parseFlags |= PARSE_FLAG_STRICT_SEPARATORS; parseFlags |= PARSE_FLAG_USE_FULL_AFFIXES; parseFlags |= PARSE_FLAG_EXACT_AFFIX; parseFlags |= PARSE_FLAG_STRICT_IGNORABLES; } else { parseFlags |= PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES; } if (grouper.getPrimary() <= 0) { parseFlags |= PARSE_FLAG_GROUPING_DISABLED; } if (parseCurrency || affixProvider.get().hasCurrencySign()) { parseFlags |= PARSE_FLAG_MONETARY_SEPARATORS; } if (!parseCurrency) { parseFlags |= PARSE_FLAG_NO_FOREIGN_CURRENCY; } LocalPointer parser(new NumberParserImpl(parseFlags)); parser->fLocalMatchers.ignorables = {parseFlags}; IgnorablesMatcher& ignorables = parser->fLocalMatchers.ignorables; ////////////////////// /// AFFIX MATCHERS /// ////////////////////// // The following statements set up the affix matchers. AffixTokenMatcherSetupData affixSetupData = { currencySymbols, symbols, ignorables, locale, parseFlags}; parser->fLocalMatchers.affixTokenMatcherWarehouse = {&affixSetupData}; parser->fLocalMatchers.affixMatcherWarehouse = {&parser->fLocalMatchers.affixTokenMatcherWarehouse}; parser->fLocalMatchers.affixMatcherWarehouse.createAffixMatchers( affixProvider.get(), *parser, ignorables, parseFlags, status); //////////////////////// /// CURRENCY MATCHER /// //////////////////////// if (parseCurrency || affixProvider.get().hasCurrencySign()) { parser->addMatcher(parser->fLocalMatchers.currency = {currencySymbols, symbols, parseFlags, status}); } /////////////// /// PERCENT /// /////////////// // ICU-TC meeting, April 11, 2018: accept percent/permille only if it is in the pattern, // and to maintain regressive behavior, divide by 100 even if no percent sign is present. if (!isStrict && affixProvider.get().containsSymbolType(AffixPatternType::TYPE_PERCENT, status)) { parser->addMatcher(parser->fLocalMatchers.percent = {symbols}); } if (!isStrict && affixProvider.get().containsSymbolType(AffixPatternType::TYPE_PERMILLE, status)) { parser->addMatcher(parser->fLocalMatchers.permille = {symbols}); } /////////////////////////////// /// OTHER STANDARD MATCHERS /// /////////////////////////////// if (!isStrict) { parser->addMatcher(parser->fLocalMatchers.plusSign = {symbols, false}); parser->addMatcher(parser->fLocalMatchers.minusSign = {symbols, false}); } parser->addMatcher(parser->fLocalMatchers.nan = {symbols}); parser->addMatcher(parser->fLocalMatchers.infinity = {symbols}); UnicodeString padString = properties.padString; if (!padString.isBogus() && !ignorables.getSet()->contains(padString)) { parser->addMatcher(parser->fLocalMatchers.padding = {padString}); } parser->addMatcher(parser->fLocalMatchers.ignorables); parser->addMatcher(parser->fLocalMatchers.decimal = {symbols, grouper, parseFlags}); // NOTE: parseNoExponent doesn't disable scientific parsing if we have a scientific formatter if (!properties.parseNoExponent || properties.minimumExponentDigits > 0) { parser->addMatcher(parser->fLocalMatchers.scientific = {symbols, grouper}); } ////////////////// /// VALIDATORS /// ////////////////// parser->addMatcher(parser->fLocalValidators.number = {}); if (isStrict) { parser->addMatcher(parser->fLocalValidators.affix = {}); } if (parseCurrency) { parser->addMatcher(parser->fLocalValidators.currency = {}); } if (properties.decimalPatternMatchRequired) { bool patternHasDecimalSeparator = properties.decimalSeparatorAlwaysShown || properties.maximumFractionDigits != 0; parser->addMatcher(parser->fLocalValidators.decimalSeparator = {patternHasDecimalSeparator}); } // The multiplier takes care of scaling percentages. Scale multiplier = scaleFromProperties(properties); if (multiplier.isValid()) { parser->addMatcher(parser->fLocalValidators.multiplier = {multiplier}); } parser->freeze(); return parser.orphan(); } NumberParserImpl::NumberParserImpl(parse_flags_t parseFlags) : fParseFlags(parseFlags) { } NumberParserImpl::~NumberParserImpl() { fNumMatchers = 0; } void NumberParserImpl::addMatcher(NumberParseMatcher& matcher) { if (fNumMatchers + 1 > fMatchers.getCapacity()) { fMatchers.resize(fNumMatchers * 2, fNumMatchers); } fMatchers[fNumMatchers] = &matcher; fNumMatchers++; } void NumberParserImpl::freeze() { fFrozen = true; } parse_flags_t NumberParserImpl::getParseFlags() const { return fParseFlags; } void NumberParserImpl::parse(const UnicodeString& input, bool greedy, ParsedNumber& result, UErrorCode& status) const { return parse(input, 0, greedy, result, status); } void NumberParserImpl::parse(const UnicodeString& input, int32_t start, bool greedy, ParsedNumber& result, UErrorCode& status) const { if (U_FAILURE(status)) { return; } U_ASSERT(fFrozen); // TODO: Check start >= 0 and start < input.length() StringSegment segment(input, 0 != (fParseFlags & PARSE_FLAG_IGNORE_CASE)); segment.adjustOffset(start); if (greedy) { parseGreedy(segment, result, status); } else if (0 != (fParseFlags & PARSE_FLAG_ALLOW_INFINITE_RECURSION)) { // Start at 1 so that recursionLevels never gets to 0 parseLongestRecursive(segment, result, 1, status); } else { // Arbitrary recursion safety limit: 100 levels. parseLongestRecursive(segment, result, -100, status); } for (int32_t i = 0; i < fNumMatchers; i++) { fMatchers[i]->postProcess(result); } result.postProcess(); } void NumberParserImpl::parseGreedy(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const { // Note: this method is not recursive in order to avoid stack overflow. for (int i = 0; i smokeTest(segment)) { // Matcher failed smoke test: try the next one i++; continue; } int32_t initialOffset = segment.getOffset(); matcher->match(segment, result, status); if (U_FAILURE(status)) { return; } if (segment.getOffset() != initialOffset) { // Greedy heuristic: accept the match and loop back i = 0; continue; } else { // Matcher did not match: try the next one i++; continue; } UPRV_UNREACHABLE_EXIT; } // NOTE: If we get here, the greedy parse completed without consuming the entire string. } void NumberParserImpl::parseLongestRecursive(StringSegment& segment, ParsedNumber& result, int32_t recursionLevels, UErrorCode& status) const { // Base Case if (segment.length() == 0) { return; } // Safety against stack overflow if (recursionLevels == 0) { return; } // TODO: Give a nice way for the matcher to reset the ParsedNumber? ParsedNumber initial(result); ParsedNumber candidate; int initialOffset = segment.getOffset(); for (int32_t i = 0; i < fNumMatchers; i++) { const NumberParseMatcher* matcher = fMatchers[i]; if (!matcher->smokeTest(segment)) { continue; } // In a non-greedy parse, we attempt all possible matches and pick the best. for (int32_t charsToConsume = 0; charsToConsume < segment.length();) { charsToConsume += U16_LENGTH(segment.codePointAt(charsToConsume)); // Run the matcher on a segment of the current length. candidate = initial; segment.setLength(charsToConsume); bool maybeMore = matcher->match(segment, candidate, status); segment.resetLength(); if (U_FAILURE(status)) { return; } // If the entire segment was consumed, recurse. if (segment.getOffset() - initialOffset == charsToConsume) { parseLongestRecursive(segment, candidate, recursionLevels + 1, status); if (U_FAILURE(status)) { return; } if (candidate.isBetterThan(result)) { result = candidate; } } // Since the segment can be re-used, reset the offset. // This does not have an effect if the matcher did not consume any chars. segment.setOffset(initialOffset); // Unless the matcher wants to see the next char, continue to the next matcher. if (!maybeMore) { break; } } } } UnicodeString NumberParserImpl::toString() const { UnicodeString result(u"toString()); } result.append(u" ]>", -1); return result; } #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/uspoof_conf.h0000644000176200001440000001134514700200761016724 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * * Copyright (C) 2008-2016, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * file name: uspoof_conf.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2009Jan05 * created by: Andy Heninger * * Internal classes for compiling confusable data into its binary (runtime) form. */ #ifndef __USPOOF_BUILDCONF_H__ #define __USPOOF_BUILDCONF_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_NORMALIZATION #if !UCONFIG_NO_REGULAR_EXPRESSIONS #include "unicode/uregex.h" #include "uhash.h" #include "uspoof_impl.h" U_NAMESPACE_BEGIN // SPUString // Holds a string that is the result of one of the mappings defined // by the confusable mapping data (confusables.txt from Unicode.org) // Instances of SPUString exist during the compilation process only. struct SPUString : public UMemory { LocalPointer fStr; // The actual string. int32_t fCharOrStrTableIndex; // Index into the final runtime data for this // string (or, for length 1, the single string char // itself, there being no string table entry for it.) SPUString(LocalPointer s); ~SPUString(); }; // String Pool A utility class for holding the strings that are the result of // the spoof mappings. These strings will utimately end up in the // run-time String Table. // This is sort of like a sorted set of strings, except that ICU's anemic // built-in collections don't support those, so it is implemented with a // combination of a uhash and a UVector. class SPUStringPool : public UMemory { public: SPUStringPool(UErrorCode &status); ~SPUStringPool(); // Add a string. Return the string from the table. // If the input parameter string is already in the table, delete the // input parameter and return the existing string. SPUString *addString(UnicodeString *src, UErrorCode &status); // Get the n-th string in the collection. SPUString *getByIndex(int32_t i); // Sort the contents; affects the ordering of getByIndex(). void sort(UErrorCode &status); int32_t size(); private: UVector *fVec; // Elements are SPUString * UHashtable *fHash; // Key: UnicodeString Value: SPUString }; // class ConfusabledataBuilder // An instance of this class exists while the confusable data is being built from source. // It encapsulates the intermediate data structures that are used for building. // It exports one static function, to do a confusable data build. class ConfusabledataBuilder : public UMemory { private: SpoofImpl *fSpoofImpl; char16_t *fInput; UHashtable *fTable; UnicodeSet *fKeySet; // A set of all keys (UChar32s) that go into the four mapping tables. // The binary data is first assembled into the following four collections, then // copied to its final raw-memory destination. UVector *fKeyVec; UVector *fValueVec; UnicodeString *fStringTable; SPUStringPool *stringPool; URegularExpression *fParseLine; URegularExpression *fParseHexNum; int32_t fLineNum; ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status); ~ConfusabledataBuilder(); void build(const char * confusables, int32_t confusablesLen, UErrorCode &status); // Add an entry to the key and value tables being built // input: data from SLTable, MATable, etc. // output: entry added to fKeyVec and fValueVec void addKeyEntry(UChar32 keyChar, // The key character UHashtable *table, // The table, one of SATable, MATable, etc. int32_t tableFlag, // One of USPOOF_SA_TABLE_FLAG, etc. UErrorCode &status); // From an index into fKeyVec & fValueVec // get a UnicodeString with the corresponding mapping. UnicodeString getMapping(int32_t index); // Populate the final binary output data array with the compiled data. void outputData(UErrorCode &status); public: static void buildConfusableData(SpoofImpl *spImpl, const char * confusables, int32_t confusablesLen, int32_t *errorType, UParseError *pe, UErrorCode &status); }; U_NAMESPACE_END #endif #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS #endif // __USPOOF_BUILDCONF_H__ stringi/src/icu74/i18n/collationtailoring.cpp0000644000176200001440000000656614700200761020645 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2013-2015, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * collationtailoring.cpp * * created on: 2013mar12 * created by: Markus W. Scherer */ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION #include "unicode/udata.h" #include "unicode/unistr.h" #include "unicode/ures.h" #include "unicode/uversion.h" #include "unicode/uvernum.h" #include "cmemory.h" #include "collationdata.h" #include "collationsettings.h" #include "collationtailoring.h" #include "normalizer2impl.h" #include "uassert.h" #include "uhash.h" #include "umutex.h" #include "utrie2.h" U_NAMESPACE_BEGIN CollationTailoring::CollationTailoring(const CollationSettings *baseSettings) : data(nullptr), settings(baseSettings), actualLocale(""), ownedData(nullptr), builder(nullptr), memory(nullptr), bundle(nullptr), trie(nullptr), unsafeBackwardSet(nullptr), maxExpansions(nullptr) { if(baseSettings != nullptr) { U_ASSERT(baseSettings->reorderCodesLength == 0); U_ASSERT(baseSettings->reorderTable == nullptr); U_ASSERT(baseSettings->minHighNoReorder == 0); } else { settings = new CollationSettings(); } if(settings != nullptr) { settings->addRef(); } rules.getTerminatedBuffer(); // ensure NUL-termination version[0] = version[1] = version[2] = version[3] = 0; maxExpansionsInitOnce.reset(); } CollationTailoring::~CollationTailoring() { SharedObject::clearPtr(settings); delete ownedData; delete builder; udata_close(memory); ures_close(bundle); utrie2_close(trie); delete unsafeBackwardSet; uhash_close(maxExpansions); maxExpansionsInitOnce.reset(); } UBool CollationTailoring::ensureOwnedData(UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return false; } if(ownedData == nullptr) { const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(errorCode); if(U_FAILURE(errorCode)) { return false; } ownedData = new CollationData(*nfcImpl); if(ownedData == nullptr) { errorCode = U_MEMORY_ALLOCATION_ERROR; return false; } } data = ownedData; return true; } void CollationTailoring::makeBaseVersion(const UVersionInfo ucaVersion, UVersionInfo version) { version[0] = UCOL_BUILDER_VERSION; version[1] = (ucaVersion[0] << 3) + ucaVersion[1]; version[2] = ucaVersion[2] << 6; version[3] = 0; } void CollationTailoring::setVersion(const UVersionInfo baseVersion, const UVersionInfo rulesVersion) { version[0] = UCOL_BUILDER_VERSION; version[1] = baseVersion[1]; version[2] = (baseVersion[2] & 0xc0) + ((rulesVersion[0] + (rulesVersion[0] >> 6)) & 0x3f); version[3] = (rulesVersion[1] << 3) + (rulesVersion[1] >> 5) + rulesVersion[2] + (rulesVersion[3] << 4) + (rulesVersion[3] >> 4); } int32_t CollationTailoring::getUCAVersion() const { return ((int32_t)version[1] << 4) | (version[2] >> 6); } CollationCacheEntry::~CollationCacheEntry() { SharedObject::clearPtr(tailoring); } U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION stringi/src/icu74/i18n/number_decimfmtprops.h0000644000176200001440000001436614700200761020636 0ustar liggesusers// © 2017 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #ifndef __NUMBER_DECIMFMTPROPS_H__ #define __NUMBER_DECIMFMTPROPS_H__ #include "unicode/unistr.h" #include #include "unicode/plurrule.h" #include "unicode/currpinf.h" #include "unicode/unum.h" #include "unicode/localpointer.h" #include "number_types.h" U_NAMESPACE_BEGIN // Export an explicit template instantiation of the LocalPointer that is used as a // data member of CurrencyPluralInfoWrapper. // (When building DLLs for Windows this is required.) #if U_PF_WINDOWS <= U_PLATFORM && U_PLATFORM <= U_PF_CYGWIN #if defined(_MSC_VER) // Ignore warning 4661 as LocalPointerBase does not use operator== or operator!= #pragma warning(push) #pragma warning(disable: 4661) #endif template class U_I18N_API LocalPointerBase; template class U_I18N_API LocalPointer; #if defined(_MSC_VER) #pragma warning(pop) #endif #endif namespace number { namespace impl { // Exported as U_I18N_API because it is a public member field of exported DecimalFormatProperties // Using this wrapper is rather unfortunate, but is needed on Windows platforms in order to allow // for DLL-exporting a fully specified template instantiation. class U_I18N_API CurrencyPluralInfoWrapper { public: LocalPointer fPtr; CurrencyPluralInfoWrapper() = default; CurrencyPluralInfoWrapper(const CurrencyPluralInfoWrapper& other) { if (!other.fPtr.isNull()) { fPtr.adoptInstead(new CurrencyPluralInfo(*other.fPtr)); } } CurrencyPluralInfoWrapper& operator=(const CurrencyPluralInfoWrapper& other) { if (this != &other && // self-assignment: no-op !other.fPtr.isNull()) { fPtr.adoptInstead(new CurrencyPluralInfo(*other.fPtr)); } return *this; } }; /** Controls the set of rules for parsing a string from the old DecimalFormat API. */ enum ParseMode { /** * Lenient mode should be used if you want to accept malformed user input. It will use heuristics * to attempt to parse through typographical errors in the string. */ PARSE_MODE_LENIENT, /** * Strict mode should be used if you want to require that the input is well-formed. More * specifically, it differs from lenient mode in the following ways: * *

    *
  • Grouping widths must match the grouping settings. For example, "12,3,45" will fail if the * grouping width is 3, as in the pattern "#,##0". *
  • The string must contain a complete prefix and suffix. For example, if the pattern is * "{#};(#)", then "{123}" or "(123)" would match, but "{123", "123}", and "123" would all fail. * (The latter strings would be accepted in lenient mode.) *
  • Whitespace may not appear at arbitrary places in the string. In lenient mode, whitespace * is allowed to occur arbitrarily before and after prefixes and exponent separators. *
  • Leading grouping separators are not allowed, as in ",123". *
  • Minus and plus signs can only appear if specified in the pattern. In lenient mode, a plus * or minus sign can always precede a number. *
  • The set of characters that can be interpreted as a decimal or grouping separator is * smaller. *
  • If currency parsing is enabled, currencies must only appear where * specified in either the current pattern string or in a valid pattern string for the current * locale. For example, if the pattern is "¤0.00", then "$1.23" would match, but "1.23$" would * fail to match. *
*/ PARSE_MODE_STRICT, }; // Exported as U_I18N_API because it is needed for the unit test PatternStringTest struct U_I18N_API DecimalFormatProperties : public UMemory { public: NullableValue compactStyle; NullableValue currency; CurrencyPluralInfoWrapper currencyPluralInfo; NullableValue currencyUsage; bool decimalPatternMatchRequired; bool decimalSeparatorAlwaysShown; bool exponentSignAlwaysShown; bool currencyAsDecimal; bool formatFailIfMoreThanMaxDigits; // ICU4C-only int32_t formatWidth; int32_t groupingSize; bool groupingUsed; int32_t magnitudeMultiplier; // internal field like multiplierScale but separate to avoid conflict int32_t maximumFractionDigits; int32_t maximumIntegerDigits; int32_t maximumSignificantDigits; int32_t minimumExponentDigits; int32_t minimumFractionDigits; int32_t minimumGroupingDigits; int32_t minimumIntegerDigits; int32_t minimumSignificantDigits; int32_t multiplier; int32_t multiplierScale; // ICU4C-only UnicodeString negativePrefix; UnicodeString negativePrefixPattern; UnicodeString negativeSuffix; UnicodeString negativeSuffixPattern; NullableValue padPosition; UnicodeString padString; bool parseCaseSensitive; bool parseIntegerOnly; NullableValue parseMode; bool parseNoExponent; bool parseToBigDecimal; // TODO: Not needed in ICU4C? UNumberFormatAttributeValue parseAllInput; // ICU4C-only //PluralRules pluralRules; UnicodeString positivePrefix; UnicodeString positivePrefixPattern; UnicodeString positiveSuffix; UnicodeString positiveSuffixPattern; double roundingIncrement; NullableValue roundingMode; int32_t secondaryGroupingSize; bool signAlwaysShown; DecimalFormatProperties(); inline bool operator==(const DecimalFormatProperties& other) const { return _equals(other, false); } void clear(); /** * Checks for equality to the default DecimalFormatProperties, but ignores the prescribed set of * options for fast-path formatting. */ bool equalsDefaultExceptFastFormat() const; /** * Returns the default DecimalFormatProperties instance. */ static const DecimalFormatProperties& getDefault(); private: bool _equals(const DecimalFormatProperties& other, bool ignoreForFastFormat) const; }; } // namespace impl } // namespace number U_NAMESPACE_END #endif //__NUMBER_DECIMFMTPROPS_H__ #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/csr2022.h0000644000176200001440000000476714700200761015513 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2005-2015, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ #ifndef __CSR2022_H #define __CSR2022_H #include "unicode/utypes.h" #if !UCONFIG_NO_CONVERSION #include "csrecog.h" U_NAMESPACE_BEGIN class CharsetMatch; /** * class CharsetRecog_2022 part of the ICU charset detection implementation. * This is a superclass for the individual detectors for * each of the detectable members of the ISO 2022 family * of encodings. * * The separate classes are nested within this class. * * @internal */ class CharsetRecog_2022 : public CharsetRecognizer { public: virtual ~CharsetRecog_2022() = 0; protected: /** * Matching function shared among the 2022 detectors JP, CN and KR * Counts up the number of legal an unrecognized escape sequences in * the sample of text, and computes a score based on the total number & * the proportion that fit the encoding. * * * @param text the byte buffer containing text to analyse * @param textLen the size of the text in the byte. * @param escapeSequences the byte escape sequences to test for. * @return match quality, in the range of 0-100. */ int32_t match_2022(const uint8_t *text, int32_t textLen, const uint8_t escapeSequences[][5], int32_t escapeSequences_length) const; }; class CharsetRecog_2022JP :public CharsetRecog_2022 { public: virtual ~CharsetRecog_2022JP(); const char *getName() const override; UBool match(InputText *textIn, CharsetMatch *results) const override; }; #if !UCONFIG_ONLY_HTML_CONVERSION class CharsetRecog_2022KR :public CharsetRecog_2022 { public: virtual ~CharsetRecog_2022KR(); const char *getName() const override; UBool match(InputText *textIn, CharsetMatch *results) const override; }; class CharsetRecog_2022CN :public CharsetRecog_2022 { public: virtual ~CharsetRecog_2022CN(); const char* getName() const override; UBool match(InputText *textIn, CharsetMatch *results) const override; }; #endif U_NAMESPACE_END #endif #endif /* __CSR2022_H */ stringi/src/icu74/i18n/plurfmt.cpp0000644000176200001440000005043314700200761016431 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2009-2015, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* * * File PLURFMT.CPP ******************************************************************************* */ #include "unicode/decimfmt.h" #include "unicode/messagepattern.h" #include "unicode/plurfmt.h" #include "unicode/plurrule.h" #include "unicode/utypes.h" #include "cmemory.h" #include "messageimpl.h" #include "nfrule.h" #include "plurrule_impl.h" #include "uassert.h" #include "uhash.h" #include "number_decimalquantity.h" #include "number_utils.h" #include "number_utypes.h" #if !UCONFIG_NO_FORMATTING U_NAMESPACE_BEGIN using number::impl::DecimalQuantity; static const char16_t OTHER_STRING[] = { 0x6F, 0x74, 0x68, 0x65, 0x72, 0 // "other" }; UOBJECT_DEFINE_RTTI_IMPLEMENTATION(PluralFormat) PluralFormat::PluralFormat(UErrorCode& status) : locale(Locale::getDefault()), msgPattern(status), numberFormat(nullptr), offset(0) { init(nullptr, UPLURAL_TYPE_CARDINAL, status); } PluralFormat::PluralFormat(const Locale& loc, UErrorCode& status) : locale(loc), msgPattern(status), numberFormat(nullptr), offset(0) { init(nullptr, UPLURAL_TYPE_CARDINAL, status); } PluralFormat::PluralFormat(const PluralRules& rules, UErrorCode& status) : locale(Locale::getDefault()), msgPattern(status), numberFormat(nullptr), offset(0) { init(&rules, UPLURAL_TYPE_COUNT, status); } PluralFormat::PluralFormat(const Locale& loc, const PluralRules& rules, UErrorCode& status) : locale(loc), msgPattern(status), numberFormat(nullptr), offset(0) { init(&rules, UPLURAL_TYPE_COUNT, status); } PluralFormat::PluralFormat(const Locale& loc, UPluralType type, UErrorCode& status) : locale(loc), msgPattern(status), numberFormat(nullptr), offset(0) { init(nullptr, type, status); } PluralFormat::PluralFormat(const UnicodeString& pat, UErrorCode& status) : locale(Locale::getDefault()), msgPattern(status), numberFormat(nullptr), offset(0) { init(nullptr, UPLURAL_TYPE_CARDINAL, status); applyPattern(pat, status); } PluralFormat::PluralFormat(const Locale& loc, const UnicodeString& pat, UErrorCode& status) : locale(loc), msgPattern(status), numberFormat(nullptr), offset(0) { init(nullptr, UPLURAL_TYPE_CARDINAL, status); applyPattern(pat, status); } PluralFormat::PluralFormat(const PluralRules& rules, const UnicodeString& pat, UErrorCode& status) : locale(Locale::getDefault()), msgPattern(status), numberFormat(nullptr), offset(0) { init(&rules, UPLURAL_TYPE_COUNT, status); applyPattern(pat, status); } PluralFormat::PluralFormat(const Locale& loc, const PluralRules& rules, const UnicodeString& pat, UErrorCode& status) : locale(loc), msgPattern(status), numberFormat(nullptr), offset(0) { init(&rules, UPLURAL_TYPE_COUNT, status); applyPattern(pat, status); } PluralFormat::PluralFormat(const Locale& loc, UPluralType type, const UnicodeString& pat, UErrorCode& status) : locale(loc), msgPattern(status), numberFormat(nullptr), offset(0) { init(nullptr, type, status); applyPattern(pat, status); } PluralFormat::PluralFormat(const PluralFormat& other) : Format(other), locale(other.locale), msgPattern(other.msgPattern), numberFormat(nullptr), offset(other.offset) { copyObjects(other); } void PluralFormat::copyObjects(const PluralFormat& other) { UErrorCode status = U_ZERO_ERROR; if (numberFormat != nullptr) { delete numberFormat; } if (pluralRulesWrapper.pluralRules != nullptr) { delete pluralRulesWrapper.pluralRules; } if (other.numberFormat == nullptr) { numberFormat = NumberFormat::createInstance(locale, status); } else { numberFormat = other.numberFormat->clone(); } if (other.pluralRulesWrapper.pluralRules == nullptr) { pluralRulesWrapper.pluralRules = PluralRules::forLocale(locale, status); } else { pluralRulesWrapper.pluralRules = other.pluralRulesWrapper.pluralRules->clone(); } } PluralFormat::~PluralFormat() { delete numberFormat; } void PluralFormat::init(const PluralRules* rules, UPluralType type, UErrorCode& status) { if (U_FAILURE(status)) { return; } if (rules==nullptr) { pluralRulesWrapper.pluralRules = PluralRules::forLocale(locale, type, status); } else { pluralRulesWrapper.pluralRules = rules->clone(); if (pluralRulesWrapper.pluralRules == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } } numberFormat= NumberFormat::createInstance(locale, status); } void PluralFormat::applyPattern(const UnicodeString& newPattern, UErrorCode& status) { msgPattern.parsePluralStyle(newPattern, nullptr, status); if (U_FAILURE(status)) { msgPattern.clear(); offset = 0; return; } offset = msgPattern.getPluralOffset(0); } UnicodeString& PluralFormat::format(const Formattable& obj, UnicodeString& appendTo, FieldPosition& pos, UErrorCode& status) const { if (U_FAILURE(status)) return appendTo; if (obj.isNumeric()) { return format(obj, obj.getDouble(), appendTo, pos, status); } else { status = U_ILLEGAL_ARGUMENT_ERROR; return appendTo; } } UnicodeString PluralFormat::format(int32_t number, UErrorCode& status) const { FieldPosition fpos(FieldPosition::DONT_CARE); UnicodeString result; return format(Formattable(number), number, result, fpos, status); } UnicodeString PluralFormat::format(double number, UErrorCode& status) const { FieldPosition fpos(FieldPosition::DONT_CARE); UnicodeString result; return format(Formattable(number), number, result, fpos, status); } UnicodeString& PluralFormat::format(int32_t number, UnicodeString& appendTo, FieldPosition& pos, UErrorCode& status) const { return format(Formattable(number), (double)number, appendTo, pos, status); } UnicodeString& PluralFormat::format(double number, UnicodeString& appendTo, FieldPosition& pos, UErrorCode& status) const { return format(Formattable(number), (double)number, appendTo, pos, status); } UnicodeString& PluralFormat::format(const Formattable& numberObject, double number, UnicodeString& appendTo, FieldPosition& pos, UErrorCode& status) const { if (U_FAILURE(status)) { return appendTo; } if (msgPattern.countParts() == 0) { return numberFormat->format(numberObject, appendTo, pos, status); } // Get the appropriate sub-message. // Select it based on the formatted number-offset. double numberMinusOffset = number - offset; // Call NumberFormatter to get both the DecimalQuantity and the string. // This call site needs to use more internal APIs than the Java equivalent. number::impl::UFormattedNumberData data; if (offset == 0) { // could be BigDecimal etc. numberObject.populateDecimalQuantity(data.quantity, status); } else { data.quantity.setToDouble(numberMinusOffset); } UnicodeString numberString; auto *decFmt = dynamic_cast(numberFormat); if(decFmt != nullptr) { const number::LocalizedNumberFormatter* lnf = decFmt->toNumberFormatter(status); if (U_FAILURE(status)) { return appendTo; } lnf->formatImpl(&data, status); // mutates &data if (U_FAILURE(status)) { return appendTo; } numberString = data.getStringRef().toUnicodeString(); } else { if (offset == 0) { numberFormat->format(numberObject, numberString, status); } else { numberFormat->format(numberMinusOffset, numberString, status); } } int32_t partIndex = findSubMessage(msgPattern, 0, pluralRulesWrapper, &data.quantity, number, status); if (U_FAILURE(status)) { return appendTo; } // Replace syntactic # signs in the top level of this sub-message // (not in nested arguments) with the formatted number-offset. const UnicodeString& pattern = msgPattern.getPatternString(); int32_t prevIndex = msgPattern.getPart(partIndex).getLimit(); for (;;) { const MessagePattern::Part& part = msgPattern.getPart(++partIndex); const UMessagePatternPartType type = part.getType(); int32_t index = part.getIndex(); if (type == UMSGPAT_PART_TYPE_MSG_LIMIT) { return appendTo.append(pattern, prevIndex, index - prevIndex); } else if ((type == UMSGPAT_PART_TYPE_REPLACE_NUMBER) || (type == UMSGPAT_PART_TYPE_SKIP_SYNTAX && MessageImpl::jdkAposMode(msgPattern))) { appendTo.append(pattern, prevIndex, index - prevIndex); if (type == UMSGPAT_PART_TYPE_REPLACE_NUMBER) { appendTo.append(numberString); } prevIndex = part.getLimit(); } else if (type == UMSGPAT_PART_TYPE_ARG_START) { appendTo.append(pattern, prevIndex, index - prevIndex); prevIndex = index; partIndex = msgPattern.getLimitPartIndex(partIndex); index = msgPattern.getPart(partIndex).getLimit(); MessageImpl::appendReducedApostrophes(pattern, prevIndex, index, appendTo); prevIndex = index; } } } UnicodeString& PluralFormat::toPattern(UnicodeString& appendTo) { if (0 == msgPattern.countParts()) { appendTo.setToBogus(); } else { appendTo.append(msgPattern.getPatternString()); } return appendTo; } void PluralFormat::setLocale(const Locale& loc, UErrorCode& status) { if (U_FAILURE(status)) { return; } locale = loc; msgPattern.clear(); delete numberFormat; offset = 0; numberFormat = nullptr; pluralRulesWrapper.reset(); init(nullptr, UPLURAL_TYPE_CARDINAL, status); } void PluralFormat::setNumberFormat(const NumberFormat* format, UErrorCode& status) { if (U_FAILURE(status)) { return; } NumberFormat* nf = format->clone(); if (nf != nullptr) { delete numberFormat; numberFormat = nf; } else { status = U_MEMORY_ALLOCATION_ERROR; } } PluralFormat* PluralFormat::clone() const { return new PluralFormat(*this); } PluralFormat& PluralFormat::operator=(const PluralFormat& other) { if (this != &other) { locale = other.locale; msgPattern = other.msgPattern; offset = other.offset; copyObjects(other); } return *this; } bool PluralFormat::operator==(const Format& other) const { if (this == &other) { return true; } if (!Format::operator==(other)) { return false; } const PluralFormat& o = (const PluralFormat&)other; return locale == o.locale && msgPattern == o.msgPattern && // implies same offset (numberFormat == nullptr) == (o.numberFormat == nullptr) && (numberFormat == nullptr || *numberFormat == *o.numberFormat) && (pluralRulesWrapper.pluralRules == nullptr) == (o.pluralRulesWrapper.pluralRules == nullptr) && (pluralRulesWrapper.pluralRules == nullptr || *pluralRulesWrapper.pluralRules == *o.pluralRulesWrapper.pluralRules); } bool PluralFormat::operator!=(const Format& other) const { return !operator==(other); } void PluralFormat::parseObject(const UnicodeString& /*source*/, Formattable& /*result*/, ParsePosition& pos) const { // Parsing not supported. pos.setErrorIndex(pos.getIndex()); } int32_t PluralFormat::findSubMessage(const MessagePattern& pattern, int32_t partIndex, const PluralSelector& selector, void *context, double number, UErrorCode& ec) { if (U_FAILURE(ec)) { return 0; } int32_t count=pattern.countParts(); double offset; const MessagePattern::Part* part=&pattern.getPart(partIndex); if (MessagePattern::Part::hasNumericValue(part->getType())) { offset=pattern.getNumericValue(*part); ++partIndex; } else { offset=0; } // The keyword is empty until we need to match against a non-explicit, not-"other" value. // Then we get the keyword from the selector. // (In other words, we never call the selector if we match against an explicit value, // or if the only non-explicit keyword is "other".) UnicodeString keyword; UnicodeString other(false, OTHER_STRING, 5); // When we find a match, we set msgStart>0 and also set this boolean to true // to avoid matching the keyword again (duplicates are allowed) // while we continue to look for an explicit-value match. UBool haveKeywordMatch=false; // msgStart is 0 until we find any appropriate sub-message. // We remember the first "other" sub-message if we have not seen any // appropriate sub-message before. // We remember the first matching-keyword sub-message if we have not seen // one of those before. // (The parser allows [does not check for] duplicate keywords. // We just have to make sure to take the first one.) // We avoid matching the keyword twice by also setting haveKeywordMatch=true // at the first keyword match. // We keep going until we find an explicit-value match or reach the end of the plural style. int32_t msgStart=0; // Iterate over (ARG_SELECTOR [ARG_INT|ARG_DOUBLE] message) tuples // until ARG_LIMIT or end of plural-only pattern. do { part=&pattern.getPart(partIndex++); const UMessagePatternPartType type = part->getType(); if(type==UMSGPAT_PART_TYPE_ARG_LIMIT) { break; } U_ASSERT (type==UMSGPAT_PART_TYPE_ARG_SELECTOR); // part is an ARG_SELECTOR followed by an optional explicit value, and then a message if(MessagePattern::Part::hasNumericValue(pattern.getPartType(partIndex))) { // explicit value like "=2" part=&pattern.getPart(partIndex++); if(number==pattern.getNumericValue(*part)) { // matches explicit value return partIndex; } } else if(!haveKeywordMatch) { // plural keyword like "few" or "other" // Compare "other" first and call the selector if this is not "other". if(pattern.partSubstringMatches(*part, other)) { if(msgStart==0) { msgStart=partIndex; if(0 == keyword.compare(other)) { // This is the first "other" sub-message, // and the selected keyword is also "other". // Do not match "other" again. haveKeywordMatch=true; } } } else { if(keyword.isEmpty()) { keyword=selector.select(context, number-offset, ec); if(msgStart!=0 && (0 == keyword.compare(other))) { // We have already seen an "other" sub-message. // Do not match "other" again. haveKeywordMatch=true; // Skip keyword matching but do getLimitPartIndex(). } } if(!haveKeywordMatch && pattern.partSubstringMatches(*part, keyword)) { // keyword matches msgStart=partIndex; // Do not match this keyword again. haveKeywordMatch=true; } } } partIndex=pattern.getLimitPartIndex(partIndex); } while(++partIndexgetType() != UMSGPAT_PART_TYPE_ARG_SELECTOR) { // Bad format continue; } const MessagePattern::Part* partStart = &msgPattern.getPart(partIndex++); if (partStart->getType() != UMSGPAT_PART_TYPE_MSG_START) { // Bad format continue; } const MessagePattern::Part* partLimit = &msgPattern.getPart(partIndex++); if (partLimit->getType() != UMSGPAT_PART_TYPE_MSG_LIMIT) { // Bad format continue; } UnicodeString currArg = pattern.tempSubString(partStart->getLimit(), partLimit->getIndex() - partStart->getLimit()); if (rbnfLenientScanner != nullptr) { // Check if non-lenient rule finds the text before call lenient parsing int32_t tempIndex = source.indexOf(currArg, startingAt); if (tempIndex >= 0) { currMatchIndex = tempIndex; } else { // If lenient parsing is turned ON, we've got some time consuming parsing ahead of us. int32_t length = -1; currMatchIndex = rbnfLenientScanner->findTextLenient(source, currArg, startingAt, &length); } } else { currMatchIndex = source.indexOf(currArg, startingAt); } if (currMatchIndex >= 0 && currMatchIndex >= matchedIndex && currArg.length() > matchedWord.length()) { matchedIndex = currMatchIndex; matchedWord = currArg; keyword = pattern.tempSubString(partStart->getLimit(), partLimit->getIndex() - partStart->getLimit()); } } if (matchedIndex >= 0) { pos.setBeginIndex(matchedIndex); pos.setEndIndex(matchedIndex + matchedWord.length()); result.setString(keyword); return; } // Not found! pos.setBeginIndex(-1); pos.setEndIndex(-1); } PluralFormat::PluralSelector::~PluralSelector() {} PluralFormat::PluralSelectorAdapter::~PluralSelectorAdapter() { delete pluralRules; } UnicodeString PluralFormat::PluralSelectorAdapter::select(void *context, double number, UErrorCode& /*ec*/) const { (void)number; // unused except in the assertion IFixedDecimal *dec=static_cast(context); return pluralRules->select(*dec); } void PluralFormat::PluralSelectorAdapter::reset() { delete pluralRules; pluralRules = nullptr; } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ //eof stringi/src/icu74/i18n/numparse_currency.cpp0000644000176200001440000001604614700200761020506 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING // Allow implicit conversion from char16_t* to UnicodeString for this file: // Helpful in toString methods and elsewhere. #define UNISTR_FROM_STRING_EXPLICIT #include "numparse_types.h" #include "numparse_currency.h" #include "ucurrimp.h" #include "unicode/errorcode.h" #include "numparse_utils.h" #include "string_segment.h" using namespace icu; using namespace icu::numparse; using namespace icu::numparse::impl; CombinedCurrencyMatcher::CombinedCurrencyMatcher(const CurrencySymbols& currencySymbols, const DecimalFormatSymbols& dfs, parse_flags_t parseFlags, UErrorCode& status) : fCurrency1(currencySymbols.getCurrencySymbol(status)), fCurrency2(currencySymbols.getIntlCurrencySymbol(status)), fUseFullCurrencyData(0 == (parseFlags & PARSE_FLAG_NO_FOREIGN_CURRENCY)), afterPrefixInsert(dfs.getPatternForCurrencySpacing(UNUM_CURRENCY_INSERT, false, status)), beforeSuffixInsert(dfs.getPatternForCurrencySpacing(UNUM_CURRENCY_INSERT, true, status)), fLocaleName(dfs.getLocale().getName(), -1, status) { utils::copyCurrencyCode(fCurrencyCode, currencySymbols.getIsoCode()); // Pre-load the long names for the current locale and currency // if we are parsing without the full currency data. if (!fUseFullCurrencyData) { for (int32_t i=0; i(i); fLocalLongNames[i] = currencySymbols.getPluralName(plural, status); } } // TODO: Figure out how to make this faster and re-enable. // Computing the "lead code points" set for fastpathing is too slow to use in production. // See https://unicode-org.atlassian.net/browse/ICU-13584 // // Compute the full set of characters that could be the first in a currency to allow for // // efficient smoke test. // fLeadCodePoints.add(fCurrency1.char32At(0)); // fLeadCodePoints.add(fCurrency2.char32At(0)); // fLeadCodePoints.add(beforeSuffixInsert.char32At(0)); // uprv_currencyLeads(fLocaleName.data(), fLeadCodePoints, status); // // Always apply case mapping closure for currencies // fLeadCodePoints.closeOver(USET_ADD_CASE_MAPPINGS); // fLeadCodePoints.freeze(); } bool CombinedCurrencyMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const { if (result.currencyCode[0] != 0) { return false; } // Try to match a currency spacing separator. int32_t initialOffset = segment.getOffset(); bool maybeMore = false; if (result.seenNumber() && !beforeSuffixInsert.isEmpty()) { int32_t overlap = segment.getCommonPrefixLength(beforeSuffixInsert); if (overlap == beforeSuffixInsert.length()) { segment.adjustOffset(overlap); // Note: let currency spacing be a weak match. Don't update chars consumed. } maybeMore = maybeMore || overlap == segment.length(); } // Match the currency string, and reset if we didn't find one. maybeMore = maybeMore || matchCurrency(segment, result, status); if (result.currencyCode[0] == 0) { segment.setOffset(initialOffset); return maybeMore; } // Try to match a currency spacing separator. if (!result.seenNumber() && !afterPrefixInsert.isEmpty()) { int32_t overlap = segment.getCommonPrefixLength(afterPrefixInsert); if (overlap == afterPrefixInsert.length()) { segment.adjustOffset(overlap); // Note: let currency spacing be a weak match. Don't update chars consumed. } maybeMore = maybeMore || overlap == segment.length(); } return maybeMore; } bool CombinedCurrencyMatcher::matchCurrency(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const { bool maybeMore = false; int32_t overlap1; if (!fCurrency1.isEmpty()) { overlap1 = segment.getCaseSensitivePrefixLength(fCurrency1); } else { overlap1 = -1; } maybeMore = maybeMore || overlap1 == segment.length(); if (overlap1 == fCurrency1.length()) { utils::copyCurrencyCode(result.currencyCode, fCurrencyCode); segment.adjustOffset(overlap1); result.setCharsConsumed(segment); return maybeMore; } int32_t overlap2; if (!fCurrency2.isEmpty()) { // ISO codes should be accepted case-insensitive. // https://unicode-org.atlassian.net/browse/ICU-13696 overlap2 = segment.getCommonPrefixLength(fCurrency2); } else { overlap2 = -1; } maybeMore = maybeMore || overlap2 == segment.length(); if (overlap2 == fCurrency2.length()) { utils::copyCurrencyCode(result.currencyCode, fCurrencyCode); segment.adjustOffset(overlap2); result.setCharsConsumed(segment); return maybeMore; } if (fUseFullCurrencyData) { // Use the full currency data. // NOTE: This call site should be improved with #13584. const UnicodeString segmentString = segment.toTempUnicodeString(); // Try to parse the currency ParsePosition ppos(0); int32_t partialMatchLen = 0; uprv_parseCurrency( fLocaleName.data(), segmentString, ppos, UCURR_SYMBOL_NAME, // checks for both UCURR_SYMBOL_NAME and UCURR_LONG_NAME &partialMatchLen, result.currencyCode, status); maybeMore = maybeMore || partialMatchLen == segment.length(); if (U_SUCCESS(status) && ppos.getIndex() != 0) { // Complete match. // NOTE: The currency code should already be saved in the ParsedNumber. segment.adjustOffset(ppos.getIndex()); result.setCharsConsumed(segment); return maybeMore; } } else { // Use the locale long names. int32_t longestFullMatch = 0; for (int32_t i=0; i longestFullMatch) { longestFullMatch = name.length(); } maybeMore = maybeMore || overlap > 0; } if (longestFullMatch > 0) { utils::copyCurrencyCode(result.currencyCode, fCurrencyCode); segment.adjustOffset(longestFullMatch); result.setCharsConsumed(segment); return maybeMore; } } // No match found. return maybeMore; } bool CombinedCurrencyMatcher::smokeTest(const StringSegment&) const { // TODO: See constructor return true; //return segment.startsWith(fLeadCodePoints); } UnicodeString CombinedCurrencyMatcher::toString() const { return u""; } #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/brktrans.h0000644000176200001440000000576114700200761016237 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2008-2015, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 05/11/2008 Andy Heninger Ported from Java ********************************************************************** */ #ifndef BRKTRANS_H #define BRKTRANS_H #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION #include "unicode/translit.h" #include "unicode/localpointer.h" U_NAMESPACE_BEGIN class UVector32; /** * A transliterator that pInserts the specified characters at word breaks. * To restrict it to particular characters, use a filter. * TODO: this is an internal class, and only temporary. * Remove it once we have \b notation in Transliterator. */ class BreakTransliterator : public Transliterator { public: /** * Constructs a transliterator. * @param adoptedFilter the filter for this transliterator. */ BreakTransliterator(UnicodeFilter* adoptedFilter = 0); /** * Destructor. */ virtual ~BreakTransliterator(); /** * Copy constructor. */ BreakTransliterator(const BreakTransliterator&); /** * Transliterator API. * @return A copy of the object. */ virtual BreakTransliterator* clone() const override; virtual const UnicodeString &getInsertion() const; virtual void setInsertion(const UnicodeString &insertion); /** * ICU "poor man's RTTI", returns a UClassID for the actual class. */ virtual UClassID getDynamicClassID() const override; /** * ICU "poor man's RTTI", returns a UClassID for this class. */ U_I18N_API static UClassID U_EXPORT2 getStaticClassID(); protected: /** * Implements {@link Transliterator#handleTransliterate}. * @param text the buffer holding transliterated and * untransliterated text * @param offset the start and limit of the text, the position * of the cursor, and the start and limit of transliteration. * @param incremental if true, assume more text may be coming after * pos.contextLimit. Otherwise, assume the text is complete. */ virtual void handleTransliterate(Replaceable& text, UTransPosition& offset, UBool isIncremental) const override; private: LocalPointer cachedBI; LocalPointer cachedBoundaries; UnicodeString fInsertion; static UnicodeString replaceableAsString(Replaceable &r); /** * Assignment operator. */ BreakTransliterator& operator=(const BreakTransliterator&); }; U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ #endif stringi/src/icu74/i18n/dangical.h0000644000176200001440000000713314700200761016146 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ***************************************************************************** * Copyright (C) 2013, International Business Machines Corporation * and others. All Rights Reserved. ***************************************************************************** * * File DANGICAL.H ***************************************************************************** */ #ifndef DANGICAL_H #define DANGICAL_H #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/calendar.h" #include "unicode/timezone.h" #include "chnsecal.h" U_NAMESPACE_BEGIN /** *

DangiCalendar is a concrete subclass of {@link Calendar} * that implements a traditional Korean lunisolar calendar.

* *

DangiCalendar usually should be instantiated using * {@link com.ibm.icu.util.Calendar#getInstance(ULocale)} passing in a ULocale * with the tag "@calendar=dangi".

* * @internal */ class DangiCalendar : public ChineseCalendar { public: //------------------------------------------------------------------------- // Constructors... //------------------------------------------------------------------------- /** * Constructs a DangiCalendar based on the current time in the default time zone * with the given locale. * * @param aLocale The given locale. * @param success Indicates the status of DangiCalendar object construction. * Returns U_ZERO_ERROR if constructed successfully. * @internal */ DangiCalendar(const Locale& aLocale, UErrorCode &success); /** * Copy Constructor * @internal */ DangiCalendar(const DangiCalendar& other); /** * Destructor. * @internal */ virtual ~DangiCalendar(); /** * Clone. * @internal */ virtual DangiCalendar* clone() const override; //---------------------------------------------------------------------- // Internal methods & astronomical calculations //---------------------------------------------------------------------- /** * @return The related Gregorian year; will be obtained by modifying the value * obtained by get from UCAL_EXTENDED_YEAR field * @internal */ virtual int32_t getRelatedYear(UErrorCode &status) const override; /** * @param year The related Gregorian year to set; will be modified as necessary then * set in UCAL_EXTENDED_YEAR field * @internal */ virtual void setRelatedYear(int32_t year) override; private: const TimeZone* getDangiCalZoneAstroCalc(UErrorCode &status) const; // UObject stuff public: /** * @return The class ID for this object. All objects of a given class have the * same class ID. Objects of other classes have different class IDs. * @internal */ virtual UClassID getDynamicClassID() const override; /** * Return the class ID for this class. This is useful only for comparing to a return * value from getDynamicClassID(). For example: * * Base* polymorphic_pointer = createPolymorphicObject(); * if (polymorphic_pointer->getDynamicClassID() == * Derived::getStaticClassID()) ... * * @return The class ID for all objects of this class. * @internal */ U_I18N_API static UClassID U_EXPORT2 getStaticClassID(); /** * return the calendar type, "dangi". * * @return calendar type * @internal */ const char * getType() const override; private: DangiCalendar(); // default constructor not implemented }; U_NAMESPACE_END #endif #endif stringi/src/icu74/i18n/uregex.cpp0000644000176200001440000020700514700200761016236 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2004-2015, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * file name: uregex.cpp */ #include "unicode/utypes.h" #if !UCONFIG_NO_REGULAR_EXPRESSIONS #include "unicode/regex.h" #include "unicode/uregex.h" #include "unicode/unistr.h" #include "unicode/ustring.h" #include "unicode/uchar.h" #include "unicode/uobject.h" #include "unicode/utf16.h" #include "cmemory.h" #include "uassert.h" #include "uhash.h" #include "umutex.h" #include "uvectr32.h" #include "regextxt.h" U_NAMESPACE_BEGIN #define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0) struct RegularExpression: public UMemory { public: RegularExpression(); ~RegularExpression(); int32_t fMagic; RegexPattern *fPat; u_atomic_int32_t *fPatRefCount; char16_t *fPatString; int32_t fPatStringLen; RegexMatcher *fMatcher; const char16_t *fText; // Text from setText() int32_t fTextLength; // Length provided by user with setText(), which // may be -1. UBool fOwnsText; }; static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII RegularExpression::RegularExpression() { fMagic = REXP_MAGIC; fPat = nullptr; fPatRefCount = nullptr; fPatString = nullptr; fPatStringLen = 0; fMatcher = nullptr; fText = nullptr; fTextLength = 0; fOwnsText = false; } RegularExpression::~RegularExpression() { delete fMatcher; fMatcher = nullptr; if (fPatRefCount!=nullptr && umtx_atomic_dec(fPatRefCount)==0) { delete fPat; uprv_free(fPatString); uprv_free((void *)fPatRefCount); } if (fOwnsText && fText!=nullptr) { uprv_free((void *)fText); } fMagic = 0; } U_NAMESPACE_END U_NAMESPACE_USE //---------------------------------------------------------------------------------------- // // validateRE Do boilerplate style checks on API function parameters. // Return true if they look OK. //---------------------------------------------------------------------------------------- static UBool validateRE(const RegularExpression *re, UBool requiresText, UErrorCode *status) { if (U_FAILURE(*status)) { return false; } if (re == nullptr || re->fMagic != REXP_MAGIC) { *status = U_ILLEGAL_ARGUMENT_ERROR; return false; } // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway if (requiresText && re->fText == nullptr && !re->fOwnsText) { *status = U_REGEX_INVALID_STATE; return false; } return true; } //---------------------------------------------------------------------------------------- // // uregex_open // //---------------------------------------------------------------------------------------- U_CAPI URegularExpression * U_EXPORT2 uregex_open( const char16_t *pattern, int32_t patternLength, uint32_t flags, UParseError *pe, UErrorCode *status) { if (U_FAILURE(*status)) { return nullptr; } if (pattern == nullptr || patternLength < -1 || patternLength == 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return nullptr; } int32_t actualPatLen = patternLength; if (actualPatLen == -1) { actualPatLen = u_strlen(pattern); } RegularExpression *re = new RegularExpression; u_atomic_int32_t *refC = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t)); char16_t *patBuf = (char16_t *)uprv_malloc(sizeof(char16_t)*(actualPatLen+1)); if (re == nullptr || refC == nullptr || patBuf == nullptr) { *status = U_MEMORY_ALLOCATION_ERROR; delete re; uprv_free((void *)refC); uprv_free(patBuf); return nullptr; } re->fPatRefCount = refC; *re->fPatRefCount = 1; // // Make a copy of the pattern string, so we can return it later if asked. // For compiling the pattern, we will use a UText wrapper around // this local copy, to avoid making even more copies. // re->fPatString = patBuf; re->fPatStringLen = patternLength; u_memcpy(patBuf, pattern, actualPatLen); patBuf[actualPatLen] = 0; UText patText = UTEXT_INITIALIZER; utext_openUChars(&patText, patBuf, patternLength, status); // // Compile the pattern // if (pe != nullptr) { re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); } else { re->fPat = RegexPattern::compile(&patText, flags, *status); } utext_close(&patText); if (U_FAILURE(*status)) { goto ErrorExit; } // // Create the matcher object // re->fMatcher = re->fPat->matcher(*status); if (U_SUCCESS(*status)) { return (URegularExpression*)re; } ErrorExit: delete re; return nullptr; } //---------------------------------------------------------------------------------------- // // uregex_openUText // //---------------------------------------------------------------------------------------- U_CAPI URegularExpression * U_EXPORT2 uregex_openUText(UText *pattern, uint32_t flags, UParseError *pe, UErrorCode *status) { if (U_FAILURE(*status)) { return nullptr; } if (pattern == nullptr) { *status = U_ILLEGAL_ARGUMENT_ERROR; return nullptr; } int64_t patternNativeLength = utext_nativeLength(pattern); if (patternNativeLength == 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return nullptr; } RegularExpression *re = new RegularExpression; UErrorCode lengthStatus = U_ZERO_ERROR; int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, nullptr, 0, &lengthStatus); u_atomic_int32_t *refC = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t)); char16_t *patBuf = (char16_t *)uprv_malloc(sizeof(char16_t)*(pattern16Length+1)); if (re == nullptr || refC == nullptr || patBuf == nullptr) { *status = U_MEMORY_ALLOCATION_ERROR; delete re; uprv_free((void *)refC); uprv_free(patBuf); return nullptr; } re->fPatRefCount = refC; *re->fPatRefCount = 1; // // Make a copy of the pattern string, so we can return it later if asked. // For compiling the pattern, we will use a read-only UText wrapper // around this local copy, to avoid making even more copies. // re->fPatString = patBuf; re->fPatStringLen = pattern16Length; utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status); UText patText = UTEXT_INITIALIZER; utext_openUChars(&patText, patBuf, pattern16Length, status); // // Compile the pattern // if (pe != nullptr) { re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); } else { re->fPat = RegexPattern::compile(&patText, flags, *status); } utext_close(&patText); if (U_FAILURE(*status)) { goto ErrorExit; } // // Create the matcher object // re->fMatcher = re->fPat->matcher(*status); if (U_SUCCESS(*status)) { return (URegularExpression*)re; } ErrorExit: delete re; return nullptr; } //---------------------------------------------------------------------------------------- // // uregex_close // //---------------------------------------------------------------------------------------- U_CAPI void U_EXPORT2 uregex_close(URegularExpression *re2) { RegularExpression *re = (RegularExpression*)re2; UErrorCode status = U_ZERO_ERROR; if (validateRE(re, false, &status) == false) { return; } delete re; } //---------------------------------------------------------------------------------------- // // uregex_clone // //---------------------------------------------------------------------------------------- U_CAPI URegularExpression * U_EXPORT2 uregex_clone(const URegularExpression *source2, UErrorCode *status) { RegularExpression *source = (RegularExpression*)source2; if (validateRE(source, false, status) == false) { return nullptr; } RegularExpression *clone = new RegularExpression; if (clone == nullptr) { *status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } clone->fMatcher = source->fPat->matcher(*status); if (U_FAILURE(*status)) { delete clone; return nullptr; } clone->fPat = source->fPat; clone->fPatRefCount = source->fPatRefCount; clone->fPatString = source->fPatString; clone->fPatStringLen = source->fPatStringLen; umtx_atomic_inc(source->fPatRefCount); // Note: fText is not cloned. return (URegularExpression*)clone; } //------------------------------------------------------------------------------ // // uregex_pattern // //------------------------------------------------------------------------------ U_CAPI const char16_t * U_EXPORT2 uregex_pattern(const URegularExpression *regexp2, int32_t *patLength, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, false, status) == false) { return nullptr; } if (patLength != nullptr) { *patLength = regexp->fPatStringLen; } return regexp->fPatString; } //------------------------------------------------------------------------------ // // uregex_patternUText // //------------------------------------------------------------------------------ U_CAPI UText * U_EXPORT2 uregex_patternUText(const URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; return regexp->fPat->patternText(*status); } //------------------------------------------------------------------------------ // // uregex_flags // //------------------------------------------------------------------------------ U_CAPI int32_t U_EXPORT2 uregex_flags(const URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, false, status) == false) { return 0; } int32_t flags = regexp->fPat->flags(); return flags; } //------------------------------------------------------------------------------ // // uregex_setText // //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_setText(URegularExpression *regexp2, const char16_t *text, int32_t textLength, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, false, status) == false) { return; } if (text == nullptr || textLength < -1) { *status = U_ILLEGAL_ARGUMENT_ERROR; return; } if (regexp->fOwnsText && regexp->fText != nullptr) { uprv_free((void *)regexp->fText); } regexp->fText = text; regexp->fTextLength = textLength; regexp->fOwnsText = false; UText input = UTEXT_INITIALIZER; utext_openUChars(&input, text, textLength, status); regexp->fMatcher->reset(&input); utext_close(&input); // reset() made a shallow clone, so we don't need this copy } //------------------------------------------------------------------------------ // // uregex_setUText // //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_setUText(URegularExpression *regexp2, UText *text, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, false, status) == false) { return; } if (text == nullptr) { *status = U_ILLEGAL_ARGUMENT_ERROR; return; } if (regexp->fOwnsText && regexp->fText != nullptr) { uprv_free((void *)regexp->fText); } regexp->fText = nullptr; // only fill it in on request regexp->fTextLength = -1; regexp->fOwnsText = true; regexp->fMatcher->reset(text); } //------------------------------------------------------------------------------ // // uregex_getText // //------------------------------------------------------------------------------ U_CAPI const char16_t * U_EXPORT2 uregex_getText(URegularExpression *regexp2, int32_t *textLength, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, false, status) == false) { return nullptr; } if (regexp->fText == nullptr) { // need to fill in the text UText *inputText = regexp->fMatcher->inputText(); int64_t inputNativeLength = utext_nativeLength(inputText); if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) { regexp->fText = inputText->chunkContents; regexp->fTextLength = (int32_t)inputNativeLength; regexp->fOwnsText = false; // because the UText owns it } else { UErrorCode lengthStatus = U_ZERO_ERROR; regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, nullptr, 0, &lengthStatus); // buffer overflow error char16_t *inputChars = (char16_t *)uprv_malloc(sizeof(char16_t)*(regexp->fTextLength+1)); utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status); regexp->fText = inputChars; regexp->fOwnsText = true; // should already be set but just in case } } if (textLength != nullptr) { *textLength = regexp->fTextLength; } return regexp->fText; } //------------------------------------------------------------------------------ // // uregex_getUText // //------------------------------------------------------------------------------ U_CAPI UText * U_EXPORT2 uregex_getUText(URegularExpression *regexp2, UText *dest, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, false, status) == false) { return dest; } return regexp->fMatcher->getInput(dest, *status); } //------------------------------------------------------------------------------ // // uregex_refreshUText // //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_refreshUText(URegularExpression *regexp2, UText *text, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, false, status) == false) { return; } regexp->fMatcher->refreshInputText(text, *status); } //------------------------------------------------------------------------------ // // uregex_matches // //------------------------------------------------------------------------------ U_CAPI UBool U_EXPORT2 uregex_matches(URegularExpression *regexp2, int32_t startIndex, UErrorCode *status) { return uregex_matches64( regexp2, (int64_t)startIndex, status); } U_CAPI UBool U_EXPORT2 uregex_matches64(URegularExpression *regexp2, int64_t startIndex, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; UBool result = false; if (validateRE(regexp, true, status) == false) { return result; } if (startIndex == -1) { result = regexp->fMatcher->matches(*status); } else { result = regexp->fMatcher->matches(startIndex, *status); } return result; } //------------------------------------------------------------------------------ // // uregex_lookingAt // //------------------------------------------------------------------------------ U_CAPI UBool U_EXPORT2 uregex_lookingAt(URegularExpression *regexp2, int32_t startIndex, UErrorCode *status) { return uregex_lookingAt64( regexp2, (int64_t)startIndex, status); } U_CAPI UBool U_EXPORT2 uregex_lookingAt64(URegularExpression *regexp2, int64_t startIndex, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; UBool result = false; if (validateRE(regexp, true, status) == false) { return result; } if (startIndex == -1) { result = regexp->fMatcher->lookingAt(*status); } else { result = regexp->fMatcher->lookingAt(startIndex, *status); } return result; } //------------------------------------------------------------------------------ // // uregex_find // //------------------------------------------------------------------------------ U_CAPI UBool U_EXPORT2 uregex_find(URegularExpression *regexp2, int32_t startIndex, UErrorCode *status) { return uregex_find64( regexp2, (int64_t)startIndex, status); } U_CAPI UBool U_EXPORT2 uregex_find64(URegularExpression *regexp2, int64_t startIndex, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; UBool result = false; if (validateRE(regexp, true, status) == false) { return result; } if (startIndex == -1) { regexp->fMatcher->resetPreserveRegion(); result = regexp->fMatcher->find(*status); } else { result = regexp->fMatcher->find(startIndex, *status); } return result; } //------------------------------------------------------------------------------ // // uregex_findNext // //------------------------------------------------------------------------------ U_CAPI UBool U_EXPORT2 uregex_findNext(URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, true, status) == false) { return false; } UBool result = regexp->fMatcher->find(*status); return result; } //------------------------------------------------------------------------------ // // uregex_groupCount // //------------------------------------------------------------------------------ U_CAPI int32_t U_EXPORT2 uregex_groupCount(URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, false, status) == false) { return 0; } int32_t result = regexp->fMatcher->groupCount(); return result; } //------------------------------------------------------------------------------ // // uregex_groupNumberFromName // //------------------------------------------------------------------------------ int32_t uregex_groupNumberFromName(URegularExpression *regexp2, const char16_t *groupName, int32_t nameLength, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, false, status) == false) { return 0; } int32_t result = regexp->fPat->groupNumberFromName(UnicodeString(groupName, nameLength), *status); return result; } int32_t uregex_groupNumberFromCName(URegularExpression *regexp2, const char *groupName, int32_t nameLength, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, false, status) == false) { return 0; } return regexp->fPat->groupNumberFromName(groupName, nameLength, *status); } //------------------------------------------------------------------------------ // // uregex_group // //------------------------------------------------------------------------------ U_CAPI int32_t U_EXPORT2 uregex_group(URegularExpression *regexp2, int32_t groupNum, char16_t *dest, int32_t destCapacity, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, true, status) == false) { return 0; } if (destCapacity < 0 || (destCapacity > 0 && dest == nullptr)) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } if (destCapacity == 0 || regexp->fText != nullptr) { // If preflighting or if we already have the text as UChars, // this is a little cheaper than extracting from the UText // // Pick up the range of characters from the matcher // int32_t startIx = regexp->fMatcher->start(groupNum, *status); int32_t endIx = regexp->fMatcher->end (groupNum, *status); if (U_FAILURE(*status)) { return 0; } // // Trim length based on buffer capacity // int32_t fullLength = endIx - startIx; int32_t copyLength = fullLength; if (copyLength < destCapacity) { dest[copyLength] = 0; } else if (copyLength == destCapacity) { *status = U_STRING_NOT_TERMINATED_WARNING; } else { copyLength = destCapacity; *status = U_BUFFER_OVERFLOW_ERROR; } // // Copy capture group to user's buffer // if (copyLength > 0) { u_memcpy(dest, ®exp->fText[startIx], copyLength); } return fullLength; } else { int64_t start = regexp->fMatcher->start64(groupNum, *status); int64_t limit = regexp->fMatcher->end64(groupNum, *status); if (U_FAILURE(*status)) { return 0; } // Note edge cases: // Group didn't match: start == end == -1. UText trims to 0, UText gives zero length result. // Zero Length Match: start == end. int32_t length = utext_extract(regexp->fMatcher->inputText(), start, limit, dest, destCapacity, status); return length; } } //------------------------------------------------------------------------------ // // uregex_groupUText // //------------------------------------------------------------------------------ U_CAPI UText * U_EXPORT2 uregex_groupUText(URegularExpression *regexp2, int32_t groupNum, UText *dest, int64_t *groupLength, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, true, status) == false) { UErrorCode emptyTextStatus = U_ZERO_ERROR; return (dest ? dest : utext_openUChars(nullptr, nullptr, 0, &emptyTextStatus)); } return regexp->fMatcher->group(groupNum, dest, *groupLength, *status); } //------------------------------------------------------------------------------ // // uregex_start // //------------------------------------------------------------------------------ U_CAPI int32_t U_EXPORT2 uregex_start(URegularExpression *regexp2, int32_t groupNum, UErrorCode *status) { return (int32_t)uregex_start64( regexp2, groupNum, status); } U_CAPI int64_t U_EXPORT2 uregex_start64(URegularExpression *regexp2, int32_t groupNum, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, true, status) == false) { return 0; } int64_t result = regexp->fMatcher->start64(groupNum, *status); return result; } //------------------------------------------------------------------------------ // // uregex_end // //------------------------------------------------------------------------------ U_CAPI int32_t U_EXPORT2 uregex_end(URegularExpression *regexp2, int32_t groupNum, UErrorCode *status) { return (int32_t)uregex_end64( regexp2, groupNum, status); } U_CAPI int64_t U_EXPORT2 uregex_end64(URegularExpression *regexp2, int32_t groupNum, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, true, status) == false) { return 0; } int64_t result = regexp->fMatcher->end64(groupNum, *status); return result; } //------------------------------------------------------------------------------ // // uregex_reset // //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_reset(URegularExpression *regexp2, int32_t index, UErrorCode *status) { uregex_reset64( regexp2, (int64_t)index, status); } U_CAPI void U_EXPORT2 uregex_reset64(URegularExpression *regexp2, int64_t index, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, true, status) == false) { return; } regexp->fMatcher->reset(index, *status); } //------------------------------------------------------------------------------ // // uregex_setRegion // //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_setRegion(URegularExpression *regexp2, int32_t regionStart, int32_t regionLimit, UErrorCode *status) { uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status); } U_CAPI void U_EXPORT2 uregex_setRegion64(URegularExpression *regexp2, int64_t regionStart, int64_t regionLimit, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, true, status) == false) { return; } regexp->fMatcher->region(regionStart, regionLimit, *status); } //------------------------------------------------------------------------------ // // uregex_setRegionAndStart // //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_setRegionAndStart(URegularExpression *regexp2, int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, true, status) == false) { return; } regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status); } //------------------------------------------------------------------------------ // // uregex_regionStart // //------------------------------------------------------------------------------ U_CAPI int32_t U_EXPORT2 uregex_regionStart(const URegularExpression *regexp2, UErrorCode *status) { return (int32_t)uregex_regionStart64(regexp2, status); } U_CAPI int64_t U_EXPORT2 uregex_regionStart64(const URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, true, status) == false) { return 0; } return regexp->fMatcher->regionStart(); } //------------------------------------------------------------------------------ // // uregex_regionEnd // //------------------------------------------------------------------------------ U_CAPI int32_t U_EXPORT2 uregex_regionEnd(const URegularExpression *regexp2, UErrorCode *status) { return (int32_t)uregex_regionEnd64(regexp2, status); } U_CAPI int64_t U_EXPORT2 uregex_regionEnd64(const URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, true, status) == false) { return 0; } return regexp->fMatcher->regionEnd(); } //------------------------------------------------------------------------------ // // uregex_hasTransparentBounds // //------------------------------------------------------------------------------ U_CAPI UBool U_EXPORT2 uregex_hasTransparentBounds(const URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, false, status) == false) { return false; } return regexp->fMatcher->hasTransparentBounds(); } //------------------------------------------------------------------------------ // // uregex_useTransparentBounds // //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_useTransparentBounds(URegularExpression *regexp2, UBool b, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, false, status) == false) { return; } regexp->fMatcher->useTransparentBounds(b); } //------------------------------------------------------------------------------ // // uregex_hasAnchoringBounds // //------------------------------------------------------------------------------ U_CAPI UBool U_EXPORT2 uregex_hasAnchoringBounds(const URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, false, status) == false) { return false; } return regexp->fMatcher->hasAnchoringBounds(); } //------------------------------------------------------------------------------ // // uregex_useAnchoringBounds // //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_useAnchoringBounds(URegularExpression *regexp2, UBool b, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, false, status) == false) { return; } regexp->fMatcher->useAnchoringBounds(b); } //------------------------------------------------------------------------------ // // uregex_hitEnd // //------------------------------------------------------------------------------ U_CAPI UBool U_EXPORT2 uregex_hitEnd(const URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, true, status) == false) { return false; } return regexp->fMatcher->hitEnd(); } //------------------------------------------------------------------------------ // // uregex_requireEnd // //------------------------------------------------------------------------------ U_CAPI UBool U_EXPORT2 uregex_requireEnd(const URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, true, status) == false) { return false; } return regexp->fMatcher->requireEnd(); } //------------------------------------------------------------------------------ // // uregex_setTimeLimit // //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_setTimeLimit(URegularExpression *regexp2, int32_t limit, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, false, status)) { regexp->fMatcher->setTimeLimit(limit, *status); } } //------------------------------------------------------------------------------ // // uregex_getTimeLimit // //------------------------------------------------------------------------------ U_CAPI int32_t U_EXPORT2 uregex_getTimeLimit(const URegularExpression *regexp2, UErrorCode *status) { int32_t retVal = 0; RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, false, status)) { retVal = regexp->fMatcher->getTimeLimit(); } return retVal; } //------------------------------------------------------------------------------ // // uregex_setStackLimit // //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_setStackLimit(URegularExpression *regexp2, int32_t limit, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, false, status)) { regexp->fMatcher->setStackLimit(limit, *status); } } //------------------------------------------------------------------------------ // // uregex_getStackLimit // //------------------------------------------------------------------------------ U_CAPI int32_t U_EXPORT2 uregex_getStackLimit(const URegularExpression *regexp2, UErrorCode *status) { int32_t retVal = 0; RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, false, status)) { retVal = regexp->fMatcher->getStackLimit(); } return retVal; } //------------------------------------------------------------------------------ // // uregex_setMatchCallback // //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_setMatchCallback(URegularExpression *regexp2, URegexMatchCallback *callback, const void *context, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, false, status)) { regexp->fMatcher->setMatchCallback(callback, context, *status); } } //------------------------------------------------------------------------------ // // uregex_getMatchCallback // //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_getMatchCallback(const URegularExpression *regexp2, URegexMatchCallback **callback, const void **context, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, false, status)) { regexp->fMatcher->getMatchCallback(*callback, *context, *status); } } //------------------------------------------------------------------------------ // // uregex_setMatchProgressCallback // //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_setFindProgressCallback(URegularExpression *regexp2, URegexFindProgressCallback *callback, const void *context, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, false, status)) { regexp->fMatcher->setFindProgressCallback(callback, context, *status); } } //------------------------------------------------------------------------------ // // uregex_getMatchCallback // //------------------------------------------------------------------------------ U_CAPI void U_EXPORT2 uregex_getFindProgressCallback(const URegularExpression *regexp2, URegexFindProgressCallback **callback, const void **context, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, false, status)) { regexp->fMatcher->getFindProgressCallback(*callback, *context, *status); } } //------------------------------------------------------------------------------ // // uregex_replaceAll // //------------------------------------------------------------------------------ U_CAPI int32_t U_EXPORT2 uregex_replaceAll(URegularExpression *regexp2, const char16_t *replacementText, int32_t replacementLength, char16_t *destBuf, int32_t destCapacity, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, true, status) == false) { return 0; } if (replacementText == nullptr || replacementLength < -1 || (destBuf == nullptr && destCapacity > 0) || destCapacity < 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } int32_t len = 0; uregex_reset(regexp2, 0, status); // Note: Separate error code variables for findNext() and appendReplacement() // are used so that destination buffer overflow errors // in appendReplacement won't stop findNext() from working. // appendReplacement() and appendTail() special case incoming buffer // overflow errors, continuing to return the correct length. UErrorCode findStatus = *status; while (uregex_findNext(regexp2, &findStatus)) { len += uregex_appendReplacement(regexp2, replacementText, replacementLength, &destBuf, &destCapacity, status); } len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); if (U_FAILURE(findStatus)) { // If anything went wrong with the findNext(), make that error trump // whatever may have happened with the append() operations. // Errors in findNext() are not expected. *status = findStatus; } return len; } //------------------------------------------------------------------------------ // // uregex_replaceAllUText // //------------------------------------------------------------------------------ U_CAPI UText * U_EXPORT2 uregex_replaceAllUText(URegularExpression *regexp2, UText *replacementText, UText *dest, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, true, status) == false) { return 0; } if (replacementText == nullptr) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } dest = regexp->fMatcher->replaceAll(replacementText, dest, *status); return dest; } //------------------------------------------------------------------------------ // // uregex_replaceFirst // //------------------------------------------------------------------------------ U_CAPI int32_t U_EXPORT2 uregex_replaceFirst(URegularExpression *regexp2, const char16_t *replacementText, int32_t replacementLength, char16_t *destBuf, int32_t destCapacity, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, true, status) == false) { return 0; } if (replacementText == nullptr || replacementLength < -1 || (destBuf == nullptr && destCapacity > 0) || destCapacity < 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } int32_t len = 0; UBool findSucceeded; uregex_reset(regexp2, 0, status); findSucceeded = uregex_find(regexp2, 0, status); if (findSucceeded) { len = uregex_appendReplacement(regexp2, replacementText, replacementLength, &destBuf, &destCapacity, status); } len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); return len; } //------------------------------------------------------------------------------ // // uregex_replaceFirstUText // //------------------------------------------------------------------------------ U_CAPI UText * U_EXPORT2 uregex_replaceFirstUText(URegularExpression *regexp2, UText *replacementText, UText *dest, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, true, status) == false) { return 0; } if (replacementText == nullptr) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status); return dest; } //------------------------------------------------------------------------------ // // uregex_appendReplacement // //------------------------------------------------------------------------------ U_NAMESPACE_BEGIN // // Dummy class, because these functions need to be friends of class RegexMatcher, // and stand-alone C functions don't work as friends // class RegexCImpl { public: inline static int32_t appendReplacement(RegularExpression *regexp, const char16_t *replacementText, int32_t replacementLength, char16_t **destBuf, int32_t *destCapacity, UErrorCode *status); inline static int32_t appendTail(RegularExpression *regexp, char16_t **destBuf, int32_t *destCapacity, UErrorCode *status); inline static int32_t split(RegularExpression *regexp, char16_t *destBuf, int32_t destCapacity, int32_t *requiredCapacity, char16_t *destFields[], int32_t destFieldsCapacity, UErrorCode *status); }; U_NAMESPACE_END static const char16_t BACKSLASH = 0x5c; static const char16_t DOLLARSIGN = 0x24; static const char16_t LEFTBRACKET = 0x7b; static const char16_t RIGHTBRACKET = 0x7d; // // Move a character to an output buffer, with bounds checking on the index. // Index advances even if capacity is exceeded, for preflight size computations. // This little sequence is used a LOT. // static inline void appendToBuf(char16_t c, int32_t *idx, char16_t *buf, int32_t bufCapacity) { if (*idx < bufCapacity) { buf[*idx] = c; } (*idx)++; } // // appendReplacement, the actual implementation. // int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, const char16_t *replacementText, int32_t replacementLength, char16_t **destBuf, int32_t *destCapacity, UErrorCode *status) { // If we come in with a buffer overflow error, don't suppress the operation. // A series of appendReplacements, appendTail need to correctly preflight // the buffer size when an overflow happens somewhere in the middle. UBool pendingBufferOverflow = false; if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != nullptr && *destCapacity == 0) { pendingBufferOverflow = true; *status = U_ZERO_ERROR; } // // Validate all parameters // if (validateRE(regexp, true, status) == false) { return 0; } if (replacementText == nullptr || replacementLength < -1 || destCapacity == nullptr || destBuf == nullptr || (*destBuf == nullptr && *destCapacity > 0) || *destCapacity < 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } RegexMatcher *m = regexp->fMatcher; if (m->fMatch == false) { *status = U_REGEX_INVALID_STATE; return 0; } char16_t *dest = *destBuf; int32_t capacity = *destCapacity; int32_t destIdx = 0; int32_t i; // If it wasn't supplied by the caller, get the length of the replacement text. // TODO: slightly smarter logic in the copy loop could watch for the NUL on // the fly and avoid this step. if (replacementLength == -1) { replacementLength = u_strlen(replacementText); } // Copy input string from the end of previous match to start of current match if (regexp->fText != nullptr) { int32_t matchStart; int32_t lastMatchEnd; if (UTEXT_USES_U16(m->fInputText)) { lastMatchEnd = (int32_t)m->fLastMatchEnd; matchStart = (int32_t)m->fMatchStart; } else { // !!!: Would like a better way to do this! UErrorCode tempStatus = U_ZERO_ERROR; lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, nullptr, 0, &tempStatus); tempStatus = U_ZERO_ERROR; matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, nullptr, 0, &tempStatus); } for (i=lastMatchEnd; ifText[i], &destIdx, dest, capacity); } } else { UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, dest==nullptr?nullptr:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), &possibleOverflowError); } U_ASSERT(destIdx >= 0); // scan the replacement text, looking for substitutions ($n) and \escapes. int32_t replIdx = 0; while (replIdx < replacementLength && U_SUCCESS(*status)) { char16_t c = replacementText[replIdx]; replIdx++; if (c != DOLLARSIGN && c != BACKSLASH) { // Common case, no substitution, no escaping, // just copy the char to the dest buf. appendToBuf(c, &destIdx, dest, capacity); continue; } if (c == BACKSLASH) { // Backslash Escape. Copy the following char out without further checks. // Note: Surrogate pairs don't need any special handling // The second half wont be a '$' or a '\', and // will move to the dest normally on the next // loop iteration. if (replIdx >= replacementLength) { break; } c = replacementText[replIdx]; if (c==0x55/*U*/ || c==0x75/*u*/) { // We have a \udddd or \Udddddddd escape sequence. UChar32 escapedChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &replIdx, // Index is updated by unescapeAt replacementLength, // Length of replacement text (void *)replacementText); if (escapedChar != (UChar32)0xFFFFFFFF) { if (escapedChar <= 0xffff) { appendToBuf((char16_t)escapedChar, &destIdx, dest, capacity); } else { appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity); appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity); } continue; } // Note: if the \u escape was invalid, just fall through and // treat it as a plain \ escape. } // Plain backslash escape. Just put out the escaped character. appendToBuf(c, &destIdx, dest, capacity); replIdx++; continue; } // We've got a $. Pick up the following capture group name or number. // For numbers, consume only digits that produce a valid capture group for the pattern. int32_t groupNum = 0; U_ASSERT(c == DOLLARSIGN); UChar32 c32 = -1; if (replIdx < replacementLength) { U16_GET(replacementText, 0, replIdx, replacementLength, c32); } if (u_isdigit(c32)) { int32_t numDigits = 0; int32_t numCaptureGroups = m->fPattern->fGroupMap->size(); for (;;) { if (replIdx >= replacementLength) { break; } U16_GET(replacementText, 0, replIdx, replacementLength, c32); if (u_isdigit(c32) == false) { break; } int32_t digitVal = u_charDigitValue(c32); if (groupNum * 10 + digitVal <= numCaptureGroups) { groupNum = groupNum * 10 + digitVal; U16_FWD_1(replacementText, replIdx, replacementLength); numDigits++; } else { if (numDigits == 0) { *status = U_INDEX_OUTOFBOUNDS_ERROR; } break; } } } else if (c32 == LEFTBRACKET) { // Scan for Named Capture Group, ${name}. UnicodeString groupName; U16_FWD_1(replacementText, replIdx, replacementLength); while (U_SUCCESS(*status) && c32 != RIGHTBRACKET) { if (replIdx >= replacementLength) { *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; break; } U16_NEXT(replacementText, replIdx, replacementLength, c32); if ((c32 >= 0x41 && c32 <= 0x5a) || // A..Z (c32 >= 0x61 && c32 <= 0x7a) || // a..z (c32 >= 0x31 && c32 <= 0x39)) { // 0..9 groupName.append(c32); } else if (c32 == RIGHTBRACKET) { groupNum = regexp->fPat->fNamedCaptureMap ? uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName) : 0; if (groupNum == 0) { // Name not defined by pattern. *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; } } else { // Character was something other than a name char or a closing '}' *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; } } } else { // $ not followed by {name} or digits. *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; } // Finally, append the capture group data to the destination. if (U_SUCCESS(*status)) { destIdx += uregex_group((URegularExpression*)regexp, groupNum, dest==nullptr?nullptr:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status); if (*status == U_BUFFER_OVERFLOW_ERROR) { // Ignore buffer overflow when extracting the group. We need to // continue on to get full size of the untruncated result. We will // raise our own buffer overflow error at the end. *status = U_ZERO_ERROR; } } if (U_FAILURE(*status)) { // bad group number or name. break; } } // // Nul Terminate the dest buffer if possible. // Set the appropriate buffer overflow or not terminated error, if needed. // if (destIdx < capacity) { dest[destIdx] = 0; } else if (U_SUCCESS(*status)) { if (destIdx == *destCapacity) { *status = U_STRING_NOT_TERMINATED_WARNING; } else { *status = U_BUFFER_OVERFLOW_ERROR; } } // // Return an updated dest buffer and capacity to the caller. // if (destIdx > 0 && *destCapacity > 0) { if (destIdx < capacity) { *destBuf += destIdx; *destCapacity -= destIdx; } else { *destBuf += capacity; *destCapacity = 0; } } // If we came in with a buffer overflow, make sure we go out with one also. // (A zero length match right at the end of the previous match could // make this function succeed even though a previous call had overflowed the buf) if (pendingBufferOverflow && U_SUCCESS(*status)) { *status = U_BUFFER_OVERFLOW_ERROR; } return destIdx; } // // appendReplacement the actual API function, // U_CAPI int32_t U_EXPORT2 uregex_appendReplacement(URegularExpression *regexp2, const char16_t *replacementText, int32_t replacementLength, char16_t **destBuf, int32_t *destCapacity, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; return RegexCImpl::appendReplacement( regexp, replacementText, replacementLength,destBuf, destCapacity, status); } // // uregex_appendReplacementUText...can just use the normal C++ method // U_CAPI void U_EXPORT2 uregex_appendReplacementUText(URegularExpression *regexp2, UText *replText, UText *dest, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; regexp->fMatcher->appendReplacement(dest, replText, *status); } //------------------------------------------------------------------------------ // // uregex_appendTail // //------------------------------------------------------------------------------ int32_t RegexCImpl::appendTail(RegularExpression *regexp, char16_t **destBuf, int32_t *destCapacity, UErrorCode *status) { // If we come in with a buffer overflow error, don't suppress the operation. // A series of appendReplacements, appendTail need to correctly preflight // the buffer size when an overflow happens somewhere in the middle. UBool pendingBufferOverflow = false; if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != nullptr && *destCapacity == 0) { pendingBufferOverflow = true; *status = U_ZERO_ERROR; } if (validateRE(regexp, true, status) == false) { return 0; } if (destCapacity == nullptr || destBuf == nullptr || (*destBuf == nullptr && *destCapacity > 0) || *destCapacity < 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } RegexMatcher *m = regexp->fMatcher; int32_t destIdx = 0; int32_t destCap = *destCapacity; char16_t *dest = *destBuf; if (regexp->fText != nullptr) { int32_t srcIdx; int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd); if (nativeIdx == -1) { srcIdx = 0; } else if (UTEXT_USES_U16(m->fInputText)) { srcIdx = (int32_t)nativeIdx; } else { UErrorCode newStatus = U_ZERO_ERROR; srcIdx = utext_extract(m->fInputText, 0, nativeIdx, nullptr, 0, &newStatus); } for (;;) { U_ASSERT(destIdx >= 0); if (srcIdx == regexp->fTextLength) { break; } char16_t c = regexp->fText[srcIdx]; if (c == 0 && regexp->fTextLength == -1) { regexp->fTextLength = srcIdx; break; } if (destIdx < destCap) { dest[destIdx] = c; } else { // We've overflowed the dest buffer. // If the total input string length is known, we can // compute the total buffer size needed without scanning through the string. if (regexp->fTextLength > 0) { destIdx += (regexp->fTextLength - srcIdx); break; } } srcIdx++; destIdx++; } } else { int64_t srcIdx; if (m->fMatch) { // The most recent call to find() succeeded. srcIdx = m->fMatchEnd; } else { // The last call to find() on this matcher failed(). // Look back to the end of the last find() that succeeded for src index. srcIdx = m->fLastMatchEnd; if (srcIdx == -1) { // There has been no successful match with this matcher. // We want to copy the whole string. srcIdx = 0; } } destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status); } // // NUL terminate the output string, if possible, otherwise issue the // appropriate error or warning. // if (destIdx < destCap) { dest[destIdx] = 0; } else if (destIdx == destCap) { *status = U_STRING_NOT_TERMINATED_WARNING; } else { *status = U_BUFFER_OVERFLOW_ERROR; } // // Update the user's buffer ptr and capacity vars to reflect the // amount used. // if (destIdx < destCap) { *destBuf += destIdx; *destCapacity -= destIdx; } else if (*destBuf != nullptr) { *destBuf += destCap; *destCapacity = 0; } if (pendingBufferOverflow && U_SUCCESS(*status)) { *status = U_BUFFER_OVERFLOW_ERROR; } return destIdx; } // // appendTail the actual API function // U_CAPI int32_t U_EXPORT2 uregex_appendTail(URegularExpression *regexp2, char16_t **destBuf, int32_t *destCapacity, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status); } // // uregex_appendTailUText...can just use the normal C++ method // U_CAPI UText * U_EXPORT2 uregex_appendTailUText(URegularExpression *regexp2, UText *dest, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; return regexp->fMatcher->appendTail(dest, *status); } //------------------------------------------------------------------------------ // // copyString Internal utility to copy a string to an output buffer, // while managing buffer overflow and preflight size // computation. NUL termination is added to destination, // and the NUL is counted in the output size. // //------------------------------------------------------------------------------ #if 0 static void copyString(char16_t *destBuffer, // Destination buffer. int32_t destCapacity, // Total capacity of dest buffer int32_t *destIndex, // Index into dest buffer. Updated on return. // Update not clipped to destCapacity. const char16_t *srcPtr, // Pointer to source string int32_t srcLen) // Source string len. { int32_t si; int32_t di = *destIndex; char16_t c; for (si=0; sifMatcher->reset(); UText *inputText = regexp->fMatcher->fInputText; int64_t nextOutputStringStart = 0; int64_t inputLen = regexp->fMatcher->fInputLength; if (inputLen == 0) { return 0; } // // Loop through the input text, searching for the delimiter pattern // int32_t i; // Index of the field being processed. int32_t destIdx = 0; // Next available position in destBuf; int32_t numCaptureGroups = regexp->fMatcher->groupCount(); UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow errors so that the strings are still counted for (i=0; ; i++) { if (i>=destFieldsCapacity-1) { // There are one or zero output strings left. // Fill the last output string with whatever is left from the input, then exit the loop. // ( i will be == destFieldsCapacity if we filled the output array while processing // capture groups of the delimiter expression, in which case we will discard the // last capture group saved in favor of the unprocessed remainder of the // input string.) if (inputLen > nextOutputStringStart) { if (i != destFieldsCapacity-1) { // No fields are left. Recycle the last one for holding the trailing part of // the input string. i = destFieldsCapacity-1; destIdx = (int32_t)(destFields[i] - destFields[0]); } destFields[i] = (destBuf == nullptr) ? nullptr : &destBuf[destIdx]; destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen, destFields[i], REMAINING_CAPACITY(destIdx, destCapacity), status); } break; } if (regexp->fMatcher->find()) { // We found another delimiter. Move everything from where we started looking // up until the start of the delimiter into the next output string. destFields[i] = (destBuf == nullptr) ? nullptr : &destBuf[destIdx]; destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart, destFields[i], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus); if (tStatus == U_BUFFER_OVERFLOW_ERROR) { tStatus = U_ZERO_ERROR; } else { *status = tStatus; } nextOutputStringStart = regexp->fMatcher->fMatchEnd; // If the delimiter pattern has capturing parentheses, the captured // text goes out into the next n destination strings. int32_t groupNum; for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { // If we've run out of output string slots, bail out. if (i==destFieldsCapacity-1) { break; } i++; // Set up to extract the capture group contents into the dest buffer. destFields[i] = &destBuf[destIdx]; tStatus = U_ZERO_ERROR; int32_t t = uregex_group((URegularExpression*)regexp, groupNum, destFields[i], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus); destIdx += t + 1; // Record the space used in the output string buffer. // +1 for the NUL that terminates the string. if (tStatus == U_BUFFER_OVERFLOW_ERROR) { tStatus = U_ZERO_ERROR; } else { *status = tStatus; } } if (nextOutputStringStart == inputLen) { // The delimiter was at the end of the string. // Output an empty string, and then we are done. if (destIdx < destCapacity) { destBuf[destIdx] = 0; } if (i < destFieldsCapacity-1) { ++i; } if (destIdx < destCapacity) { destFields[i] = destBuf + destIdx; } ++destIdx; break; } } else { // We ran off the end of the input while looking for the next delimiter. // All the remaining text goes into the current output string. destFields[i] = (destBuf == nullptr) ? nullptr : &destBuf[destIdx]; destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen, destFields[i], REMAINING_CAPACITY(destIdx, destCapacity), status); break; } } // Zero out any unused portion of the destFields array int j; for (j=i+1; j destCapacity) { *status = U_BUFFER_OVERFLOW_ERROR; } return i+1; } // // uregex_split The actual API function // U_CAPI int32_t U_EXPORT2 uregex_split(URegularExpression *regexp2, char16_t *destBuf, int32_t destCapacity, int32_t *requiredCapacity, char16_t *destFields[], int32_t destFieldsCapacity, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; if (validateRE(regexp, true, status) == false) { return 0; } if ((destBuf == nullptr && destCapacity > 0) || destCapacity < 0 || destFields == nullptr || destFieldsCapacity < 1 ) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status); } // // uregex_splitUText...can just use the normal C++ method // U_CAPI int32_t U_EXPORT2 uregex_splitUText(URegularExpression *regexp2, UText *destFields[], int32_t destFieldsCapacity, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status); } #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS stringi/src/icu74/i18n/brktrans.cpp0000644000176200001440000001405714700200761016570 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2008-2015, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 05/11/2008 Andy Heninger Port from Java ********************************************************************** */ #include #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION #include "unicode/brkiter.h" #include "unicode/localpointer.h" #include "unicode/uchar.h" #include "unicode/unifilt.h" #include "unicode/uniset.h" #include "brktrans.h" #include "cmemory.h" #include "mutex.h" #include "uprops.h" #include "uinvchar.h" #include "util.h" #include "uvectr32.h" U_NAMESPACE_BEGIN UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator) static const char16_t SPACE = 32; // ' ' /** * Constructs a transliterator with the default delimiters '{' and * '}'. */ BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) : Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter), cachedBI(nullptr), cachedBoundaries(nullptr), fInsertion(SPACE) { } /** * Destructor. */ BreakTransliterator::~BreakTransliterator() { } /** * Copy constructor. */ BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) : Transliterator(o), cachedBI(nullptr), cachedBoundaries(nullptr), fInsertion(o.fInsertion) { } /** * Transliterator API. */ BreakTransliterator* BreakTransliterator::clone() const { return new BreakTransliterator(*this); } /** * Implements {@link Transliterator#handleTransliterate}. */ void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, UBool isIncremental ) const { UErrorCode status = U_ZERO_ERROR; LocalPointer bi; LocalPointer boundaries; { Mutex m; BreakTransliterator *nonConstThis = const_cast(this); boundaries = std::move(nonConstThis->cachedBoundaries); bi = std::move(nonConstThis->cachedBI); } if (bi.isNull()) { bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status)); } if (boundaries.isNull()) { boundaries.adoptInstead(new UVector32(status)); } if (bi.isNull() || boundaries.isNull() || U_FAILURE(status)) { return; } boundaries->removeAllElements(); UnicodeString sText = replaceableAsString(text); bi->setText(sText); bi->preceding(offsets.start); // To make things much easier, we will stack the boundaries, and then insert at the end. // generally, we won't need too many, since we will be filtered. int32_t boundary; for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) { if (boundary == 0) continue; // HACK: Check to see that preceding item was a letter UChar32 cp = sText.char32At(boundary-1); int type = u_charType(cp); //System.out.println(Integer.toString(cp,16) + " (before): " + type); if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; cp = sText.char32At(boundary); type = u_charType(cp); //System.out.println(Integer.toString(cp,16) + " (after): " + type); if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; boundaries->addElement(boundary, status); // printf("Boundary at %d\n", boundary); } int delta = 0; int lastBoundary = 0; if (boundaries->size() != 0) { // if we found something, adjust delta = boundaries->size() * fInsertion.length(); lastBoundary = boundaries->lastElementi(); // we do this from the end backwards, so that we don't have to keep updating. while (boundaries->size() > 0) { boundary = boundaries->popi(); text.handleReplaceBetween(boundary, boundary, fInsertion); } } // Now fix up the return values offsets.contextLimit += delta; offsets.limit += delta; offsets.start = isIncremental ? lastBoundary + delta : offsets.limit; // Return break iterator & boundaries vector to the cache. { Mutex m; BreakTransliterator *nonConstThis = const_cast(this); if (nonConstThis->cachedBI.isNull()) { nonConstThis->cachedBI = std::move(bi); } if (nonConstThis->cachedBoundaries.isNull()) { nonConstThis->cachedBoundaries = std::move(boundaries); } } // TODO: do something with U_FAILURE(status); // (need to look at transliterators overall, not just here.) } // // getInsertion() // const UnicodeString &BreakTransliterator::getInsertion() const { return fInsertion; } // // setInsertion() // void BreakTransliterator::setInsertion(const UnicodeString &insertion) { this->fInsertion = insertion; } // // replaceableAsString Hack to let break iterators work // on the replaceable text from transliterators. // In practice, the only real Replaceable type that we // will be seeing is UnicodeString, so this function // will normally be efficient. // UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) { UnicodeString s; UnicodeString *rs = dynamic_cast(&r); if (rs != nullptr) { s = *rs; } else { r.extractBetween(0, r.length(), s); } return s; } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ stringi/src/icu74/i18n/ucln_in.h0000644000176200001440000000431114700200761016026 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * Copyright (C) 2001-2016, International Business Machines * Corporation and others. All Rights Reserved. ****************************************************************************** * file name: ucln_in.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2001July05 * created by: George Rhoten */ #ifndef __UCLN_IN_H__ #define __UCLN_IN_H__ #include "unicode/utypes.h" #include "ucln.h" /* Please keep the order of enums declared in same order as the functions are suppose to be called. It's usually best to have child dependencies called first. */ typedef enum ECleanupI18NType { UCLN_I18N_START = -1, UCLN_I18N_UNIT_EXTRAS, UCLN_I18N_NUMBER_SKELETONS, UCLN_I18N_CURRENCY_SPACING, UCLN_I18N_SPOOF, UCLN_I18N_SPOOFDATA, UCLN_I18N_TRANSLITERATOR, UCLN_I18N_REGEX, UCLN_I18N_JAPANESE_CALENDAR, UCLN_I18N_ISLAMIC_CALENDAR, UCLN_I18N_CHINESE_CALENDAR, UCLN_I18N_HEBREW_CALENDAR, UCLN_I18N_ASTRO_CALENDAR, UCLN_I18N_DANGI_CALENDAR, UCLN_I18N_CALENDAR, UCLN_I18N_TIMEZONEFORMAT, UCLN_I18N_TZDBTIMEZONENAMES, UCLN_I18N_TIMEZONEGENERICNAMES, UCLN_I18N_TIMEZONENAMES, UCLN_I18N_ZONEMETA, UCLN_I18N_TIMEZONE, UCLN_I18N_DIGITLIST, UCLN_I18N_DECFMT, UCLN_I18N_NUMFMT, UCLN_I18N_ALLOWED_HOUR_FORMATS, UCLN_I18N_DAYPERIODRULES, UCLN_I18N_SMPDTFMT, UCLN_I18N_USEARCH, UCLN_I18N_COLLATOR, UCLN_I18N_UCOL_RES, UCLN_I18N_CSDET, UCLN_I18N_COLLATION_ROOT, UCLN_I18N_GENDERINFO, UCLN_I18N_CDFINFO, UCLN_I18N_REGION, UCLN_I18N_LIST_FORMATTER, UCLN_I18N_NUMSYS, UCLN_I18N_COUNT /* This must be last */ } ECleanupI18NType; /* Main library cleanup registration function. */ /* See common/ucln.h for details on adding a cleanup function. */ /* Note: the global mutex must not be held when calling this function. */ U_CFUNC void U_EXPORT2 ucln_i18n_registerCleanup(ECleanupI18NType type, cleanupFunc *func); #endif stringi/src/icu74/i18n/measure.cpp0000644000176200001440000000374214700200761016402 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (c) 2004-2014, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Author: Alan Liu * Created: April 26, 2004 * Since: ICU 3.0 ********************************************************************** */ #include "utypeinfo.h" // for 'typeid' to work #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/measure.h" #include "unicode/measunit.h" U_NAMESPACE_BEGIN UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Measure) Measure::Measure() : unit(nullptr) {} Measure::Measure(const Formattable& _number, MeasureUnit* adoptedUnit, UErrorCode& ec) : number(_number), unit(adoptedUnit) { if (U_SUCCESS(ec) && (!number.isNumeric() || adoptedUnit == 0)) { ec = U_ILLEGAL_ARGUMENT_ERROR; } } Measure::Measure(const Measure& other) : UObject(other), unit(nullptr) { *this = other; } Measure& Measure::operator=(const Measure& other) { if (this != &other) { delete unit; number = other.number; if (other.unit != nullptr) { unit = other.unit->clone(); } else { unit = nullptr; } } return *this; } Measure *Measure::clone() const { return new Measure(*this); } Measure::~Measure() { delete unit; } bool Measure::operator==(const UObject& other) const { if (this == &other) { // Same object, equal return true; } if (typeid(*this) != typeid(other)) { // Different types, not equal return false; } const Measure &m = static_cast(other); return number == m.number && ((unit == nullptr) == (m.unit == nullptr)) && (unit == nullptr || *unit == *m.unit); } U_NAMESPACE_END #endif // !UCONFIG_NO_FORMATTING stringi/src/icu74/i18n/double-conversion.h0000644000176200001440000000426314700200761020042 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // // From the double-conversion library. Original license: // // Copyright 2012 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // ICU PATCH: ifdef around UCONFIG_NO_FORMATTING #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #ifndef DOUBLE_CONVERSION_DOUBLE_CONVERSION_H_ #define DOUBLE_CONVERSION_DOUBLE_CONVERSION_H_ // ICU PATCH: Customize header file paths for ICU. #include "double-conversion-string-to-double.h" #include "double-conversion-double-to-string.h" #endif // DOUBLE_CONVERSION_DOUBLE_CONVERSION_H_ #endif // ICU PATCH: close #if !UCONFIG_NO_FORMATTING stringi/src/icu74/i18n/tzfmt.cpp0000644000176200001440000030013214700200761016076 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2011-2015, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/calendar.h" #include "unicode/tzfmt.h" #include "unicode/numsys.h" #include "unicode/strenum.h" #include "unicode/uchar.h" #include "unicode/udat.h" #include "unicode/ustring.h" #include "unicode/utf16.h" #include "bytesinkutil.h" #include "charstr.h" #include "tzgnames.h" #include "cmemory.h" #include "cstring.h" #include "putilimp.h" #include "uassert.h" #include "ucln_in.h" #include "ulocimp.h" #include "umutex.h" #include "uresimp.h" #include "ureslocs.h" #include "uvector.h" #include "zonemeta.h" #include "tznames_impl.h" // TextTrieMap #include "patternprops.h" U_NAMESPACE_BEGIN // Bit flags used by the parse method. // The order must match UTimeZoneFormatStyle enum. #define ISO_Z_STYLE_FLAG 0x0080 #define ISO_LOCAL_STYLE_FLAG 0x0100 static const int16_t STYLE_PARSE_FLAGS[] = { 0x0001, // UTZFMT_STYLE_GENERIC_LOCATION, 0x0002, // UTZFMT_STYLE_GENERIC_LONG, 0x0004, // UTZFMT_STYLE_GENERIC_SHORT, 0x0008, // UTZFMT_STYLE_SPECIFIC_LONG, 0x0010, // UTZFMT_STYLE_SPECIFIC_SHORT, 0x0020, // UTZFMT_STYLE_LOCALIZED_GMT, 0x0040, // UTZFMT_STYLE_LOCALIZED_GMT_SHORT, ISO_Z_STYLE_FLAG, // UTZFMT_STYLE_ISO_BASIC_SHORT, ISO_LOCAL_STYLE_FLAG, // UTZFMT_STYLE_ISO_BASIC_LOCAL_SHORT, ISO_Z_STYLE_FLAG, // UTZFMT_STYLE_ISO_BASIC_FIXED, ISO_LOCAL_STYLE_FLAG, // UTZFMT_STYLE_ISO_BASIC_LOCAL_FIXED, ISO_Z_STYLE_FLAG, // UTZFMT_STYLE_ISO_BASIC_FULL, ISO_LOCAL_STYLE_FLAG, // UTZFMT_STYLE_ISO_BASIC_LOCAL_FULL, ISO_Z_STYLE_FLAG, // UTZFMT_STYLE_ISO_EXTENDED_FIXED, ISO_LOCAL_STYLE_FLAG, // UTZFMT_STYLE_ISO_EXTENDED_LOCAL_FIXED, ISO_Z_STYLE_FLAG, // UTZFMT_STYLE_ISO_EXTENDED_FULL, ISO_LOCAL_STYLE_FLAG, // UTZFMT_STYLE_ISO_EXTENDED_LOCAL_FULL, 0x0200, // UTZFMT_STYLE_ZONE_ID, 0x0400, // UTZFMT_STYLE_ZONE_ID_SHORT, 0x0800 // UTZFMT_STYLE_EXEMPLAR_LOCATION }; static const char gZoneStringsTag[] = "zoneStrings"; static const char gGmtFormatTag[]= "gmtFormat"; static const char gGmtZeroFormatTag[] = "gmtZeroFormat"; static const char gHourFormatTag[]= "hourFormat"; static const char16_t TZID_GMT[] = {0x0045, 0x0074, 0x0063, 0x002F, 0x0047, 0x004D, 0x0054, 0}; // Etc/GMT static const char16_t UNKNOWN_ZONE_ID[] = { 0x0045, 0x0074, 0x0063, 0x002F, 0x0055, 0x006E, 0x006B, 0x006E, 0x006F, 0x0077, 0x006E, 0}; // Etc/Unknown static const char16_t UNKNOWN_SHORT_ZONE_ID[] = {0x0075, 0x006E, 0x006B, 0}; // unk static const char16_t UNKNOWN_LOCATION[] = {0x0055, 0x006E, 0x006B, 0x006E, 0x006F, 0x0077, 0x006E, 0}; // Unknown static const char16_t DEFAULT_GMT_PATTERN[] = {0x0047, 0x004D, 0x0054, 0x007B, 0x0030, 0x007D, 0}; // GMT{0} //static const char16_t DEFAULT_GMT_ZERO[] = {0x0047, 0x004D, 0x0054, 0}; // GMT static const char16_t DEFAULT_GMT_POSITIVE_HM[] = {0x002B, 0x0048, 0x003A, 0x006D, 0x006D, 0}; // +H:mm static const char16_t DEFAULT_GMT_POSITIVE_HMS[] = {0x002B, 0x0048, 0x003A, 0x006D, 0x006D, 0x003A, 0x0073, 0x0073, 0}; // +H:mm:ss static const char16_t DEFAULT_GMT_NEGATIVE_HM[] = {0x002D, 0x0048, 0x003A, 0x006D, 0x006D, 0}; // -H:mm static const char16_t DEFAULT_GMT_NEGATIVE_HMS[] = {0x002D, 0x0048, 0x003A, 0x006D, 0x006D, 0x003A, 0x0073, 0x0073, 0}; // -H:mm:ss static const char16_t DEFAULT_GMT_POSITIVE_H[] = {0x002B, 0x0048, 0}; // +H static const char16_t DEFAULT_GMT_NEGATIVE_H[] = {0x002D, 0x0048, 0}; // -H static const UChar32 DEFAULT_GMT_DIGITS[] = { 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039 }; static const char16_t DEFAULT_GMT_OFFSET_SEP = 0x003A; // ':' static const char16_t ARG0[] = {0x007B, 0x0030, 0x007D}; // "{0}" static const int32_t ARG0_LEN = 3; static const char16_t DEFAULT_GMT_OFFSET_MINUTE_PATTERN[] = {0x006D, 0x006D, 0}; // "mm" static const char16_t DEFAULT_GMT_OFFSET_SECOND_PATTERN[] = {0x0073, 0x0073, 0}; // "ss" static const char16_t ALT_GMT_STRINGS[][4] = { {0x0047, 0x004D, 0x0054, 0}, // GMT {0x0055, 0x0054, 0x0043, 0}, // UTC {0x0055, 0x0054, 0, 0}, // UT {0, 0, 0, 0} }; // Order of GMT offset pattern parsing, *_HMS must be evaluated first // because *_HM is most likely a substring of *_HMS static const int32_t PARSE_GMT_OFFSET_TYPES[] = { UTZFMT_PAT_POSITIVE_HMS, UTZFMT_PAT_NEGATIVE_HMS, UTZFMT_PAT_POSITIVE_HM, UTZFMT_PAT_NEGATIVE_HM, UTZFMT_PAT_POSITIVE_H, UTZFMT_PAT_NEGATIVE_H, -1 }; static const char16_t SINGLEQUOTE = 0x0027; static const char16_t PLUS = 0x002B; static const char16_t MINUS = 0x002D; static const char16_t ISO8601_UTC = 0x005A; // 'Z' static const char16_t ISO8601_SEP = 0x003A; // ':' static const int32_t MILLIS_PER_HOUR = 60 * 60 * 1000; static const int32_t MILLIS_PER_MINUTE = 60 * 1000; static const int32_t MILLIS_PER_SECOND = 1000; // Maximum offset (exclusive) in millisecond supported by offset formats static int32_t MAX_OFFSET = 24 * MILLIS_PER_HOUR; // Maximum values for GMT offset fields static const int32_t MAX_OFFSET_HOUR = 23; static const int32_t MAX_OFFSET_MINUTE = 59; static const int32_t MAX_OFFSET_SECOND = 59; static const int32_t UNKNOWN_OFFSET = 0x7FFFFFFF; static const int32_t ALL_SIMPLE_NAME_TYPES = UTZNM_LONG_STANDARD | UTZNM_LONG_DAYLIGHT | UTZNM_SHORT_STANDARD | UTZNM_SHORT_DAYLIGHT | UTZNM_EXEMPLAR_LOCATION; static const int32_t ALL_GENERIC_NAME_TYPES = UTZGNM_LOCATION | UTZGNM_LONG | UTZGNM_SHORT; #define DIGIT_VAL(c) (0x0030 <= (c) && (c) <= 0x0039 ? (c) - 0x0030 : -1) #define MAX_OFFSET_DIGITS 6 // Time Zone ID/Short ID trie static TextTrieMap *gZoneIdTrie = nullptr; static icu::UInitOnce gZoneIdTrieInitOnce {}; static TextTrieMap *gShortZoneIdTrie = nullptr; static icu::UInitOnce gShortZoneIdTrieInitOnce {}; static UMutex gLock; U_CDECL_BEGIN /** * Cleanup callback func */ static UBool U_CALLCONV tzfmt_cleanup() { if (gZoneIdTrie != nullptr) { delete gZoneIdTrie; } gZoneIdTrie = nullptr; gZoneIdTrieInitOnce.reset(); if (gShortZoneIdTrie != nullptr) { delete gShortZoneIdTrie; } gShortZoneIdTrie = nullptr; gShortZoneIdTrieInitOnce.reset(); return true; } U_CDECL_END // ------------------------------------------------------------------ // GMTOffsetField // // This class represents a localized GMT offset pattern // item and used by TimeZoneFormat // ------------------------------------------------------------------ class GMTOffsetField : public UMemory { public: enum FieldType { TEXT = 0, HOUR = 1, MINUTE = 2, SECOND = 4 }; virtual ~GMTOffsetField(); static GMTOffsetField* createText(const UnicodeString& text, UErrorCode& status); static GMTOffsetField* createTimeField(FieldType type, uint8_t width, UErrorCode& status); static UBool isValid(FieldType type, int32_t width); static FieldType getTypeByLetter(char16_t ch); FieldType getType() const; uint8_t getWidth() const; const char16_t* getPatternText() const; private: char16_t* fText; FieldType fType; uint8_t fWidth; GMTOffsetField(); }; GMTOffsetField::GMTOffsetField() : fText(nullptr), fType(TEXT), fWidth(0) { } GMTOffsetField::~GMTOffsetField() { if (fText) { uprv_free(fText); } } GMTOffsetField* GMTOffsetField::createText(const UnicodeString& text, UErrorCode& status) { if (U_FAILURE(status)) { return nullptr; } GMTOffsetField* result = new GMTOffsetField(); if (result == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } int32_t len = text.length(); result->fText = (char16_t*)uprv_malloc((len + 1) * sizeof(char16_t)); if (result->fText == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; delete result; return nullptr; } u_strncpy(result->fText, text.getBuffer(), len); result->fText[len] = 0; result->fType = TEXT; return result; } GMTOffsetField* GMTOffsetField::createTimeField(FieldType type, uint8_t width, UErrorCode& status) { U_ASSERT(type != TEXT); if (U_FAILURE(status)) { return nullptr; } GMTOffsetField* result = new GMTOffsetField(); if (result == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } result->fType = type; result->fWidth = width; return result; } UBool GMTOffsetField::isValid(FieldType type, int32_t width) { switch (type) { case HOUR: return (width == 1 || width == 2); case MINUTE: case SECOND: return (width == 2); default: UPRV_UNREACHABLE_EXIT; } return (width > 0); } GMTOffsetField::FieldType GMTOffsetField::getTypeByLetter(char16_t ch) { if (ch == 0x0048 /* H */) { return HOUR; } else if (ch == 0x006D /* m */) { return MINUTE; } else if (ch == 0x0073 /* s */) { return SECOND; } return TEXT; } inline GMTOffsetField::FieldType GMTOffsetField::getType() const { return fType; } inline uint8_t GMTOffsetField::getWidth() const { return fWidth; } inline const char16_t* GMTOffsetField::getPatternText() const { return fText; } U_CDECL_BEGIN static void U_CALLCONV deleteGMTOffsetField(void *obj) { delete static_cast(obj); } U_CDECL_END // ------------------------------------------------------------------ // TimeZoneFormat // ------------------------------------------------------------------ UOBJECT_DEFINE_RTTI_IMPLEMENTATION(TimeZoneFormat) TimeZoneFormat::TimeZoneFormat(const Locale& locale, UErrorCode& status) : fLocale(locale), fTimeZoneNames(nullptr), fTimeZoneGenericNames(nullptr), fDefParseOptionFlags(0), fTZDBTimeZoneNames(nullptr) { for (int32_t i = 0; i < UTZFMT_PAT_COUNT; i++) { fGMTOffsetPatternItems[i] = nullptr; } const char* region = fLocale.getCountry(); int32_t regionLen = static_cast(uprv_strlen(region)); if (regionLen == 0) { CharString loc; { CharStringByteSink sink(&loc); ulocimp_addLikelySubtags(fLocale.getName(), sink, &status); } regionLen = uloc_getCountry(loc.data(), fTargetRegion, sizeof(fTargetRegion), &status); if (U_SUCCESS(status)) { fTargetRegion[regionLen] = 0; } else { return; } } else if (regionLen < (int32_t)sizeof(fTargetRegion)) { uprv_strcpy(fTargetRegion, region); } else { fTargetRegion[0] = 0; } fTimeZoneNames = TimeZoneNames::createInstance(locale, status); // fTimeZoneGenericNames is lazily instantiated if (U_FAILURE(status)) { return; } const char16_t* gmtPattern = nullptr; const char16_t* hourFormats = nullptr; UResourceBundle *zoneBundle = ures_open(U_ICUDATA_ZONE, locale.getName(), &status); UResourceBundle *zoneStringsArray = ures_getByKeyWithFallback(zoneBundle, gZoneStringsTag, nullptr, &status); if (U_SUCCESS(status)) { const char16_t* resStr; int32_t len; resStr = ures_getStringByKeyWithFallback(zoneStringsArray, gGmtFormatTag, &len, &status); if (len > 0) { gmtPattern = resStr; } resStr = ures_getStringByKeyWithFallback(zoneStringsArray, gGmtZeroFormatTag, &len, &status); if (len > 0) { fGMTZeroFormat.setTo(true, resStr, len); } resStr = ures_getStringByKeyWithFallback(zoneStringsArray, gHourFormatTag, &len, &status); if (len > 0) { hourFormats = resStr; } ures_close(zoneStringsArray); ures_close(zoneBundle); } if (gmtPattern == nullptr) { gmtPattern = DEFAULT_GMT_PATTERN; } initGMTPattern(UnicodeString(true, gmtPattern, -1), status); UBool useDefaultOffsetPatterns = true; if (hourFormats) { char16_t *sep = u_strchr(hourFormats, (char16_t)0x003B /* ';' */); if (sep != nullptr) { UErrorCode tmpStatus = U_ZERO_ERROR; fGMTOffsetPatterns[UTZFMT_PAT_POSITIVE_HM].setTo(false, hourFormats, (int32_t)(sep - hourFormats)); fGMTOffsetPatterns[UTZFMT_PAT_NEGATIVE_HM].setTo(true, sep + 1, -1); expandOffsetPattern(fGMTOffsetPatterns[UTZFMT_PAT_POSITIVE_HM], fGMTOffsetPatterns[UTZFMT_PAT_POSITIVE_HMS], tmpStatus); expandOffsetPattern(fGMTOffsetPatterns[UTZFMT_PAT_NEGATIVE_HM], fGMTOffsetPatterns[UTZFMT_PAT_NEGATIVE_HMS], tmpStatus); truncateOffsetPattern(fGMTOffsetPatterns[UTZFMT_PAT_POSITIVE_HM], fGMTOffsetPatterns[UTZFMT_PAT_POSITIVE_H], tmpStatus); truncateOffsetPattern(fGMTOffsetPatterns[UTZFMT_PAT_NEGATIVE_HM], fGMTOffsetPatterns[UTZFMT_PAT_NEGATIVE_H], tmpStatus); if (U_SUCCESS(tmpStatus)) { useDefaultOffsetPatterns = false; } } } if (useDefaultOffsetPatterns) { fGMTOffsetPatterns[UTZFMT_PAT_POSITIVE_H].setTo(true, DEFAULT_GMT_POSITIVE_H, -1); fGMTOffsetPatterns[UTZFMT_PAT_POSITIVE_HM].setTo(true, DEFAULT_GMT_POSITIVE_HM, -1); fGMTOffsetPatterns[UTZFMT_PAT_POSITIVE_HMS].setTo(true, DEFAULT_GMT_POSITIVE_HMS, -1); fGMTOffsetPatterns[UTZFMT_PAT_NEGATIVE_H].setTo(true, DEFAULT_GMT_NEGATIVE_H, -1); fGMTOffsetPatterns[UTZFMT_PAT_NEGATIVE_HM].setTo(true, DEFAULT_GMT_NEGATIVE_HM, -1); fGMTOffsetPatterns[UTZFMT_PAT_NEGATIVE_HMS].setTo(true, DEFAULT_GMT_NEGATIVE_HMS, -1); } initGMTOffsetPatterns(status); NumberingSystem* ns = NumberingSystem::createInstance(locale, status); UBool useDefDigits = true; if (ns && !ns->isAlgorithmic()) { UnicodeString digits = ns->getDescription(); useDefDigits = !toCodePoints(digits, fGMTOffsetDigits, 10); } if (useDefDigits) { uprv_memcpy(fGMTOffsetDigits, DEFAULT_GMT_DIGITS, sizeof(UChar32) * 10); } delete ns; } TimeZoneFormat::TimeZoneFormat(const TimeZoneFormat& other) : Format(other), fTimeZoneNames(nullptr), fTimeZoneGenericNames(nullptr), fTZDBTimeZoneNames(nullptr) { for (int32_t i = 0; i < UTZFMT_PAT_COUNT; i++) { fGMTOffsetPatternItems[i] = nullptr; } *this = other; } TimeZoneFormat::~TimeZoneFormat() { delete fTimeZoneNames; delete fTimeZoneGenericNames; delete fTZDBTimeZoneNames; for (int32_t i = 0; i < UTZFMT_PAT_COUNT; i++) { delete fGMTOffsetPatternItems[i]; } } TimeZoneFormat& TimeZoneFormat::operator=(const TimeZoneFormat& other) { if (this == &other) { return *this; } delete fTimeZoneNames; delete fTimeZoneGenericNames; fTimeZoneGenericNames = nullptr; delete fTZDBTimeZoneNames; fTZDBTimeZoneNames = nullptr; fLocale = other.fLocale; uprv_memcpy(fTargetRegion, other.fTargetRegion, sizeof(fTargetRegion)); fTimeZoneNames = other.fTimeZoneNames->clone(); if (other.fTimeZoneGenericNames) { // TODO: this test has dubious thread safety. fTimeZoneGenericNames = other.fTimeZoneGenericNames->clone(); } fGMTPattern = other.fGMTPattern; fGMTPatternPrefix = other.fGMTPatternPrefix; fGMTPatternSuffix = other.fGMTPatternSuffix; UErrorCode status = U_ZERO_ERROR; for (int32_t i = 0; i < UTZFMT_PAT_COUNT; i++) { fGMTOffsetPatterns[i] = other.fGMTOffsetPatterns[i]; delete fGMTOffsetPatternItems[i]; fGMTOffsetPatternItems[i] = nullptr; } initGMTOffsetPatterns(status); U_ASSERT(U_SUCCESS(status)); fGMTZeroFormat = other.fGMTZeroFormat; uprv_memcpy(fGMTOffsetDigits, other.fGMTOffsetDigits, sizeof(fGMTOffsetDigits)); fDefParseOptionFlags = other.fDefParseOptionFlags; return *this; } bool TimeZoneFormat::operator==(const Format& other) const { TimeZoneFormat* tzfmt = (TimeZoneFormat*)&other; bool isEqual = fLocale == tzfmt->fLocale && fGMTPattern == tzfmt->fGMTPattern && fGMTZeroFormat == tzfmt->fGMTZeroFormat && *fTimeZoneNames == *tzfmt->fTimeZoneNames; for (int32_t i = 0; i < UTZFMT_PAT_COUNT && isEqual; i++) { isEqual = fGMTOffsetPatterns[i] == tzfmt->fGMTOffsetPatterns[i]; } for (int32_t i = 0; i < 10 && isEqual; i++) { isEqual = fGMTOffsetDigits[i] == tzfmt->fGMTOffsetDigits[i]; } // TODO // Check fTimeZoneGenericNames. For now, // if fTimeZoneNames is same, fTimeZoneGenericNames should // be also equivalent. return isEqual; } TimeZoneFormat* TimeZoneFormat::clone() const { return new TimeZoneFormat(*this); } TimeZoneFormat* U_EXPORT2 TimeZoneFormat::createInstance(const Locale& locale, UErrorCode& status) { TimeZoneFormat* tzfmt = new TimeZoneFormat(locale, status); if (U_SUCCESS(status)) { return tzfmt; } delete tzfmt; return nullptr; } // ------------------------------------------------------------------ // Setter and Getter const TimeZoneNames* TimeZoneFormat::getTimeZoneNames() const { return (const TimeZoneNames*)fTimeZoneNames; } void TimeZoneFormat::adoptTimeZoneNames(TimeZoneNames *tznames) { delete fTimeZoneNames; fTimeZoneNames = tznames; // TODO - We should also update fTimeZoneGenericNames } void TimeZoneFormat::setTimeZoneNames(const TimeZoneNames &tznames) { delete fTimeZoneNames; fTimeZoneNames = tznames.clone(); // TODO - We should also update fTimeZoneGenericNames } void TimeZoneFormat::setDefaultParseOptions(uint32_t flags) { fDefParseOptionFlags = flags; } uint32_t TimeZoneFormat::getDefaultParseOptions() const { return fDefParseOptionFlags; } UnicodeString& TimeZoneFormat::getGMTPattern(UnicodeString& pattern) const { return pattern.setTo(fGMTPattern); } void TimeZoneFormat::setGMTPattern(const UnicodeString& pattern, UErrorCode& status) { initGMTPattern(pattern, status); } UnicodeString& TimeZoneFormat::getGMTOffsetPattern(UTimeZoneFormatGMTOffsetPatternType type, UnicodeString& pattern) const { return pattern.setTo(fGMTOffsetPatterns[type]); } void TimeZoneFormat::setGMTOffsetPattern(UTimeZoneFormatGMTOffsetPatternType type, const UnicodeString& pattern, UErrorCode& status) { if (U_FAILURE(status)) { return; } if (pattern == fGMTOffsetPatterns[type]) { // No need to reset return; } OffsetFields required = FIELDS_HM; switch (type) { case UTZFMT_PAT_POSITIVE_H: case UTZFMT_PAT_NEGATIVE_H: required = FIELDS_H; break; case UTZFMT_PAT_POSITIVE_HM: case UTZFMT_PAT_NEGATIVE_HM: required = FIELDS_HM; break; case UTZFMT_PAT_POSITIVE_HMS: case UTZFMT_PAT_NEGATIVE_HMS: required = FIELDS_HMS; break; default: UPRV_UNREACHABLE_EXIT; } UVector* patternItems = parseOffsetPattern(pattern, required, status); if (patternItems == nullptr) { return; } fGMTOffsetPatterns[type].setTo(pattern); delete fGMTOffsetPatternItems[type]; fGMTOffsetPatternItems[type] = patternItems; checkAbuttingHoursAndMinutes(); } UnicodeString& TimeZoneFormat::getGMTOffsetDigits(UnicodeString& digits) const { digits.remove(); for (int32_t i = 0; i < 10; i++) { digits.append(fGMTOffsetDigits[i]); } return digits; } void TimeZoneFormat::setGMTOffsetDigits(const UnicodeString& digits, UErrorCode& status) { if (U_FAILURE(status)) { return; } UChar32 digitArray[10]; if (!toCodePoints(digits, digitArray, 10)) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } uprv_memcpy(fGMTOffsetDigits, digitArray, sizeof(UChar32)*10); } UnicodeString& TimeZoneFormat::getGMTZeroFormat(UnicodeString& gmtZeroFormat) const { return gmtZeroFormat.setTo(fGMTZeroFormat); } void TimeZoneFormat::setGMTZeroFormat(const UnicodeString& gmtZeroFormat, UErrorCode& status) { if (U_SUCCESS(status)) { if (gmtZeroFormat.isEmpty()) { status = U_ILLEGAL_ARGUMENT_ERROR; } else if (gmtZeroFormat != fGMTZeroFormat) { fGMTZeroFormat.setTo(gmtZeroFormat); } } } // ------------------------------------------------------------------ // Format and Parse UnicodeString& TimeZoneFormat::format(UTimeZoneFormatStyle style, const TimeZone& tz, UDate date, UnicodeString& name, UTimeZoneFormatTimeType* timeType /* = nullptr */) const { if (timeType) { *timeType = UTZFMT_TIME_TYPE_UNKNOWN; } UBool noOffsetFormatFallback = false; switch (style) { case UTZFMT_STYLE_GENERIC_LOCATION: formatGeneric(tz, UTZGNM_LOCATION, date, name); break; case UTZFMT_STYLE_GENERIC_LONG: formatGeneric(tz, UTZGNM_LONG, date, name); break; case UTZFMT_STYLE_GENERIC_SHORT: formatGeneric(tz, UTZGNM_SHORT, date, name); break; case UTZFMT_STYLE_SPECIFIC_LONG: formatSpecific(tz, UTZNM_LONG_STANDARD, UTZNM_LONG_DAYLIGHT, date, name, timeType); break; case UTZFMT_STYLE_SPECIFIC_SHORT: formatSpecific(tz, UTZNM_SHORT_STANDARD, UTZNM_SHORT_DAYLIGHT, date, name, timeType); break; case UTZFMT_STYLE_ZONE_ID: tz.getID(name); noOffsetFormatFallback = true; break; case UTZFMT_STYLE_ZONE_ID_SHORT: { const char16_t* shortID = ZoneMeta::getShortID(tz); if (shortID == nullptr) { shortID = UNKNOWN_SHORT_ZONE_ID; } name.setTo(shortID, -1); } noOffsetFormatFallback = true; break; case UTZFMT_STYLE_EXEMPLAR_LOCATION: formatExemplarLocation(tz, name); noOffsetFormatFallback = true; break; default: // will be handled below break; } if (name.isEmpty() && !noOffsetFormatFallback) { UErrorCode status = U_ZERO_ERROR; int32_t rawOffset, dstOffset; tz.getOffset(date, false, rawOffset, dstOffset, status); int32_t offset = rawOffset + dstOffset; if (U_SUCCESS(status)) { switch (style) { case UTZFMT_STYLE_GENERIC_LOCATION: case UTZFMT_STYLE_GENERIC_LONG: case UTZFMT_STYLE_SPECIFIC_LONG: case UTZFMT_STYLE_LOCALIZED_GMT: formatOffsetLocalizedGMT(offset, name, status); break; case UTZFMT_STYLE_GENERIC_SHORT: case UTZFMT_STYLE_SPECIFIC_SHORT: case UTZFMT_STYLE_LOCALIZED_GMT_SHORT: formatOffsetShortLocalizedGMT(offset, name, status); break; case UTZFMT_STYLE_ISO_BASIC_SHORT: formatOffsetISO8601Basic(offset, true, true, true, name, status); break; case UTZFMT_STYLE_ISO_BASIC_LOCAL_SHORT: formatOffsetISO8601Basic(offset, false, true, true, name, status); break; case UTZFMT_STYLE_ISO_BASIC_FIXED: formatOffsetISO8601Basic(offset, true, false, true, name, status); break; case UTZFMT_STYLE_ISO_BASIC_LOCAL_FIXED: formatOffsetISO8601Basic(offset, false, false, true, name, status); break; case UTZFMT_STYLE_ISO_EXTENDED_FIXED: formatOffsetISO8601Extended(offset, true, false, true, name, status); break; case UTZFMT_STYLE_ISO_EXTENDED_LOCAL_FIXED: formatOffsetISO8601Extended(offset, false, false, true, name, status); break; case UTZFMT_STYLE_ISO_BASIC_FULL: formatOffsetISO8601Basic(offset, true, false, false, name, status); break; case UTZFMT_STYLE_ISO_BASIC_LOCAL_FULL: formatOffsetISO8601Basic(offset, false, false, false, name, status); break; case UTZFMT_STYLE_ISO_EXTENDED_FULL: formatOffsetISO8601Extended(offset, true, false, false, name, status); break; case UTZFMT_STYLE_ISO_EXTENDED_LOCAL_FULL: formatOffsetISO8601Extended(offset, false, false, false, name, status); break; default: // UTZFMT_STYLE_ZONE_ID, UTZFMT_STYLE_ZONE_ID_SHORT, UTZFMT_STYLE_EXEMPLAR_LOCATION break; } if (timeType) { *timeType = (dstOffset != 0) ? UTZFMT_TIME_TYPE_DAYLIGHT : UTZFMT_TIME_TYPE_STANDARD; } } } return name; } UnicodeString& TimeZoneFormat::format(const Formattable& obj, UnicodeString& appendTo, FieldPosition& pos, UErrorCode& status) const { if (U_FAILURE(status)) { return appendTo; } UDate date = Calendar::getNow(); if (obj.getType() == Formattable::kObject) { const UObject* formatObj = obj.getObject(); const TimeZone* tz = dynamic_cast(formatObj); if (tz == nullptr) { const Calendar* cal = dynamic_cast(formatObj); if (cal != nullptr) { tz = &cal->getTimeZone(); date = cal->getTime(status); } } if (tz != nullptr) { int32_t rawOffset, dstOffset; tz->getOffset(date, false, rawOffset, dstOffset, status); char16_t buf[ZONE_NAME_U16_MAX]; UnicodeString result(buf, 0, UPRV_LENGTHOF(buf)); formatOffsetLocalizedGMT(rawOffset + dstOffset, result, status); if (U_SUCCESS(status)) { appendTo.append(result); if (pos.getField() == UDAT_TIMEZONE_FIELD) { pos.setBeginIndex(0); pos.setEndIndex(result.length()); } } } } return appendTo; } TimeZone* TimeZoneFormat::parse(UTimeZoneFormatStyle style, const UnicodeString& text, ParsePosition& pos, UTimeZoneFormatTimeType* timeType /*= nullptr*/) const { return parse(style, text, pos, getDefaultParseOptions(), timeType); } TimeZone* TimeZoneFormat::parse(UTimeZoneFormatStyle style, const UnicodeString& text, ParsePosition& pos, int32_t parseOptions, UTimeZoneFormatTimeType* timeType /* = nullptr */) const { if (timeType) { *timeType = UTZFMT_TIME_TYPE_UNKNOWN; } int32_t startIdx = pos.getIndex(); int32_t maxPos = text.length(); int32_t offset; // Styles using localized GMT format as fallback UBool fallbackLocalizedGMT = (style == UTZFMT_STYLE_SPECIFIC_LONG || style == UTZFMT_STYLE_GENERIC_LONG || style == UTZFMT_STYLE_GENERIC_LOCATION); UBool fallbackShortLocalizedGMT = (style == UTZFMT_STYLE_SPECIFIC_SHORT || style == UTZFMT_STYLE_GENERIC_SHORT); int32_t evaluated = 0; // bit flags representing already evaluated styles ParsePosition tmpPos(startIdx); int32_t parsedOffset = UNKNOWN_OFFSET; // stores successfully parsed offset for later use int32_t parsedPos = -1; // stores successfully parsed offset position for later use // Try localized GMT format first if necessary if (fallbackLocalizedGMT || fallbackShortLocalizedGMT) { UBool hasDigitOffset = false; offset = parseOffsetLocalizedGMT(text, tmpPos, fallbackShortLocalizedGMT, &hasDigitOffset); if (tmpPos.getErrorIndex() == -1) { // Even when the input text was successfully parsed as a localized GMT format text, // we may still need to evaluate the specified style if - // 1) GMT zero format was used, and // 2) The input text was not completely processed if (tmpPos.getIndex() == maxPos || hasDigitOffset) { pos.setIndex(tmpPos.getIndex()); return createTimeZoneForOffset(offset); } parsedOffset = offset; parsedPos = tmpPos.getIndex(); } // Note: For now, no distinction between long/short localized GMT format in the parser. // This might be changed in future. // evaluated |= (fallbackLocalizedGMT ? STYLE_PARSE_FLAGS[UTZFMT_STYLE_LOCALIZED_GMT] : STYLE_PARSE_FLAGS[UTZFMT_STYLE_LOCALIZED_GMT_SHORT]); evaluated |= STYLE_PARSE_FLAGS[UTZFMT_STYLE_LOCALIZED_GMT] | STYLE_PARSE_FLAGS[UTZFMT_STYLE_LOCALIZED_GMT_SHORT]; } UErrorCode status = U_ZERO_ERROR; char16_t tzIDBuf[32]; UnicodeString tzID(tzIDBuf, 0, UPRV_LENGTHOF(tzIDBuf)); UBool parseTZDBAbbrev = ((parseOptions & UTZFMT_PARSE_OPTION_TZ_DATABASE_ABBREVIATIONS) != 0); // Try the specified style switch (style) { case UTZFMT_STYLE_LOCALIZED_GMT: { tmpPos.setIndex(startIdx); tmpPos.setErrorIndex(-1); offset = parseOffsetLocalizedGMT(text, tmpPos); if (tmpPos.getErrorIndex() == -1) { pos.setIndex(tmpPos.getIndex()); return createTimeZoneForOffset(offset); } // Note: For now, no distinction between long/short localized GMT format in the parser. // This might be changed in future. evaluated |= STYLE_PARSE_FLAGS[UTZFMT_STYLE_LOCALIZED_GMT_SHORT]; break; } case UTZFMT_STYLE_LOCALIZED_GMT_SHORT: { tmpPos.setIndex(startIdx); tmpPos.setErrorIndex(-1); offset = parseOffsetShortLocalizedGMT(text, tmpPos); if (tmpPos.getErrorIndex() == -1) { pos.setIndex(tmpPos.getIndex()); return createTimeZoneForOffset(offset); } // Note: For now, no distinction between long/short localized GMT format in the parser. // This might be changed in future. evaluated |= STYLE_PARSE_FLAGS[UTZFMT_STYLE_LOCALIZED_GMT]; break; } case UTZFMT_STYLE_ISO_BASIC_SHORT: case UTZFMT_STYLE_ISO_BASIC_FIXED: case UTZFMT_STYLE_ISO_BASIC_FULL: case UTZFMT_STYLE_ISO_EXTENDED_FIXED: case UTZFMT_STYLE_ISO_EXTENDED_FULL: { tmpPos.setIndex(startIdx); tmpPos.setErrorIndex(-1); offset = parseOffsetISO8601(text, tmpPos); if (tmpPos.getErrorIndex() == -1) { pos.setIndex(tmpPos.getIndex()); return createTimeZoneForOffset(offset); } break; } case UTZFMT_STYLE_ISO_BASIC_LOCAL_SHORT: case UTZFMT_STYLE_ISO_BASIC_LOCAL_FIXED: case UTZFMT_STYLE_ISO_BASIC_LOCAL_FULL: case UTZFMT_STYLE_ISO_EXTENDED_LOCAL_FIXED: case UTZFMT_STYLE_ISO_EXTENDED_LOCAL_FULL: { tmpPos.setIndex(startIdx); tmpPos.setErrorIndex(-1); // Exclude the case of UTC Indicator "Z" here UBool hasDigitOffset = false; offset = parseOffsetISO8601(text, tmpPos, false, &hasDigitOffset); if (tmpPos.getErrorIndex() == -1 && hasDigitOffset) { pos.setIndex(tmpPos.getIndex()); return createTimeZoneForOffset(offset); } break; } case UTZFMT_STYLE_SPECIFIC_LONG: case UTZFMT_STYLE_SPECIFIC_SHORT: { // Specific styles int32_t nameTypes = 0; if (style == UTZFMT_STYLE_SPECIFIC_LONG) { nameTypes = (UTZNM_LONG_STANDARD | UTZNM_LONG_DAYLIGHT); } else { U_ASSERT(style == UTZFMT_STYLE_SPECIFIC_SHORT); nameTypes = (UTZNM_SHORT_STANDARD | UTZNM_SHORT_DAYLIGHT); } LocalPointer specificMatches(fTimeZoneNames->find(text, startIdx, nameTypes, status)); if (U_FAILURE(status)) { pos.setErrorIndex(startIdx); return nullptr; } if (!specificMatches.isNull()) { int32_t matchIdx = -1; int32_t matchPos = -1; for (int32_t i = 0; i < specificMatches->size(); i++) { matchPos = startIdx + specificMatches->getMatchLengthAt(i); if (matchPos > parsedPos) { matchIdx = i; parsedPos = matchPos; } } if (matchIdx >= 0) { if (timeType) { *timeType = getTimeType(specificMatches->getNameTypeAt(matchIdx)); } pos.setIndex(matchPos); getTimeZoneID(specificMatches.getAlias(), matchIdx, tzID); U_ASSERT(!tzID.isEmpty()); return TimeZone::createTimeZone(tzID); } } if (parseTZDBAbbrev && style == UTZFMT_STYLE_SPECIFIC_SHORT) { U_ASSERT((nameTypes & UTZNM_SHORT_STANDARD) != 0); U_ASSERT((nameTypes & UTZNM_SHORT_DAYLIGHT) != 0); const TZDBTimeZoneNames *tzdbTimeZoneNames = getTZDBTimeZoneNames(status); if (U_SUCCESS(status)) { LocalPointer tzdbNameMatches( tzdbTimeZoneNames->find(text, startIdx, nameTypes, status)); if (U_FAILURE(status)) { pos.setErrorIndex(startIdx); return nullptr; } if (!tzdbNameMatches.isNull()) { int32_t matchIdx = -1; int32_t matchPos = -1; for (int32_t i = 0; i < tzdbNameMatches->size(); i++) { matchPos = startIdx + tzdbNameMatches->getMatchLengthAt(i); if (matchPos > parsedPos) { matchIdx = i; parsedPos = matchPos; } } if (matchIdx >= 0) { if (timeType) { *timeType = getTimeType(tzdbNameMatches->getNameTypeAt(matchIdx)); } pos.setIndex(matchPos); getTimeZoneID(tzdbNameMatches.getAlias(), matchIdx, tzID); U_ASSERT(!tzID.isEmpty()); return TimeZone::createTimeZone(tzID); } } } } break; } case UTZFMT_STYLE_GENERIC_LONG: case UTZFMT_STYLE_GENERIC_SHORT: case UTZFMT_STYLE_GENERIC_LOCATION: { int32_t genericNameTypes = 0; switch (style) { case UTZFMT_STYLE_GENERIC_LOCATION: genericNameTypes = UTZGNM_LOCATION; break; case UTZFMT_STYLE_GENERIC_LONG: genericNameTypes = UTZGNM_LONG | UTZGNM_LOCATION; break; case UTZFMT_STYLE_GENERIC_SHORT: genericNameTypes = UTZGNM_SHORT | UTZGNM_LOCATION; break; default: UPRV_UNREACHABLE_EXIT; } int32_t len = 0; UTimeZoneFormatTimeType tt = UTZFMT_TIME_TYPE_UNKNOWN; const TimeZoneGenericNames *gnames = getTimeZoneGenericNames(status); if (U_SUCCESS(status)) { len = gnames->findBestMatch(text, startIdx, genericNameTypes, tzID, tt, status); } if (U_FAILURE(status)) { pos.setErrorIndex(startIdx); return nullptr; } if (len > 0) { // Found a match if (timeType) { *timeType = tt; } pos.setIndex(startIdx + len); U_ASSERT(!tzID.isEmpty()); return TimeZone::createTimeZone(tzID); } break; } case UTZFMT_STYLE_ZONE_ID: { tmpPos.setIndex(startIdx); tmpPos.setErrorIndex(-1); parseZoneID(text, tmpPos, tzID); if (tmpPos.getErrorIndex() == -1) { pos.setIndex(tmpPos.getIndex()); return TimeZone::createTimeZone(tzID); } break; } case UTZFMT_STYLE_ZONE_ID_SHORT: { tmpPos.setIndex(startIdx); tmpPos.setErrorIndex(-1); parseShortZoneID(text, tmpPos, tzID); if (tmpPos.getErrorIndex() == -1) { pos.setIndex(tmpPos.getIndex()); return TimeZone::createTimeZone(tzID); } break; } case UTZFMT_STYLE_EXEMPLAR_LOCATION: { tmpPos.setIndex(startIdx); tmpPos.setErrorIndex(-1); parseExemplarLocation(text, tmpPos, tzID); if (tmpPos.getErrorIndex() == -1) { pos.setIndex(tmpPos.getIndex()); return TimeZone::createTimeZone(tzID); } break; } } evaluated |= STYLE_PARSE_FLAGS[style]; if (parsedPos > startIdx) { // When the specified style is one of SPECIFIC_XXX or GENERIC_XXX, we tried to parse the input // as localized GMT format earlier. If parsedOffset is positive, it means it was successfully // parsed as localized GMT format, but offset digits were not detected (more specifically, GMT // zero format). Then, it tried to find a match within the set of display names, but could not // find a match. At this point, we can safely assume the input text contains the localized // GMT format. U_ASSERT(parsedOffset != UNKNOWN_OFFSET); pos.setIndex(parsedPos); return createTimeZoneForOffset(parsedOffset); } // Failed to parse the input text as the time zone format in the specified style. // Check the longest match among other styles below. char16_t parsedIDBuf[32]; UnicodeString parsedID(parsedIDBuf, 0, UPRV_LENGTHOF(parsedIDBuf)); UTimeZoneFormatTimeType parsedTimeType = UTZFMT_TIME_TYPE_UNKNOWN; U_ASSERT(parsedPos < 0); U_ASSERT(parsedOffset == UNKNOWN_OFFSET); // ISO 8601 if (parsedPos < maxPos && ((evaluated & ISO_Z_STYLE_FLAG) == 0 || (evaluated & ISO_LOCAL_STYLE_FLAG) == 0)) { tmpPos.setIndex(startIdx); tmpPos.setErrorIndex(-1); UBool hasDigitOffset = false; offset = parseOffsetISO8601(text, tmpPos, false, &hasDigitOffset); if (tmpPos.getErrorIndex() == -1) { if (tmpPos.getIndex() == maxPos || hasDigitOffset) { pos.setIndex(tmpPos.getIndex()); return createTimeZoneForOffset(offset); } // Note: When ISO 8601 format contains offset digits, it should not // collide with other formats. However, ISO 8601 UTC format "Z" (single letter) // may collide with other names. In this case, we need to evaluate other names. if (parsedPos < tmpPos.getIndex()) { parsedOffset = offset; parsedID.setToBogus(); parsedTimeType = UTZFMT_TIME_TYPE_UNKNOWN; parsedPos = tmpPos.getIndex(); U_ASSERT(parsedPos == startIdx + 1); // only when "Z" is used } } } // Localized GMT format if (parsedPos < maxPos && (evaluated & STYLE_PARSE_FLAGS[UTZFMT_STYLE_LOCALIZED_GMT]) == 0) { tmpPos.setIndex(startIdx); tmpPos.setErrorIndex(-1); UBool hasDigitOffset = false; offset = parseOffsetLocalizedGMT(text, tmpPos, false, &hasDigitOffset); if (tmpPos.getErrorIndex() == -1) { if (tmpPos.getIndex() == maxPos || hasDigitOffset) { pos.setIndex(tmpPos.getIndex()); return createTimeZoneForOffset(offset); } // Evaluate other names - see the comment earlier in this method. if (parsedPos < tmpPos.getIndex()) { parsedOffset = offset; parsedID.setToBogus(); parsedTimeType = UTZFMT_TIME_TYPE_UNKNOWN; parsedPos = tmpPos.getIndex(); } } } if (parsedPos < maxPos && (evaluated & STYLE_PARSE_FLAGS[UTZFMT_STYLE_LOCALIZED_GMT_SHORT]) == 0) { tmpPos.setIndex(startIdx); tmpPos.setErrorIndex(-1); UBool hasDigitOffset = false; offset = parseOffsetLocalizedGMT(text, tmpPos, true, &hasDigitOffset); if (tmpPos.getErrorIndex() == -1) { if (tmpPos.getIndex() == maxPos || hasDigitOffset) { pos.setIndex(tmpPos.getIndex()); return createTimeZoneForOffset(offset); } // Evaluate other names - see the comment earlier in this method. if (parsedPos < tmpPos.getIndex()) { parsedOffset = offset; parsedID.setToBogus(); parsedTimeType = UTZFMT_TIME_TYPE_UNKNOWN; parsedPos = tmpPos.getIndex(); } } } // When ParseOption.ALL_STYLES is available, we also try to look all possible display names and IDs. // For example, when style is GENERIC_LONG, "EST" (SPECIFIC_SHORT) is never // used for America/New_York. With parseAllStyles true, this code parses "EST" // as America/New_York. // Note: Adding all possible names into the trie used by the implementation is quite heavy operation, // which we want to avoid normally (note that we cache the trie, so this is applicable to the // first time only as long as the cache does not expire). if (parseOptions & UTZFMT_PARSE_OPTION_ALL_STYLES) { // Try all specific names and exemplar location names if (parsedPos < maxPos) { LocalPointer specificMatches(fTimeZoneNames->find(text, startIdx, ALL_SIMPLE_NAME_TYPES, status)); if (U_FAILURE(status)) { pos.setErrorIndex(startIdx); return nullptr; } int32_t specificMatchIdx = -1; int32_t matchPos = -1; if (!specificMatches.isNull()) { for (int32_t i = 0; i < specificMatches->size(); i++) { if (startIdx + specificMatches->getMatchLengthAt(i) > matchPos) { specificMatchIdx = i; matchPos = startIdx + specificMatches->getMatchLengthAt(i); } } } if (parsedPos < matchPos) { U_ASSERT(specificMatchIdx >= 0); parsedPos = matchPos; getTimeZoneID(specificMatches.getAlias(), specificMatchIdx, parsedID); parsedTimeType = getTimeType(specificMatches->getNameTypeAt(specificMatchIdx)); parsedOffset = UNKNOWN_OFFSET; } } if (parseTZDBAbbrev && parsedPos < maxPos && (evaluated & STYLE_PARSE_FLAGS[UTZFMT_STYLE_SPECIFIC_SHORT]) == 0) { const TZDBTimeZoneNames *tzdbTimeZoneNames = getTZDBTimeZoneNames(status); if (U_SUCCESS(status)) { LocalPointer tzdbNameMatches( tzdbTimeZoneNames->find(text, startIdx, ALL_SIMPLE_NAME_TYPES, status)); if (U_FAILURE(status)) { pos.setErrorIndex(startIdx); return nullptr; } int32_t tzdbNameMatchIdx = -1; int32_t matchPos = -1; if (!tzdbNameMatches.isNull()) { for (int32_t i = 0; i < tzdbNameMatches->size(); i++) { if (startIdx + tzdbNameMatches->getMatchLengthAt(i) > matchPos) { tzdbNameMatchIdx = i; matchPos = startIdx + tzdbNameMatches->getMatchLengthAt(i); } } } if (parsedPos < matchPos) { U_ASSERT(tzdbNameMatchIdx >= 0); parsedPos = matchPos; getTimeZoneID(tzdbNameMatches.getAlias(), tzdbNameMatchIdx, parsedID); parsedTimeType = getTimeType(tzdbNameMatches->getNameTypeAt(tzdbNameMatchIdx)); parsedOffset = UNKNOWN_OFFSET; } } } // Try generic names if (parsedPos < maxPos) { int32_t genMatchLen = -1; UTimeZoneFormatTimeType tt = UTZFMT_TIME_TYPE_UNKNOWN; const TimeZoneGenericNames *gnames = getTimeZoneGenericNames(status); if (U_SUCCESS(status)) { genMatchLen = gnames->findBestMatch(text, startIdx, ALL_GENERIC_NAME_TYPES, tzID, tt, status); } if (U_FAILURE(status)) { pos.setErrorIndex(startIdx); return nullptr; } if (genMatchLen > 0 && parsedPos < startIdx + genMatchLen) { parsedPos = startIdx + genMatchLen; parsedID.setTo(tzID); parsedTimeType = tt; parsedOffset = UNKNOWN_OFFSET; } } // Try time zone ID if (parsedPos < maxPos && (evaluated & STYLE_PARSE_FLAGS[UTZFMT_STYLE_ZONE_ID]) == 0) { tmpPos.setIndex(startIdx); tmpPos.setErrorIndex(-1); parseZoneID(text, tmpPos, tzID); if (tmpPos.getErrorIndex() == -1 && parsedPos < tmpPos.getIndex()) { parsedPos = tmpPos.getIndex(); parsedID.setTo(tzID); parsedTimeType = UTZFMT_TIME_TYPE_UNKNOWN; parsedOffset = UNKNOWN_OFFSET; } } // Try short time zone ID if (parsedPos < maxPos && (evaluated & STYLE_PARSE_FLAGS[UTZFMT_STYLE_ZONE_ID]) == 0) { tmpPos.setIndex(startIdx); tmpPos.setErrorIndex(-1); parseShortZoneID(text, tmpPos, tzID); if (tmpPos.getErrorIndex() == -1 && parsedPos < tmpPos.getIndex()) { parsedPos = tmpPos.getIndex(); parsedID.setTo(tzID); parsedTimeType = UTZFMT_TIME_TYPE_UNKNOWN; parsedOffset = UNKNOWN_OFFSET; } } } if (parsedPos > startIdx) { // Parsed successfully TimeZone* parsedTZ; if (parsedID.length() > 0) { parsedTZ = TimeZone::createTimeZone(parsedID); } else { U_ASSERT(parsedOffset != UNKNOWN_OFFSET); parsedTZ = createTimeZoneForOffset(parsedOffset); } if (timeType) { *timeType = parsedTimeType; } pos.setIndex(parsedPos); return parsedTZ; } pos.setErrorIndex(startIdx); return nullptr; } void TimeZoneFormat::parseObject(const UnicodeString& source, Formattable& result, ParsePosition& parse_pos) const { result.adoptObject(parse(UTZFMT_STYLE_GENERIC_LOCATION, source, parse_pos, UTZFMT_PARSE_OPTION_ALL_STYLES)); } // ------------------------------------------------------------------ // Private zone name format/parse implementation UnicodeString& TimeZoneFormat::formatGeneric(const TimeZone& tz, int32_t genType, UDate date, UnicodeString& name) const { UErrorCode status = U_ZERO_ERROR; const TimeZoneGenericNames* gnames = getTimeZoneGenericNames(status); if (U_FAILURE(status)) { name.setToBogus(); return name; } if (genType == UTZGNM_LOCATION) { const char16_t* canonicalID = ZoneMeta::getCanonicalCLDRID(tz); if (canonicalID == nullptr) { name.setToBogus(); return name; } return gnames->getGenericLocationName(UnicodeString(true, canonicalID, -1), name); } return gnames->getDisplayName(tz, (UTimeZoneGenericNameType)genType, date, name); } UnicodeString& TimeZoneFormat::formatSpecific(const TimeZone& tz, UTimeZoneNameType stdType, UTimeZoneNameType dstType, UDate date, UnicodeString& name, UTimeZoneFormatTimeType *timeType) const { if (fTimeZoneNames == nullptr) { name.setToBogus(); return name; } UErrorCode status = U_ZERO_ERROR; UBool isDaylight = tz.inDaylightTime(date, status); const char16_t* canonicalID = ZoneMeta::getCanonicalCLDRID(tz); if (U_FAILURE(status) || canonicalID == nullptr) { name.setToBogus(); return name; } if (isDaylight) { fTimeZoneNames->getDisplayName(UnicodeString(true, canonicalID, -1), dstType, date, name); } else { fTimeZoneNames->getDisplayName(UnicodeString(true, canonicalID, -1), stdType, date, name); } if (timeType && !name.isEmpty()) { *timeType = isDaylight ? UTZFMT_TIME_TYPE_DAYLIGHT : UTZFMT_TIME_TYPE_STANDARD; } return name; } const TimeZoneGenericNames* TimeZoneFormat::getTimeZoneGenericNames(UErrorCode& status) const { if (U_FAILURE(status)) { return nullptr; } umtx_lock(&gLock); if (fTimeZoneGenericNames == nullptr) { TimeZoneFormat *nonConstThis = const_cast(this); nonConstThis->fTimeZoneGenericNames = TimeZoneGenericNames::createInstance(fLocale, status); } umtx_unlock(&gLock); return fTimeZoneGenericNames; } const TZDBTimeZoneNames* TimeZoneFormat::getTZDBTimeZoneNames(UErrorCode& status) const { if (U_FAILURE(status)) { return nullptr; } umtx_lock(&gLock); if (fTZDBTimeZoneNames == nullptr) { TZDBTimeZoneNames *tzdbNames = new TZDBTimeZoneNames(fLocale); if (tzdbNames == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; } else { TimeZoneFormat *nonConstThis = const_cast(this); nonConstThis->fTZDBTimeZoneNames = tzdbNames; } } umtx_unlock(&gLock); return fTZDBTimeZoneNames; } UnicodeString& TimeZoneFormat::formatExemplarLocation(const TimeZone& tz, UnicodeString& name) const { char16_t locationBuf[ZONE_NAME_U16_MAX]; UnicodeString location(locationBuf, 0, UPRV_LENGTHOF(locationBuf)); const char16_t* canonicalID = ZoneMeta::getCanonicalCLDRID(tz); if (canonicalID) { fTimeZoneNames->getExemplarLocationName(UnicodeString(true, canonicalID, -1), location); } if (location.length() > 0) { name.setTo(location); } else { // Use "unknown" location fTimeZoneNames->getExemplarLocationName(UnicodeString(true, UNKNOWN_ZONE_ID, -1), location); if (location.length() > 0) { name.setTo(location); } else { // last resort name.setTo(UNKNOWN_LOCATION, -1); } } return name; } // ------------------------------------------------------------------ // Zone offset format and parse UnicodeString& TimeZoneFormat::formatOffsetISO8601Basic(int32_t offset, UBool useUtcIndicator, UBool isShort, UBool ignoreSeconds, UnicodeString& result, UErrorCode& status) const { return formatOffsetISO8601(offset, true, useUtcIndicator, isShort, ignoreSeconds, result, status); } UnicodeString& TimeZoneFormat::formatOffsetISO8601Extended(int32_t offset, UBool useUtcIndicator, UBool isShort, UBool ignoreSeconds, UnicodeString& result, UErrorCode& status) const { return formatOffsetISO8601(offset, false, useUtcIndicator, isShort, ignoreSeconds, result, status); } UnicodeString& TimeZoneFormat::formatOffsetLocalizedGMT(int32_t offset, UnicodeString& result, UErrorCode& status) const { return formatOffsetLocalizedGMT(offset, false, result, status); } UnicodeString& TimeZoneFormat::formatOffsetShortLocalizedGMT(int32_t offset, UnicodeString& result, UErrorCode& status) const { return formatOffsetLocalizedGMT(offset, true, result, status); } int32_t TimeZoneFormat::parseOffsetISO8601(const UnicodeString& text, ParsePosition& pos) const { return parseOffsetISO8601(text, pos, false); } int32_t TimeZoneFormat::parseOffsetLocalizedGMT(const UnicodeString& text, ParsePosition& pos) const { return parseOffsetLocalizedGMT(text, pos, false, nullptr); } int32_t TimeZoneFormat::parseOffsetShortLocalizedGMT(const UnicodeString& text, ParsePosition& pos) const { return parseOffsetLocalizedGMT(text, pos, true, nullptr); } // ------------------------------------------------------------------ // Private zone offset format/parse implementation UnicodeString& TimeZoneFormat::formatOffsetISO8601(int32_t offset, UBool isBasic, UBool useUtcIndicator, UBool isShort, UBool ignoreSeconds, UnicodeString& result, UErrorCode& status) const { if (U_FAILURE(status)) { result.setToBogus(); return result; } int32_t absOffset = offset < 0 ? -offset : offset; if (useUtcIndicator && (absOffset < MILLIS_PER_SECOND || (ignoreSeconds && absOffset < MILLIS_PER_MINUTE))) { result.setTo(ISO8601_UTC); return result; } OffsetFields minFields = isShort ? FIELDS_H : FIELDS_HM; OffsetFields maxFields = ignoreSeconds ? FIELDS_HM : FIELDS_HMS; char16_t sep = isBasic ? 0 : ISO8601_SEP; // Note: FIELDS_HMS as maxFields is a CLDR/ICU extension. ISO 8601 specification does // not support seconds field. if (absOffset >= MAX_OFFSET) { result.setToBogus(); status = U_ILLEGAL_ARGUMENT_ERROR; return result; } int fields[3]; fields[0] = absOffset / MILLIS_PER_HOUR; absOffset = absOffset % MILLIS_PER_HOUR; fields[1] = absOffset / MILLIS_PER_MINUTE; absOffset = absOffset % MILLIS_PER_MINUTE; fields[2] = absOffset / MILLIS_PER_SECOND; U_ASSERT(fields[0] >= 0 && fields[0] <= MAX_OFFSET_HOUR); U_ASSERT(fields[1] >= 0 && fields[1] <= MAX_OFFSET_MINUTE); U_ASSERT(fields[2] >= 0 && fields[2] <= MAX_OFFSET_SECOND); int32_t lastIdx = maxFields; while (lastIdx > minFields) { if (fields[lastIdx] != 0) { break; } lastIdx--; } char16_t sign = PLUS; if (offset < 0) { // if all output fields are 0s, do not use negative sign for (int32_t idx = 0; idx <= lastIdx; idx++) { if (fields[idx] != 0) { sign = MINUS; break; } } } result.setTo(sign); for (int32_t idx = 0; idx <= lastIdx; idx++) { if (sep && idx != 0) { result.append(sep); } result.append((char16_t)(0x0030 + fields[idx]/10)); result.append((char16_t)(0x0030 + fields[idx]%10)); } return result; } UnicodeString& TimeZoneFormat::formatOffsetLocalizedGMT(int32_t offset, UBool isShort, UnicodeString& result, UErrorCode& status) const { if (U_FAILURE(status)) { result.setToBogus(); return result; } if (offset <= -MAX_OFFSET || offset >= MAX_OFFSET) { result.setToBogus(); status = U_ILLEGAL_ARGUMENT_ERROR; return result; } if (offset == 0) { result.setTo(fGMTZeroFormat); return result; } UBool positive = true; if (offset < 0) { offset = -offset; positive = false; } int32_t offsetH = offset / MILLIS_PER_HOUR; offset = offset % MILLIS_PER_HOUR; int32_t offsetM = offset / MILLIS_PER_MINUTE; offset = offset % MILLIS_PER_MINUTE; int32_t offsetS = offset / MILLIS_PER_SECOND; U_ASSERT(offsetH <= MAX_OFFSET_HOUR && offsetM <= MAX_OFFSET_MINUTE && offsetS <= MAX_OFFSET_SECOND); const UVector* offsetPatternItems = nullptr; if (positive) { if (offsetS != 0) { offsetPatternItems = fGMTOffsetPatternItems[UTZFMT_PAT_POSITIVE_HMS]; } else if (offsetM != 0 || !isShort) { offsetPatternItems = fGMTOffsetPatternItems[UTZFMT_PAT_POSITIVE_HM]; } else { offsetPatternItems = fGMTOffsetPatternItems[UTZFMT_PAT_POSITIVE_H]; } } else { if (offsetS != 0) { offsetPatternItems = fGMTOffsetPatternItems[UTZFMT_PAT_NEGATIVE_HMS]; } else if (offsetM != 0 || !isShort) { offsetPatternItems = fGMTOffsetPatternItems[UTZFMT_PAT_NEGATIVE_HM]; } else { offsetPatternItems = fGMTOffsetPatternItems[UTZFMT_PAT_NEGATIVE_H]; } } U_ASSERT(offsetPatternItems != nullptr); // Building the GMT format string result.setTo(fGMTPatternPrefix); for (int32_t i = 0; i < offsetPatternItems->size(); i++) { const GMTOffsetField* item = (GMTOffsetField*)offsetPatternItems->elementAt(i); GMTOffsetField::FieldType type = item->getType(); switch (type) { case GMTOffsetField::TEXT: result.append(item->getPatternText(), -1); break; case GMTOffsetField::HOUR: appendOffsetDigits(result, offsetH, (isShort ? 1 : 2)); break; case GMTOffsetField::MINUTE: appendOffsetDigits(result, offsetM, 2); break; case GMTOffsetField::SECOND: appendOffsetDigits(result, offsetS, 2); break; } } result.append(fGMTPatternSuffix); return result; } int32_t TimeZoneFormat::parseOffsetISO8601(const UnicodeString& text, ParsePosition& pos, UBool extendedOnly, UBool* hasDigitOffset /* = nullptr */) const { if (hasDigitOffset) { *hasDigitOffset = false; } int32_t start = pos.getIndex(); if (start >= text.length()) { pos.setErrorIndex(start); return 0; } char16_t firstChar = text.charAt(start); if (firstChar == ISO8601_UTC || firstChar == (char16_t)(ISO8601_UTC + 0x20)) { // "Z" (or "z") - indicates UTC pos.setIndex(start + 1); return 0; } int32_t sign = 1; if (firstChar == PLUS) { sign = 1; } else if (firstChar == MINUS) { sign = -1; } else { // Not an ISO 8601 offset string pos.setErrorIndex(start); return 0; } ParsePosition posOffset(start + 1); int32_t offset = parseAsciiOffsetFields(text, posOffset, ISO8601_SEP, FIELDS_H, FIELDS_HMS); if (posOffset.getErrorIndex() == -1 && !extendedOnly && (posOffset.getIndex() - start <= 3)) { // If the text is successfully parsed as extended format with the options above, it can be also parsed // as basic format. For example, "0230" can be parsed as offset 2:00 (only first digits are valid for // extended format), but it can be parsed as offset 2:30 with basic format. We use longer result. ParsePosition posBasic(start + 1); int32_t tmpOffset = parseAbuttingAsciiOffsetFields(text, posBasic, FIELDS_H, FIELDS_HMS, false); if (posBasic.getErrorIndex() == -1 && posBasic.getIndex() > posOffset.getIndex()) { offset = tmpOffset; posOffset.setIndex(posBasic.getIndex()); } } if (posOffset.getErrorIndex() != -1) { pos.setErrorIndex(start); return 0; } pos.setIndex(posOffset.getIndex()); if (hasDigitOffset) { *hasDigitOffset = true; } return sign * offset; } int32_t TimeZoneFormat::parseOffsetLocalizedGMT(const UnicodeString& text, ParsePosition& pos, UBool isShort, UBool* hasDigitOffset) const { int32_t start = pos.getIndex(); int32_t offset = 0; int32_t parsedLength = 0; if (hasDigitOffset) { *hasDigitOffset = false; } offset = parseOffsetLocalizedGMTPattern(text, start, isShort, parsedLength); // For now, parseOffsetLocalizedGMTPattern handles both long and short // formats, no matter isShort is true or false. This might be changed in future // when strict parsing is necessary, or different set of patterns are used for // short/long formats. #if 0 if (parsedLength == 0) { offset = parseOffsetLocalizedGMTPattern(text, start, !isShort, parsedLength); } #endif if (parsedLength > 0) { if (hasDigitOffset) { *hasDigitOffset = true; } pos.setIndex(start + parsedLength); return offset; } // Try the default patterns offset = parseOffsetDefaultLocalizedGMT(text, start, parsedLength); if (parsedLength > 0) { if (hasDigitOffset) { *hasDigitOffset = true; } pos.setIndex(start + parsedLength); return offset; } // Check if this is a GMT zero format if (text.caseCompare(start, fGMTZeroFormat.length(), fGMTZeroFormat, 0) == 0) { pos.setIndex(start + fGMTZeroFormat.length()); return 0; } // Check if this is a default GMT zero format for (int32_t i = 0; ALT_GMT_STRINGS[i][0] != 0; i++) { const char16_t* defGMTZero = ALT_GMT_STRINGS[i]; int32_t defGMTZeroLen = u_strlen(defGMTZero); if (text.caseCompare(start, defGMTZeroLen, defGMTZero, 0) == 0) { pos.setIndex(start + defGMTZeroLen); return 0; } } // Nothing matched pos.setErrorIndex(start); return 0; } int32_t TimeZoneFormat::parseOffsetLocalizedGMTPattern(const UnicodeString& text, int32_t start, UBool /*isShort*/, int32_t& parsedLen) const { int32_t idx = start; int32_t offset = 0; UBool parsed = false; do { // Prefix part int32_t len = fGMTPatternPrefix.length(); if (len > 0 && text.caseCompare(idx, len, fGMTPatternPrefix, 0) != 0) { // prefix match failed break; } idx += len; // Offset part offset = parseOffsetFields(text, idx, false, len); if (len == 0) { // offset field match failed break; } idx += len; len = fGMTPatternSuffix.length(); if (len > 0 && text.caseCompare(idx, len, fGMTPatternSuffix, 0) != 0) { // no suffix match break; } idx += len; parsed = true; } while (false); parsedLen = parsed ? idx - start : 0; return offset; } int32_t TimeZoneFormat::parseOffsetFields(const UnicodeString& text, int32_t start, UBool /*isShort*/, int32_t& parsedLen) const { int32_t outLen = 0; int32_t offset = 0; int32_t sign = 1; parsedLen = 0; int32_t offsetH, offsetM, offsetS; offsetH = offsetM = offsetS = 0; for (int32_t patidx = 0; PARSE_GMT_OFFSET_TYPES[patidx] >= 0; patidx++) { int32_t gmtPatType = PARSE_GMT_OFFSET_TYPES[patidx]; UVector* items = fGMTOffsetPatternItems[gmtPatType]; U_ASSERT(items != nullptr); outLen = parseOffsetFieldsWithPattern(text, start, items, false, offsetH, offsetM, offsetS); if (outLen > 0) { sign = (gmtPatType == UTZFMT_PAT_POSITIVE_H || gmtPatType == UTZFMT_PAT_POSITIVE_HM || gmtPatType == UTZFMT_PAT_POSITIVE_HMS) ? 1 : -1; break; } } if (outLen > 0 && fAbuttingOffsetHoursAndMinutes) { // When hours field is sabutting minutes field, // the parse result above may not be appropriate. // For example, "01020" is parsed as 01:02: above, // but it should be parsed as 00:10:20. int32_t tmpLen = 0; int32_t tmpSign = 1; int32_t tmpH = 0; int32_t tmpM = 0; int32_t tmpS = 0; for (int32_t patidx = 0; PARSE_GMT_OFFSET_TYPES[patidx] >= 0; patidx++) { int32_t gmtPatType = PARSE_GMT_OFFSET_TYPES[patidx]; UVector* items = fGMTOffsetPatternItems[gmtPatType]; U_ASSERT(items != nullptr); // forcing parse to use single hour digit tmpLen = parseOffsetFieldsWithPattern(text, start, items, true, tmpH, tmpM, tmpS); if (tmpLen > 0) { tmpSign = (gmtPatType == UTZFMT_PAT_POSITIVE_H || gmtPatType == UTZFMT_PAT_POSITIVE_HM || gmtPatType == UTZFMT_PAT_POSITIVE_HMS) ? 1 : -1; break; } } if (tmpLen > outLen) { // Better parse result with single hour digit outLen = tmpLen; sign = tmpSign; offsetH = tmpH; offsetM = tmpM; offsetS = tmpS; } } if (outLen > 0) { offset = ((((offsetH * 60) + offsetM) * 60) + offsetS) * 1000 * sign; parsedLen = outLen; } return offset; } int32_t TimeZoneFormat::parseOffsetFieldsWithPattern(const UnicodeString& text, int32_t start, UVector* patternItems, UBool forceSingleHourDigit, int32_t& hour, int32_t& min, int32_t& sec) const { UBool failed = false; int32_t offsetH, offsetM, offsetS; offsetH = offsetM = offsetS = 0; int32_t idx = start; for (int32_t i = 0; i < patternItems->size(); i++) { int32_t len = 0; const GMTOffsetField* field = (const GMTOffsetField*)patternItems->elementAt(i); GMTOffsetField::FieldType fieldType = field->getType(); if (fieldType == GMTOffsetField::TEXT) { const char16_t* patStr = field->getPatternText(); len = u_strlen(patStr); if (i == 0) { // When TimeZoneFormat parse() is called from SimpleDateFormat, // leading space characters might be truncated. If the first pattern text // starts with such character (e.g. Bidi control), then we need to // skip the leading space characters. if (idx < text.length() && !PatternProps::isWhiteSpace(text.char32At(idx))) { while (len > 0) { UChar32 ch; int32_t chLen; U16_GET(patStr, 0, 0, len, ch); if (PatternProps::isWhiteSpace(ch)) { chLen = U16_LENGTH(ch); len -= chLen; patStr += chLen; } else { break; } } } } if (text.caseCompare(idx, len, patStr, 0) != 0) { failed = true; break; } idx += len; } else { if (fieldType == GMTOffsetField::HOUR) { uint8_t maxDigits = forceSingleHourDigit ? 1 : 2; offsetH = parseOffsetFieldWithLocalizedDigits(text, idx, 1, maxDigits, 0, MAX_OFFSET_HOUR, len); } else if (fieldType == GMTOffsetField::MINUTE) { offsetM = parseOffsetFieldWithLocalizedDigits(text, idx, 2, 2, 0, MAX_OFFSET_MINUTE, len); } else if (fieldType == GMTOffsetField::SECOND) { offsetS = parseOffsetFieldWithLocalizedDigits(text, idx, 2, 2, 0, MAX_OFFSET_SECOND, len); } if (len == 0) { failed = true; break; } idx += len; } } if (failed) { hour = min = sec = 0; return 0; } hour = offsetH; min = offsetM; sec = offsetS; return idx - start; } int32_t TimeZoneFormat::parseAbuttingOffsetFields(const UnicodeString& text, int32_t start, int32_t& parsedLen) const { int32_t digits[MAX_OFFSET_DIGITS]; int32_t parsed[MAX_OFFSET_DIGITS]; // accumulative offsets // Parse digits into int[] int32_t idx = start; int32_t len = 0; int32_t numDigits = 0; for (int32_t i = 0; i < MAX_OFFSET_DIGITS; i++) { digits[i] = parseSingleLocalizedDigit(text, idx, len); if (digits[i] < 0) { break; } idx += len; parsed[i] = idx - start; numDigits++; } if (numDigits == 0) { parsedLen = 0; return 0; } int32_t offset = 0; while (numDigits > 0) { int32_t hour = 0; int32_t min = 0; int32_t sec = 0; U_ASSERT(numDigits > 0 && numDigits <= MAX_OFFSET_DIGITS); switch (numDigits) { case 1: // H hour = digits[0]; break; case 2: // HH hour = digits[0] * 10 + digits[1]; break; case 3: // Hmm hour = digits[0]; min = digits[1] * 10 + digits[2]; break; case 4: // HHmm hour = digits[0] * 10 + digits[1]; min = digits[2] * 10 + digits[3]; break; case 5: // Hmmss hour = digits[0]; min = digits[1] * 10 + digits[2]; sec = digits[3] * 10 + digits[4]; break; case 6: // HHmmss hour = digits[0] * 10 + digits[1]; min = digits[2] * 10 + digits[3]; sec = digits[4] * 10 + digits[5]; break; } if (hour <= MAX_OFFSET_HOUR && min <= MAX_OFFSET_MINUTE && sec <= MAX_OFFSET_SECOND) { // found a valid combination offset = hour * MILLIS_PER_HOUR + min * MILLIS_PER_MINUTE + sec * MILLIS_PER_SECOND; parsedLen = parsed[numDigits - 1]; break; } numDigits--; } return offset; } int32_t TimeZoneFormat::parseOffsetDefaultLocalizedGMT(const UnicodeString& text, int start, int32_t& parsedLen) const { int32_t idx = start; int32_t offset = 0; int32_t parsed = 0; do { // check global default GMT alternatives int32_t gmtLen = 0; for (int32_t i = 0; ALT_GMT_STRINGS[i][0] != 0; i++) { const char16_t* gmt = ALT_GMT_STRINGS[i]; int32_t len = u_strlen(gmt); if (text.caseCompare(start, len, gmt, 0) == 0) { gmtLen = len; break; } } if (gmtLen == 0) { break; } idx += gmtLen; // offset needs a sign char and a digit at minimum if (idx + 1 >= text.length()) { break; } // parse sign int32_t sign = 1; char16_t c = text.charAt(idx); if (c == PLUS) { sign = 1; } else if (c == MINUS) { sign = -1; } else { break; } idx++; // offset part // try the default pattern with the separator first int32_t lenWithSep = 0; int32_t offsetWithSep = parseDefaultOffsetFields(text, idx, DEFAULT_GMT_OFFSET_SEP, lenWithSep); if (lenWithSep == text.length() - idx) { // maximum match offset = offsetWithSep * sign; idx += lenWithSep; } else { // try abutting field pattern int32_t lenAbut = 0; int32_t offsetAbut = parseAbuttingOffsetFields(text, idx, lenAbut); if (lenWithSep > lenAbut) { offset = offsetWithSep * sign; idx += lenWithSep; } else { offset = offsetAbut * sign; idx += lenAbut; } } parsed = idx - start; } while (false); parsedLen = parsed; return offset; } int32_t TimeZoneFormat::parseDefaultOffsetFields(const UnicodeString& text, int32_t start, char16_t separator, int32_t& parsedLen) const { int32_t max = text.length(); int32_t idx = start; int32_t len = 0; int32_t hour = 0, min = 0, sec = 0; parsedLen = 0; do { hour = parseOffsetFieldWithLocalizedDigits(text, idx, 1, 2, 0, MAX_OFFSET_HOUR, len); if (len == 0) { break; } idx += len; if (idx + 1 < max && text.charAt(idx) == separator) { min = parseOffsetFieldWithLocalizedDigits(text, idx + 1, 2, 2, 0, MAX_OFFSET_MINUTE, len); if (len == 0) { break; } idx += (1 + len); if (idx + 1 < max && text.charAt(idx) == separator) { sec = parseOffsetFieldWithLocalizedDigits(text, idx + 1, 2, 2, 0, MAX_OFFSET_SECOND, len); if (len == 0) { break; } idx += (1 + len); } } } while (false); if (idx == start) { return 0; } parsedLen = idx - start; return hour * MILLIS_PER_HOUR + min * MILLIS_PER_MINUTE + sec * MILLIS_PER_SECOND; } int32_t TimeZoneFormat::parseOffsetFieldWithLocalizedDigits(const UnicodeString& text, int32_t start, uint8_t minDigits, uint8_t maxDigits, uint16_t minVal, uint16_t maxVal, int32_t& parsedLen) const { parsedLen = 0; int32_t decVal = 0; int32_t numDigits = 0; int32_t idx = start; int32_t digitLen = 0; while (idx < text.length() && numDigits < maxDigits) { int32_t digit = parseSingleLocalizedDigit(text, idx, digitLen); if (digit < 0) { break; } int32_t tmpVal = decVal * 10 + digit; if (tmpVal > maxVal) { break; } decVal = tmpVal; numDigits++; idx += digitLen; } // Note: maxVal is checked in the while loop if (numDigits < minDigits || decVal < minVal) { decVal = -1; numDigits = 0; } else { parsedLen = idx - start; } return decVal; } int32_t TimeZoneFormat::parseSingleLocalizedDigit(const UnicodeString& text, int32_t start, int32_t& len) const { int32_t digit = -1; len = 0; if (start < text.length()) { UChar32 cp = text.char32At(start); // First, try digits configured for this instance for (int32_t i = 0; i < 10; i++) { if (cp == fGMTOffsetDigits[i]) { digit = i; break; } } // If failed, check if this is a Unicode digit if (digit < 0) { int32_t tmp = u_charDigitValue(cp); digit = (tmp >= 0 && tmp <= 9) ? tmp : -1; } if (digit >= 0) { int32_t next = text.moveIndex32(start, 1); len = next - start; } } return digit; } UnicodeString& TimeZoneFormat::formatOffsetWithAsciiDigits(int32_t offset, char16_t sep, OffsetFields minFields, OffsetFields maxFields, UnicodeString& result) { U_ASSERT(maxFields >= minFields); U_ASSERT(offset > -MAX_OFFSET && offset < MAX_OFFSET); char16_t sign = PLUS; if (offset < 0) { sign = MINUS; offset = -offset; } result.setTo(sign); int fields[3]; fields[0] = offset / MILLIS_PER_HOUR; offset = offset % MILLIS_PER_HOUR; fields[1] = offset / MILLIS_PER_MINUTE; offset = offset % MILLIS_PER_MINUTE; fields[2] = offset / MILLIS_PER_SECOND; U_ASSERT(fields[0] >= 0 && fields[0] <= MAX_OFFSET_HOUR); U_ASSERT(fields[1] >= 0 && fields[1] <= MAX_OFFSET_MINUTE); U_ASSERT(fields[2] >= 0 && fields[2] <= MAX_OFFSET_SECOND); int32_t lastIdx = maxFields; while (lastIdx > minFields) { if (fields[lastIdx] != 0) { break; } lastIdx--; } for (int32_t idx = 0; idx <= lastIdx; idx++) { if (sep && idx != 0) { result.append(sep); } result.append((char16_t)(0x0030 + fields[idx]/10)); result.append((char16_t)(0x0030 + fields[idx]%10)); } return result; } int32_t TimeZoneFormat::parseAbuttingAsciiOffsetFields(const UnicodeString& text, ParsePosition& pos, OffsetFields minFields, OffsetFields maxFields, UBool fixedHourWidth) { int32_t start = pos.getIndex(); int32_t minDigits = 2 * (minFields + 1) - (fixedHourWidth ? 0 : 1); int32_t maxDigits = 2 * (maxFields + 1); U_ASSERT(maxDigits <= MAX_OFFSET_DIGITS); int32_t digits[MAX_OFFSET_DIGITS] = {}; int32_t numDigits = 0; int32_t idx = start; while (numDigits < maxDigits && idx < text.length()) { char16_t uch = text.charAt(idx); int32_t digit = DIGIT_VAL(uch); if (digit < 0) { break; } digits[numDigits] = digit; numDigits++; idx++; } if (fixedHourWidth && (numDigits & 1)) { // Fixed digits, so the number of digits must be even number. Truncating. numDigits--; } if (numDigits < minDigits) { pos.setErrorIndex(start); return 0; } int32_t hour = 0, min = 0, sec = 0; UBool bParsed = false; while (numDigits >= minDigits) { switch (numDigits) { case 1: //H hour = digits[0]; break; case 2: //HH hour = digits[0] * 10 + digits[1]; break; case 3: //Hmm hour = digits[0]; min = digits[1] * 10 + digits[2]; break; case 4: //HHmm hour = digits[0] * 10 + digits[1]; min = digits[2] * 10 + digits[3]; break; case 5: //Hmmss hour = digits[0]; min = digits[1] * 10 + digits[2]; sec = digits[3] * 10 + digits[4]; break; case 6: //HHmmss hour = digits[0] * 10 + digits[1]; min = digits[2] * 10 + digits[3]; sec = digits[4] * 10 + digits[5]; break; } if (hour <= MAX_OFFSET_HOUR && min <= MAX_OFFSET_MINUTE && sec <= MAX_OFFSET_SECOND) { // Successfully parsed bParsed = true; break; } // Truncating numDigits -= (fixedHourWidth ? 2 : 1); hour = min = sec = 0; } if (!bParsed) { pos.setErrorIndex(start); return 0; } pos.setIndex(start + numDigits); return ((((hour * 60) + min) * 60) + sec) * 1000; } int32_t TimeZoneFormat::parseAsciiOffsetFields(const UnicodeString& text, ParsePosition& pos, char16_t sep, OffsetFields minFields, OffsetFields maxFields) { int32_t start = pos.getIndex(); int32_t fieldVal[] = {0, 0, 0}; int32_t fieldLen[] = {0, -1, -1}; for (int32_t idx = start, fieldIdx = 0; idx < text.length() && fieldIdx <= maxFields; idx++) { char16_t c = text.charAt(idx); if (c == sep) { if (fieldIdx == 0) { if (fieldLen[0] == 0) { // no hours field break; } // 1 digit hour, move to next field } else { if (fieldLen[fieldIdx] != -1) { // premature minute or seconds field break; } fieldLen[fieldIdx] = 0; } continue; } else if (fieldLen[fieldIdx] == -1) { // no separator after 2 digit field break; } int32_t digit = DIGIT_VAL(c); if (digit < 0) { // not a digit break; } fieldVal[fieldIdx] = fieldVal[fieldIdx] * 10 + digit; fieldLen[fieldIdx]++; if (fieldLen[fieldIdx] >= 2) { // parsed 2 digits, move to next field fieldIdx++; } } int32_t offset = 0; int32_t parsedLen = 0; int32_t parsedFields = -1; do { // hour if (fieldLen[0] == 0) { break; } if (fieldVal[0] > MAX_OFFSET_HOUR) { offset = (fieldVal[0] / 10) * MILLIS_PER_HOUR; parsedFields = FIELDS_H; parsedLen = 1; break; } offset = fieldVal[0] * MILLIS_PER_HOUR; parsedLen = fieldLen[0]; parsedFields = FIELDS_H; // minute if (fieldLen[1] != 2 || fieldVal[1] > MAX_OFFSET_MINUTE) { break; } offset += fieldVal[1] * MILLIS_PER_MINUTE; parsedLen += (1 + fieldLen[1]); parsedFields = FIELDS_HM; // second if (fieldLen[2] != 2 || fieldVal[2] > MAX_OFFSET_SECOND) { break; } offset += fieldVal[2] * MILLIS_PER_SECOND; parsedLen += (1 + fieldLen[2]); parsedFields = FIELDS_HMS; } while (false); if (parsedFields < minFields) { pos.setErrorIndex(start); return 0; } pos.setIndex(start + parsedLen); return offset; } void TimeZoneFormat::appendOffsetDigits(UnicodeString& buf, int32_t n, uint8_t minDigits) const { U_ASSERT(n >= 0 && n < 60); int32_t numDigits = n >= 10 ? 2 : 1; for (int32_t i = 0; i < minDigits - numDigits; i++) { buf.append(fGMTOffsetDigits[0]); } if (numDigits == 2) { buf.append(fGMTOffsetDigits[n / 10]); } buf.append(fGMTOffsetDigits[n % 10]); } // ------------------------------------------------------------------ // Private misc void TimeZoneFormat::initGMTPattern(const UnicodeString& gmtPattern, UErrorCode& status) { if (U_FAILURE(status)) { return; } // This implementation not perfect, but sufficient practically. int32_t idx = gmtPattern.indexOf(ARG0, ARG0_LEN, 0); if (idx < 0) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } fGMTPattern.setTo(gmtPattern); unquote(gmtPattern.tempSubString(0, idx), fGMTPatternPrefix); unquote(gmtPattern.tempSubString(idx + ARG0_LEN), fGMTPatternSuffix); } UnicodeString& TimeZoneFormat::unquote(const UnicodeString& pattern, UnicodeString& result) { if (pattern.indexOf(SINGLEQUOTE) < 0) { result.setTo(pattern); return result; } result.remove(); UBool isPrevQuote = false; UBool inQuote = false; for (int32_t i = 0; i < pattern.length(); i++) { char16_t c = pattern.charAt(i); if (c == SINGLEQUOTE) { if (isPrevQuote) { result.append(c); isPrevQuote = false; } else { isPrevQuote = true; } inQuote = !inQuote; } else { isPrevQuote = false; result.append(c); } } return result; } UVector* TimeZoneFormat::parseOffsetPattern(const UnicodeString& pattern, OffsetFields required, UErrorCode& status) { if (U_FAILURE(status)) { return nullptr; } UVector* result = new UVector(deleteGMTOffsetField, nullptr, status); if (result == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } int32_t checkBits = 0; UBool isPrevQuote = false; UBool inQuote = false; char16_t textBuf[32]; UnicodeString text(textBuf, 0, UPRV_LENGTHOF(textBuf)); GMTOffsetField::FieldType itemType = GMTOffsetField::TEXT; int32_t itemLength = 1; for (int32_t i = 0; i < pattern.length(); i++) { char16_t ch = pattern.charAt(i); if (ch == SINGLEQUOTE) { if (isPrevQuote) { text.append(SINGLEQUOTE); isPrevQuote = false; } else { isPrevQuote = true; if (itemType != GMTOffsetField::TEXT) { if (GMTOffsetField::isValid(itemType, itemLength)) { GMTOffsetField* fld = GMTOffsetField::createTimeField(itemType, static_cast(itemLength), status); result->adoptElement(fld, status); if (U_FAILURE(status)) { break; } } else { status = U_ILLEGAL_ARGUMENT_ERROR; break; } itemType = GMTOffsetField::TEXT; } } inQuote = !inQuote; } else { isPrevQuote = false; if (inQuote) { text.append(ch); } else { GMTOffsetField::FieldType tmpType = GMTOffsetField::getTypeByLetter(ch); if (tmpType != GMTOffsetField::TEXT) { // an offset time pattern character if (tmpType == itemType) { itemLength++; } else { if (itemType == GMTOffsetField::TEXT) { if (text.length() > 0) { GMTOffsetField* textfld = GMTOffsetField::createText(text, status); result->adoptElement(textfld, status); if (U_FAILURE(status)) { break; } text.remove(); } } else { if (GMTOffsetField::isValid(itemType, itemLength)) { GMTOffsetField* fld = GMTOffsetField::createTimeField(itemType, static_cast(itemLength), status); result->adoptElement(fld, status); if (U_FAILURE(status)) { break; } } else { status = U_ILLEGAL_ARGUMENT_ERROR; break; } } itemType = tmpType; itemLength = 1; checkBits |= tmpType; } } else { // a string literal if (itemType != GMTOffsetField::TEXT) { if (GMTOffsetField::isValid(itemType, itemLength)) { GMTOffsetField* fld = GMTOffsetField::createTimeField(itemType, static_cast(itemLength), status); result->adoptElement(fld, status); if (U_FAILURE(status)) { break; } } else { status = U_ILLEGAL_ARGUMENT_ERROR; break; } itemType = GMTOffsetField::TEXT; } text.append(ch); } } } } // handle last item if (U_SUCCESS(status)) { if (itemType == GMTOffsetField::TEXT) { if (text.length() > 0) { GMTOffsetField* tfld = GMTOffsetField::createText(text, status); result->adoptElement(tfld, status); } } else { if (GMTOffsetField::isValid(itemType, itemLength)) { GMTOffsetField* fld = GMTOffsetField::createTimeField(itemType, static_cast(itemLength), status); result->adoptElement(fld, status); } else { status = U_ILLEGAL_ARGUMENT_ERROR; } } // Check all required fields are set if (U_SUCCESS(status)) { int32_t reqBits = 0; switch (required) { case FIELDS_H: reqBits = GMTOffsetField::HOUR; break; case FIELDS_HM: reqBits = GMTOffsetField::HOUR | GMTOffsetField::MINUTE; break; case FIELDS_HMS: reqBits = GMTOffsetField::HOUR | GMTOffsetField::MINUTE | GMTOffsetField::SECOND; break; } if (checkBits == reqBits) { // all required fields are set, no extra fields return result; } } } // error delete result; return nullptr; } UnicodeString& TimeZoneFormat::expandOffsetPattern(const UnicodeString& offsetHM, UnicodeString& result, UErrorCode& status) { result.setToBogus(); if (U_FAILURE(status)) { return result; } U_ASSERT(u_strlen(DEFAULT_GMT_OFFSET_MINUTE_PATTERN) == 2); int32_t idx_mm = offsetHM.indexOf(DEFAULT_GMT_OFFSET_MINUTE_PATTERN, 2, 0); if (idx_mm < 0) { // Bad time zone hour pattern data status = U_ILLEGAL_ARGUMENT_ERROR; return result; } UnicodeString sep; int32_t idx_H = offsetHM.tempSubString(0, idx_mm).lastIndexOf((char16_t)0x0048 /* H */); if (idx_H >= 0) { sep = offsetHM.tempSubString(idx_H + 1, idx_mm - (idx_H + 1)); } result.setTo(offsetHM.tempSubString(0, idx_mm + 2)); result.append(sep); result.append(DEFAULT_GMT_OFFSET_SECOND_PATTERN, -1); result.append(offsetHM.tempSubString(idx_mm + 2)); return result; } UnicodeString& TimeZoneFormat::truncateOffsetPattern(const UnicodeString& offsetHM, UnicodeString& result, UErrorCode& status) { result.setToBogus(); if (U_FAILURE(status)) { return result; } U_ASSERT(u_strlen(DEFAULT_GMT_OFFSET_MINUTE_PATTERN) == 2); int32_t idx_mm = offsetHM.indexOf(DEFAULT_GMT_OFFSET_MINUTE_PATTERN, 2, 0); if (idx_mm < 0) { // Bad time zone hour pattern data status = U_ILLEGAL_ARGUMENT_ERROR; return result; } char16_t HH[] = {0x0048, 0x0048}; int32_t idx_HH = offsetHM.tempSubString(0, idx_mm).lastIndexOf(HH, 2, 0); if (idx_HH >= 0) { return result.setTo(offsetHM.tempSubString(0, idx_HH + 2)); } int32_t idx_H = offsetHM.tempSubString(0, idx_mm).lastIndexOf((char16_t)0x0048, 0); if (idx_H >= 0) { return result.setTo(offsetHM.tempSubString(0, idx_H + 1)); } // Bad time zone hour pattern data status = U_ILLEGAL_ARGUMENT_ERROR; return result; } void TimeZoneFormat::initGMTOffsetPatterns(UErrorCode& status) { for (int32_t type = 0; type < UTZFMT_PAT_COUNT; type++) { switch (type) { case UTZFMT_PAT_POSITIVE_H: case UTZFMT_PAT_NEGATIVE_H: fGMTOffsetPatternItems[type] = parseOffsetPattern(fGMTOffsetPatterns[type], FIELDS_H, status); break; case UTZFMT_PAT_POSITIVE_HM: case UTZFMT_PAT_NEGATIVE_HM: fGMTOffsetPatternItems[type] = parseOffsetPattern(fGMTOffsetPatterns[type], FIELDS_HM, status); break; case UTZFMT_PAT_POSITIVE_HMS: case UTZFMT_PAT_NEGATIVE_HMS: fGMTOffsetPatternItems[type] = parseOffsetPattern(fGMTOffsetPatterns[type], FIELDS_HMS, status); break; } } if (U_FAILURE(status)) { return; } checkAbuttingHoursAndMinutes(); } void TimeZoneFormat::checkAbuttingHoursAndMinutes() { fAbuttingOffsetHoursAndMinutes= false; for (int32_t type = 0; type < UTZFMT_PAT_COUNT; type++) { UBool afterH = false; UVector *items = fGMTOffsetPatternItems[type]; for (int32_t i = 0; i < items->size(); i++) { const GMTOffsetField* item = (GMTOffsetField*)items->elementAt(i); GMTOffsetField::FieldType fieldType = item->getType(); if (fieldType != GMTOffsetField::TEXT) { if (afterH) { fAbuttingOffsetHoursAndMinutes = true; break; } else if (fieldType == GMTOffsetField::HOUR) { afterH = true; } } else if (afterH) { break; } } if (fAbuttingOffsetHoursAndMinutes) { break; } } } UBool TimeZoneFormat::toCodePoints(const UnicodeString& str, UChar32* codeArray, int32_t size) { int32_t count = str.countChar32(); if (count != size) { return false; } for (int32_t idx = 0, start = 0; idx < size; idx++) { codeArray[idx] = str.char32At(start); start = str.moveIndex32(start, 1); } return true; } TimeZone* TimeZoneFormat::createTimeZoneForOffset(int32_t offset) const { if (offset == 0) { // when offset is 0, we should use "Etc/GMT" return TimeZone::createTimeZone(UnicodeString(true, TZID_GMT, -1)); } return ZoneMeta::createCustomTimeZone(offset); } UTimeZoneFormatTimeType TimeZoneFormat::getTimeType(UTimeZoneNameType nameType) { switch (nameType) { case UTZNM_LONG_STANDARD: case UTZNM_SHORT_STANDARD: return UTZFMT_TIME_TYPE_STANDARD; case UTZNM_LONG_DAYLIGHT: case UTZNM_SHORT_DAYLIGHT: return UTZFMT_TIME_TYPE_DAYLIGHT; default: return UTZFMT_TIME_TYPE_UNKNOWN; } } UnicodeString& TimeZoneFormat::getTimeZoneID(const TimeZoneNames::MatchInfoCollection* matches, int32_t idx, UnicodeString& tzID) const { if (!matches->getTimeZoneIDAt(idx, tzID)) { char16_t mzIDBuf[32]; UnicodeString mzID(mzIDBuf, 0, UPRV_LENGTHOF(mzIDBuf)); if (matches->getMetaZoneIDAt(idx, mzID)) { fTimeZoneNames->getReferenceZoneID(mzID, fTargetRegion, tzID); } } return tzID; } class ZoneIdMatchHandler : public TextTrieMapSearchResultHandler { public: ZoneIdMatchHandler(); virtual ~ZoneIdMatchHandler(); UBool handleMatch(int32_t matchLength, const CharacterNode *node, UErrorCode &status) override; const char16_t* getID(); int32_t getMatchLen(); private: int32_t fLen; const char16_t* fID; }; ZoneIdMatchHandler::ZoneIdMatchHandler() : fLen(0), fID(nullptr) { } ZoneIdMatchHandler::~ZoneIdMatchHandler() { } UBool ZoneIdMatchHandler::handleMatch(int32_t matchLength, const CharacterNode *node, UErrorCode &status) { if (U_FAILURE(status)) { return false; } if (node->hasValues()) { const char16_t* id = (const char16_t*)node->getValue(0); if (id != nullptr) { if (fLen < matchLength) { fID = id; fLen = matchLength; } } } return true; } const char16_t* ZoneIdMatchHandler::getID() { return fID; } int32_t ZoneIdMatchHandler::getMatchLen() { return fLen; } static void U_CALLCONV initZoneIdTrie(UErrorCode &status) { U_ASSERT(gZoneIdTrie == nullptr); ucln_i18n_registerCleanup(UCLN_I18N_TIMEZONEFORMAT, tzfmt_cleanup); gZoneIdTrie = new TextTrieMap(true, nullptr); // No deleter, because values are pooled by ZoneMeta if (gZoneIdTrie == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } StringEnumeration *tzenum = TimeZone::createEnumeration(status); if (U_SUCCESS(status)) { const UnicodeString *id; while ((id = tzenum->snext(status)) != nullptr) { const char16_t* uid = ZoneMeta::findTimeZoneID(*id); if (uid) { gZoneIdTrie->put(uid, const_cast(uid), status); } } delete tzenum; } } UnicodeString& TimeZoneFormat::parseZoneID(const UnicodeString& text, ParsePosition& pos, UnicodeString& tzID) const { UErrorCode status = U_ZERO_ERROR; umtx_initOnce(gZoneIdTrieInitOnce, &initZoneIdTrie, status); int32_t start = pos.getIndex(); int32_t len = 0; tzID.setToBogus(); if (U_SUCCESS(status)) { LocalPointer handler(new ZoneIdMatchHandler()); gZoneIdTrie->search(text, start, handler.getAlias(), status); len = handler->getMatchLen(); if (len > 0) { tzID.setTo(handler->getID(), -1); } } if (len > 0) { pos.setIndex(start + len); } else { pos.setErrorIndex(start); } return tzID; } static void U_CALLCONV initShortZoneIdTrie(UErrorCode &status) { U_ASSERT(gShortZoneIdTrie == nullptr); ucln_i18n_registerCleanup(UCLN_I18N_TIMEZONEFORMAT, tzfmt_cleanup); StringEnumeration *tzenum = TimeZone::createTimeZoneIDEnumeration(UCAL_ZONE_TYPE_CANONICAL, nullptr, nullptr, status); if (U_SUCCESS(status)) { gShortZoneIdTrie = new TextTrieMap(true, nullptr); // No deleter, because values are pooled by ZoneMeta if (gShortZoneIdTrie == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; } else { const UnicodeString *id; while ((id = tzenum->snext(status)) != nullptr) { const char16_t* uID = ZoneMeta::findTimeZoneID(*id); const char16_t* shortID = ZoneMeta::getShortID(*id); if (shortID && uID) { gShortZoneIdTrie->put(shortID, const_cast(uID), status); } } } } delete tzenum; } UnicodeString& TimeZoneFormat::parseShortZoneID(const UnicodeString& text, ParsePosition& pos, UnicodeString& tzID) const { UErrorCode status = U_ZERO_ERROR; umtx_initOnce(gShortZoneIdTrieInitOnce, &initShortZoneIdTrie, status); int32_t start = pos.getIndex(); int32_t len = 0; tzID.setToBogus(); if (U_SUCCESS(status)) { LocalPointer handler(new ZoneIdMatchHandler()); gShortZoneIdTrie->search(text, start, handler.getAlias(), status); len = handler->getMatchLen(); if (len > 0) { tzID.setTo(handler->getID(), -1); } } if (len > 0) { pos.setIndex(start + len); } else { pos.setErrorIndex(start); } return tzID; } UnicodeString& TimeZoneFormat::parseExemplarLocation(const UnicodeString& text, ParsePosition& pos, UnicodeString& tzID) const { int32_t startIdx = pos.getIndex(); int32_t parsedPos = -1; tzID.setToBogus(); UErrorCode status = U_ZERO_ERROR; LocalPointer exemplarMatches(fTimeZoneNames->find(text, startIdx, UTZNM_EXEMPLAR_LOCATION, status)); if (U_FAILURE(status)) { pos.setErrorIndex(startIdx); return tzID; } int32_t matchIdx = -1; if (!exemplarMatches.isNull()) { for (int32_t i = 0; i < exemplarMatches->size(); i++) { if (startIdx + exemplarMatches->getMatchLengthAt(i) > parsedPos) { matchIdx = i; parsedPos = startIdx + exemplarMatches->getMatchLengthAt(i); } } if (parsedPos > 0) { pos.setIndex(parsedPos); getTimeZoneID(exemplarMatches.getAlias(), matchIdx, tzID); } } if (tzID.length() == 0) { pos.setErrorIndex(startIdx); } return tzID; } U_NAMESPACE_END #endif stringi/src/icu74/i18n/collationruleparser.h0000644000176200001440000001441114700200761020472 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2013-2014, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * collationruleparser.h * * created on: 2013apr10 * created by: Markus W. Scherer */ #ifndef __COLLATIONRULEPARSER_H__ #define __COLLATIONRULEPARSER_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION #include "unicode/ucol.h" #include "unicode/uniset.h" #include "unicode/unistr.h" struct UParseError; U_NAMESPACE_BEGIN struct CollationData; struct CollationTailoring; class Locale; class Normalizer2; struct CollationSettings; class U_I18N_API CollationRuleParser : public UMemory { public: /** Special reset positions. */ enum Position { FIRST_TERTIARY_IGNORABLE, LAST_TERTIARY_IGNORABLE, FIRST_SECONDARY_IGNORABLE, LAST_SECONDARY_IGNORABLE, FIRST_PRIMARY_IGNORABLE, LAST_PRIMARY_IGNORABLE, FIRST_VARIABLE, LAST_VARIABLE, FIRST_REGULAR, LAST_REGULAR, FIRST_IMPLICIT, LAST_IMPLICIT, FIRST_TRAILING, LAST_TRAILING }; /** * First character of contractions that encode special reset positions. * U+FFFE cannot be tailored via rule syntax. * * The second contraction character is POS_BASE + Position. */ static const char16_t POS_LEAD = 0xfffe; /** * Base for the second character of contractions that encode special reset positions. * Braille characters U+28xx are printable and normalization-inert. * @see POS_LEAD */ static const char16_t POS_BASE = 0x2800; class U_I18N_API Sink : public UObject { public: virtual ~Sink(); /** * Adds a reset. * strength=UCOL_IDENTICAL for &str. * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3. */ virtual void addReset(int32_t strength, const UnicodeString &str, const char *&errorReason, UErrorCode &errorCode) = 0; /** * Adds a relation with strength and prefix | str / extension. */ virtual void addRelation(int32_t strength, const UnicodeString &prefix, const UnicodeString &str, const UnicodeString &extension, const char *&errorReason, UErrorCode &errorCode) = 0; virtual void suppressContractions(const UnicodeSet &set, const char *&errorReason, UErrorCode &errorCode); virtual void optimize(const UnicodeSet &set, const char *&errorReason, UErrorCode &errorCode); }; class U_I18N_API Importer : public UObject { public: virtual ~Importer(); virtual void getRules( const char *localeID, const char *collationType, UnicodeString &rules, const char *&errorReason, UErrorCode &errorCode) = 0; }; /** * Constructor. * The Sink must be set before parsing. * The Importer can be set, otherwise [import locale] syntax is not supported. */ CollationRuleParser(const CollationData *base, UErrorCode &errorCode); ~CollationRuleParser(); /** * Sets the pointer to a Sink object. * The pointer is aliased: Pointer copy without cloning or taking ownership. */ void setSink(Sink *sinkAlias) { sink = sinkAlias; } /** * Sets the pointer to an Importer object. * The pointer is aliased: Pointer copy without cloning or taking ownership. */ void setImporter(Importer *importerAlias) { importer = importerAlias; } void parse(const UnicodeString &ruleString, CollationSettings &outSettings, UParseError *outParseError, UErrorCode &errorCode); const char *getErrorReason() const { return errorReason; } /** * Gets a script or reorder code from its string representation. * @return the script/reorder code, or * -1 if not recognized */ static int32_t getReorderCode(const char *word); private: /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */ static const int32_t STRENGTH_MASK = 0xf; static const int32_t STARRED_FLAG = 0x10; static const int32_t OFFSET_SHIFT = 8; void parse(const UnicodeString &ruleString, UErrorCode &errorCode); void parseRuleChain(UErrorCode &errorCode); int32_t parseResetAndPosition(UErrorCode &errorCode); int32_t parseRelationOperator(UErrorCode &errorCode); void parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode); void parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode); int32_t parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode); int32_t parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode); /** * Sets str to a contraction of U+FFFE and (U+2800 + Position). * @return rule index after the special reset position */ int32_t parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode); void parseSetting(UErrorCode &errorCode); void parseReordering(const UnicodeString &raw, UErrorCode &errorCode); static UColAttributeValue getOnOffValue(const UnicodeString &s); int32_t parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode); int32_t readWords(int32_t i, UnicodeString &raw) const; int32_t skipComment(int32_t i) const; void setParseError(const char *reason, UErrorCode &errorCode); void setErrorContext(); /** * ASCII [:P:] and [:S:]: * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E] */ static UBool isSyntaxChar(UChar32 c); int32_t skipWhiteSpace(int32_t i) const; const Normalizer2 &nfd, &nfc; const UnicodeString *rules; const CollationData *const baseData; CollationSettings *settings; UParseError *parseError; const char *errorReason; Sink *sink; Importer *importer; int32_t ruleIndex; }; U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION #endif // __COLLATIONRULEPARSER_H__ stringi/src/icu74/i18n/currfmt.cpp0000644000176200001440000000335114700200761016417 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (c) 2004-2014 International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Author: Alan Liu * Created: April 20, 2004 * Since: ICU 3.0 ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "currfmt.h" #include "unicode/numfmt.h" #include "unicode/curramt.h" U_NAMESPACE_BEGIN CurrencyFormat::CurrencyFormat(const Locale& locale, UErrorCode& ec) : MeasureFormat(locale, UMEASFMT_WIDTH_WIDE, ec) { } CurrencyFormat::CurrencyFormat(const CurrencyFormat& other) : MeasureFormat(other) { } CurrencyFormat::~CurrencyFormat() { } CurrencyFormat* CurrencyFormat::clone() const { return new CurrencyFormat(*this); } UnicodeString& CurrencyFormat::format(const Formattable& obj, UnicodeString& appendTo, FieldPosition& pos, UErrorCode& ec) const { return getCurrencyFormatInternal().format(obj, appendTo, pos, ec); } void CurrencyFormat::parseObject(const UnicodeString& source, Formattable& result, ParsePosition& pos) const { CurrencyAmount* currAmt = getCurrencyFormatInternal().parseCurrency(source, pos); if (currAmt != nullptr) { result.adoptObject(currAmt); } } UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CurrencyFormat) U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/nfsubs.cpp0000644000176200001440000015233414700200761016243 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * Copyright (C) 1997-2015, International Business Machines * Corporation and others. All Rights Reserved. ****************************************************************************** * file name: nfsubs.cpp * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * Modification history * Date Name Comments * 10/11/2001 Doug Ported from ICU4J */ #include #include "utypeinfo.h" // for 'typeid' to work #include "nfsubs.h" #include "fmtableimp.h" #include "putilimp.h" #include "number_decimalquantity.h" #if U_HAVE_RBNF static const char16_t gLessThan = 0x003c; static const char16_t gEquals = 0x003d; static const char16_t gGreaterThan = 0x003e; static const char16_t gPercent = 0x0025; static const char16_t gPound = 0x0023; static const char16_t gZero = 0x0030; static const char16_t gSpace = 0x0020; static const char16_t gEqualsEquals[] = { 0x3D, 0x3D, 0 }; /* "==" */ static const char16_t gGreaterGreaterGreaterThan[] = { 0x3E, 0x3E, 0x3E, 0 }; /* ">>>" */ static const char16_t gGreaterGreaterThan[] = { 0x3E, 0x3E, 0 }; /* ">>" */ U_NAMESPACE_BEGIN using number::impl::DecimalQuantity; class SameValueSubstitution : public NFSubstitution { public: SameValueSubstitution(int32_t pos, const NFRuleSet* ruleset, const UnicodeString& description, UErrorCode& status); virtual ~SameValueSubstitution(); virtual int64_t transformNumber(int64_t number) const override { return number; } virtual double transformNumber(double number) const override { return number; } virtual double composeRuleValue(double newRuleValue, double /*oldRuleValue*/) const override { return newRuleValue; } virtual double calcUpperBound(double oldUpperBound) const override { return oldUpperBound; } virtual char16_t tokenChar() const override { return (char16_t)0x003d; } // '=' public: static UClassID getStaticClassID(); virtual UClassID getDynamicClassID() const override; }; SameValueSubstitution::~SameValueSubstitution() {} class MultiplierSubstitution : public NFSubstitution { int64_t divisor; public: MultiplierSubstitution(int32_t _pos, const NFRule *rule, const NFRuleSet* _ruleSet, const UnicodeString& description, UErrorCode& status) : NFSubstitution(_pos, _ruleSet, description, status), divisor(rule->getDivisor()) { if (divisor == 0) { status = U_PARSE_ERROR; } } virtual ~MultiplierSubstitution(); virtual void setDivisor(int32_t radix, int16_t exponent, UErrorCode& status) override { divisor = util64_pow(radix, exponent); if(divisor == 0) { status = U_PARSE_ERROR; } } virtual bool operator==(const NFSubstitution& rhs) const override; virtual int64_t transformNumber(int64_t number) const override { return number / divisor; } virtual double transformNumber(double number) const override { bool doFloor = getRuleSet() != nullptr; if (!doFloor) { // This is a HACK that partially addresses ICU-22313. The original code wanted us to do // floor() on the result if we were passing it to another rule set, but not if we were passing // it to a DecimalFormat. But the DurationRules rule set has multiplier substitutions where // we DO want to do the floor() operation. What we REALLY want is to do floor() any time // the owning rule also has a ModulusSubsitution, but we don't have access to that information // here, so instead we're doing a floor() any time the DecimalFormat has maxFracDigits equal to // 0. This seems to work with our existing rule sets, but could be a problem in the future, // but the "real" fix for DurationRules isn't worth doing, since we're deprecating DurationRules // anyway. This is enough to keep it from being egregiously wrong, without obvious side // effects. --rtg 8/16/23 const DecimalFormat* decimalFormat = getNumberFormat(); if (decimalFormat == nullptr || decimalFormat->getMaximumFractionDigits() == 0) { doFloor = true; } } if (doFloor) { return uprv_floor(number / divisor); } else { return number / divisor; } } virtual double composeRuleValue(double newRuleValue, double /*oldRuleValue*/) const override { return newRuleValue * divisor; } virtual double calcUpperBound(double /*oldUpperBound*/) const override { return static_cast(divisor); } virtual char16_t tokenChar() const override { return (char16_t)0x003c; } // '<' public: static UClassID getStaticClassID(); virtual UClassID getDynamicClassID() const override; }; MultiplierSubstitution::~MultiplierSubstitution() {} class ModulusSubstitution : public NFSubstitution { int64_t divisor; const NFRule* ruleToUse; public: ModulusSubstitution(int32_t pos, const NFRule* rule, const NFRule* rulePredecessor, const NFRuleSet* ruleSet, const UnicodeString& description, UErrorCode& status); virtual ~ModulusSubstitution(); virtual void setDivisor(int32_t radix, int16_t exponent, UErrorCode& status) override { divisor = util64_pow(radix, exponent); if (divisor == 0) { status = U_PARSE_ERROR; } } virtual bool operator==(const NFSubstitution& rhs) const override; virtual void doSubstitution(int64_t number, UnicodeString& toInsertInto, int32_t pos, int32_t recursionCount, UErrorCode& status) const override; virtual void doSubstitution(double number, UnicodeString& toInsertInto, int32_t pos, int32_t recursionCount, UErrorCode& status) const override; virtual int64_t transformNumber(int64_t number) const override { return number % divisor; } virtual double transformNumber(double number) const override { return uprv_fmod(number, static_cast(divisor)); } virtual UBool doParse(const UnicodeString& text, ParsePosition& parsePosition, double baseValue, double upperBound, UBool lenientParse, uint32_t nonNumericalExecutedRuleMask, Formattable& result) const override; virtual double composeRuleValue(double newRuleValue, double oldRuleValue) const override { return oldRuleValue - uprv_fmod(oldRuleValue, static_cast(divisor)) + newRuleValue; } virtual double calcUpperBound(double /*oldUpperBound*/) const override { return static_cast(divisor); } virtual UBool isModulusSubstitution() const override { return true; } virtual char16_t tokenChar() const override { return (char16_t)0x003e; } // '>' virtual void toString(UnicodeString& result) const override; public: static UClassID getStaticClassID(); virtual UClassID getDynamicClassID() const override; }; ModulusSubstitution::~ModulusSubstitution() {} class IntegralPartSubstitution : public NFSubstitution { public: IntegralPartSubstitution(int32_t _pos, const NFRuleSet* _ruleSet, const UnicodeString& description, UErrorCode& status) : NFSubstitution(_pos, _ruleSet, description, status) {} virtual ~IntegralPartSubstitution(); virtual int64_t transformNumber(int64_t number) const override { return number; } virtual double transformNumber(double number) const override { return uprv_floor(number); } virtual double composeRuleValue(double newRuleValue, double oldRuleValue) const override { return newRuleValue + oldRuleValue; } virtual double calcUpperBound(double /*oldUpperBound*/) const override { return DBL_MAX; } virtual char16_t tokenChar() const override { return (char16_t)0x003c; } // '<' public: static UClassID getStaticClassID(); virtual UClassID getDynamicClassID() const override; }; IntegralPartSubstitution::~IntegralPartSubstitution() {} class FractionalPartSubstitution : public NFSubstitution { UBool byDigits; UBool useSpaces; enum { kMaxDecimalDigits = 8 }; public: FractionalPartSubstitution(int32_t pos, const NFRuleSet* ruleSet, const UnicodeString& description, UErrorCode& status); virtual ~FractionalPartSubstitution(); virtual bool operator==(const NFSubstitution& rhs) const override; virtual void doSubstitution(double number, UnicodeString& toInsertInto, int32_t pos, int32_t recursionCount, UErrorCode& status) const override; virtual void doSubstitution(int64_t /*number*/, UnicodeString& /*toInsertInto*/, int32_t /*_pos*/, int32_t /*recursionCount*/, UErrorCode& /*status*/) const override {} virtual int64_t transformNumber(int64_t /*number*/) const override { return 0; } virtual double transformNumber(double number) const override { return number - uprv_floor(number); } virtual UBool doParse(const UnicodeString& text, ParsePosition& parsePosition, double baseValue, double upperBound, UBool lenientParse, uint32_t nonNumericalExecutedRuleMask, Formattable& result) const override; virtual double composeRuleValue(double newRuleValue, double oldRuleValue) const override { return newRuleValue + oldRuleValue; } virtual double calcUpperBound(double /*oldUpperBound*/) const override { return 0.0; } virtual char16_t tokenChar() const override { return (char16_t)0x003e; } // '>' public: static UClassID getStaticClassID(); virtual UClassID getDynamicClassID() const override; }; FractionalPartSubstitution::~FractionalPartSubstitution() {} class AbsoluteValueSubstitution : public NFSubstitution { public: AbsoluteValueSubstitution(int32_t _pos, const NFRuleSet* _ruleSet, const UnicodeString& description, UErrorCode& status) : NFSubstitution(_pos, _ruleSet, description, status) {} virtual ~AbsoluteValueSubstitution(); virtual int64_t transformNumber(int64_t number) const override { return number >= 0 ? number : -number; } virtual double transformNumber(double number) const override { return uprv_fabs(number); } virtual double composeRuleValue(double newRuleValue, double /*oldRuleValue*/) const override { return -newRuleValue; } virtual double calcUpperBound(double /*oldUpperBound*/) const override { return DBL_MAX; } virtual char16_t tokenChar() const override { return (char16_t)0x003e; } // '>' public: static UClassID getStaticClassID(); virtual UClassID getDynamicClassID() const override; }; AbsoluteValueSubstitution::~AbsoluteValueSubstitution() {} class NumeratorSubstitution : public NFSubstitution { double denominator; int64_t ldenominator; UBool withZeros; public: static inline UnicodeString fixdesc(const UnicodeString& desc) { if (desc.endsWith(LTLT, 2)) { UnicodeString result(desc, 0, desc.length()-1); return result; } return desc; } NumeratorSubstitution(int32_t _pos, double _denominator, NFRuleSet* _ruleSet, const UnicodeString& description, UErrorCode& status) : NFSubstitution(_pos, _ruleSet, fixdesc(description), status), denominator(_denominator) { ldenominator = util64_fromDouble(denominator); withZeros = description.endsWith(LTLT, 2); } virtual ~NumeratorSubstitution(); virtual bool operator==(const NFSubstitution& rhs) const override; virtual int64_t transformNumber(int64_t number) const override { return number * ldenominator; } virtual double transformNumber(double number) const override { return uprv_round(number * denominator); } virtual void doSubstitution(int64_t /*number*/, UnicodeString& /*toInsertInto*/, int32_t /*_pos*/, int32_t /*recursionCount*/, UErrorCode& /*status*/) const override {} virtual void doSubstitution(double number, UnicodeString& toInsertInto, int32_t pos, int32_t recursionCount, UErrorCode& status) const override; virtual UBool doParse(const UnicodeString& text, ParsePosition& parsePosition, double baseValue, double upperBound, UBool /*lenientParse*/, uint32_t nonNumericalExecutedRuleMask, Formattable& result) const override; virtual double composeRuleValue(double newRuleValue, double oldRuleValue) const override { return newRuleValue / oldRuleValue; } virtual double calcUpperBound(double /*oldUpperBound*/) const override { return denominator; } virtual char16_t tokenChar() const override { return (char16_t)0x003c; } // '<' private: static const char16_t LTLT[2]; public: static UClassID getStaticClassID(); virtual UClassID getDynamicClassID() const override; }; NumeratorSubstitution::~NumeratorSubstitution() {} NFSubstitution* NFSubstitution::makeSubstitution(int32_t pos, const NFRule* rule, const NFRule* predecessor, const NFRuleSet* ruleSet, const RuleBasedNumberFormat* formatter, const UnicodeString& description, UErrorCode& status) { // if the description is empty, return a NullSubstitution if (description.length() == 0) { return nullptr; } switch (description.charAt(0)) { // if the description begins with '<'... case gLessThan: // throw an exception if the rule is a negative number // rule if (rule->getBaseValue() == NFRule::kNegativeNumberRule) { // throw new IllegalArgumentException("<< not allowed in negative-number rule"); status = U_PARSE_ERROR; return nullptr; } // if the rule is a fraction rule, return an // IntegralPartSubstitution else if (rule->getBaseValue() == NFRule::kImproperFractionRule || rule->getBaseValue() == NFRule::kProperFractionRule || rule->getBaseValue() == NFRule::kDefaultRule) { return new IntegralPartSubstitution(pos, ruleSet, description, status); } // if the rule set containing the rule is a fraction // rule set, return a NumeratorSubstitution else if (ruleSet->isFractionRuleSet()) { return new NumeratorSubstitution(pos, (double)rule->getBaseValue(), formatter->getDefaultRuleSet(), description, status); } // otherwise, return a MultiplierSubstitution else { return new MultiplierSubstitution(pos, rule, ruleSet, description, status); } // if the description begins with '>'... case gGreaterThan: // if the rule is a negative-number rule, return // an AbsoluteValueSubstitution if (rule->getBaseValue() == NFRule::kNegativeNumberRule) { return new AbsoluteValueSubstitution(pos, ruleSet, description, status); } // if the rule is a fraction rule, return a // FractionalPartSubstitution else if (rule->getBaseValue() == NFRule::kImproperFractionRule || rule->getBaseValue() == NFRule::kProperFractionRule || rule->getBaseValue() == NFRule::kDefaultRule) { return new FractionalPartSubstitution(pos, ruleSet, description, status); } // if the rule set owning the rule is a fraction rule set, // throw an exception else if (ruleSet->isFractionRuleSet()) { // throw new IllegalArgumentException(">> not allowed in fraction rule set"); status = U_PARSE_ERROR; return nullptr; } // otherwise, return a ModulusSubstitution else { return new ModulusSubstitution(pos, rule, predecessor, ruleSet, description, status); } // if the description begins with '=', always return a // SameValueSubstitution case gEquals: return new SameValueSubstitution(pos, ruleSet, description, status); // and if it's anything else, throw an exception default: // throw new IllegalArgumentException("Illegal substitution character"); status = U_PARSE_ERROR; } return nullptr; } NFSubstitution::NFSubstitution(int32_t _pos, const NFRuleSet* _ruleSet, const UnicodeString& description, UErrorCode& status) : pos(_pos), ruleSet(nullptr), numberFormat(nullptr) { // the description should begin and end with the same character. // If it doesn't that's a syntax error. Otherwise, // makeSubstitution() was the only thing that needed to know // about these characters, so strip them off UnicodeString workingDescription(description); if (description.length() >= 2 && description.charAt(0) == description.charAt(description.length() - 1)) { workingDescription.remove(description.length() - 1, 1); workingDescription.remove(0, 1); } else if (description.length() != 0) { // throw new IllegalArgumentException("Illegal substitution syntax"); status = U_PARSE_ERROR; return; } if (workingDescription.length() == 0) { // if the description was just two paired token characters // (i.e., "<<" or ">>"), it uses the rule set it belongs to to // format its result this->ruleSet = _ruleSet; } else if (workingDescription.charAt(0) == gPercent) { // if the description contains a rule set name, that's the rule // set we use to format the result: get a reference to the // names rule set this->ruleSet = _ruleSet->getOwner()->findRuleSet(workingDescription, status); } else if (workingDescription.charAt(0) == gPound || workingDescription.charAt(0) ==gZero) { // if the description begins with 0 or #, treat it as a // DecimalFormat pattern, and initialize a DecimalFormat with // that pattern (then set it to use the DecimalFormatSymbols // belonging to our formatter) const DecimalFormatSymbols* sym = _ruleSet->getOwner()->getDecimalFormatSymbols(); if (!sym) { status = U_MISSING_RESOURCE_ERROR; return; } DecimalFormat *tempNumberFormat = new DecimalFormat(workingDescription, *sym, status); /* test for nullptr */ if (!tempNumberFormat) { status = U_MEMORY_ALLOCATION_ERROR; return; } if (U_FAILURE(status)) { delete tempNumberFormat; return; } this->numberFormat = tempNumberFormat; } else if (workingDescription.charAt(0) == gGreaterThan) { // if the description is ">>>", this substitution bypasses the // usual rule-search process and always uses the rule that precedes // it in its own rule set's rule list (this is used for place-value // notations: formats where you want to see a particular part of // a number even when it's 0) // this causes problems when >>> is used in a frationalPartSubstitution // this->ruleSet = nullptr; this->ruleSet = _ruleSet; this->numberFormat = nullptr; } else { // and of the description is none of these things, it's a syntax error // throw new IllegalArgumentException("Illegal substitution syntax"); status = U_PARSE_ERROR; } } NFSubstitution::~NFSubstitution() { delete numberFormat; numberFormat = nullptr; } /** * Set's the substitution's divisor. Used by NFRule.setBaseValue(). * A no-op for all substitutions except multiplier and modulus * substitutions. * @param radix The radix of the divisor * @param exponent The exponent of the divisor */ void NFSubstitution::setDivisor(int32_t /*radix*/, int16_t /*exponent*/, UErrorCode& /*status*/) { // a no-op for all substitutions except multiplier and modulus substitutions } void NFSubstitution::setDecimalFormatSymbols(const DecimalFormatSymbols &newSymbols, UErrorCode& /*status*/) { if (numberFormat != nullptr) { numberFormat->setDecimalFormatSymbols(newSymbols); } } //----------------------------------------------------------------------- // boilerplate //----------------------------------------------------------------------- UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NFSubstitution) /** * Compares two substitutions for equality * @param The substitution to compare this one to * @return true if the two substitutions are functionally equivalent */ bool NFSubstitution::operator==(const NFSubstitution& rhs) const { // compare class and all of the fields all substitutions have // in common // this should be called by subclasses before their own equality tests return typeid(*this) == typeid(rhs) && pos == rhs.pos && (ruleSet == nullptr) == (rhs.ruleSet == nullptr) // && ruleSet == rhs.ruleSet causes circularity, other checks to make instead? && (numberFormat == nullptr ? (rhs.numberFormat == nullptr) : (*numberFormat == *rhs.numberFormat)); } /** * Returns a textual description of the substitution * @return A textual description of the substitution. This might * not be identical to the description it was created from, but * it'll produce the same result. */ void NFSubstitution::toString(UnicodeString& text) const { // use tokenChar() to get the character at the beginning and // end of the substitutin token. In between them will go // either the name of the rule set it uses, or the pattern of // the DecimalFormat it uses text.remove(); text.append(tokenChar()); UnicodeString temp; if (ruleSet != nullptr) { ruleSet->getName(temp); } else if (numberFormat != nullptr) { numberFormat->toPattern(temp); } text.append(temp); text.append(tokenChar()); } //----------------------------------------------------------------------- // formatting //----------------------------------------------------------------------- /** * Performs a mathematical operation on the number, formats it using * either ruleSet or decimalFormat, and inserts the result into * toInsertInto. * @param number The number being formatted. * @param toInsertInto The string we insert the result into * @param pos The position in toInsertInto where the owning rule's * rule text begins (this value is added to this substitution's * position to determine exactly where to insert the new text) */ void NFSubstitution::doSubstitution(int64_t number, UnicodeString& toInsertInto, int32_t _pos, int32_t recursionCount, UErrorCode& status) const { if (ruleSet != nullptr) { // Perform a transformation on the number that is dependent // on the type of substitution this is, then just call its // rule set's format() method to format the result ruleSet->format(transformNumber(number), toInsertInto, _pos + this->pos, recursionCount, status); } else if (numberFormat != nullptr) { if (number <= MAX_INT64_IN_DOUBLE) { // or perform the transformation on the number (preserving // the result's fractional part if the formatter it set // to show it), then use that formatter's format() method // to format the result double numberToFormat = transformNumber((double)number); if (numberFormat->getMaximumFractionDigits() == 0) { numberToFormat = uprv_floor(numberToFormat); } UnicodeString temp; numberFormat->format(numberToFormat, temp, status); toInsertInto.insert(_pos + this->pos, temp); } else { // We have gone beyond double precision. Something has to give. // We're favoring accuracy of the large number over potential rules // that round like a CompactDecimalFormat, which is not a common use case. // // Perform a transformation on the number that is dependent // on the type of substitution this is, then just call its // rule set's format() method to format the result int64_t numberToFormat = transformNumber(number); UnicodeString temp; numberFormat->format(numberToFormat, temp, status); toInsertInto.insert(_pos + this->pos, temp); } } } /** * Performs a mathematical operation on the number, formats it using * either ruleSet or decimalFormat, and inserts the result into * toInsertInto. * @param number The number being formatted. * @param toInsertInto The string we insert the result into * @param pos The position in toInsertInto where the owning rule's * rule text begins (this value is added to this substitution's * position to determine exactly where to insert the new text) */ void NFSubstitution::doSubstitution(double number, UnicodeString& toInsertInto, int32_t _pos, int32_t recursionCount, UErrorCode& status) const { // perform a transformation on the number being formatted that // is dependent on the type of substitution this is double numberToFormat = transformNumber(number); if (uprv_isInfinite(numberToFormat)) { // This is probably a minus rule. Combine it with an infinite rule. const NFRule *infiniteRule = ruleSet->findDoubleRule(uprv_getInfinity()); infiniteRule->doFormat(numberToFormat, toInsertInto, _pos + this->pos, recursionCount, status); return; } // if the result is an integer, from here on out we work in integer // space (saving time and memory and preserving accuracy) if (numberToFormat == uprv_floor(numberToFormat) && ruleSet != nullptr) { ruleSet->format(util64_fromDouble(numberToFormat), toInsertInto, _pos + this->pos, recursionCount, status); // if the result isn't an integer, then call either our rule set's // format() method or our DecimalFormat's format() method to // format the result } else { if (ruleSet != nullptr) { ruleSet->format(numberToFormat, toInsertInto, _pos + this->pos, recursionCount, status); } else if (numberFormat != nullptr) { UnicodeString temp; numberFormat->format(numberToFormat, temp); toInsertInto.insert(_pos + this->pos, temp); } } } //----------------------------------------------------------------------- // parsing //----------------------------------------------------------------------- #ifdef RBNF_DEBUG #include #endif /** * Parses a string using the rule set or DecimalFormat belonging * to this substitution. If there's a match, a mathematical * operation (the inverse of the one used in formatting) is * performed on the result of the parse and the value passed in * and returned as the result. The parse position is updated to * point to the first unmatched character in the string. * @param text The string to parse * @param parsePosition On entry, ignored, but assumed to be 0. * On exit, this is updated to point to the first unmatched * character (or 0 if the substitution didn't match) * @param baseValue A partial parse result that should be * combined with the result of this parse * @param upperBound When searching the rule set for a rule * matching the string passed in, only rules with base values * lower than this are considered * @param lenientParse If true and matching against rules fails, * the substitution will also try matching the text against * numerals using a default-costructed NumberFormat. If false, * no extra work is done. (This value is false whenever the * formatter isn't in lenient-parse mode, but is also false * under some conditions even when the formatter _is_ in * lenient-parse mode.) * @return If there's a match, this is the result of composing * baseValue with whatever was returned from matching the * characters. This will be either a Long or a Double. If there's * no match this is new Long(0) (not null), and parsePosition * is left unchanged. */ UBool NFSubstitution::doParse(const UnicodeString& text, ParsePosition& parsePosition, double baseValue, double upperBound, UBool lenientParse, uint32_t nonNumericalExecutedRuleMask, Formattable& result) const { #ifdef RBNF_DEBUG fprintf(stderr, " %x bv: %g ub: %g\n", this, baseValue, upperBound); #endif // figure out the highest base value a rule can have and match // the text being parsed (this varies according to the type of // substitutions: multiplier, modulus, and numerator substitutions // restrict the search to rules with base values lower than their // own; same-value substitutions leave the upper bound wherever // it was, and the others allow any rule to match upperBound = calcUpperBound(upperBound); // use our rule set to parse the text. If that fails and // lenient parsing is enabled (this is always false if the // formatter's lenient-parsing mode is off, but it may also // be false even when the formatter's lenient-parse mode is // on), then also try parsing the text using a default- // constructed NumberFormat if (ruleSet != nullptr) { ruleSet->parse(text, parsePosition, upperBound, nonNumericalExecutedRuleMask, result); if (lenientParse && !ruleSet->isFractionRuleSet() && parsePosition.getIndex() == 0) { UErrorCode status = U_ZERO_ERROR; NumberFormat* fmt = NumberFormat::createInstance(status); if (U_SUCCESS(status)) { fmt->parse(text, result, parsePosition); } delete fmt; } // ...or use our DecimalFormat to parse the text } else if (numberFormat != nullptr) { numberFormat->parse(text, result, parsePosition); } // if the parse was successful, we've already advanced the caller's // parse position (this is the one function that doesn't have one // of its own). Derive a parse result and return it as a Long, // if possible, or a Double if (parsePosition.getIndex() != 0) { UErrorCode status = U_ZERO_ERROR; double tempResult = result.getDouble(status); // composeRuleValue() produces a full parse result from // the partial parse result passed to this function from // the caller (this is either the owning rule's base value // or the partial result obtained from composing the // owning rule's base value with its other substitution's // parse result) and the partial parse result obtained by // matching the substitution (which will be the same value // the caller would get by parsing just this part of the // text with RuleBasedNumberFormat.parse() ). How the two // values are used to derive the full parse result depends // on the types of substitutions: For a regular rule, the // ultimate result is its multiplier substitution's result // times the rule's divisor (or the rule's base value) plus // the modulus substitution's result (which will actually // supersede part of the rule's base value). For a negative- // number rule, the result is the negative of its substitution's // result. For a fraction rule, it's the sum of its two // substitution results. For a rule in a fraction rule set, // it's the numerator substitution's result divided by // the rule's base value. Results from same-value substitutions // propagate back upard, and null substitutions don't affect // the result. tempResult = composeRuleValue(tempResult, baseValue); result.setDouble(tempResult); return true; // if the parse was UNsuccessful, return 0 } else { result.setLong(0); return false; } } /** * Returns true if this is a modulus substitution. (We didn't do this * with instanceof partially because it causes source files to * proliferate and partially because we have to port this to C++.) * @return true if this object is an instance of ModulusSubstitution */ UBool NFSubstitution::isModulusSubstitution() const { return false; } //=================================================================== // SameValueSubstitution //=================================================================== /** * A substitution that passes the value passed to it through unchanged. * Represented by == in rule descriptions. */ SameValueSubstitution::SameValueSubstitution(int32_t _pos, const NFRuleSet* _ruleSet, const UnicodeString& description, UErrorCode& status) : NFSubstitution(_pos, _ruleSet, description, status) { if (0 == description.compare(gEqualsEquals, 2)) { // throw new IllegalArgumentException("== is not a legal token"); status = U_PARSE_ERROR; } } UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SameValueSubstitution) //=================================================================== // MultiplierSubstitution //=================================================================== UOBJECT_DEFINE_RTTI_IMPLEMENTATION(MultiplierSubstitution) bool MultiplierSubstitution::operator==(const NFSubstitution& rhs) const { return NFSubstitution::operator==(rhs) && divisor == ((const MultiplierSubstitution*)&rhs)->divisor; } //=================================================================== // ModulusSubstitution //=================================================================== /** * A substitution that divides the number being formatted by the its rule's * divisor and formats the remainder. Represented by ">>" in a * regular rule. */ ModulusSubstitution::ModulusSubstitution(int32_t _pos, const NFRule* rule, const NFRule* predecessor, const NFRuleSet* _ruleSet, const UnicodeString& description, UErrorCode& status) : NFSubstitution(_pos, _ruleSet, description, status) , divisor(rule->getDivisor()) , ruleToUse(nullptr) { // the owning rule's divisor controls the behavior of this // substitution: rather than keeping a backpointer to the rule, // we keep a copy of the divisor if (divisor == 0) { status = U_PARSE_ERROR; } if (0 == description.compare(gGreaterGreaterGreaterThan, 3)) { // the >>> token doesn't alter how this substitution calculates the // values it uses for formatting and parsing, but it changes // what's done with that value after it's obtained: >>> short- // circuits the rule-search process and goes straight to the // specified rule to format the substitution value ruleToUse = predecessor; } } UOBJECT_DEFINE_RTTI_IMPLEMENTATION(ModulusSubstitution) bool ModulusSubstitution::operator==(const NFSubstitution& rhs) const { return NFSubstitution::operator==(rhs) && divisor == ((const ModulusSubstitution*)&rhs)->divisor && ruleToUse == ((const ModulusSubstitution*)&rhs)->ruleToUse; } //----------------------------------------------------------------------- // formatting //----------------------------------------------------------------------- /** * If this is a >>> substitution, use ruleToUse to fill in * the substitution. Otherwise, just use the superclass function. * @param number The number being formatted * @toInsertInto The string to insert the result of this substitution * into * @param pos The position of the rule text in toInsertInto */ void ModulusSubstitution::doSubstitution(int64_t number, UnicodeString& toInsertInto, int32_t _pos, int32_t recursionCount, UErrorCode& status) const { // if this isn't a >>> substitution, just use the inherited version // of this function (which uses either a rule set or a DecimalFormat // to format its substitution value) if (ruleToUse == nullptr) { NFSubstitution::doSubstitution(number, toInsertInto, _pos, recursionCount, status); // a >>> substitution goes straight to a particular rule to // format the substitution value } else { int64_t numberToFormat = transformNumber(number); ruleToUse->doFormat(numberToFormat, toInsertInto, _pos + getPos(), recursionCount, status); } } /** * If this is a >>> substitution, use ruleToUse to fill in * the substitution. Otherwise, just use the superclass function. * @param number The number being formatted * @toInsertInto The string to insert the result of this substitution * into * @param pos The position of the rule text in toInsertInto */ void ModulusSubstitution::doSubstitution(double number, UnicodeString& toInsertInto, int32_t _pos, int32_t recursionCount, UErrorCode& status) const { // if this isn't a >>> substitution, just use the inherited version // of this function (which uses either a rule set or a DecimalFormat // to format its substitution value) if (ruleToUse == nullptr) { NFSubstitution::doSubstitution(number, toInsertInto, _pos, recursionCount, status); // a >>> substitution goes straight to a particular rule to // format the substitution value } else { double numberToFormat = transformNumber(number); ruleToUse->doFormat(numberToFormat, toInsertInto, _pos + getPos(), recursionCount, status); } } //----------------------------------------------------------------------- // parsing //----------------------------------------------------------------------- /** * If this is a >>> substitution, match only against ruleToUse. * Otherwise, use the superclass function. * @param text The string to parse * @param parsePosition Ignored on entry, updated on exit to point to * the first unmatched character. * @param baseValue The partial parse result prior to calling this * routine. */ UBool ModulusSubstitution::doParse(const UnicodeString& text, ParsePosition& parsePosition, double baseValue, double upperBound, UBool lenientParse, uint32_t nonNumericalExecutedRuleMask, Formattable& result) const { // if this isn't a >>> substitution, we can just use the // inherited parse() routine to do the parsing if (ruleToUse == nullptr) { return NFSubstitution::doParse(text, parsePosition, baseValue, upperBound, lenientParse, nonNumericalExecutedRuleMask, result); // but if it IS a >>> substitution, we have to do it here: we // use the specific rule's doParse() method, and then we have to // do some of the other work of NFRuleSet.parse() } else { ruleToUse->doParse(text, parsePosition, false, upperBound, nonNumericalExecutedRuleMask, result); if (parsePosition.getIndex() != 0) { UErrorCode status = U_ZERO_ERROR; double tempResult = result.getDouble(status); tempResult = composeRuleValue(tempResult, baseValue); result.setDouble(tempResult); } return true; } } /** * Returns a textual description of the substitution * @return A textual description of the substitution. This might * not be identical to the description it was created from, but * it'll produce the same result. */ void ModulusSubstitution::toString(UnicodeString& text) const { // use tokenChar() to get the character at the beginning and // end of the substitutin token. In between them will go // either the name of the rule set it uses, or the pattern of // the DecimalFormat it uses if ( ruleToUse != nullptr ) { // Must have been a >>> substitution. text.remove(); text.append(tokenChar()); text.append(tokenChar()); text.append(tokenChar()); } else { // Otherwise just use the super-class function. NFSubstitution::toString(text); } } //=================================================================== // IntegralPartSubstitution //=================================================================== UOBJECT_DEFINE_RTTI_IMPLEMENTATION(IntegralPartSubstitution) //=================================================================== // FractionalPartSubstitution //=================================================================== /** * Constructs a FractionalPartSubstitution. This object keeps a flag * telling whether it should format by digits or not. In addition, * it marks the rule set it calls (if any) as a fraction rule set. */ FractionalPartSubstitution::FractionalPartSubstitution(int32_t _pos, const NFRuleSet* _ruleSet, const UnicodeString& description, UErrorCode& status) : NFSubstitution(_pos, _ruleSet, description, status) , byDigits(false) , useSpaces(true) { // akk, ruleSet can change in superclass constructor if (0 == description.compare(gGreaterGreaterThan, 2) || 0 == description.compare(gGreaterGreaterGreaterThan, 3) || _ruleSet == getRuleSet()) { byDigits = true; if (0 == description.compare(gGreaterGreaterGreaterThan, 3)) { useSpaces = false; } } else { // cast away const ((NFRuleSet*)getRuleSet())->makeIntoFractionRuleSet(); } } //----------------------------------------------------------------------- // formatting //----------------------------------------------------------------------- /** * If in "by digits" mode, fills in the substitution one decimal digit * at a time using the rule set containing this substitution. * Otherwise, uses the superclass function. * @param number The number being formatted * @param toInsertInto The string to insert the result of formatting * the substitution into * @param pos The position of the owning rule's rule text in * toInsertInto */ void FractionalPartSubstitution::doSubstitution(double number, UnicodeString& toInsertInto, int32_t _pos, int32_t recursionCount, UErrorCode& status) const { // if we're not in "byDigits" mode, just use the inherited // doSubstitution() routine if (!byDigits) { NFSubstitution::doSubstitution(number, toInsertInto, _pos, recursionCount, status); // if we're in "byDigits" mode, transform the value into an integer // by moving the decimal point eight places to the right and // pulling digits off the right one at a time, formatting each digit // as an integer using this substitution's owning rule set // (this is slower, but more accurate, than doing it from the // other end) } else { // int32_t numberToFormat = (int32_t)uprv_round(transformNumber(number) * uprv_pow(10, kMaxDecimalDigits)); // // this flag keeps us from formatting trailing zeros. It starts // // out false because we're pulling from the right, and switches // // to true the first time we encounter a non-zero digit // UBool doZeros = false; // for (int32_t i = 0; i < kMaxDecimalDigits; i++) { // int64_t digit = numberToFormat % 10; // if (digit != 0 || doZeros) { // if (doZeros && useSpaces) { // toInsertInto.insert(_pos + getPos(), gSpace); // } // doZeros = true; // getRuleSet()->format(digit, toInsertInto, _pos + getPos()); // } // numberToFormat /= 10; // } DecimalQuantity dl; dl.setToDouble(number); dl.roundToMagnitude(-20, UNUM_ROUND_HALFEVEN, status); // round to 20 fraction digits. UBool pad = false; for (int32_t didx = dl.getLowerDisplayMagnitude(); didx<0; didx++) { // Loop iterates over fraction digits, starting with the LSD. // include both real digits from the number, and zeros // to the left of the MSD but to the right of the decimal point. if (pad && useSpaces) { toInsertInto.insert(_pos + getPos(), gSpace); } else { pad = true; } int64_t digit = dl.getDigit(didx); getRuleSet()->format(digit, toInsertInto, _pos + getPos(), recursionCount, status); } if (!pad) { // hack around lack of precision in digitlist. if we would end up with // "foo point" make sure we add a " zero" to the end. getRuleSet()->format((int64_t)0, toInsertInto, _pos + getPos(), recursionCount, status); } } } //----------------------------------------------------------------------- // parsing //----------------------------------------------------------------------- /** * If in "by digits" mode, parses the string as if it were a string * of individual digits; otherwise, uses the superclass function. * @param text The string to parse * @param parsePosition Ignored on entry, but updated on exit to point * to the first unmatched character * @param baseValue The partial parse result prior to entering this * function * @param upperBound Only consider rules with base values lower than * this when filling in the substitution * @param lenientParse If true, try matching the text as numerals if * matching as words doesn't work * @return If the match was successful, the current partial parse * result; otherwise new Long(0). The result is either a Long or * a Double. */ UBool FractionalPartSubstitution::doParse(const UnicodeString& text, ParsePosition& parsePosition, double baseValue, double /*upperBound*/, UBool lenientParse, uint32_t nonNumericalExecutedRuleMask, Formattable& resVal) const { // if we're not in byDigits mode, we can just use the inherited // doParse() if (!byDigits) { return NFSubstitution::doParse(text, parsePosition, baseValue, 0, lenientParse, nonNumericalExecutedRuleMask, resVal); // if we ARE in byDigits mode, parse the text one digit at a time // using this substitution's owning rule set (we do this by setting // upperBound to 10 when calling doParse() ) until we reach // nonmatching text } else { UnicodeString workText(text); ParsePosition workPos(1); double result = 0; int32_t digit; // double p10 = 0.1; DecimalQuantity dl; int32_t totalDigits = 0; NumberFormat* fmt = nullptr; while (workText.length() > 0 && workPos.getIndex() != 0) { workPos.setIndex(0); Formattable temp; getRuleSet()->parse(workText, workPos, 10, nonNumericalExecutedRuleMask, temp); UErrorCode status = U_ZERO_ERROR; digit = temp.getLong(status); // digit = temp.getType() == Formattable::kLong ? // temp.getLong() : // (int32_t)temp.getDouble(); if (lenientParse && workPos.getIndex() == 0) { if (!fmt) { status = U_ZERO_ERROR; fmt = NumberFormat::createInstance(status); if (U_FAILURE(status)) { delete fmt; fmt = nullptr; } } if (fmt) { fmt->parse(workText, temp, workPos); digit = temp.getLong(status); } } if (workPos.getIndex() != 0) { dl.appendDigit(static_cast(digit), 0, true); totalDigits++; // result += digit * p10; // p10 /= 10; parsePosition.setIndex(parsePosition.getIndex() + workPos.getIndex()); workText.removeBetween(0, workPos.getIndex()); while (workText.length() > 0 && workText.charAt(0) == gSpace) { workText.removeBetween(0, 1); parsePosition.setIndex(parsePosition.getIndex() + 1); } } } delete fmt; dl.adjustMagnitude(-totalDigits); result = dl.toDouble(); result = composeRuleValue(result, baseValue); resVal.setDouble(result); return true; } } bool FractionalPartSubstitution::operator==(const NFSubstitution& rhs) const { return NFSubstitution::operator==(rhs) && ((const FractionalPartSubstitution*)&rhs)->byDigits == byDigits; } UOBJECT_DEFINE_RTTI_IMPLEMENTATION(FractionalPartSubstitution) //=================================================================== // AbsoluteValueSubstitution //=================================================================== UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AbsoluteValueSubstitution) //=================================================================== // NumeratorSubstitution //=================================================================== void NumeratorSubstitution::doSubstitution(double number, UnicodeString& toInsertInto, int32_t apos, int32_t recursionCount, UErrorCode& status) const { // perform a transformation on the number being formatted that // is dependent on the type of substitution this is double numberToFormat = transformNumber(number); int64_t longNF = util64_fromDouble(numberToFormat); const NFRuleSet* aruleSet = getRuleSet(); if (withZeros && aruleSet != nullptr) { // if there are leading zeros in the decimal expansion then emit them int64_t nf =longNF; int32_t len = toInsertInto.length(); while ((nf *= 10) < denominator) { toInsertInto.insert(apos + getPos(), gSpace); aruleSet->format((int64_t)0, toInsertInto, apos + getPos(), recursionCount, status); } apos += toInsertInto.length() - len; } // if the result is an integer, from here on out we work in integer // space (saving time and memory and preserving accuracy) if (numberToFormat == longNF && aruleSet != nullptr) { aruleSet->format(longNF, toInsertInto, apos + getPos(), recursionCount, status); // if the result isn't an integer, then call either our rule set's // format() method or our DecimalFormat's format() method to // format the result } else { if (aruleSet != nullptr) { aruleSet->format(numberToFormat, toInsertInto, apos + getPos(), recursionCount, status); } else { UnicodeString temp; getNumberFormat()->format(numberToFormat, temp, status); toInsertInto.insert(apos + getPos(), temp); } } } UBool NumeratorSubstitution::doParse(const UnicodeString& text, ParsePosition& parsePosition, double baseValue, double upperBound, UBool /*lenientParse*/, uint32_t nonNumericalExecutedRuleMask, Formattable& result) const { // we don't have to do anything special to do the parsing here, // but we have to turn lenient parsing off-- if we leave it on, // it SERIOUSLY messes up the algorithm // if withZeros is true, we need to count the zeros // and use that to adjust the parse result UErrorCode status = U_ZERO_ERROR; int32_t zeroCount = 0; UnicodeString workText(text); if (withZeros) { ParsePosition workPos(1); Formattable temp; while (workText.length() > 0 && workPos.getIndex() != 0) { workPos.setIndex(0); getRuleSet()->parse(workText, workPos, 1, nonNumericalExecutedRuleMask, temp); // parse zero or nothing at all if (workPos.getIndex() == 0) { // we failed, either there were no more zeros, or the number was formatted with digits // either way, we're done break; } ++zeroCount; parsePosition.setIndex(parsePosition.getIndex() + workPos.getIndex()); workText.remove(0, workPos.getIndex()); while (workText.length() > 0 && workText.charAt(0) == gSpace) { workText.remove(0, 1); parsePosition.setIndex(parsePosition.getIndex() + 1); } } workText = text; workText.remove(0, (int32_t)parsePosition.getIndex()); parsePosition.setIndex(0); } // we've parsed off the zeros, now let's parse the rest from our current position NFSubstitution::doParse(workText, parsePosition, withZeros ? 1 : baseValue, upperBound, false, nonNumericalExecutedRuleMask, result); if (withZeros) { // any base value will do in this case. is there a way to // force this to not bother trying all the base values? // compute the 'effective' base and prescale the value down int64_t n = result.getLong(status); // force conversion! int64_t d = 1; while (d <= n) { d *= 10; } // now add the zeros while (zeroCount > 0) { d *= 10; --zeroCount; } // d is now our true denominator result.setDouble((double)n/(double)d); } return true; } bool NumeratorSubstitution::operator==(const NFSubstitution& rhs) const { return NFSubstitution::operator==(rhs) && denominator == ((const NumeratorSubstitution*)&rhs)->denominator; } UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NumeratorSubstitution) const char16_t NumeratorSubstitution::LTLT[] = { 0x003c, 0x003c }; U_NAMESPACE_END /* U_HAVE_RBNF */ #endif stringi/src/icu74/i18n/regexcmp.cpp0000644000176200001440000054407114700200761016560 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // // file: regexcmp.cpp // // Copyright (C) 2002-2016 International Business Machines Corporation and others. // All Rights Reserved. // // This file contains the ICU regular expression compiler, which is responsible // for processing a regular expression pattern into the compiled form that // is used by the match finding engine. // #include "unicode/utypes.h" #if !UCONFIG_NO_REGULAR_EXPRESSIONS #include "unicode/ustring.h" #include "unicode/unistr.h" #include "unicode/uniset.h" #include "unicode/uchar.h" #include "unicode/uchriter.h" #include "unicode/parsepos.h" #include "unicode/parseerr.h" #include "unicode/regex.h" #include "unicode/utf.h" #include "unicode/utf16.h" #include "patternprops.h" #include "putilimp.h" #include "cmemory.h" #include "cstr.h" #include "cstring.h" #include "uvectr32.h" #include "uvectr64.h" #include "uassert.h" #include "uinvchar.h" #include "regeximp.h" #include "regexcst.h" // Contains state table for the regex pattern parser. // generated by a Perl script. #include "regexcmp.h" #include "regexst.h" #include "regextxt.h" U_NAMESPACE_BEGIN //------------------------------------------------------------------------------ // // Constructor. // //------------------------------------------------------------------------------ RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(status), fSetStack(uprv_deleteUObject, nullptr, status), fSetOpStack(status) { // Lazy init of all shared global sets (needed for init()'s empty text) RegexStaticSets::initGlobals(&status); fStatus = &status; fRXPat = rxp; fScanIndex = 0; fLastChar = -1; fPeekChar = -1; fLineNum = 1; fCharNum = 0; fQuoteMode = false; fInBackslashQuote = false; fModeFlags = fRXPat->fFlags | 0x80000000; fEOLComments = true; fMatchOpenParen = -1; fMatchCloseParen = -1; fCaptureName = nullptr; fLastSetLiteral = U_SENTINEL; if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) { status = rxp->fDeferredStatus; } } static const char16_t chAmp = 0x26; // '&' static const char16_t chDash = 0x2d; // '-' //------------------------------------------------------------------------------ // // Destructor // //------------------------------------------------------------------------------ RegexCompile::~RegexCompile() { delete fCaptureName; // Normally will be nullptr, but can exist if pattern // compilation stops with a syntax error. } static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) { set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec)); } //------------------------------------------------------------------------------ // // Compile regex pattern. The state machine for rexexp pattern parsing is here. // The state tables are hand-written in the file regexcst.txt, // and converted to the form used here by a perl // script regexcst.pl // //------------------------------------------------------------------------------ void RegexCompile::compile( const UnicodeString &pat, // Source pat to be compiled. UParseError &pp, // Error position info UErrorCode &e) // Error Code { fRXPat->fPatternString = new UnicodeString(pat); UText patternText = UTEXT_INITIALIZER; utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e); if (U_SUCCESS(e)) { compile(&patternText, pp, e); utext_close(&patternText); } } // // compile, UText mode // All the work is actually done here. // void RegexCompile::compile( UText *pat, // Source pat to be compiled. UParseError &pp, // Error position info UErrorCode &e) // Error Code { fStatus = &e; fParseErr = &pp; fStackPtr = 0; fStack[fStackPtr] = 0; if (U_FAILURE(*fStatus)) { return; } // There should be no pattern stuff in the RegexPattern object. They can not be reused. U_ASSERT(fRXPat->fPattern == nullptr || utext_nativeLength(fRXPat->fPattern) == 0); // Prepare the RegexPattern object to receive the compiled pattern. fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, false, true, fStatus); if (U_FAILURE(*fStatus)) { return; } // Initialize the pattern scanning state machine fPatternLength = utext_nativeLength(pat); uint16_t state = 1; const RegexTableEl *tableEl; // UREGEX_LITERAL force entire pattern to be treated as a literal string. if (fModeFlags & UREGEX_LITERAL) { fQuoteMode = true; } nextChar(fC); // Fetch the first char from the pattern string. // // Main loop for the regex pattern parsing state machine. // Runs once per state transition. // Each time through optionally performs, depending on the state table, // - an advance to the the next pattern char // - an action to be performed. // - pushing or popping a state to/from the local state return stack. // file regexcst.txt is the source for the state table. The logic behind // recongizing the pattern syntax is there, not here. // for (;;) { // Bail out if anything has gone wrong. // Regex pattern parsing stops on the first error encountered. if (U_FAILURE(*fStatus)) { break; } U_ASSERT(state != 0); // Find the state table element that matches the input char from the pattern, or the // class of the input character. Start with the first table row for this // state, then linearly scan forward until we find a row that matches the // character. The last row for each state always matches all characters, so // the search will stop there, if not before. // tableEl = &gRuleParseStateTable[state]; REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ", fC.fChar, fLineNum, fCharNum, RegexStateNames[state])); for (;;) { // loop through table rows belonging to this state, looking for one // that matches the current input char. REGEX_SCAN_DEBUG_PRINTF((".")); if (tableEl->fCharClass < 127 && fC.fQuoted == false && tableEl->fCharClass == fC.fChar) { // Table row specified an individual character, not a set, and // the input character is not quoted, and // the input character matched it. break; } if (tableEl->fCharClass == 255) { // Table row specified default, match anything character class. break; } if (tableEl->fCharClass == 254 && fC.fQuoted) { // Table row specified "quoted" and the char was quoted. break; } if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) { // Table row specified eof and we hit eof on the input. break; } if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class && fC.fQuoted == false && // char is not escaped && fC.fChar != (UChar32)-1) { // char is not EOF U_ASSERT(tableEl->fCharClass <= 137); if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) { // Table row specified a character class, or set of characters, // and the current char matches it. break; } } // No match on this row, advance to the next row for this state, tableEl++; } REGEX_SCAN_DEBUG_PRINTF(("\n")); // // We've found the row of the state table that matches the current input // character from the rules string. // Perform any action specified by this row in the state table. if (doParseActions(tableEl->fAction) == false) { // Break out of the state machine loop if the // the action signalled some kind of error, or // the action was to exit, occurs on normal end-of-rules-input. break; } if (tableEl->fPushState != 0) { fStackPtr++; if (fStackPtr >= kStackSize) { error(U_REGEX_INTERNAL_ERROR); REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n")); fStackPtr--; } fStack[fStackPtr] = tableEl->fPushState; } // // NextChar. This is where characters are actually fetched from the pattern. // Happens under control of the 'n' tag in the state table. // if (tableEl->fNextChar) { nextChar(fC); } // Get the next state from the table entry, or from the // state stack if the next state was specified as "pop". if (tableEl->fNextState != 255) { state = tableEl->fNextState; } else { state = fStack[fStackPtr]; fStackPtr--; if (fStackPtr < 0) { // state stack underflow // This will occur if the user pattern has mis-matched parentheses, // with extra close parens. // fStackPtr++; error(U_REGEX_MISMATCHED_PAREN); } } } if (U_FAILURE(*fStatus)) { // Bail out if the pattern had errors. return; } // // The pattern has now been read and processed, and the compiled code generated. // // // The pattern's fFrameSize so far has accumulated the requirements for // storage for capture parentheses, counters, etc. that are encountered // in the pattern. Add space for the two variables that are always // present in the saved state: the input string position (int64_t) and // the position in the compiled pattern. // allocateStackData(RESTACKFRAME_HDRCOUNT); // // Optimization pass 1: NOPs, back-references, and case-folding // stripNOPs(); // // Get bounds for the minimum and maximum length of a string that this // pattern can match. Used to avoid looking for matches in strings that // are too short. // fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1); // // Optimization pass 2: match start type // matchStartType(); // // Set up fast latin-1 range sets // int32_t numSets = fRXPat->fSets->size(); fRXPat->fSets8 = new Regex8BitSet[numSets]; // Null pointer check. if (fRXPat->fSets8 == nullptr) { e = *fStatus = U_MEMORY_ALLOCATION_ERROR; return; } int32_t i; for (i=0; ifSets->elementAt(i); fRXPat->fSets8[i].init(s); } } //------------------------------------------------------------------------------ // // doParseAction Do some action during regex pattern parsing. // Called by the parse state machine. // // Generation of the match engine PCode happens here, or // in functions called from the parse actions defined here. // // //------------------------------------------------------------------------------ UBool RegexCompile::doParseActions(int32_t action) { UBool returnVal = true; switch ((Regex_PatternParseAction)action) { case doPatStart: // Start of pattern compiles to: //0 SAVE 2 Fall back to position of FAIL //1 jmp 3 //2 FAIL Stop if we ever reach here. //3 NOP Dummy, so start of pattern looks the same as // the start of an ( grouping. //4 NOP Resreved, will be replaced by a save if there are // OR | operators at the top level appendOp(URX_STATE_SAVE, 2); appendOp(URX_JMP, 3); appendOp(URX_FAIL, 0); // Standard open nonCapture paren action emits the two NOPs and // sets up the paren stack frame. doParseActions(doOpenNonCaptureParen); break; case doPatFinish: // We've scanned to the end of the pattern // The end of pattern compiles to: // URX_END // which will stop the runtime match engine. // Encountering end of pattern also behaves like a close paren, // and forces fixups of the State Save at the beginning of the compiled pattern // and of any OR operations at the top level. // handleCloseParen(); if (fParenStack.size() > 0) { // Missing close paren in pattern. error(U_REGEX_MISMATCHED_PAREN); } // add the END operation to the compiled pattern. appendOp(URX_END, 0); // Terminate the pattern compilation state machine. returnVal = false; break; case doOrOperator: // Scanning a '|', as in (A|B) { // Generate code for any pending literals preceding the '|' fixLiterals(false); // Insert a SAVE operation at the start of the pattern section preceding // this OR at this level. This SAVE will branch the match forward // to the right hand side of the OR in the event that the left hand // side fails to match and backtracks. Locate the position for the // save from the location on the top of the parentheses stack. int32_t savePosition = fParenStack.popi(); int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition); U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); fRXPat->fCompiledPat->setElementAt(op, savePosition); // Append an JMP operation into the compiled pattern. The operand for // the JMP will eventually be the location following the ')' for the // group. This will be patched in later, when the ')' is encountered. appendOp(URX_JMP, 0); // Push the position of the newly added JMP op onto the parentheses stack. // This registers if for fixup when this block's close paren is encountered. fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // Append a NOP to the compiled pattern. This is the slot reserved // for a SAVE in the event that there is yet another '|' following // this one. appendOp(URX_NOP, 0); fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); } break; case doBeginNamedCapture: // Scanning (?append(fC.fChar); break; case doBadNamedCapture: error(U_REGEX_INVALID_CAPTURE_GROUP_NAME); break; case doOpenCaptureParen: // Open Capturing Paren, possibly named. // Compile to a // - NOP, which later may be replaced by a save-state if the // parenthesized group gets a * quantifier, followed by // - START_CAPTURE n where n is stack frame offset to the capture group variables. // - NOP, which may later be replaced by a save-state if there // is an '|' alternation within the parens. // // Each capture group gets three slots in the save stack frame: // 0: Capture Group start position (in input string being matched.) // 1: Capture Group end position. // 2: Start of Match-in-progress. // The first two locations are for a completed capture group, and are // referred to by back references and the like. // The third location stores the capture start position when an START_CAPTURE is // encountered. This will be promoted to a completed capture when (and if) the corresponding // END_CAPTURE is encountered. { fixLiterals(); appendOp(URX_NOP, 0); int32_t varsLoc = allocateStackData(3); // Reserve three slots in match stack frame. appendOp(URX_START_CAPTURE, varsLoc); appendOp(URX_NOP, 0); // On the Parentheses stack, start a new frame and add the positions // of the two NOPs. Depending on what follows in the pattern, the // NOPs may be changed to SAVE_STATE or JMP ops, with a target // address of the end of the parenthesized group. fParenStack.push(fModeFlags, *fStatus); // Match mode state fParenStack.push(capturing, *fStatus); // Frame type. fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc // Save the mapping from group number to stack frame variable position. fRXPat->fGroupMap->addElement(varsLoc, *fStatus); // If this is a named capture group, add the name->group number mapping. if (fCaptureName != nullptr) { if (!fRXPat->initNamedCaptureMap()) { if (U_SUCCESS(*fStatus)) { error(fRXPat->fDeferredStatus); } break; } int32_t groupNumber = fRXPat->fGroupMap->size(); int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus); fCaptureName = nullptr; // hash table takes ownership of the name (key) string. if (previousMapping > 0 && U_SUCCESS(*fStatus)) { error(U_REGEX_INVALID_CAPTURE_GROUP_NAME); } } } break; case doOpenNonCaptureParen: // Open non-caputuring (grouping only) Paren. // Compile to a // - NOP, which later may be replaced by a save-state if the // parenthesized group gets a * quantifier, followed by // - NOP, which may later be replaced by a save-state if there // is an '|' alternation within the parens. { fixLiterals(); appendOp(URX_NOP, 0); appendOp(URX_NOP, 0); // On the Parentheses stack, start a new frame and add the positions // of the two NOPs. fParenStack.push(fModeFlags, *fStatus); // Match mode state fParenStack.push(plain, *fStatus); // Begin a new frame. fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc } break; case doOpenAtomicParen: // Open Atomic Paren. (?> // Compile to a // - NOP, which later may be replaced if the parenthesized group // has a quantifier, followed by // - STO_SP save state stack position, so it can be restored at the ")" // - NOP, which may later be replaced by a save-state if there // is an '|' alternation within the parens. { fixLiterals(); appendOp(URX_NOP, 0); int32_t varLoc = allocateData(1); // Reserve a data location for saving the state stack ptr. appendOp(URX_STO_SP, varLoc); appendOp(URX_NOP, 0); // On the Parentheses stack, start a new frame and add the positions // of the two NOPs. Depending on what follows in the pattern, the // NOPs may be changed to SAVE_STATE or JMP ops, with a target // address of the end of the parenthesized group. fParenStack.push(fModeFlags, *fStatus); // Match mode state fParenStack.push(atomic, *fStatus); // Frame type. fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP } break; case doOpenLookAhead: // Positive Look-ahead (?= stuff ) // // Note: Addition of transparent input regions, with the need to // restore the original regions when failing out of a lookahead // block, complicated this sequence. Some combined opcodes // might make sense - or might not, lookahead aren't that common. // // Caution: min match length optimization knows about this // sequence; don't change without making updates there too. // // Compiles to // 1 LA_START dataLoc Saves SP, Input Pos, Active input region. // 2. STATE_SAVE 4 on failure of lookahead, goto 4 // 3 JMP 6 continue ... // // 4. LA_END Look Ahead failed. Restore regions. // 5. BACKTRACK and back track again. // // 6. NOP reserved for use by quantifiers on the block. // Look-ahead can't have quantifiers, but paren stack // compile time conventions require the slot anyhow. // 7. NOP may be replaced if there is are '|' ops in the block. // 8. code for parenthesized stuff. // 9. LA_END // // Four data slots are reserved, for saving state on entry to the look-around // 0: stack pointer on entry. // 1: input position on entry. // 2: fActiveStart, the active bounds start on entry. // 3: fActiveLimit, the active bounds limit on entry. { fixLiterals(); int32_t dataLoc = allocateData(4); appendOp(URX_LA_START, dataLoc); appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3); appendOp(URX_LA_END, dataLoc); appendOp(URX_BACKTRACK, 0); appendOp(URX_NOP, 0); appendOp(URX_NOP, 0); // On the Parentheses stack, start a new frame and add the positions // of the NOPs. fParenStack.push(fModeFlags, *fStatus); // Match mode state fParenStack.push(lookAhead, *fStatus); // Frame type. fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location } break; case doOpenLookAheadNeg: // Negated Lookahead. (?! stuff ) // Compiles to // 1. LA_START dataloc // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state, // // which continues with the match. // 3. NOP // Std. Open Paren sequence, for possible '|' // 4. code for parenthesized stuff. // 5. LA_END // Cut back stack, remove saved state from step 2. // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails. // 7. END_LA // Restore match region, in case look-ahead was using // an alternate (transparent) region. // Four data slots are reserved, for saving state on entry to the look-around // 0: stack pointer on entry. // 1: input position on entry. // 2: fActiveStart, the active bounds start on entry. // 3: fActiveLimit, the active bounds limit on entry. { fixLiterals(); int32_t dataLoc = allocateData(4); appendOp(URX_LA_START, dataLoc); appendOp(URX_STATE_SAVE, 0); // dest address will be patched later. appendOp(URX_NOP, 0); // On the Parentheses stack, start a new frame and add the positions // of the StateSave and NOP. fParenStack.push(fModeFlags, *fStatus); // Match mode state fParenStack.push(negLookAhead, *fStatus); // Frame type fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location // Instructions #5 - #7 will be added when the ')' is encountered. } break; case doOpenLookBehind: { // Compile a (?<= look-behind open paren. // // Compiles to // 0 URX_LB_START dataLoc // 1 URX_LB_CONT dataLoc // 2 MinMatchLen // 3 MaxMatchLen // 4 URX_NOP Standard '(' boilerplate. // 5 URX_NOP Reserved slot for use with '|' ops within (block). // 6 // 7 URX_LB_END dataLoc # Check match len, restore input len // 8 URX_LA_END dataLoc # Restore stack, input pos // // Allocate a block of matcher data, to contain (when running a match) // 0: Stack ptr on entry // 1: Input Index on entry // 2: fActiveStart, the active bounds start on entry. // 3: fActiveLimit, the active bounds limit on entry. // 4: Start index of match current match attempt. // The first four items must match the layout of data for LA_START / LA_END // Generate match code for any pending literals. fixLiterals(); // Allocate data space int32_t dataLoc = allocateData(5); // Emit URX_LB_START appendOp(URX_LB_START, dataLoc); // Emit URX_LB_CONT appendOp(URX_LB_CONT, dataLoc); appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later. appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later. // Emit the NOPs appendOp(URX_NOP, 0); appendOp(URX_NOP, 0); // On the Parentheses stack, start a new frame and add the positions // of the URX_LB_CONT and the NOP. fParenStack.push(fModeFlags, *fStatus); // Match mode state fParenStack.push(lookBehind, *fStatus); // Frame type fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location // The final two instructions will be added when the ')' is encountered. } break; case doOpenLookBehindNeg: { // Compile a (? // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL // 9 ... // // Allocate a block of matcher data, to contain (when running a match) // 0: Stack ptr on entry // 1: Input Index on entry // 2: fActiveStart, the active bounds start on entry. // 3: fActiveLimit, the active bounds limit on entry. // 4: Start index of match current match attempt. // The first four items must match the layout of data for LA_START / LA_END // Generate match code for any pending literals. fixLiterals(); // Allocate data space int32_t dataLoc = allocateData(5); // Emit URX_LB_START appendOp(URX_LB_START, dataLoc); // Emit URX_LBN_CONT appendOp(URX_LBN_CONT, dataLoc); appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later. appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later. appendOp(URX_RESERVED_OP, 0); // Continue Loc. To be filled later. // Emit the NOPs appendOp(URX_NOP, 0); appendOp(URX_NOP, 0); // On the Parentheses stack, start a new frame and add the positions // of the URX_LB_CONT and the NOP. fParenStack.push(fModeFlags, *fStatus); // Match mode state fParenStack.push(lookBehindN, *fStatus); // Frame type fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location // The final two instructions will be added when the ')' is encountered. } break; case doConditionalExpr: // Conditionals such as (?(1)a:b) case doPerlInline: // Perl inline-conditionals. (?{perl code}a|b) We're not perl, no way to do them. error(U_REGEX_UNIMPLEMENTED); break; case doCloseParen: handleCloseParen(); if (fParenStack.size() <= 0) { // Extra close paren, or missing open paren. error(U_REGEX_MISMATCHED_PAREN); } break; case doNOP: break; case doBadOpenParenType: case doRuleError: error(U_REGEX_RULE_SYNTAX); break; case doMismatchedParenErr: error(U_REGEX_MISMATCHED_PAREN); break; case doPlus: // Normal '+' compiles to // 1. stuff to be repeated (already built) // 2. jmp-sav 1 // 3. ... // // Or, if the item to be repeated can match a zero length string, // 1. STO_INP_LOC data-loc // 2. body of stuff to be repeated // 3. JMP_SAV_X 2 // 4. ... // // Or, if the item to be repeated is simple // 1. Item to be repeated. // 2. LOOP_SR_I set number (assuming repeated item is a set ref) // 3. LOOP_C stack location { int32_t topLoc = blockTopLoc(false); // location of item #1 int32_t frameLoc; // Check for simple constructs, which may get special optimized code. if (topLoc == fRXPat->fCompiledPat->size() - 1) { int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc); if (URX_TYPE(repeatedOp) == URX_SETREF) { // Emit optimized code for [char set]+ appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp)); frameLoc = allocateStackData(1); appendOp(URX_LOOP_C, frameLoc); break; } if (URX_TYPE(repeatedOp) == URX_DOTANY || URX_TYPE(repeatedOp) == URX_DOTANY_ALL || URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { // Emit Optimized code for .+ operations. int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0); if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode. loopOpI |= 1; } if (fModeFlags & UREGEX_UNIX_LINES) { loopOpI |= 2; } appendOp(loopOpI); frameLoc = allocateStackData(1); appendOp(URX_LOOP_C, frameLoc); break; } } // General case. // Check for minimum match length of zero, which requires // extra loop-breaking code. if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) { // Zero length match is possible. // Emit the code sequence that can handle it. insertOp(topLoc); frameLoc = allocateStackData(1); int32_t op = buildOp(URX_STO_INP_LOC, frameLoc); fRXPat->fCompiledPat->setElementAt(op, topLoc); appendOp(URX_JMP_SAV_X, topLoc+1); } else { // Simpler code when the repeated body must match something non-empty appendOp(URX_JMP_SAV, topLoc); } } break; case doNGPlus: // Non-greedy '+?' compiles to // 1. stuff to be repeated (already built) // 2. state-save 1 // 3. ... { int32_t topLoc = blockTopLoc(false); appendOp(URX_STATE_SAVE, topLoc); } break; case doOpt: // Normal (greedy) ? quantifier. // Compiles to // 1. state save 3 // 2. body of optional block // 3. ... // Insert the state save into the compiled pattern, and we're done. { int32_t saveStateLoc = blockTopLoc(true); int32_t saveStateOp = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()); fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); } break; case doNGOpt: // Non-greedy ?? quantifier // compiles to // 1. jmp 4 // 2. body of optional block // 3 jmp 5 // 4. state save 2 // 5 ... // This code is less than ideal, with two jmps instead of one, because we can only // insert one instruction at the top of the block being iterated. { int32_t jmp1_loc = blockTopLoc(true); int32_t jmp2_loc = fRXPat->fCompiledPat->size(); int32_t jmp1_op = buildOp(URX_JMP, jmp2_loc+1); fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc); appendOp(URX_JMP, jmp2_loc+2); appendOp(URX_STATE_SAVE, jmp1_loc+1); } break; case doStar: // Normal (greedy) * quantifier. // Compiles to // 1. STATE_SAVE 4 // 2. body of stuff being iterated over // 3. JMP_SAV 2 // 4. ... // // Or, if the body is a simple [Set], // 1. LOOP_SR_I set number // 2. LOOP_C stack location // ... // // Or if this is a .* // 1. LOOP_DOT_I (. matches all mode flag) // 2. LOOP_C stack location // // Or, if the body can match a zero-length string, to inhibit infinite loops, // 1. STATE_SAVE 5 // 2. STO_INP_LOC data-loc // 3. body of stuff // 4. JMP_SAV_X 2 // 5. ... { // location of item #1, the STATE_SAVE int32_t topLoc = blockTopLoc(false); int32_t dataLoc = -1; // Check for simple *, where the construct being repeated // compiled to single opcode, and might be optimizable. if (topLoc == fRXPat->fCompiledPat->size() - 1) { int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc); if (URX_TYPE(repeatedOp) == URX_SETREF) { // Emit optimized code for a [char set]* int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp)); fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); dataLoc = allocateStackData(1); appendOp(URX_LOOP_C, dataLoc); break; } if (URX_TYPE(repeatedOp) == URX_DOTANY || URX_TYPE(repeatedOp) == URX_DOTANY_ALL || URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { // Emit Optimized code for .* operations. int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0); if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { // URX_LOOP_DOT_I operand is a flag indicating . matches any mode. loopOpI |= 1; } if ((fModeFlags & UREGEX_UNIX_LINES) != 0) { loopOpI |= 2; } fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); dataLoc = allocateStackData(1); appendOp(URX_LOOP_C, dataLoc); break; } } // Emit general case code for this * // The optimizations did not apply. int32_t saveStateLoc = blockTopLoc(true); int32_t jmpOp = buildOp(URX_JMP_SAV, saveStateLoc+1); // Check for minimum match length of zero, which requires // extra loop-breaking code. if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) { insertOp(saveStateLoc); dataLoc = allocateStackData(1); int32_t op = buildOp(URX_STO_INP_LOC, dataLoc); fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); jmpOp = buildOp(URX_JMP_SAV_X, saveStateLoc+2); } // Locate the position in the compiled pattern where the match will continue // after completing the *. (4 or 5 in the comment above) int32_t continueLoc = fRXPat->fCompiledPat->size()+1; // Put together the save state op and store it into the compiled code. int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc); fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern. appendOp(jmpOp); } break; case doNGStar: // Non-greedy *? quantifier // compiles to // 1. JMP 3 // 2. body of stuff being iterated over // 3. STATE_SAVE 2 // 4 ... { int32_t jmpLoc = blockTopLoc(true); // loc 1. int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3. int32_t jmpOp = buildOp(URX_JMP, saveLoc); fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc); appendOp(URX_STATE_SAVE, jmpLoc+1); } break; case doIntervalInit: // The '{' opening an interval quantifier was just scanned. // Init the counter variables that will accumulate the values as the digits // are scanned. fIntervalLow = 0; fIntervalUpper = -1; break; case doIntevalLowerDigit: // Scanned a digit from the lower value of an {lower,upper} interval { int32_t digitValue = u_charDigitValue(fC.fChar); U_ASSERT(digitValue >= 0); int64_t val = (int64_t)fIntervalLow*10 + digitValue; if (val > INT32_MAX) { error(U_REGEX_NUMBER_TOO_BIG); } else { fIntervalLow = (int32_t)val; } } break; case doIntervalUpperDigit: // Scanned a digit from the upper value of an {lower,upper} interval { if (fIntervalUpper < 0) { fIntervalUpper = 0; } int32_t digitValue = u_charDigitValue(fC.fChar); U_ASSERT(digitValue >= 0); int64_t val = (int64_t)fIntervalUpper*10 + digitValue; if (val > INT32_MAX) { error(U_REGEX_NUMBER_TOO_BIG); } else { fIntervalUpper = (int32_t)val; } } break; case doIntervalSame: // Scanned a single value interval like {27}. Upper = Lower. fIntervalUpper = fIntervalLow; break; case doInterval: // Finished scanning a normal {lower,upper} interval. Generate the code for it. if (compileInlineInterval() == false) { compileInterval(URX_CTR_INIT, URX_CTR_LOOP); } break; case doPossessiveInterval: // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it. { // Remember the loc for the top of the block being looped over. // (Can not reserve a slot in the compiled pattern at this time, because // compileInterval needs to reserve also, and blockTopLoc can only reserve // once per block.) int32_t topLoc = blockTopLoc(false); // Produce normal looping code. compileInterval(URX_CTR_INIT, URX_CTR_LOOP); // Surround the just-emitted normal looping code with a STO_SP ... LD_SP // just as if the loop was inclosed in atomic parentheses. // First the STO_SP before the start of the loop insertOp(topLoc); int32_t varLoc = allocateData(1); // Reserve a data location for saving the int32_t op = buildOp(URX_STO_SP, varLoc); fRXPat->fCompiledPat->setElementAt(op, topLoc); int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi(); U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc); loopOp++; // point LoopOp after the just-inserted STO_SP fRXPat->fCompiledPat->push(loopOp, *fStatus); // Then the LD_SP after the end of the loop appendOp(URX_LD_SP, varLoc); } break; case doNGInterval: // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it. compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG); break; case doIntervalError: error(U_REGEX_BAD_INTERVAL); break; case doLiteralChar: // We've just scanned a "normal" character from the pattern, literalChar(fC.fChar); break; case doEscapedLiteralChar: // We've just scanned an backslashed escaped character with no // special meaning. It represents itself. if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z] (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z] error(U_REGEX_BAD_ESCAPE_SEQUENCE); } literalChar(fC.fChar); break; case doDotAny: // scanned a ".", match any single character. { fixLiterals(false); if (fModeFlags & UREGEX_DOTALL) { appendOp(URX_DOTANY_ALL, 0); } else if (fModeFlags & UREGEX_UNIX_LINES) { appendOp(URX_DOTANY_UNIX, 0); } else { appendOp(URX_DOTANY, 0); } } break; case doCaret: { fixLiterals(false); if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { appendOp(URX_CARET, 0); } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { appendOp(URX_CARET_M, 0); } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { appendOp(URX_CARET, 0); // Only testing true start of input. } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { appendOp(URX_CARET_M_UNIX, 0); } } break; case doDollar: { fixLiterals(false); if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { appendOp(URX_DOLLAR, 0); } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { appendOp(URX_DOLLAR_M, 0); } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { appendOp(URX_DOLLAR_D, 0); } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { appendOp(URX_DOLLAR_MD, 0); } } break; case doBackslashA: fixLiterals(false); appendOp(URX_CARET, 0); break; case doBackslashB: { #if UCONFIG_NO_BREAK_ITERATION==1 if (fModeFlags & UREGEX_UWORD) { error(U_UNSUPPORTED_ERROR); } #endif fixLiterals(false); int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B; appendOp(op, 1); } break; case doBackslashb: { #if UCONFIG_NO_BREAK_ITERATION==1 if (fModeFlags & UREGEX_UWORD) { error(U_UNSUPPORTED_ERROR); } #endif fixLiterals(false); int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B; appendOp(op, 0); } break; case doBackslashD: fixLiterals(false); appendOp(URX_BACKSLASH_D, 1); break; case doBackslashd: fixLiterals(false); appendOp(URX_BACKSLASH_D, 0); break; case doBackslashG: fixLiterals(false); appendOp(URX_BACKSLASH_G, 0); break; case doBackslashH: fixLiterals(false); appendOp(URX_BACKSLASH_H, 1); break; case doBackslashh: fixLiterals(false); appendOp(URX_BACKSLASH_H, 0); break; case doBackslashR: fixLiterals(false); appendOp(URX_BACKSLASH_R, 0); break; case doBackslashS: fixLiterals(false); appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET); break; case doBackslashs: fixLiterals(false); appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET); break; case doBackslashV: fixLiterals(false); appendOp(URX_BACKSLASH_V, 1); break; case doBackslashv: fixLiterals(false); appendOp(URX_BACKSLASH_V, 0); break; case doBackslashW: fixLiterals(false); appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET); break; case doBackslashw: fixLiterals(false); appendOp(URX_STATIC_SETREF, URX_ISWORD_SET); break; case doBackslashX: #if UCONFIG_NO_BREAK_ITERATION==1 // Grapheme Cluster Boundary requires ICU break iteration. error(U_UNSUPPORTED_ERROR); #endif fixLiterals(false); appendOp(URX_BACKSLASH_X, 0); break; case doBackslashZ: fixLiterals(false); appendOp(URX_DOLLAR, 0); break; case doBackslashz: fixLiterals(false); appendOp(URX_BACKSLASH_Z, 0); break; case doEscapeError: error(U_REGEX_BAD_ESCAPE_SEQUENCE); break; case doExit: fixLiterals(false); returnVal = false; break; case doProperty: { fixLiterals(false); UnicodeSet *theSet = scanProp(); compileSet(theSet); } break; case doNamedChar: { UChar32 c = scanNamedChar(); literalChar(c); } break; case doBackRef: // BackReference. Somewhat unusual in that the front-end can not completely parse // the regular expression, because the number of digits to be consumed // depends on the number of capture groups that have been defined. So // we have to do it here instead. { int32_t numCaptureGroups = fRXPat->fGroupMap->size(); int32_t groupNum = 0; UChar32 c = fC.fChar; for (;;) { // Loop once per digit, for max allowed number of digits in a back reference. int32_t digit = u_charDigitValue(c); groupNum = groupNum * 10 + digit; if (groupNum >= numCaptureGroups) { break; } c = peekCharLL(); if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == false) { break; } nextCharLL(); } // Scan of the back reference in the source regexp is complete. Now generate // the compiled code for it. // Because capture groups can be forward-referenced by back-references, // we fill the operand with the capture group number. At the end // of compilation, it will be changed to the variable's location. U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal escape sequence, // and shouldn't enter this code path at all. fixLiterals(false); if (fModeFlags & UREGEX_CASE_INSENSITIVE) { appendOp(URX_BACKREF_I, groupNum); } else { appendOp(URX_BACKREF, groupNum); } } break; case doBeginNamedBackRef: U_ASSERT(fCaptureName == nullptr); fCaptureName = new UnicodeString; if (fCaptureName == nullptr) { error(U_MEMORY_ALLOCATION_ERROR); } break; case doContinueNamedBackRef: fCaptureName->append(fC.fChar); break; case doCompleteNamedBackRef: { int32_t groupNumber = fRXPat->fNamedCaptureMap ? uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName) : 0; if (groupNumber == 0) { // Group name has not been defined. // Could be a forward reference. If we choose to support them at some // future time, extra mechanism will be required at this point. error(U_REGEX_INVALID_CAPTURE_GROUP_NAME); } else { // Given the number, handle identically to a \n numbered back reference. // See comments above, under doBackRef fixLiterals(false); if (fModeFlags & UREGEX_CASE_INSENSITIVE) { appendOp(URX_BACKREF_I, groupNumber); } else { appendOp(URX_BACKREF, groupNumber); } } delete fCaptureName; fCaptureName = nullptr; break; } case doPossessivePlus: // Possessive ++ quantifier. // Compiles to // 1. STO_SP // 2. body of stuff being iterated over // 3. STATE_SAVE 5 // 4. JMP 2 // 5. LD_SP // 6. ... // // Note: TODO: This is pretty inefficient. A mass of saved state is built up // then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056 // { // Emit the STO_SP int32_t topLoc = blockTopLoc(true); int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr. int32_t op = buildOp(URX_STO_SP, stoLoc); fRXPat->fCompiledPat->setElementAt(op, topLoc); // Emit the STATE_SAVE appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); // Emit the JMP appendOp(URX_JMP, topLoc+1); // Emit the LD_SP appendOp(URX_LD_SP, stoLoc); } break; case doPossessiveStar: // Possessive *+ quantifier. // Compiles to // 1. STO_SP loc // 2. STATE_SAVE 5 // 3. body of stuff being iterated over // 4. JMP 2 // 5. LD_SP loc // 6 ... // TODO: do something to cut back the state stack each time through the loop. { // Reserve two slots at the top of the block. int32_t topLoc = blockTopLoc(true); insertOp(topLoc); // emit STO_SP loc int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr. int32_t op = buildOp(URX_STO_SP, stoLoc); fRXPat->fCompiledPat->setElementAt(op, topLoc); // Emit the SAVE_STATE 5 int32_t L7 = fRXPat->fCompiledPat->size()+1; op = buildOp(URX_STATE_SAVE, L7); fRXPat->fCompiledPat->setElementAt(op, topLoc+1); // Append the JMP operation. appendOp(URX_JMP, topLoc+1); // Emit the LD_SP loc appendOp(URX_LD_SP, stoLoc); } break; case doPossessiveOpt: // Possessive ?+ quantifier. // Compiles to // 1. STO_SP loc // 2. SAVE_STATE 5 // 3. body of optional block // 4. LD_SP loc // 5. ... // { // Reserve two slots at the top of the block. int32_t topLoc = blockTopLoc(true); insertOp(topLoc); // Emit the STO_SP int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr. int32_t op = buildOp(URX_STO_SP, stoLoc); fRXPat->fCompiledPat->setElementAt(op, topLoc); // Emit the SAVE_STATE int32_t continueLoc = fRXPat->fCompiledPat->size()+1; op = buildOp(URX_STATE_SAVE, continueLoc); fRXPat->fCompiledPat->setElementAt(op, topLoc+1); // Emit the LD_SP appendOp(URX_LD_SP, stoLoc); } break; case doBeginMatchMode: fNewModeFlags = fModeFlags; fSetModeFlag = true; break; case doMatchMode: // (?i) and similar { int32_t bit = 0; switch (fC.fChar) { case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break; case 0x64: /* 'd' */ bit = UREGEX_UNIX_LINES; break; case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break; case 0x73: /* 's' */ bit = UREGEX_DOTALL; break; case 0x75: /* 'u' */ bit = 0; /* Unicode casing */ break; case 0x77: /* 'w' */ bit = UREGEX_UWORD; break; case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break; case 0x2d: /* '-' */ fSetModeFlag = false; break; default: UPRV_UNREACHABLE_EXIT; // Should never happen. Other chars are filtered out // by the scanner. } if (fSetModeFlag) { fNewModeFlags |= bit; } else { fNewModeFlags &= ~bit; } } break; case doSetMatchMode: // Emit code to match any pending literals, using the not-yet changed match mode. fixLiterals(); // We've got a (?i) or similar. The match mode is being changed, but // the change is not scoped to a parenthesized block. U_ASSERT(fNewModeFlags < 0); fModeFlags = fNewModeFlags; break; case doMatchModeParen: // We've got a (?i: or similar. Begin a parenthesized block, save old // mode flags so they can be restored at the close of the block. // // Compile to a // - NOP, which later may be replaced by a save-state if the // parenthesized group gets a * quantifier, followed by // - NOP, which may later be replaced by a save-state if there // is an '|' alternation within the parens. { fixLiterals(false); appendOp(URX_NOP, 0); appendOp(URX_NOP, 0); // On the Parentheses stack, start a new frame and add the positions // of the two NOPs (a normal non-capturing () frame, except for the // saving of the original mode flags.) fParenStack.push(fModeFlags, *fStatus); fParenStack.push(flags, *fStatus); // Frame Marker fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP // Set the current mode flags to the new values. U_ASSERT(fNewModeFlags < 0); fModeFlags = fNewModeFlags; } break; case doBadModeFlag: error(U_REGEX_INVALID_FLAG); break; case doSuppressComments: // We have just scanned a '(?'. We now need to prevent the character scanner from // treating a '#' as a to-the-end-of-line comment. // (This Perl compatibility just gets uglier and uglier to do...) fEOLComments = false; break; case doSetAddAmp: { UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); set->add(chAmp); } break; case doSetAddDash: { UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); set->add(chDash); } break; case doSetBackslash_s: { UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]); break; } case doSetBackslash_S: { UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); UnicodeSet SSet; SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]).complement(); set->addAll(SSet); break; } case doSetBackslash_d: { UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); // TODO - make a static set, ticket 6058. addCategory(set, U_GC_ND_MASK, *fStatus); break; } case doSetBackslash_D: { UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); UnicodeSet digits; // TODO - make a static set, ticket 6058. digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus); digits.complement(); set->addAll(digits); break; } case doSetBackslash_h: { UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); UnicodeSet h; h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus); h.add((UChar32)9); // Tab set->addAll(h); break; } case doSetBackslash_H: { UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); UnicodeSet h; h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus); h.add((UChar32)9); // Tab h.complement(); set->addAll(h); break; } case doSetBackslash_v: { UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); set->add((UChar32)0x0a, (UChar32)0x0d); // add range set->add((UChar32)0x85); set->add((UChar32)0x2028, (UChar32)0x2029); break; } case doSetBackslash_V: { UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); UnicodeSet v; v.add((UChar32)0x0a, (UChar32)0x0d); // add range v.add((UChar32)0x85); v.add((UChar32)0x2028, (UChar32)0x2029); v.complement(); set->addAll(v); break; } case doSetBackslash_w: { UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]); break; } case doSetBackslash_W: { UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); UnicodeSet SSet; SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]).complement(); set->addAll(SSet); break; } case doSetBegin: { fixLiterals(false); LocalPointer lpSet(new UnicodeSet(), *fStatus); fSetStack.push(lpSet.orphan(), *fStatus); fSetOpStack.push(setStart, *fStatus); if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { fSetOpStack.push(setCaseClose, *fStatus); } break; } case doSetBeginDifference1: // We have scanned something like [[abc]-[ // Set up a new UnicodeSet for the set beginning with the just-scanned '[' // Push a Difference operator, which will cause the new set to be subtracted from what // went before once it is created. setPushOp(setDifference1); fSetOpStack.push(setStart, *fStatus); if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { fSetOpStack.push(setCaseClose, *fStatus); } break; case doSetBeginIntersection1: // We have scanned something like [[abc]&[ // Need both the '&' operator and the open '[' operator. setPushOp(setIntersection1); fSetOpStack.push(setStart, *fStatus); if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { fSetOpStack.push(setCaseClose, *fStatus); } break; case doSetBeginUnion: // We have scanned something like [[abc][ // Need to handle the union operation explicitly [[abc] | [ setPushOp(setUnion); fSetOpStack.push(setStart, *fStatus); if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { fSetOpStack.push(setCaseClose, *fStatus); } break; case doSetDifference2: // We have scanned something like [abc-- // Consider this to unambiguously be a set difference operator. setPushOp(setDifference2); break; case doSetEnd: // Have encountered the ']' that closes a set. // Force the evaluation of any pending operations within this set, // leave the completed set on the top of the set stack. setEval(setEnd); U_ASSERT(fSetOpStack.peeki()==setStart); fSetOpStack.popi(); break; case doSetFinish: { // Finished a complete set expression, including all nested sets. // The close bracket has already triggered clearing out pending set operators, // the operator stack should be empty and the operand stack should have just // one entry, the result set. U_ASSERT(fSetOpStack.empty()); UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop(); U_ASSERT(fSetStack.empty()); compileSet(theSet); break; } case doSetIntersection2: // Have scanned something like [abc&& setPushOp(setIntersection2); break; case doSetLiteral: // Union the just-scanned literal character into the set being built. // This operation is the highest precedence set operation, so we can always do // it immediately, without waiting to see what follows. It is necessary to perform // any pending '-' or '&' operation first, because these have the same precedence // as union-ing in a literal' { setEval(setUnion); UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); s->add(fC.fChar); fLastSetLiteral = fC.fChar; break; } case doSetLiteralEscaped: // A back-slash escaped literal character was encountered. // Processing is the same as with setLiteral, above, with the addition of // the optional check for errors on escaped ASCII letters. { if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z] (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z] error(U_REGEX_BAD_ESCAPE_SEQUENCE); } setEval(setUnion); UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); s->add(fC.fChar); fLastSetLiteral = fC.fChar; break; } case doSetNamedChar: // Scanning a \N{UNICODE CHARACTER NAME} // Aside from the source of the character, the processing is identical to doSetLiteral, // above. { UChar32 c = scanNamedChar(); setEval(setUnion); UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); s->add(c); fLastSetLiteral = c; break; } case doSetNamedRange: // We have scanned literal-\N{CHAR NAME}. Add the range to the set. // The left character is already in the set, and is saved in fLastSetLiteral. // The right side needs to be picked up, the scan is at the 'N'. // Lower Limit > Upper limit being an error matches both Java // and ICU UnicodeSet behavior. { UChar32 c = scanNamedChar(); if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) { error(U_REGEX_INVALID_RANGE); } UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); s->add(fLastSetLiteral, c); fLastSetLiteral = c; break; } case doSetNegate: // Scanned a '^' at the start of a set. // Push the negation operator onto the set op stack. // A twist for case-insensitive matching: // the case closure operation must happen _before_ negation. // But the case closure operation will already be on the stack if it's required. // This requires checking for case closure, and swapping the stack order // if it is present. { int32_t tosOp = fSetOpStack.peeki(); if (tosOp == setCaseClose) { fSetOpStack.popi(); fSetOpStack.push(setNegation, *fStatus); fSetOpStack.push(setCaseClose, *fStatus); } else { fSetOpStack.push(setNegation, *fStatus); } } break; case doSetNoCloseError: error(U_REGEX_MISSING_CLOSE_BRACKET); break; case doSetOpError: error(U_REGEX_RULE_SYNTAX); // -- or && at the end of a set. Illegal. break; case doSetPosixProp: { UnicodeSet *s = scanPosixProp(); if (s != nullptr) { UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); tos->addAll(*s); delete s; } // else error. scanProp() reported the error status already. } break; case doSetProp: // Scanned a \p \P within [brackets]. { UnicodeSet *s = scanProp(); if (s != nullptr) { UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); tos->addAll(*s); delete s; } // else error. scanProp() reported the error status already. } break; case doSetRange: // We have scanned literal-literal. Add the range to the set. // The left character is already in the set, and is saved in fLastSetLiteral. // The right side is the current character. // Lower Limit > Upper limit being an error matches both Java // and ICU UnicodeSet behavior. { if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) { error(U_REGEX_INVALID_RANGE); } UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); s->add(fLastSetLiteral, fC.fChar); break; } default: UPRV_UNREACHABLE_EXIT; } if (U_FAILURE(*fStatus)) { returnVal = false; } return returnVal; } //------------------------------------------------------------------------------ // // literalChar We've encountered a literal character from the pattern, // or an escape sequence that reduces to a character. // Add it to the string containing all literal chars/strings from // the pattern. // //------------------------------------------------------------------------------ void RegexCompile::literalChar(UChar32 c) { fLiteralChars.append(c); } //------------------------------------------------------------------------------ // // fixLiterals When compiling something that can follow a literal // string in a pattern, emit the code to match the // accumulated literal string. // // Optionally, split the last char of the string off into // a single "ONE_CHAR" operation, so that quantifiers can // apply to that char alone. Example: abc* // The * must apply to the 'c' only. // //------------------------------------------------------------------------------ void RegexCompile::fixLiterals(UBool split) { // If no literal characters have been scanned but not yet had code generated // for them, nothing needs to be done. if (fLiteralChars.length() == 0) { return; } int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1); UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); // Split: We need to ensure that the last item in the compiled pattern // refers only to the last literal scanned in the pattern, so that // quantifiers (*, +, etc.) affect only it, and not a longer string. // Split before case folding for case insensitive matches. if (split) { fLiteralChars.truncate(indexOfLastCodePoint); fixLiterals(false); // Recursive call, emit code to match the first part of the string. // Note that the truncated literal string may be empty, in which case // nothing will be emitted. literalChar(lastCodePoint); // Re-add the last code point as if it were a new literal. fixLiterals(false); // Second recursive call, code for the final code point. return; } // If we are doing case-insensitive matching, case fold the string. This may expand // the string, e.g. the German sharp-s turns into "ss" if (fModeFlags & UREGEX_CASE_INSENSITIVE) { fLiteralChars.foldCase(); indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1); lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); } if (indexOfLastCodePoint == 0) { // Single character, emit a URX_ONECHAR op to match it. if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) { appendOp(URX_ONECHAR_I, lastCodePoint); } else { appendOp(URX_ONECHAR, lastCodePoint); } } else { // Two or more chars, emit a URX_STRING to match them. if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) { error(U_REGEX_PATTERN_TOO_BIG); } if (fModeFlags & UREGEX_CASE_INSENSITIVE) { appendOp(URX_STRING_I, fRXPat->fLiteralText.length()); } else { // TODO here: add optimization to split case sensitive strings of length two // into two single char ops, for efficiency. appendOp(URX_STRING, fRXPat->fLiteralText.length()); } appendOp(URX_STRING_LEN, fLiteralChars.length()); // Add this string into the accumulated strings of the compiled pattern. fRXPat->fLiteralText.append(fLiteralChars); } fLiteralChars.remove(); } int32_t RegexCompile::buildOp(int32_t type, int32_t val) { if (U_FAILURE(*fStatus)) { return 0; } if (type < 0 || type > 255) { UPRV_UNREACHABLE_EXIT; } if (val > 0x00ffffff) { UPRV_UNREACHABLE_EXIT; } if (val < 0) { if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) { UPRV_UNREACHABLE_EXIT; } if (URX_TYPE(val) != 0xff) { UPRV_UNREACHABLE_EXIT; } type = URX_RESERVED_OP_N; } return (type << 24) | val; } //------------------------------------------------------------------------------ // // appendOp() Append a new instruction onto the compiled pattern // Includes error checking, limiting the size of the // pattern to lengths that can be represented in the // 24 bit operand field of an instruction. // //------------------------------------------------------------------------------ void RegexCompile::appendOp(int32_t op) { if (U_FAILURE(*fStatus)) { return; } fRXPat->fCompiledPat->addElement(op, *fStatus); if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) { error(U_REGEX_PATTERN_TOO_BIG); } } void RegexCompile::appendOp(int32_t type, int32_t val) { appendOp(buildOp(type, val)); } //------------------------------------------------------------------------------ // // insertOp() Insert a slot for a new opcode into the already // compiled pattern code. // // Fill the slot with a NOP. Our caller will replace it // with what they really wanted. // //------------------------------------------------------------------------------ void RegexCompile::insertOp(int32_t where) { UVector64 *code = fRXPat->fCompiledPat; U_ASSERT(where>0 && where < code->size()); int32_t nop = buildOp(URX_NOP, 0); code->insertElementAt(nop, where, *fStatus); // Walk through the pattern, looking for any ops with targets that // were moved down by the insert. Fix them. int32_t loc; for (loc=0; locsize(); loc++) { int32_t op = (int32_t)code->elementAti(loc); int32_t opType = URX_TYPE(op); int32_t opValue = URX_VAL(op); if ((opType == URX_JMP || opType == URX_JMPX || opType == URX_STATE_SAVE || opType == URX_CTR_LOOP || opType == URX_CTR_LOOP_NG || opType == URX_JMP_SAV || opType == URX_JMP_SAV_X || opType == URX_RELOC_OPRND) && opValue > where) { // Target location for this opcode is after the insertion point and // needs to be incremented to adjust for the insertion. opValue++; op = buildOp(opType, opValue); code->setElementAt(op, loc); } } // Now fix up the parentheses stack. All positive values in it are locations in // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.) for (loc=0; locsize()); if (x>where) { x++; fParenStack.setElementAt(x, loc); } } if (fMatchCloseParen > where) { fMatchCloseParen++; } if (fMatchOpenParen > where) { fMatchOpenParen++; } } //------------------------------------------------------------------------------ // // allocateData() Allocate storage in the matcher's static data area. // Return the index for the newly allocated data. // The storage won't actually exist until we are running a match // operation, but the storage indexes are inserted into various // opcodes while compiling the pattern. // //------------------------------------------------------------------------------ int32_t RegexCompile::allocateData(int32_t size) { if (U_FAILURE(*fStatus)) { return 0; } if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) { error(U_REGEX_INTERNAL_ERROR); return 0; } int32_t dataIndex = fRXPat->fDataSize; fRXPat->fDataSize += size; if (fRXPat->fDataSize >= 0x00fffff0) { error(U_REGEX_INTERNAL_ERROR); } return dataIndex; } //------------------------------------------------------------------------------ // // allocateStackData() Allocate space in the back-tracking stack frame. // Return the index for the newly allocated data. // The frame indexes are inserted into various // opcodes while compiling the pattern, meaning that frame // size must be restricted to the size that will fit // as an operand (24 bits). // //------------------------------------------------------------------------------ int32_t RegexCompile::allocateStackData(int32_t size) { if (U_FAILURE(*fStatus)) { return 0; } if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) { error(U_REGEX_INTERNAL_ERROR); return 0; } int32_t dataIndex = fRXPat->fFrameSize; fRXPat->fFrameSize += size; if (fRXPat->fFrameSize >= 0x00fffff0) { error(U_REGEX_PATTERN_TOO_BIG); } return dataIndex; } //------------------------------------------------------------------------------ // // blockTopLoc() Find or create a location in the compiled pattern // at the start of the operation or block that has // just been compiled. Needed when a quantifier (* or // whatever) appears, and we need to add an operation // at the start of the thing being quantified. // // (Parenthesized Blocks) have a slot with a NOP that // is reserved for this purpose. .* or similar don't // and a slot needs to be added. // // parameter reserveLoc : true - ensure that there is space to add an opcode // at the returned location. // false - just return the address, // do not reserve a location there. // //------------------------------------------------------------------------------ int32_t RegexCompile::blockTopLoc(UBool reserveLoc) { int32_t theLoc; fixLiterals(true); // Emit code for any pending literals. // If last item was a string, emit separate op for the its last char. if (fRXPat->fCompiledPat->size() == fMatchCloseParen) { // The item just processed is a parenthesized block. theLoc = fMatchOpenParen; // A slot is already reserved for us. U_ASSERT(theLoc > 0); U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP); } else { // Item just compiled is a single thing, a ".", or a single char, a string or a set reference. // No slot for STATE_SAVE was pre-reserved in the compiled code. // We need to make space now. theLoc = fRXPat->fCompiledPat->size()-1; int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc); if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) { // Strings take two opcode, we want the position of the first one. // We can have a string at this point if a single character case-folded to two. theLoc--; } if (reserveLoc) { int32_t nop = buildOp(URX_NOP, 0); fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus); } } return theLoc; } //------------------------------------------------------------------------------ // // handleCloseParen When compiling a close paren, we need to go back // and fix up any JMP or SAVE operations within the // parenthesized block that need to target the end // of the block. The locations of these are kept on // the paretheses stack. // // This function is called both when encountering a // real ) and at the end of the pattern. // //------------------------------------------------------------------------------ void RegexCompile::handleCloseParen() { int32_t patIdx; int32_t patOp; if (fParenStack.size() <= 0) { error(U_REGEX_MISMATCHED_PAREN); return; } // Emit code for any pending literals. fixLiterals(false); // Fixup any operations within the just-closed parenthesized group // that need to reference the end of the (block). // (The first one popped from the stack is an unused slot for // alternation (OR) state save, but applying the fixup to it does no harm.) for (;;) { patIdx = fParenStack.popi(); if (patIdx < 0) { // value < 0 flags the start of the frame on the paren stack. break; } U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size()); patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx); U_ASSERT(URX_VAL(patOp) == 0); // Branch target for JMP should not be set. patOp |= fRXPat->fCompiledPat->size(); // Set it now. fRXPat->fCompiledPat->setElementAt(patOp, patIdx); fMatchOpenParen = patIdx; } // At the close of any parenthesized block, restore the match mode flags to // the value they had at the open paren. Saved value is // at the top of the paren stack. fModeFlags = fParenStack.popi(); U_ASSERT(fModeFlags < 0); // DO any additional fixups, depending on the specific kind of // parentesized grouping this is switch (patIdx) { case plain: case flags: // No additional fixups required. // (Grouping-only parentheses) break; case capturing: // Capturing Parentheses. // Insert a End Capture op into the pattern. // The frame offset of the variables for this cg is obtained from the // start capture op and put it into the end-capture op. { int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1); U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); int32_t frameVarLocation = URX_VAL(captureOp); appendOp(URX_END_CAPTURE, frameVarLocation); } break; case atomic: // Atomic Parenthesis. // Insert a LD_SP operation to restore the state stack to the position // it was when the atomic parens were entered. { int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1); U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP); int32_t stoLoc = URX_VAL(stoOp); appendOp(URX_LD_SP, stoLoc); } break; case lookAhead: { int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5); U_ASSERT(URX_TYPE(startOp) == URX_LA_START); int32_t dataLoc = URX_VAL(startOp); appendOp(URX_LA_END, dataLoc); } break; case negLookAhead: { // See comment at doOpenLookAheadNeg int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1); U_ASSERT(URX_TYPE(startOp) == URX_LA_START); int32_t dataLoc = URX_VAL(startOp); appendOp(URX_LA_END, dataLoc); appendOp(URX_BACKTRACK, 0); appendOp(URX_LA_END, dataLoc); // Patch the URX_SAVE near the top of the block. // The destination of the SAVE is the final LA_END that was just added. int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen); U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE); int32_t dest = fRXPat->fCompiledPat->size()-1; saveOp = buildOp(URX_STATE_SAVE, dest); fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen); } break; case lookBehind: { // See comment at doOpenLookBehind. // Append the URX_LB_END and URX_LA_END to the compiled pattern. int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4); U_ASSERT(URX_TYPE(startOp) == URX_LB_START); int32_t dataLoc = URX_VAL(startOp); appendOp(URX_LB_END, dataLoc); appendOp(URX_LA_END, dataLoc); // Determine the min and max bounds for the length of the // string that the pattern can match. // An unbounded upper limit is an error. int32_t patEnd = fRXPat->fCompiledPat->size() - 1; int32_t minML = minMatchLength(fMatchOpenParen, patEnd); int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); if (URX_TYPE(maxML) != 0) { error(U_REGEX_LOOK_BEHIND_LIMIT); break; } if (maxML == INT32_MAX) { error(U_REGEX_LOOK_BEHIND_LIMIT); break; } if (minML == INT32_MAX) { // This condition happens when no match is possible, such as with a // [set] expression containing no elements. // In principle, the generated code to evaluate the expression could be deleted, // but it's probably not worth the complication. minML = 0; } U_ASSERT(minML <= maxML); // Insert the min and max match len bounds into the URX_LB_CONT op that // appears at the top of the look-behind block, at location fMatchOpenParen+1 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2); fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1); } break; case lookBehindN: { // See comment at doOpenLookBehindNeg. // Append the URX_LBN_END to the compiled pattern. int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5); U_ASSERT(URX_TYPE(startOp) == URX_LB_START); int32_t dataLoc = URX_VAL(startOp); appendOp(URX_LBN_END, dataLoc); // Determine the min and max bounds for the length of the // string that the pattern can match. // An unbounded upper limit is an error. int32_t patEnd = fRXPat->fCompiledPat->size() - 1; int32_t minML = minMatchLength(fMatchOpenParen, patEnd); int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); if (URX_TYPE(maxML) != 0) { error(U_REGEX_LOOK_BEHIND_LIMIT); break; } if (maxML == INT32_MAX) { error(U_REGEX_LOOK_BEHIND_LIMIT); break; } if (minML == INT32_MAX) { // This condition happens when no match is possible, such as with a // [set] expression containing no elements. // In principle, the generated code to evaluate the expression could be deleted, // but it's probably not worth the complication. minML = 0; } U_ASSERT(minML <= maxML); // Insert the min and max match len bounds into the URX_LB_CONT op that // appears at the top of the look-behind block, at location fMatchOpenParen+1 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3); fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2); // Insert the pattern location to continue at after a successful match // as the last operand of the URX_LBN_CONT int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size()); fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1); } break; default: UPRV_UNREACHABLE_EXIT; } // remember the next location in the compiled pattern. // The compilation of Quantifiers will look at this to see whether its looping // over a parenthesized block or a single item fMatchCloseParen = fRXPat->fCompiledPat->size(); } //------------------------------------------------------------------------------ // // compileSet Compile the pattern operations for a reference to a // UnicodeSet. // //------------------------------------------------------------------------------ void RegexCompile::compileSet(UnicodeSet *theSet) { if (theSet == nullptr) { return; } // Remove any strings from the set. // There shouldn't be any, but just in case. // (Case Closure can add them; if we had a simple case closure available that // ignored strings, that would be better.) theSet->removeAllStrings(); int32_t setSize = theSet->size(); switch (setSize) { case 0: { // Set of no elements. Always fails to match. appendOp(URX_BACKTRACK, 0); delete theSet; } break; case 1: { // The set contains only a single code point. Put it into // the compiled pattern as a single char operation rather // than a set, and discard the set itself. literalChar(theSet->charAt(0)); delete theSet; } break; default: { // The set contains two or more chars. (the normal case) // Put it into the compiled pattern as a set. theSet->freeze(); int32_t setNumber = fRXPat->fSets->size(); fRXPat->fSets->addElement(theSet, *fStatus); if (U_SUCCESS(*fStatus)) { appendOp(URX_SETREF, setNumber); } else { delete theSet; } } } } //------------------------------------------------------------------------------ // // compileInterval Generate the code for a {min, max} style interval quantifier. // Except for the specific opcodes used, the code is the same // for all three types (greedy, non-greedy, possessive) of // intervals. The opcodes are supplied as parameters. // (There are two sets of opcodes - greedy & possessive use the // same ones, while non-greedy has it's own.) // // The code for interval loops has this form: // 0 CTR_INIT counter loc (in stack frame) // 1 5 patt address of CTR_LOOP at bottom of block // 2 min count // 3 max count (-1 for unbounded) // 4 ... block to be iterated over // 5 CTR_LOOP // // In //------------------------------------------------------------------------------ void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp) { // The CTR_INIT op at the top of the block with the {n,m} quantifier takes // four slots in the compiled code. Reserve them. int32_t topOfBlock = blockTopLoc(true); insertOp(topOfBlock); insertOp(topOfBlock); insertOp(topOfBlock); // The operands for the CTR_INIT opcode include the index in the matcher data // of the counter. Allocate it now. There are two data items // counterLoc --> Loop counter // +1 --> Input index (for breaking non-progressing loops) // (Only present if unbounded upper limit on loop) int32_t dataSize = fIntervalUpper < 0 ? 2 : 1; int32_t counterLoc = allocateStackData(dataSize); int32_t op = buildOp(InitOp, counterLoc); fRXPat->fCompiledPat->setElementAt(op, topOfBlock); // The second operand of CTR_INIT is the location following the end of the loop. // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the // compilation of something later on causes the code to grow and the target // position to move. int32_t loopEnd = fRXPat->fCompiledPat->size(); op = buildOp(URX_RELOC_OPRND, loopEnd); fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1); // Followed by the min and max counts. fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2); fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3); // Append the CTR_LOOP op. The operand is the location of the CTR_INIT op. // Goes at end of the block being looped over, so just append to the code so far. appendOp(LoopOp, topOfBlock); if ((fIntervalLow & 0xff000000) != 0 || (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) { error(U_REGEX_NUMBER_TOO_BIG); } if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) { error(U_REGEX_MAX_LT_MIN); } } UBool RegexCompile::compileInlineInterval() { if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) { // Too big to inline. Fail, which will cause looping code to be generated. // (Upper < Lower picks up unbounded upper and errors, both.) return false; } int32_t topOfBlock = blockTopLoc(false); if (fIntervalUpper == 0) { // Pathological case. Attempt no matches, as if the block doesn't exist. // Discard the generated code for the block. // If the block included parens, discard the info pertaining to them as well. fRXPat->fCompiledPat->setSize(topOfBlock); if (fMatchOpenParen >= topOfBlock) { fMatchOpenParen = -1; } if (fMatchCloseParen >= topOfBlock) { fMatchCloseParen = -1; } return true; } if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) { // The thing being repeated is not a single op, but some // more complex block. Do it as a loop, not inlines. // Note that things "repeated" a max of once are handled as inline, because // the one copy of the code already generated is just fine. return false; } // Pick up the opcode that is to be repeated // int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock); // Compute the pattern location where the inline sequence // will end, and set up the state save op that will be needed. // int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1 + fIntervalUpper + (fIntervalUpper-fIntervalLow); int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc); if (fIntervalLow == 0) { insertOp(topOfBlock); fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock); } // Loop, emitting the op for the thing being repeated each time. // Loop starts at 1 because one instance of the op already exists in the pattern, // it was put there when it was originally encountered. int32_t i; for (i=1; i= fIntervalLow) { appendOp(saveOp); } appendOp(op); } return true; } //------------------------------------------------------------------------------ // // caseInsensitiveStart given a single code point from a pattern string, determine the // set of characters that could potentially begin a case-insensitive // match of a string beginning with that character, using full Unicode // case insensitive matching. // // This is used in optimizing find(). // // closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but // misses cases like this: // A string from the pattern begins with 'ss' (although all we know // in this context is that it begins with 's') // The pattern could match a string beginning with a German sharp-s // // To the ordinary case closure for a character c, we add all other // characters cx where the case closure of cx includes a string form that begins // with the original character c. // // This function could be made smarter. The full pattern string is available // and it would be possible to verify that the extra characters being added // to the starting set fully match, rather than having just a first-char of the // folded form match. // //------------------------------------------------------------------------------ void RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) { // Machine Generated below. // It may need updating with new versions of Unicode. // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed. // The update tool is here: // https://github.com/unicode-org/icu/tree/main/tools/unicode/c/genregexcasing // Machine Generated Data. Do not hand edit. static const UChar32 RECaseFixCodePoints[] = { 0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc, 0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565, 0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07, 0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61, 0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000}; static const int16_t RECaseFixStringOffsets[] = { 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f, 0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43, 0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57, 0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0}; static const int16_t RECaseFixCounts[] = { 0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1, 0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0}; static const char16_t RECaseFixData[] = { 0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf, 0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3, 0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3, 0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3, 0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14, 0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83, 0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90, 0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95, 0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2, 0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7, 0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0}; // End of machine generated data. if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) { // This function should never be called with an invalid input character. UPRV_UNREACHABLE_EXIT; } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { UChar32 caseFoldedC = u_foldCase(c, U_FOLD_CASE_DEFAULT); starterChars->set(caseFoldedC, caseFoldedC); int32_t i; for (i=0; RECaseFixCodePoints[i]add(cpToAdd); } } starterChars->closeOver(USET_CASE_INSENSITIVE); starterChars->removeAllStrings(); } else { // Not a cased character. Just return it alone. starterChars->set(c, c); } } // Increment with overflow check. // val and delta will both be positive. static int32_t safeIncrement(int32_t val, int32_t delta) { if (INT32_MAX - val > delta) { return val + delta; } else { return INT32_MAX; } } //------------------------------------------------------------------------------ // // matchStartType Determine how a match can start. // Used to optimize find() operations. // // Operation is very similar to minMatchLength(). Walk the compiled // pattern, keeping an on-going minimum-match-length. For any // op where the min match coming in is zero, add that ops possible // starting matches to the possible starts for the overall pattern. // //------------------------------------------------------------------------------ void RegexCompile::matchStartType() { if (U_FAILURE(*fStatus)) { return; } int32_t loc; // Location in the pattern of the current op being processed. int32_t op; // The op being processed int32_t opType; // The opcode type of the op int32_t currentLen = 0; // Minimum length of a match to this point (loc) in the pattern int32_t numInitialStrings = 0; // Number of strings encountered that could match at start. UBool atStart = true; // True if no part of the pattern yet encountered // could have advanced the position in a match. // (Maximum match length so far == 0) // forwardedLength is a vector holding minimum-match-length values that // are propagated forward in the pattern by JMP or STATE_SAVE operations. // It must be one longer than the pattern being checked because some ops // will jmp to a end-of-block+1 location from within a block, and we must // count those when checking the block. int32_t end = fRXPat->fCompiledPat->size(); UVector32 forwardedLength(end+1, *fStatus); forwardedLength.setSize(end+1); for (loc=3; locfCompiledPat->elementAti(loc); opType = URX_TYPE(op); // The loop is advancing linearly through the pattern. // If the op we are now at was the destination of a branch in the pattern, // and that path has a shorter minimum length than the current accumulated value, // replace the current accumulated value. if (forwardedLength.elementAti(loc) < currentLen) { currentLen = forwardedLength.elementAti(loc); U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); } switch (opType) { // Ops that don't change the total length matched case URX_RESERVED_OP: case URX_END: case URX_FAIL: case URX_STRING_LEN: case URX_NOP: case URX_START_CAPTURE: case URX_END_CAPTURE: case URX_BACKSLASH_B: case URX_BACKSLASH_BU: case URX_BACKSLASH_G: case URX_BACKSLASH_Z: case URX_DOLLAR: case URX_DOLLAR_M: case URX_DOLLAR_D: case URX_DOLLAR_MD: case URX_RELOC_OPRND: case URX_STO_INP_LOC: case URX_BACKREF: // BackRef. Must assume that it might be a zero length match case URX_BACKREF_I: case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. case URX_LD_SP: break; case URX_CARET: if (atStart) { fRXPat->fStartType = START_START; } break; case URX_CARET_M: case URX_CARET_M_UNIX: if (atStart) { fRXPat->fStartType = START_LINE; } break; case URX_ONECHAR: if (currentLen == 0) { // This character could appear at the start of a match. // Add it to the set of possible starting characters. fRXPat->fInitialChars->add(URX_VAL(op)); numInitialStrings += 2; } currentLen = safeIncrement(currentLen, 1); atStart = false; break; case URX_SETREF: if (currentLen == 0) { int32_t sn = URX_VAL(op); U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn); fRXPat->fInitialChars->addAll(*s); numInitialStrings += 2; } currentLen = safeIncrement(currentLen, 1); atStart = false; break; case URX_LOOP_SR_I: // [Set]*, like a SETREF, above, in what it can match, // but may not match at all, so currentLen is not incremented. if (currentLen == 0) { int32_t sn = URX_VAL(op); U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn); fRXPat->fInitialChars->addAll(*s); numInitialStrings += 2; } atStart = false; break; case URX_LOOP_DOT_I: if (currentLen == 0) { // .* at the start of a pattern. // Any character can begin the match. fRXPat->fInitialChars->clear(); fRXPat->fInitialChars->complement(); numInitialStrings += 2; } atStart = false; break; case URX_STATIC_SETREF: if (currentLen == 0) { int32_t sn = URX_VAL(op); U_ASSERT(sn>0 && snfPropSets[sn]; fRXPat->fInitialChars->addAll(s); numInitialStrings += 2; } currentLen = safeIncrement(currentLen, 1); atStart = false; break; case URX_STAT_SETREF_N: if (currentLen == 0) { int32_t sn = URX_VAL(op); UnicodeSet sc; sc.addAll(RegexStaticSets::gStaticSets->fPropSets[sn]).complement(); fRXPat->fInitialChars->addAll(sc); numInitialStrings += 2; } currentLen = safeIncrement(currentLen, 1); atStart = false; break; case URX_BACKSLASH_D: // Digit Char if (currentLen == 0) { UnicodeSet s; s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus); if (URX_VAL(op) != 0) { s.complement(); } fRXPat->fInitialChars->addAll(s); numInitialStrings += 2; } currentLen = safeIncrement(currentLen, 1); atStart = false; break; case URX_BACKSLASH_H: // Horiz white space if (currentLen == 0) { UnicodeSet s; s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus); s.add((UChar32)9); // Tab if (URX_VAL(op) != 0) { s.complement(); } fRXPat->fInitialChars->addAll(s); numInitialStrings += 2; } currentLen = safeIncrement(currentLen, 1); atStart = false; break; case URX_BACKSLASH_R: // Any line ending sequence case URX_BACKSLASH_V: // Any line ending code point, with optional negation if (currentLen == 0) { UnicodeSet s; s.add((UChar32)0x0a, (UChar32)0x0d); // add range s.add((UChar32)0x85); s.add((UChar32)0x2028, (UChar32)0x2029); if (URX_VAL(op) != 0) { // Complement option applies to URX_BACKSLASH_V only. s.complement(); } fRXPat->fInitialChars->addAll(s); numInitialStrings += 2; } currentLen = safeIncrement(currentLen, 1); atStart = false; break; case URX_ONECHAR_I: // Case Insensitive Single Character. if (currentLen == 0) { UChar32 c = URX_VAL(op); if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { UnicodeSet starters(c, c); starters.closeOver(USET_CASE_INSENSITIVE); // findCaseInsensitiveStarters(c, &starters); // For ONECHAR_I, no need to worry about text chars that expand on folding into strings. // The expanded folding can't match the pattern. fRXPat->fInitialChars->addAll(starters); } else { // Char has no case variants. Just add it as-is to the // set of possible starting chars. fRXPat->fInitialChars->add(c); } numInitialStrings += 2; } currentLen = safeIncrement(currentLen, 1); atStart = false; break; case URX_BACKSLASH_X: // Grapheme Cluster. Minimum is 1, max unbounded. case URX_DOTANY_ALL: // . matches one or two. case URX_DOTANY: case URX_DOTANY_UNIX: if (currentLen == 0) { // These constructs are all bad news when they appear at the start // of a match. Any character can begin the match. fRXPat->fInitialChars->clear(); fRXPat->fInitialChars->complement(); numInitialStrings += 2; } currentLen = safeIncrement(currentLen, 1); atStart = false; break; case URX_JMPX: loc++; // Except for extra operand on URX_JMPX, same as URX_JMP. U_FALLTHROUGH; case URX_JMP: { int32_t jmpDest = URX_VAL(op); if (jmpDest < loc) { // Loop of some kind. Can safely ignore, the worst that will happen // is that we understate the true minimum length currentLen = forwardedLength.elementAti(loc+1); } else { // Forward jump. Propagate the current min length to the target loc of the jump. U_ASSERT(jmpDest <= end+1); if (forwardedLength.elementAti(jmpDest) > currentLen) { forwardedLength.setElementAt(currentLen, jmpDest); } } } atStart = false; break; case URX_JMP_SAV: case URX_JMP_SAV_X: // Combo of state save to the next loc, + jmp backwards. // Net effect on min. length computation is nothing. atStart = false; break; case URX_BACKTRACK: // Fails are kind of like a branch, except that the min length was // propagated already, by the state save. currentLen = forwardedLength.elementAti(loc+1); atStart = false; break; case URX_STATE_SAVE: { // State Save, for forward jumps, propagate the current minimum. // of the state save. int32_t jmpDest = URX_VAL(op); if (jmpDest > loc) { if (currentLen < forwardedLength.elementAti(jmpDest)) { forwardedLength.setElementAt(currentLen, jmpDest); } } } atStart = false; break; case URX_STRING: { loc++; int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); int32_t stringLen = URX_VAL(stringLenOp); U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); U_ASSERT(stringLenOp >= 2); if (currentLen == 0) { // Add the starting character of this string to the set of possible starting // characters for this pattern. int32_t stringStartIdx = URX_VAL(op); UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx); fRXPat->fInitialChars->add(c); // Remember this string. After the entire pattern has been checked, // if nothing else is identified that can start a match, we'll use it. numInitialStrings++; fRXPat->fInitialStringIdx = stringStartIdx; fRXPat->fInitialStringLen = stringLen; } currentLen = safeIncrement(currentLen, stringLen); atStart = false; } break; case URX_STRING_I: { // Case-insensitive string. Unlike exact-match strings, we won't // attempt a string search for possible match positions. But we // do update the set of possible starting characters. loc++; int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); int32_t stringLen = URX_VAL(stringLenOp); U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); U_ASSERT(stringLenOp >= 2); if (currentLen == 0) { // Add the starting character of this string to the set of possible starting // characters for this pattern. int32_t stringStartIdx = URX_VAL(op); UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx); UnicodeSet s; findCaseInsensitiveStarters(c, &s); fRXPat->fInitialChars->addAll(s); numInitialStrings += 2; // Matching on an initial string not possible. } currentLen = safeIncrement(currentLen, stringLen); atStart = false; } break; case URX_CTR_INIT: case URX_CTR_INIT_NG: { // Loop Init Ops. These don't change the min length, but they are 4 word ops // so location must be updated accordingly. // Loop Init Ops. // If the min loop count == 0 // move loc forwards to the end of the loop, skipping over the body. // If the min count is > 0, // continue normal processing of the body of the loop. int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1); loopEndLoc = URX_VAL(loopEndLoc); int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2); if (minLoopCount == 0) { // Min Loop Count of 0, treat like a forward branch and // move the current minimum length up to the target // (end of loop) location. U_ASSERT(loopEndLoc <= end+1); if (forwardedLength.elementAti(loopEndLoc) > currentLen) { forwardedLength.setElementAt(currentLen, loopEndLoc); } } loc+=3; // Skips over operands of CTR_INIT } atStart = false; break; case URX_CTR_LOOP: case URX_CTR_LOOP_NG: // Loop ops. // The jump is conditional, backwards only. atStart = false; break; case URX_LOOP_C: // More loop ops. These state-save to themselves. // don't change the minimum match atStart = false; break; case URX_LA_START: case URX_LB_START: { // Look-around. Scan forward until the matching look-ahead end, // without processing the look-around block. This is overly pessimistic. // Keep track of the nesting depth of look-around blocks. Boilerplate code for // lookahead contains two LA_END instructions, so count goes up by two // for each LA_START. int32_t depth = (opType == URX_LA_START? 2: 1); for (;;) { loc++; op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); if (URX_TYPE(op) == URX_LA_START) { depth+=2; } if (URX_TYPE(op) == URX_LB_START) { depth++; } if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) { depth--; if (depth == 0) { break; } } if (URX_TYPE(op) == URX_STATE_SAVE) { // Need this because neg lookahead blocks will FAIL to outside // of the block. int32_t jmpDest = URX_VAL(op); if (jmpDest > loc) { if (currentLen < forwardedLength.elementAti(jmpDest)) { forwardedLength.setElementAt(currentLen, jmpDest); } } } U_ASSERT(loc <= end); } } break; case URX_LA_END: case URX_LB_CONT: case URX_LB_END: case URX_LBN_CONT: case URX_LBN_END: UPRV_UNREACHABLE_EXIT; // Shouldn't get here. These ops should be // consumed by the scan in URX_LA_START and LB_START default: UPRV_UNREACHABLE_EXIT; } } // We have finished walking through the ops. Check whether some forward jump // propagated a shorter length to location end+1. if (forwardedLength.elementAti(end+1) < currentLen) { currentLen = forwardedLength.elementAti(end+1); } fRXPat->fInitialChars8->init(fRXPat->fInitialChars); // Sort out what we should check for when looking for candidate match start positions. // In order of preference, // 1. Start of input text buffer. // 2. A literal string. // 3. Start of line in multi-line mode. // 4. A single literal character. // 5. A character from a set of characters. // if (fRXPat->fStartType == START_START) { // Match only at the start of an input text string. // start type is already set. We're done. } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) { // Match beginning only with a literal string. UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx); U_ASSERT(fRXPat->fInitialChars->contains(c)); fRXPat->fStartType = START_STRING; fRXPat->fInitialChar = c; } else if (fRXPat->fStartType == START_LINE) { // Match at start of line in Multi-Line mode. // Nothing to do here; everything is already set. } else if (fRXPat->fMinMatchLen == 0) { // Zero length match possible. We could start anywhere. fRXPat->fStartType = START_NO_INFO; } else if (fRXPat->fInitialChars->size() == 1) { // All matches begin with the same char. fRXPat->fStartType = START_CHAR; fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0); U_ASSERT(fRXPat->fInitialChar != (UChar32)-1); } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == false && fRXPat->fMinMatchLen > 0) { // Matches start with a set of character smaller than the set of all chars. fRXPat->fStartType = START_SET; } else { // Matches can start with anything fRXPat->fStartType = START_NO_INFO; } return; } //------------------------------------------------------------------------------ // // minMatchLength Calculate the length of the shortest string that could // match the specified pattern. // Length is in 16 bit code units, not code points. // // The calculated length may not be exact. The returned // value may be shorter than the actual minimum; it must // never be longer. // // start and end are the range of p-code operations to be // examined. The endpoints are included in the range. // //------------------------------------------------------------------------------ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { if (U_FAILURE(*fStatus)) { return 0; } U_ASSERT(start <= end); U_ASSERT(end < fRXPat->fCompiledPat->size()); int32_t loc; int32_t op; int32_t opType; int32_t currentLen = 0; // forwardedLength is a vector holding minimum-match-length values that // are propagated forward in the pattern by JMP or STATE_SAVE operations. // It must be one longer than the pattern being checked because some ops // will jmp to a end-of-block+1 location from within a block, and we must // count those when checking the block. UVector32 forwardedLength(end+2, *fStatus); forwardedLength.setSize(end+2); for (loc=start; loc<=end+1; loc++) { forwardedLength.setElementAt(INT32_MAX, loc); } for (loc = start; loc<=end; loc++) { op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); opType = URX_TYPE(op); // The loop is advancing linearly through the pattern. // If the op we are now at was the destination of a branch in the pattern, // and that path has a shorter minimum length than the current accumulated value, // replace the current accumulated value. // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some // no-match-possible cases. if (forwardedLength.elementAti(loc) < currentLen) { currentLen = forwardedLength.elementAti(loc); U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); } switch (opType) { // Ops that don't change the total length matched case URX_RESERVED_OP: case URX_END: case URX_STRING_LEN: case URX_NOP: case URX_START_CAPTURE: case URX_END_CAPTURE: case URX_BACKSLASH_B: case URX_BACKSLASH_BU: case URX_BACKSLASH_G: case URX_BACKSLASH_Z: case URX_CARET: case URX_DOLLAR: case URX_DOLLAR_M: case URX_DOLLAR_D: case URX_DOLLAR_MD: case URX_RELOC_OPRND: case URX_STO_INP_LOC: case URX_CARET_M: case URX_CARET_M_UNIX: case URX_BACKREF: // BackRef. Must assume that it might be a zero length match case URX_BACKREF_I: case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. case URX_LD_SP: case URX_JMP_SAV: case URX_JMP_SAV_X: break; // Ops that match a minimum of one character (one or two 16 bit code units.) // case URX_ONECHAR: case URX_STATIC_SETREF: case URX_STAT_SETREF_N: case URX_SETREF: case URX_BACKSLASH_D: case URX_BACKSLASH_H: case URX_BACKSLASH_R: case URX_BACKSLASH_V: case URX_ONECHAR_I: case URX_BACKSLASH_X: // Grapheme Cluster. Minimum is 1, max unbounded. case URX_DOTANY_ALL: // . matches one or two. case URX_DOTANY: case URX_DOTANY_UNIX: currentLen = safeIncrement(currentLen, 1); break; case URX_JMPX: loc++; // URX_JMPX has an extra operand, ignored here, // otherwise processed identically to URX_JMP. U_FALLTHROUGH; case URX_JMP: { int32_t jmpDest = URX_VAL(op); if (jmpDest < loc) { // Loop of some kind. Can safely ignore, the worst that will happen // is that we understate the true minimum length currentLen = forwardedLength.elementAti(loc+1); } else { // Forward jump. Propagate the current min length to the target loc of the jump. U_ASSERT(jmpDest <= end+1); if (forwardedLength.elementAti(jmpDest) > currentLen) { forwardedLength.setElementAt(currentLen, jmpDest); } } } break; case URX_BACKTRACK: { // Back-tracks are kind of like a branch, except that the min length was // propagated already, by the state save. currentLen = forwardedLength.elementAti(loc+1); } break; case URX_STATE_SAVE: { // State Save, for forward jumps, propagate the current minimum. // of the state save. int32_t jmpDest = URX_VAL(op); if (jmpDest > loc) { if (currentLen < forwardedLength.elementAti(jmpDest)) { forwardedLength.setElementAt(currentLen, jmpDest); } } } break; case URX_STRING: { loc++; int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp)); } break; case URX_STRING_I: { loc++; // TODO: with full case folding, matching input text may be shorter than // the string we have here. More smarts could put some bounds on it. // Assume a min length of one for now. A min length of zero causes // optimization failures for a pattern like "string"+ // currentLen += URX_VAL(stringLenOp); currentLen = safeIncrement(currentLen, 1); } break; case URX_CTR_INIT: case URX_CTR_INIT_NG: { // Loop Init Ops. // If the min loop count == 0 // move loc forwards to the end of the loop, skipping over the body. // If the min count is > 0, // continue normal processing of the body of the loop. int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1); loopEndLoc = URX_VAL(loopEndLoc); int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2); if (minLoopCount == 0) { loc = loopEndLoc; } else { loc+=3; // Skips over operands of CTR_INIT } } break; case URX_CTR_LOOP: case URX_CTR_LOOP_NG: // Loop ops. // The jump is conditional, backwards only. break; case URX_LOOP_SR_I: case URX_LOOP_DOT_I: case URX_LOOP_C: // More loop ops. These state-save to themselves. // don't change the minimum match - could match nothing at all. break; case URX_LA_START: case URX_LB_START: { // Look-around. Scan forward until the matching look-ahead end, // without processing the look-around block. This is overly pessimistic for look-ahead, // it assumes that the look-ahead match might be zero-length. // TODO: Positive lookahead could recursively do the block, then continue // with the longer of the block or the value coming in. Ticket 6060 int32_t depth = (opType == URX_LA_START? 2: 1); for (;;) { loc++; op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); if (URX_TYPE(op) == URX_LA_START) { // The boilerplate for look-ahead includes two LA_END instructions, // Depth will be decremented by each one when it is seen. depth += 2; } if (URX_TYPE(op) == URX_LB_START) { depth++; } if (URX_TYPE(op) == URX_LA_END) { depth--; if (depth == 0) { break; } } if (URX_TYPE(op)==URX_LBN_END) { depth--; if (depth == 0) { break; } } if (URX_TYPE(op) == URX_STATE_SAVE) { // Need this because neg lookahead blocks will FAIL to outside // of the block. int32_t jmpDest = URX_VAL(op); if (jmpDest > loc) { if (currentLen < forwardedLength.elementAti(jmpDest)) { forwardedLength.setElementAt(currentLen, jmpDest); } } } U_ASSERT(loc <= end); } } break; case URX_LA_END: case URX_LB_CONT: case URX_LB_END: case URX_LBN_CONT: case URX_LBN_END: // Only come here if the matching URX_LA_START or URX_LB_START was not in the // range being sized, which happens when measuring size of look-behind blocks. break; default: UPRV_UNREACHABLE_EXIT; } } // We have finished walking through the ops. Check whether some forward jump // propagated a shorter length to location end+1. if (forwardedLength.elementAti(end+1) < currentLen) { currentLen = forwardedLength.elementAti(end+1); U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); } return currentLen; } //------------------------------------------------------------------------------ // // maxMatchLength Calculate the length of the longest string that could // match the specified pattern. // Length is in 16 bit code units, not code points. // // The calculated length may not be exact. The returned // value may be longer than the actual maximum; it must // never be shorter. // // start, end: the range of the pattern to check. // end is inclusive. // //------------------------------------------------------------------------------ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { if (U_FAILURE(*fStatus)) { return 0; } U_ASSERT(start <= end); U_ASSERT(end < fRXPat->fCompiledPat->size()); int32_t loc; int32_t op; int32_t opType; int32_t currentLen = 0; UVector32 forwardedLength(end+1, *fStatus); forwardedLength.setSize(end+1); for (loc=start; loc<=end; loc++) { forwardedLength.setElementAt(0, loc); } for (loc = start; loc<=end; loc++) { op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); opType = URX_TYPE(op); // The loop is advancing linearly through the pattern. // If the op we are now at was the destination of a branch in the pattern, // and that path has a longer maximum length than the current accumulated value, // replace the current accumulated value. if (forwardedLength.elementAti(loc) > currentLen) { currentLen = forwardedLength.elementAti(loc); } switch (opType) { // Ops that don't change the total length matched case URX_RESERVED_OP: case URX_END: case URX_STRING_LEN: case URX_NOP: case URX_START_CAPTURE: case URX_END_CAPTURE: case URX_BACKSLASH_B: case URX_BACKSLASH_BU: case URX_BACKSLASH_G: case URX_BACKSLASH_Z: case URX_CARET: case URX_DOLLAR: case URX_DOLLAR_M: case URX_DOLLAR_D: case URX_DOLLAR_MD: case URX_RELOC_OPRND: case URX_STO_INP_LOC: case URX_CARET_M: case URX_CARET_M_UNIX: case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. case URX_LD_SP: case URX_LB_END: case URX_LB_CONT: case URX_LBN_CONT: case URX_LBN_END: break; // Ops that increase that cause an unbounded increase in the length // of a matched string, or that increase it a hard to characterize way. // Call the max length unbounded, and stop further checking. case URX_BACKREF: // BackRef. Must assume that it might be a zero length match case URX_BACKREF_I: case URX_BACKSLASH_X: // Grapheme Cluster. Minimum is 1, max unbounded. currentLen = INT32_MAX; break; // Ops that match a max of one character (possibly two 16 bit code units.) // case URX_STATIC_SETREF: case URX_STAT_SETREF_N: case URX_SETREF: case URX_BACKSLASH_D: case URX_BACKSLASH_H: case URX_BACKSLASH_R: case URX_BACKSLASH_V: case URX_ONECHAR_I: case URX_DOTANY_ALL: case URX_DOTANY: case URX_DOTANY_UNIX: currentLen = safeIncrement(currentLen, 2); break; // Single literal character. Increase current max length by one or two, // depending on whether the char is in the supplementary range. case URX_ONECHAR: currentLen = safeIncrement(currentLen, 1); if (URX_VAL(op) > 0x10000) { currentLen = safeIncrement(currentLen, 1); } break; // Jumps. // case URX_JMP: case URX_JMPX: case URX_JMP_SAV: case URX_JMP_SAV_X: { int32_t jmpDest = URX_VAL(op); if (jmpDest < loc) { // Loop of some kind. Max match length is unbounded. currentLen = INT32_MAX; } else { // Forward jump. Propagate the current min length to the target loc of the jump. if (forwardedLength.elementAti(jmpDest) < currentLen) { forwardedLength.setElementAt(currentLen, jmpDest); } currentLen = 0; } } break; case URX_BACKTRACK: // back-tracks are kind of like a branch, except that the max length was // propagated already, by the state save. currentLen = forwardedLength.elementAti(loc+1); break; case URX_STATE_SAVE: { // State Save, for forward jumps, propagate the current minimum. // of the state save. // For backwards jumps, they create a loop, maximum // match length is unbounded. int32_t jmpDest = URX_VAL(op); if (jmpDest > loc) { if (currentLen > forwardedLength.elementAti(jmpDest)) { forwardedLength.setElementAt(currentLen, jmpDest); } } else { currentLen = INT32_MAX; } } break; case URX_STRING: { loc++; int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp)); break; } case URX_STRING_I: // TODO: This code assumes that any user string that matches will be no longer // than our compiled string, with case insensitive matching. // Our compiled string has been case-folded already. // // Any matching user string will have no more code points than our // compiled (folded) string. Folding may add code points, but // not remove them. // // There is a potential problem if a supplemental code point // case-folds to a BMP code point. In this case our compiled string // could be shorter (in code units) than a matching user string. // // At this time (Unicode 6.1) there are no such characters, and this case // is not being handled. A test, intltest regex/Bug9283, will fail if // any problematic characters are added to Unicode. // // If this happens, we can make a set of the BMP chars that the // troublesome supplementals fold to, scan our string, and bump the // currentLen one extra for each that is found. // { loc++; int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp)); } break; case URX_CTR_INIT: case URX_CTR_INIT_NG: // For Loops, recursively call this function on the pattern for the loop body, // then multiply the result by the maximum loop count. { int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1)); if (loopEndLoc == loc+4) { // Loop has an empty body. No affect on max match length. // Continue processing with code after the loop end. loc = loopEndLoc; break; } int32_t maxLoopCount = static_cast(fRXPat->fCompiledPat->elementAti(loc+3)); if (maxLoopCount == -1) { // Unbounded Loop. No upper bound on match length. currentLen = INT32_MAX; break; } U_ASSERT(loopEndLoc >= loc+4); int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call. int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount; if (updatedLen >= INT32_MAX) { currentLen = INT32_MAX; break; } currentLen = (int32_t)updatedLen; loc = loopEndLoc; break; } case URX_CTR_LOOP: case URX_CTR_LOOP_NG: // These opcodes will be skipped over by code for URX_CTR_INIT. // We shouldn't encounter them here. UPRV_UNREACHABLE_EXIT; case URX_LOOP_SR_I: case URX_LOOP_DOT_I: case URX_LOOP_C: // For anything to do with loops, make the match length unbounded. currentLen = INT32_MAX; break; case URX_LA_START: case URX_LA_END: // Look-ahead. Just ignore, treat the look-ahead block as if // it were normal pattern. Gives a too-long match length, // but good enough for now. break; // End of look-ahead ops should always be consumed by the processing at // the URX_LA_START op. // UPRV_UNREACHABLE_EXIT; case URX_LB_START: { // Look-behind. Scan forward until the matching look-around end, // without processing the look-behind block. int32_t dataLoc = URX_VAL(op); for (loc = loc + 1; loc <= end; ++loc) { op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); int32_t opType = URX_TYPE(op); if ((opType == URX_LA_END || opType == URX_LBN_END) && (URX_VAL(op) == dataLoc)) { break; } } U_ASSERT(loc <= end); } break; default: UPRV_UNREACHABLE_EXIT; } if (currentLen == INT32_MAX) { // The maximum length is unbounded. // Stop further processing of the pattern. break; } } return currentLen; } //------------------------------------------------------------------------------ // // stripNOPs Remove any NOP operations from the compiled pattern code. // Extra NOPs are inserted for some constructs during the initial // code generation to provide locations that may be patched later. // Many end up unneeded, and are removed by this function. // // In order to minimize the number of passes through the pattern, // back-reference fixup is also performed here (adjusting // back-reference operands to point to the correct frame offsets). // //------------------------------------------------------------------------------ void RegexCompile::stripNOPs() { if (U_FAILURE(*fStatus)) { return; } int32_t end = fRXPat->fCompiledPat->size(); UVector32 deltas(end, *fStatus); // Make a first pass over the code, computing the amount that things // will be offset at each location in the original code. int32_t loc; int32_t d = 0; for (loc=0; locfCompiledPat->elementAti(loc); if (URX_TYPE(op) == URX_NOP) { d++; } } UnicodeString caseStringBuffer; // Make a second pass over the code, removing the NOPs by moving following // code up, and patching operands that refer to code locations that // are being moved. The array of offsets from the first step is used // to compute the new operand values. int32_t src; int32_t dst = 0; for (src=0; srcfCompiledPat->elementAti(src); int32_t opType = URX_TYPE(op); switch (opType) { case URX_NOP: break; case URX_STATE_SAVE: case URX_JMP: case URX_CTR_LOOP: case URX_CTR_LOOP_NG: case URX_RELOC_OPRND: case URX_JMPX: case URX_JMP_SAV: case URX_JMP_SAV_X: // These are instructions with operands that refer to code locations. { int32_t operandAddress = URX_VAL(op); U_ASSERT(operandAddress>=0 && operandAddressfCompiledPat->setElementAt(op, dst); dst++; break; } case URX_BACKREF: case URX_BACKREF_I: { int32_t where = URX_VAL(op); if (where > fRXPat->fGroupMap->size()) { error(U_REGEX_INVALID_BACK_REF); break; } where = fRXPat->fGroupMap->elementAti(where-1); op = buildOp(opType, where); fRXPat->fCompiledPat->setElementAt(op, dst); dst++; fRXPat->fNeedsAltInput = true; break; } case URX_RESERVED_OP: case URX_RESERVED_OP_N: case URX_BACKTRACK: case URX_END: case URX_ONECHAR: case URX_STRING: case URX_STRING_LEN: case URX_START_CAPTURE: case URX_END_CAPTURE: case URX_STATIC_SETREF: case URX_STAT_SETREF_N: case URX_SETREF: case URX_DOTANY: case URX_FAIL: case URX_BACKSLASH_B: case URX_BACKSLASH_BU: case URX_BACKSLASH_G: case URX_BACKSLASH_X: case URX_BACKSLASH_Z: case URX_DOTANY_ALL: case URX_BACKSLASH_D: case URX_CARET: case URX_DOLLAR: case URX_CTR_INIT: case URX_CTR_INIT_NG: case URX_DOTANY_UNIX: case URX_STO_SP: case URX_LD_SP: case URX_STO_INP_LOC: case URX_LA_START: case URX_LA_END: case URX_ONECHAR_I: case URX_STRING_I: case URX_DOLLAR_M: case URX_CARET_M: case URX_CARET_M_UNIX: case URX_LB_START: case URX_LB_CONT: case URX_LB_END: case URX_LBN_CONT: case URX_LBN_END: case URX_LOOP_SR_I: case URX_LOOP_DOT_I: case URX_LOOP_C: case URX_DOLLAR_D: case URX_DOLLAR_MD: case URX_BACKSLASH_H: case URX_BACKSLASH_R: case URX_BACKSLASH_V: // These instructions are unaltered by the relocation. fRXPat->fCompiledPat->setElementAt(op, dst); dst++; break; default: // Some op is unaccounted for. UPRV_UNREACHABLE_EXIT; } } fRXPat->fCompiledPat->setSize(dst); } //------------------------------------------------------------------------------ // // Error Report a rule parse error. // Only report it if no previous error has been recorded. // //------------------------------------------------------------------------------ void RegexCompile::error(UErrorCode e) { if (U_SUCCESS(*fStatus) || e == U_MEMORY_ALLOCATION_ERROR) { *fStatus = e; // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are // int64_t. If the values of the latter are out of range for the former, // set them to the appropriate "field not supported" values. if (fLineNum > 0x7FFFFFFF) { fParseErr->line = 0; fParseErr->offset = -1; } else if (fCharNum > 0x7FFFFFFF) { fParseErr->line = (int32_t)fLineNum; fParseErr->offset = -1; } else { fParseErr->line = (int32_t)fLineNum; fParseErr->offset = (int32_t)fCharNum; } UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context // Fill in the context. // Note: extractBetween() pins supplied indices to the string bounds. uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext)); uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext)); utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status); utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status); } } // // Assorted Unicode character constants. // Numeric because there is no portable way to enter them as literals. // (Think EBCDIC). // static const char16_t chCR = 0x0d; // New lines, for terminating comments. static const char16_t chLF = 0x0a; // Line Feed static const char16_t chPound = 0x23; // '#', introduces a comment. static const char16_t chDigit0 = 0x30; // '0' static const char16_t chDigit7 = 0x37; // '9' static const char16_t chColon = 0x3A; // ':' static const char16_t chE = 0x45; // 'E' static const char16_t chQ = 0x51; // 'Q' //static const char16_t chN = 0x4E; // 'N' static const char16_t chP = 0x50; // 'P' static const char16_t chBackSlash = 0x5c; // '\' introduces a char escape //static const char16_t chLBracket = 0x5b; // '[' static const char16_t chRBracket = 0x5d; // ']' static const char16_t chUp = 0x5e; // '^' static const char16_t chLowerP = 0x70; static const char16_t chLBrace = 0x7b; // '{' static const char16_t chRBrace = 0x7d; // '}' static const char16_t chNEL = 0x85; // NEL newline variant static const char16_t chLS = 0x2028; // Unicode Line Separator //------------------------------------------------------------------------------ // // nextCharLL Low Level Next Char from the regex pattern. // Get a char from the string, keep track of input position // for error reporting. // //------------------------------------------------------------------------------ UChar32 RegexCompile::nextCharLL() { UChar32 ch; if (fPeekChar != -1) { ch = fPeekChar; fPeekChar = -1; return ch; } // assume we're already in the right place ch = UTEXT_NEXT32(fRXPat->fPattern); if (ch == U_SENTINEL) { return ch; } if (ch == chCR || ch == chNEL || ch == chLS || (ch == chLF && fLastChar != chCR)) { // Character is starting a new line. Bump up the line number, and // reset the column to 0. fLineNum++; fCharNum=0; } else { // Character is not starting a new line. Except in the case of a // LF following a CR, increment the column position. if (ch != chLF) { fCharNum++; } } fLastChar = ch; return ch; } //------------------------------------------------------------------------------ // // peekCharLL Low Level Character Scanning, sneak a peek at the next // character without actually getting it. // //------------------------------------------------------------------------------ UChar32 RegexCompile::peekCharLL() { if (fPeekChar == -1) { fPeekChar = nextCharLL(); } return fPeekChar; } //------------------------------------------------------------------------------ // // nextChar for pattern scanning. At this level, we handle stripping // out comments and processing some backslash character escapes. // The rest of the pattern grammar is handled at the next level up. // //------------------------------------------------------------------------------ void RegexCompile::nextChar(RegexPatternChar &c) { tailRecursion: fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); c.fChar = nextCharLL(); c.fQuoted = false; if (fQuoteMode) { c.fQuoted = true; if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) || c.fChar == (UChar32)-1) { fQuoteMode = false; // Exit quote mode, nextCharLL(); // discard the E // nextChar(c); // recurse to get the real next char goto tailRecursion; // Note: fuzz testing produced testcases that // resulted in stack overflow here. } } else if (fInBackslashQuote) { // The current character immediately follows a '\' // Don't check for any further escapes, just return it as-is. // Don't set c.fQuoted, because that would prevent the state machine from // dispatching on the character. fInBackslashQuote = false; } else { // We are not in a \Q quoted region \E of the source. // if (fModeFlags & UREGEX_COMMENTS) { // // We are in free-spacing and comments mode. // Scan through any white space and comments, until we // reach a significant character or the end of input. for (;;) { if (c.fChar == (UChar32)-1) { break; // End of Input } if (c.fChar == chPound && fEOLComments) { // Start of a comment. Consume the rest of it, until EOF or a new line for (;;) { c.fChar = nextCharLL(); if (c.fChar == (UChar32)-1 || // EOF c.fChar == chCR || c.fChar == chLF || c.fChar == chNEL || c.fChar == chLS) { break; } } } // TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061. if (PatternProps::isWhiteSpace(c.fChar) == false) { break; } c.fChar = nextCharLL(); } } // // check for backslash escaped characters. // if (c.fChar == chBackSlash) { int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) { // // A '\' sequence that is handled by ICU's standard unescapeAt function. // Includes \uxxxx, \n, \r, many others. // Return the single equivalent character. // nextCharLL(); // get & discard the peeked char. c.fQuoted = true; if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) { int32_t endIndex = (int32_t)pos; c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents); if (endIndex == pos) { error(U_REGEX_BAD_ESCAPE_SEQUENCE); } fCharNum += endIndex - pos; UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex); } else { int32_t offset = 0; struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern); UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos); c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); if (offset == 0) { error(U_REGEX_BAD_ESCAPE_SEQUENCE); } else if (context.lastOffset == offset) { UTEXT_PREVIOUS32(fRXPat->fPattern); } else if (context.lastOffset != offset-1) { utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1); } fCharNum += offset; } } else if (peekCharLL() == chDigit0) { // Octal Escape, using Java Regexp Conventions // which are \0 followed by 1-3 octal digits. // Different from ICU Unescape handling of Octal, which does not // require the leading 0. // Java also has the convention of only consuming 2 octal digits if // the three digit number would be > 0xff // c.fChar = 0; nextCharLL(); // Consume the initial 0. int index; for (index=0; index<3; index++) { int32_t ch = peekCharLL(); if (chchDigit7) { if (index==0) { // \0 is not followed by any octal digits. error(U_REGEX_BAD_ESCAPE_SEQUENCE); } break; } c.fChar <<= 3; c.fChar += ch&7; if (c.fChar <= 255) { nextCharLL(); } else { // The last digit made the number too big. Forget we saw it. c.fChar >>= 3; } } c.fQuoted = true; } else if (peekCharLL() == chQ) { // "\Q" enter quote mode, which will continue until "\E" fQuoteMode = true; nextCharLL(); // discard the 'Q'. // nextChar(c); // recurse to get the real next char. goto tailRecursion; // Note: fuzz testing produced test cases that // resulted in stack overflow here. } else { // We are in a '\' escape that will be handled by the state table scanner. // Just return the backslash, but remember that the following char is to // be taken literally. fInBackslashQuote = true; } } } // re-enable # to end-of-line comments, in case they were disabled. // They are disabled by the parser upon seeing '(?', but this lasts for // the fetching of the next character only. fEOLComments = true; // putc(c.fChar, stdout); } //------------------------------------------------------------------------------ // // scanNamedChar // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern. // // The scan position will be at the 'N'. On return // the scan position should be just after the '}' // // Return the UChar32 // //------------------------------------------------------------------------------ UChar32 RegexCompile::scanNamedChar() { if (U_FAILURE(*fStatus)) { return 0; } nextChar(fC); if (fC.fChar != chLBrace) { error(U_REGEX_PROPERTY_SYNTAX); return 0; } UnicodeString charName; for (;;) { nextChar(fC); if (fC.fChar == chRBrace) { break; } if (fC.fChar == -1) { error(U_REGEX_PROPERTY_SYNTAX); return 0; } charName.append(fC.fChar); } char name[100]; if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) || (uint32_t)charName.length()>=sizeof(name)) { // All Unicode character names have only invariant characters. // The API to get a character, given a name, accepts only char *, forcing us to convert, // which requires this error check error(U_REGEX_PROPERTY_SYNTAX); return 0; } charName.extract(0, charName.length(), name, sizeof(name), US_INV); UChar32 theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus); if (U_FAILURE(*fStatus)) { error(U_REGEX_PROPERTY_SYNTAX); } nextChar(fC); // Continue overall regex pattern processing with char after the '}' return theChar; } //------------------------------------------------------------------------------ // // scanProp Construct a UnicodeSet from the text at the current scan // position, which will be of the form \p{whaterver} // // The scan position will be at the 'p' or 'P'. On return // the scan position should be just after the '}' // // Return a UnicodeSet, constructed from the \P pattern, // or nullptr if the pattern is invalid. // //------------------------------------------------------------------------------ UnicodeSet *RegexCompile::scanProp() { UnicodeSet *uset = nullptr; if (U_FAILURE(*fStatus)) { return nullptr; } (void)chLowerP; // Suppress compiler unused variable warning. U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP); UBool negated = (fC.fChar == chP); UnicodeString propertyName; nextChar(fC); if (fC.fChar != chLBrace) { error(U_REGEX_PROPERTY_SYNTAX); return nullptr; } for (;;) { nextChar(fC); if (fC.fChar == chRBrace) { break; } if (fC.fChar == -1) { // Hit the end of the input string without finding the closing '}' error(U_REGEX_PROPERTY_SYNTAX); return nullptr; } propertyName.append(fC.fChar); } uset = createSetForProperty(propertyName, negated); nextChar(fC); // Move input scan to position following the closing '}' return uset; } //------------------------------------------------------------------------------ // // scanPosixProp Construct a UnicodeSet from the text at the current scan // position, which is expected be of the form [:property expression:] // // The scan position will be at the opening ':'. On return // the scan position must be on the closing ']' // // Return a UnicodeSet constructed from the pattern, // or nullptr if this is not a valid POSIX-style set expression. // If not a property expression, restore the initial scan position // (to the opening ':') // // Note: the opening '[:' is not sufficient to guarantee that // this is a [:property:] expression. // [:'+=,] is a perfectly good ordinary set expression that // happens to include ':' as one of its characters. // //------------------------------------------------------------------------------ UnicodeSet *RegexCompile::scanPosixProp() { UnicodeSet *uset = nullptr; if (U_FAILURE(*fStatus)) { return nullptr; } U_ASSERT(fC.fChar == chColon); // Save the scanner state. // TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062 int64_t savedScanIndex = fScanIndex; int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); UBool savedQuoteMode = fQuoteMode; UBool savedInBackslashQuote = fInBackslashQuote; UBool savedEOLComments = fEOLComments; int64_t savedLineNum = fLineNum; int64_t savedCharNum = fCharNum; UChar32 savedLastChar = fLastChar; UChar32 savedPeekChar = fPeekChar; RegexPatternChar savedfC = fC; // Scan for a closing ]. A little tricky because there are some perverse // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression, // ending on the second closing ]. UnicodeString propName; UBool negated = false; // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:] nextChar(fC); if (fC.fChar == chUp) { negated = true; nextChar(fC); } // Scan for the closing ":]", collecting the property name along the way. UBool sawPropSetTerminator = false; for (;;) { propName.append(fC.fChar); nextChar(fC); if (fC.fQuoted || fC.fChar == -1) { // Escaped characters or end of input - either says this isn't a [:Property:] break; } if (fC.fChar == chColon) { nextChar(fC); if (fC.fChar == chRBracket) { sawPropSetTerminator = true; } break; } } if (sawPropSetTerminator) { uset = createSetForProperty(propName, negated); } else { // No closing ":]". // Restore the original scan position. // The main scanner will retry the input as a normal set expression, // not a [:Property:] expression. fScanIndex = savedScanIndex; fQuoteMode = savedQuoteMode; fInBackslashQuote = savedInBackslashQuote; fEOLComments = savedEOLComments; fLineNum = savedLineNum; fCharNum = savedCharNum; fLastChar = savedLastChar; fPeekChar = savedPeekChar; fC = savedfC; UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex); } return uset; } static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) { set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f); addCategory(set, U_GC_CF_MASK, ec); } // // Create a Unicode Set from a Unicode Property expression. // This is common code underlying both \p{...} and [:...:] expressions. // Includes trying the Java "properties" that aren't supported as // normal ICU UnicodeSet properties // UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) { if (U_FAILURE(*fStatus)) { return nullptr; } LocalPointer set; UErrorCode status = U_ZERO_ERROR; do { // non-loop, exists to allow breaks from the block. // // First try the property as we received it // UnicodeString setExpr; uint32_t usetFlags = 0; setExpr.append(u"[\\p{", -1); setExpr.append(propName); setExpr.append(u"}]", -1); if (fModeFlags & UREGEX_CASE_INSENSITIVE) { usetFlags |= USET_CASE_INSENSITIVE; } set.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr, usetFlags, nullptr, status), status); if (U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) { break; } // // The incoming property wasn't directly recognized by ICU. // Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet. // Java accepts 'word' with mixed case. // Java accepts 'all' only in all lower case. status = U_ZERO_ERROR; if (propName.caseCompare(u"word", -1, 0) == 0) { set.adoptInsteadAndCheckErrorCode( RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].cloneAsThawed(), status); break; } if (propName.compare(u"all", -1) == 0) { set.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status); break; } // Do Java InBlock expressions // UnicodeString mPropName = propName; if (mPropName.startsWith(u"In", 2) && mPropName.length() >= 3) { status = U_ZERO_ERROR; set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status); if (U_FAILURE(status)) { break; } UnicodeString blockName(mPropName, 2); // Property with the leading "In" removed. set->applyPropertyAlias(UnicodeString(u"Block"), blockName, status); break; } // Check for the Java form "IsBooleanPropertyValue", which we will recast // as "BooleanPropertyValue". The property value can be either a // a General Category or a Script Name. if (propName.startsWith(u"Is", 2) && propName.length()>=3) { mPropName.remove(0, 2); // Strip the "Is" if (mPropName.indexOf(u'=') >= 0) { // Reject any "Is..." property expression containing an '=', that is, // any non-binary property expression. status = U_REGEX_PROPERTY_SYNTAX; break; } if (mPropName.caseCompare(u"assigned", -1, 0) == 0) { mPropName.setTo(u"unassigned", -1); negated = !negated; } else if (mPropName.caseCompare(u"TitleCase", -1, 0) == 0) { mPropName.setTo(u"Titlecase_Letter", -1); } mPropName.insert(0, u"[\\p{", -1); mPropName.append(u"}]", -1); set.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName, *fStatus), status); if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) { set->closeOver(USET_CASE_INSENSITIVE); } break; } if (propName.startsWith(u"java", -1)) { status = U_ZERO_ERROR; set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status); if (U_FAILURE(status)) { break; } // // Try the various Java specific properties. // These all begin with "java" // if (propName.compare(u"javaDefined", -1) == 0) { addCategory(set.getAlias(), U_GC_CN_MASK, status); set->complement(); } else if (propName.compare(u"javaDigit", -1) == 0) { addCategory(set.getAlias(), U_GC_ND_MASK, status); } else if (propName.compare(u"javaIdentifierIgnorable", -1) == 0) { addIdentifierIgnorable(set.getAlias(), status); } else if (propName.compare(u"javaISOControl", -1) == 0) { set->add(0, 0x1F).add(0x7F, 0x9F); } else if (propName.compare(u"javaJavaIdentifierPart", -1) == 0) { addCategory(set.getAlias(), U_GC_L_MASK, status); addCategory(set.getAlias(), U_GC_SC_MASK, status); addCategory(set.getAlias(), U_GC_PC_MASK, status); addCategory(set.getAlias(), U_GC_ND_MASK, status); addCategory(set.getAlias(), U_GC_NL_MASK, status); addCategory(set.getAlias(), U_GC_MC_MASK, status); addCategory(set.getAlias(), U_GC_MN_MASK, status); addIdentifierIgnorable(set.getAlias(), status); } else if (propName.compare(u"javaJavaIdentifierStart", -1) == 0) { addCategory(set.getAlias(), U_GC_L_MASK, status); addCategory(set.getAlias(), U_GC_NL_MASK, status); addCategory(set.getAlias(), U_GC_SC_MASK, status); addCategory(set.getAlias(), U_GC_PC_MASK, status); } else if (propName.compare(u"javaLetter", -1) == 0) { addCategory(set.getAlias(), U_GC_L_MASK, status); } else if (propName.compare(u"javaLetterOrDigit", -1) == 0) { addCategory(set.getAlias(), U_GC_L_MASK, status); addCategory(set.getAlias(), U_GC_ND_MASK, status); } else if (propName.compare(u"javaLowerCase", -1) == 0) { addCategory(set.getAlias(), U_GC_LL_MASK, status); } else if (propName.compare(u"javaMirrored", -1) == 0) { set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, status); } else if (propName.compare(u"javaSpaceChar", -1) == 0) { addCategory(set.getAlias(), U_GC_Z_MASK, status); } else if (propName.compare(u"javaSupplementaryCodePoint", -1) == 0) { set->add(0x10000, UnicodeSet::MAX_VALUE); } else if (propName.compare(u"javaTitleCase", -1) == 0) { addCategory(set.getAlias(), U_GC_LT_MASK, status); } else if (propName.compare(u"javaUnicodeIdentifierStart", -1) == 0) { addCategory(set.getAlias(), U_GC_L_MASK, status); addCategory(set.getAlias(), U_GC_NL_MASK, status); } else if (propName.compare(u"javaUnicodeIdentifierPart", -1) == 0) { addCategory(set.getAlias(), U_GC_L_MASK, status); addCategory(set.getAlias(), U_GC_PC_MASK, status); addCategory(set.getAlias(), U_GC_ND_MASK, status); addCategory(set.getAlias(), U_GC_NL_MASK, status); addCategory(set.getAlias(), U_GC_MC_MASK, status); addCategory(set.getAlias(), U_GC_MN_MASK, status); addIdentifierIgnorable(set.getAlias(), status); } else if (propName.compare(u"javaUpperCase", -1) == 0) { addCategory(set.getAlias(), U_GC_LU_MASK, status); } else if (propName.compare(u"javaValidCodePoint", -1) == 0) { set->add(0, UnicodeSet::MAX_VALUE); } else if (propName.compare(u"javaWhitespace", -1) == 0) { addCategory(set.getAlias(), U_GC_Z_MASK, status); set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f)); set->add(9, 0x0d).add(0x1c, 0x1f); } else { status = U_REGEX_PROPERTY_SYNTAX; } if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) { set->closeOver(USET_CASE_INSENSITIVE); } break; } // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility // extensions matched it. status = U_REGEX_PROPERTY_SYNTAX; } while (false); // End of do loop block. Code above breaks out of the block on success or hard failure. if (U_SUCCESS(status)) { // ICU 70 adds emoji properties of strings, but as long as Java does not say how to // deal with properties of strings and character classes with strings, we ignore them. // Just in case something downstream might stumble over the strings, // we remove them from the set. // Note that when we support strings, the complement of a property (as with \P) // should be implemented as .complement().removeAllStrings() (code point complement). set->removeAllStrings(); U_ASSERT(set.isValid()); if (negated) { set->complement(); } return set.orphan(); } else { if (status == U_ILLEGAL_ARGUMENT_ERROR) { status = U_REGEX_PROPERTY_SYNTAX; } error(status); return nullptr; } } // // SetEval Part of the evaluation of [set expressions]. // Perform any pending (stacked) operations with precedence // equal or greater to that of the next operator encountered // in the expression. // void RegexCompile::setEval(int32_t nextOp) { UnicodeSet *rightOperand = nullptr; UnicodeSet *leftOperand = nullptr; for (;;) { U_ASSERT(fSetOpStack.empty()==false); int32_t pendingSetOperation = fSetOpStack.peeki(); if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) { break; } fSetOpStack.popi(); U_ASSERT(fSetStack.empty() == false); rightOperand = (UnicodeSet *)fSetStack.peek(); // ICU 70 adds emoji properties of strings, but createSetForProperty() removes all strings // (see comments there). // We also do not yet support string literals in character classes, // so there should not be any strings. // Note that when we support strings, the complement of a set (as with ^ or \P) // should be implemented as .complement().removeAllStrings() (code point complement). U_ASSERT(!rightOperand->hasStrings()); switch (pendingSetOperation) { case setNegation: rightOperand->complement(); break; case setCaseClose: // TODO: need a simple close function. Ticket 6065 rightOperand->closeOver(USET_CASE_INSENSITIVE); rightOperand->removeAllStrings(); break; case setDifference1: case setDifference2: fSetStack.pop(); leftOperand = (UnicodeSet *)fSetStack.peek(); leftOperand->removeAll(*rightOperand); delete rightOperand; break; case setIntersection1: case setIntersection2: fSetStack.pop(); leftOperand = (UnicodeSet *)fSetStack.peek(); leftOperand->retainAll(*rightOperand); delete rightOperand; break; case setUnion: fSetStack.pop(); leftOperand = (UnicodeSet *)fSetStack.peek(); leftOperand->addAll(*rightOperand); delete rightOperand; break; default: UPRV_UNREACHABLE_EXIT; } } } void RegexCompile::setPushOp(int32_t op) { setEval(op); fSetOpStack.push(op, *fStatus); LocalPointer lpSet(new UnicodeSet(), *fStatus); fSetStack.push(lpSet.orphan(), *fStatus); } U_NAMESPACE_END #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS stringi/src/icu74/i18n/number_decimalquantity.cpp0000644000176200001440000013400414700200761021502 0ustar liggesusers// © 2017 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include #include #include #include #include "unicode/plurrule.h" #include "cmemory.h" #include "number_decnum.h" #include "putilimp.h" #include "number_decimalquantity.h" #include "number_roundingutils.h" #include "double-conversion.h" #include "charstr.h" #include "number_utils.h" #include "uassert.h" #include "util.h" using namespace icu; using namespace icu::number; using namespace icu::number::impl; using icu::double_conversion::DoubleToStringConverter; using icu::double_conversion::StringToDoubleConverter; namespace { int8_t NEGATIVE_FLAG = 1; int8_t INFINITY_FLAG = 2; int8_t NAN_FLAG = 4; /** Helper function for safe subtraction (no overflow). */ inline int32_t safeSubtract(int32_t a, int32_t b) { // Note: In C++, signed integer subtraction is undefined behavior. int32_t diff = static_cast(static_cast(a) - static_cast(b)); if (b < 0 && diff < a) { return INT32_MAX; } if (b > 0 && diff > a) { return INT32_MIN; } return diff; } static double DOUBLE_MULTIPLIERS[] = { 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21}; } // namespace icu::IFixedDecimal::~IFixedDecimal() = default; DecimalQuantity::DecimalQuantity() { setBcdToZero(); flags = 0; } DecimalQuantity::~DecimalQuantity() { if (usingBytes) { uprv_free(fBCD.bcdBytes.ptr); fBCD.bcdBytes.ptr = nullptr; usingBytes = false; } } DecimalQuantity::DecimalQuantity(const DecimalQuantity &other) { *this = other; } DecimalQuantity::DecimalQuantity(DecimalQuantity&& src) noexcept { *this = std::move(src); } DecimalQuantity &DecimalQuantity::operator=(const DecimalQuantity &other) { if (this == &other) { return *this; } copyBcdFrom(other); copyFieldsFrom(other); return *this; } DecimalQuantity& DecimalQuantity::operator=(DecimalQuantity&& src) noexcept { if (this == &src) { return *this; } moveBcdFrom(src); copyFieldsFrom(src); return *this; } void DecimalQuantity::copyFieldsFrom(const DecimalQuantity& other) { bogus = other.bogus; lReqPos = other.lReqPos; rReqPos = other.rReqPos; scale = other.scale; precision = other.precision; flags = other.flags; origDouble = other.origDouble; origDelta = other.origDelta; isApproximate = other.isApproximate; exponent = other.exponent; } void DecimalQuantity::clear() { lReqPos = 0; rReqPos = 0; flags = 0; setBcdToZero(); // sets scale, precision, hasDouble, origDouble, origDelta, and BCD data } void DecimalQuantity::setMinInteger(int32_t minInt) { // Validation should happen outside of DecimalQuantity, e.g., in the Precision class. U_ASSERT(minInt >= 0); // Special behavior: do not set minInt to be less than what is already set. // This is so significant digits rounding can set the integer length. if (minInt < lReqPos) { minInt = lReqPos; } // Save values into internal state lReqPos = minInt; } void DecimalQuantity::setMinFraction(int32_t minFrac) { // Validation should happen outside of DecimalQuantity, e.g., in the Precision class. U_ASSERT(minFrac >= 0); // Save values into internal state // Negation is safe for minFrac/maxFrac because -Integer.MAX_VALUE > Integer.MIN_VALUE rReqPos = -minFrac; } void DecimalQuantity::applyMaxInteger(int32_t maxInt) { // Validation should happen outside of DecimalQuantity, e.g., in the Precision class. U_ASSERT(maxInt >= 0); if (precision == 0) { return; } if (maxInt <= scale) { setBcdToZero(); return; } int32_t magnitude = getMagnitude(); if (maxInt <= magnitude) { popFromLeft(magnitude - maxInt + 1); compact(); } } uint64_t DecimalQuantity::getPositionFingerprint() const { uint64_t fingerprint = 0; fingerprint ^= (lReqPos << 16); fingerprint ^= (static_cast(rReqPos) << 32); return fingerprint; } void DecimalQuantity::roundToIncrement( uint64_t increment, digits_t magnitude, RoundingMode roundingMode, UErrorCode& status) { // Do not call this method with an increment having only a 1 or a 5 digit! // Use a more efficient call to either roundToMagnitude() or roundToNickel(). // Check a few popular rounding increments; a more thorough check is in Java. U_ASSERT(increment != 1); U_ASSERT(increment != 5); DecimalQuantity incrementDQ; incrementDQ.setToLong(increment); incrementDQ.adjustMagnitude(magnitude); DecNum incrementDN; incrementDQ.toDecNum(incrementDN, status); if (U_FAILURE(status)) { return; } // Divide this DecimalQuantity by the increment, round, then multiply back. divideBy(incrementDN, status); if (U_FAILURE(status)) { return; } roundToMagnitude(0, roundingMode, status); if (U_FAILURE(status)) { return; } multiplyBy(incrementDN, status); if (U_FAILURE(status)) { return; } } void DecimalQuantity::multiplyBy(const DecNum& multiplicand, UErrorCode& status) { if (isZeroish()) { return; } // Convert to DecNum, multiply, and convert back. DecNum decnum; toDecNum(decnum, status); if (U_FAILURE(status)) { return; } decnum.multiplyBy(multiplicand, status); if (U_FAILURE(status)) { return; } setToDecNum(decnum, status); } void DecimalQuantity::divideBy(const DecNum& divisor, UErrorCode& status) { if (isZeroish()) { return; } // Convert to DecNum, multiply, and convert back. DecNum decnum; toDecNum(decnum, status); if (U_FAILURE(status)) { return; } decnum.divideBy(divisor, status); if (U_FAILURE(status)) { return; } setToDecNum(decnum, status); } void DecimalQuantity::negate() { flags ^= NEGATIVE_FLAG; } int32_t DecimalQuantity::getMagnitude() const { U_ASSERT(precision != 0); return scale + precision - 1; } bool DecimalQuantity::adjustMagnitude(int32_t delta) { if (precision != 0) { // i.e., scale += delta; origDelta += delta bool overflow = uprv_add32_overflow(scale, delta, &scale); overflow = uprv_add32_overflow(origDelta, delta, &origDelta) || overflow; // Make sure that precision + scale won't overflow, either int32_t dummy; overflow = overflow || uprv_add32_overflow(scale, precision, &dummy); return overflow; } return false; } int32_t DecimalQuantity::adjustToZeroScale() { int32_t retval = scale; scale = 0; return retval; } double DecimalQuantity::getPluralOperand(PluralOperand operand) const { // If this assertion fails, you need to call roundToInfinity() or some other rounding method. // See the comment at the top of this file explaining the "isApproximate" field. U_ASSERT(!isApproximate); switch (operand) { case PLURAL_OPERAND_I: // Invert the negative sign if necessary return static_cast(isNegative() ? -toLong(true) : toLong(true)); case PLURAL_OPERAND_F: return static_cast(toFractionLong(true)); case PLURAL_OPERAND_T: return static_cast(toFractionLong(false)); case PLURAL_OPERAND_V: return fractionCount(); case PLURAL_OPERAND_W: return fractionCountWithoutTrailingZeros(); case PLURAL_OPERAND_E: return static_cast(getExponent()); case PLURAL_OPERAND_C: // Plural operand `c` is currently an alias for `e`. return static_cast(getExponent()); default: return std::abs(toDouble()); } } int32_t DecimalQuantity::getExponent() const { return exponent; } void DecimalQuantity::adjustExponent(int delta) { exponent = exponent + delta; } void DecimalQuantity::resetExponent() { adjustMagnitude(exponent); exponent = 0; } bool DecimalQuantity::hasIntegerValue() const { return scale >= 0; } int32_t DecimalQuantity::getUpperDisplayMagnitude() const { // If this assertion fails, you need to call roundToInfinity() or some other rounding method. // See the comment in the header file explaining the "isApproximate" field. U_ASSERT(!isApproximate); int32_t magnitude = scale + precision; int32_t result = (lReqPos > magnitude) ? lReqPos : magnitude; return result - 1; } int32_t DecimalQuantity::getLowerDisplayMagnitude() const { // If this assertion fails, you need to call roundToInfinity() or some other rounding method. // See the comment in the header file explaining the "isApproximate" field. U_ASSERT(!isApproximate); int32_t magnitude = scale; int32_t result = (rReqPos < magnitude) ? rReqPos : magnitude; return result; } int8_t DecimalQuantity::getDigit(int32_t magnitude) const { // If this assertion fails, you need to call roundToInfinity() or some other rounding method. // See the comment at the top of this file explaining the "isApproximate" field. U_ASSERT(!isApproximate); return getDigitPos(magnitude - scale); } int32_t DecimalQuantity::fractionCount() const { int32_t fractionCountWithExponent = -getLowerDisplayMagnitude() - exponent; return fractionCountWithExponent > 0 ? fractionCountWithExponent : 0; } int32_t DecimalQuantity::fractionCountWithoutTrailingZeros() const { int32_t fractionCountWithExponent = -scale - exponent; return fractionCountWithExponent > 0 ? fractionCountWithExponent : 0; // max(-fractionCountWithExponent, 0) } bool DecimalQuantity::isNegative() const { return (flags & NEGATIVE_FLAG) != 0; } Signum DecimalQuantity::signum() const { bool isZero = (isZeroish() && !isInfinite()); bool isNeg = isNegative(); if (isZero && isNeg) { return SIGNUM_NEG_ZERO; } else if (isZero) { return SIGNUM_POS_ZERO; } else if (isNeg) { return SIGNUM_NEG; } else { return SIGNUM_POS; } } bool DecimalQuantity::isInfinite() const { return (flags & INFINITY_FLAG) != 0; } bool DecimalQuantity::isNaN() const { return (flags & NAN_FLAG) != 0; } bool DecimalQuantity::isZeroish() const { return precision == 0; } DecimalQuantity &DecimalQuantity::setToInt(int32_t n) { setBcdToZero(); flags = 0; if (n == INT32_MIN) { flags |= NEGATIVE_FLAG; // leave as INT32_MIN; handled below in _setToInt() } else if (n < 0) { flags |= NEGATIVE_FLAG; n = -n; } if (n != 0) { _setToInt(n); compact(); } return *this; } void DecimalQuantity::_setToInt(int32_t n) { if (n == INT32_MIN) { readLongToBcd(-static_cast(n)); } else { readIntToBcd(n); } } DecimalQuantity &DecimalQuantity::setToLong(int64_t n) { setBcdToZero(); flags = 0; if (n < 0 && n > INT64_MIN) { flags |= NEGATIVE_FLAG; n = -n; } if (n != 0) { _setToLong(n); compact(); } return *this; } void DecimalQuantity::_setToLong(int64_t n) { if (n == INT64_MIN) { DecNum decnum; UErrorCode localStatus = U_ZERO_ERROR; decnum.setTo("9.223372036854775808E+18", localStatus); if (U_FAILURE(localStatus)) { return; } // unexpected flags |= NEGATIVE_FLAG; readDecNumberToBcd(decnum); } else if (n <= INT32_MAX) { readIntToBcd(static_cast(n)); } else { readLongToBcd(n); } } DecimalQuantity &DecimalQuantity::setToDouble(double n) { setBcdToZero(); flags = 0; // signbit() from handles +0.0 vs -0.0 if (std::signbit(n)) { flags |= NEGATIVE_FLAG; n = -n; } if (std::isnan(n) != 0) { flags |= NAN_FLAG; } else if (std::isfinite(n) == 0) { flags |= INFINITY_FLAG; } else if (n != 0) { _setToDoubleFast(n); compact(); } return *this; } void DecimalQuantity::_setToDoubleFast(double n) { isApproximate = true; origDouble = n; origDelta = 0; // Make sure the double is an IEEE 754 double. If not, fall back to the slow path right now. // TODO: Make a fast path for other types of doubles. if (!std::numeric_limits::is_iec559) { convertToAccurateDouble(); return; } // To get the bits from the double, use memcpy, which takes care of endianness. uint64_t ieeeBits; uprv_memcpy(&ieeeBits, &n, sizeof(n)); int32_t exponent = static_cast((ieeeBits & 0x7ff0000000000000L) >> 52) - 0x3ff; // Not all integers can be represented exactly for exponent > 52 if (exponent <= 52 && static_cast(n) == n) { _setToLong(static_cast(n)); return; } if (exponent == -1023 || exponent == 1024) { // The extreme values of exponent are special; use slow path. convertToAccurateDouble(); return; } // 3.3219... is log2(10) auto fracLength = static_cast ((52 - exponent) / 3.32192809488736234787031942948939017586); if (fracLength >= 0) { int32_t i = fracLength; // 1e22 is the largest exact double. for (; i >= 22; i -= 22) n *= 1e22; n *= DOUBLE_MULTIPLIERS[i]; } else { int32_t i = fracLength; // 1e22 is the largest exact double. for (; i <= -22; i += 22) n /= 1e22; n /= DOUBLE_MULTIPLIERS[-i]; } auto result = static_cast(uprv_round(n)); if (result != 0) { _setToLong(result); scale -= fracLength; } } void DecimalQuantity::convertToAccurateDouble() { U_ASSERT(origDouble != 0); int32_t delta = origDelta; // Call the slow oracle function (Double.toString in Java, DoubleToAscii in C++). char buffer[DoubleToStringConverter::kBase10MaximalLength + 1]; bool sign; // unused; always positive int32_t length; int32_t point; DoubleToStringConverter::DoubleToAscii( origDouble, DoubleToStringConverter::DtoaMode::SHORTEST, 0, buffer, sizeof(buffer), &sign, &length, &point ); setBcdToZero(); readDoubleConversionToBcd(buffer, length, point); scale += delta; explicitExactDouble = true; } DecimalQuantity &DecimalQuantity::setToDecNumber(StringPiece n, UErrorCode& status) { setBcdToZero(); flags = 0; // Compute the decNumber representation DecNum decnum; decnum.setTo(n, status); _setToDecNum(decnum, status); return *this; } DecimalQuantity& DecimalQuantity::setToDecNum(const DecNum& decnum, UErrorCode& status) { setBcdToZero(); flags = 0; _setToDecNum(decnum, status); return *this; } void DecimalQuantity::_setToDecNum(const DecNum& decnum, UErrorCode& status) { if (U_FAILURE(status)) { return; } if (decnum.isNegative()) { flags |= NEGATIVE_FLAG; } if (decnum.isNaN()) { flags |= NAN_FLAG; } else if (decnum.isInfinity()) { flags |= INFINITY_FLAG; } else if (!decnum.isZero()) { readDecNumberToBcd(decnum); compact(); } } DecimalQuantity DecimalQuantity::fromExponentString(UnicodeString num, UErrorCode& status) { if (num.indexOf(u'e') >= 0 || num.indexOf(u'c') >= 0 || num.indexOf(u'E') >= 0 || num.indexOf(u'C') >= 0) { int32_t ePos = num.lastIndexOf('e'); if (ePos < 0) { ePos = num.lastIndexOf('c'); } if (ePos < 0) { ePos = num.lastIndexOf('E'); } if (ePos < 0) { ePos = num.lastIndexOf('C'); } int32_t expNumPos = ePos + 1; UnicodeString exponentStr = num.tempSubString(expNumPos, num.length() - expNumPos); // parse exponentStr into exponent, but note that parseAsciiInteger doesn't handle the minus sign bool isExpStrNeg = num[expNumPos] == u'-'; int32_t exponentParsePos = isExpStrNeg ? 1 : 0; int32_t exponent = ICU_Utility::parseAsciiInteger(exponentStr, exponentParsePos); exponent = isExpStrNeg ? -exponent : exponent; // Compute the decNumber representation UnicodeString fractionStr = num.tempSubString(0, ePos); CharString fracCharStr = CharString(); fracCharStr.appendInvariantChars(fractionStr, status); DecNum decnum; decnum.setTo(fracCharStr.toStringPiece(), status); // Clear and set this DecimalQuantity instance DecimalQuantity dq; dq.setToDecNum(decnum, status); int32_t numFracDigit = getVisibleFractionCount(fractionStr); dq.setMinFraction(numFracDigit); dq.adjustExponent(exponent); return dq; } else { DecimalQuantity dq; int numFracDigit = getVisibleFractionCount(num); CharString numCharStr = CharString(); numCharStr.appendInvariantChars(num, status); dq.setToDecNumber(numCharStr.toStringPiece(), status); dq.setMinFraction(numFracDigit); return dq; } } int32_t DecimalQuantity::getVisibleFractionCount(UnicodeString value) { int decimalPos = value.indexOf('.') + 1; if (decimalPos == 0) { return 0; } else { return value.length() - decimalPos; } } int64_t DecimalQuantity::toLong(bool truncateIfOverflow) const { // NOTE: Call sites should be guarded by fitsInLong(), like this: // if (dq.fitsInLong()) { /* use dq.toLong() */ } else { /* use some fallback */ } // Fallback behavior upon truncateIfOverflow is to truncate at 17 digits. uint64_t result = 0L; int32_t upperMagnitude = exponent + scale + precision - 1; if (truncateIfOverflow) { upperMagnitude = std::min(upperMagnitude, 17); } for (int32_t magnitude = upperMagnitude; magnitude >= 0; magnitude--) { result = result * 10 + getDigitPos(magnitude - scale - exponent); } if (isNegative()) { return static_cast(0LL - result); // i.e., -result } return static_cast(result); } uint64_t DecimalQuantity::toFractionLong(bool includeTrailingZeros) const { uint64_t result = 0L; int32_t magnitude = -1 - exponent; int32_t lowerMagnitude = scale; if (includeTrailingZeros) { lowerMagnitude = std::min(lowerMagnitude, rReqPos); } for (; magnitude >= lowerMagnitude && result <= 1e18L; magnitude--) { result = result * 10 + getDigitPos(magnitude - scale); } // Remove trailing zeros; this can happen during integer overflow cases. if (!includeTrailingZeros) { while (result > 0 && (result % 10) == 0) { result /= 10; } } return result; } bool DecimalQuantity::fitsInLong(bool ignoreFraction) const { if (isInfinite() || isNaN()) { return false; } if (isZeroish()) { return true; } if (exponent + scale < 0 && !ignoreFraction) { return false; } int magnitude = getMagnitude(); if (magnitude < 18) { return true; } if (magnitude > 18) { return false; } // Hard case: the magnitude is 10^18. // The largest int64 is: 9,223,372,036,854,775,807 for (int p = 0; p < precision; p++) { int8_t digit = getDigit(18 - p); static int8_t INT64_BCD[] = { 9, 2, 2, 3, 3, 7, 2, 0, 3, 6, 8, 5, 4, 7, 7, 5, 8, 0, 8 }; if (digit < INT64_BCD[p]) { return true; } else if (digit > INT64_BCD[p]) { return false; } } // Exactly equal to max long plus one. return isNegative(); } double DecimalQuantity::toDouble() const { // If this assertion fails, you need to call roundToInfinity() or some other rounding method. // See the comment in the header file explaining the "isApproximate" field. U_ASSERT(!isApproximate); if (isNaN()) { return NAN; } else if (isInfinite()) { return isNegative() ? -INFINITY : INFINITY; } // We are processing well-formed input, so we don't need any special options to StringToDoubleConverter. StringToDoubleConverter converter(0, 0, 0, "", ""); UnicodeString numberString = this->toScientificString(); int32_t count; return converter.StringToDouble( reinterpret_cast(numberString.getBuffer()), numberString.length(), &count); } DecNum& DecimalQuantity::toDecNum(DecNum& output, UErrorCode& status) const { // Special handling for zero if (precision == 0) { output.setTo("0", status); return output; } // Use the BCD constructor. We need to do a little bit of work to convert, though. // The decNumber constructor expects most-significant first, but we store least-significant first. MaybeStackArray ubcd(precision, status); if (U_FAILURE(status)) { return output; } for (int32_t m = 0; m < precision; m++) { ubcd[precision - m - 1] = static_cast(getDigitPos(m)); } output.setTo(ubcd.getAlias(), precision, scale, isNegative(), status); return output; } void DecimalQuantity::truncate() { if (scale < 0) { shiftRight(-scale); scale = 0; compact(); } } void DecimalQuantity::roundToNickel(int32_t magnitude, RoundingMode roundingMode, UErrorCode& status) { roundToMagnitude(magnitude, roundingMode, true, status); } void DecimalQuantity::roundToMagnitude(int32_t magnitude, RoundingMode roundingMode, UErrorCode& status) { roundToMagnitude(magnitude, roundingMode, false, status); } void DecimalQuantity::roundToMagnitude(int32_t magnitude, RoundingMode roundingMode, bool nickel, UErrorCode& status) { // The position in the BCD at which rounding will be performed; digits to the right of position // will be rounded away. int position = safeSubtract(magnitude, scale); // "trailing" = least significant digit to the left of rounding int8_t trailingDigit = getDigitPos(position); if (position <= 0 && !isApproximate && (!nickel || trailingDigit == 0 || trailingDigit == 5)) { // All digits are to the left of the rounding magnitude. } else if (precision == 0) { // No rounding for zero. } else { // Perform rounding logic. // "leading" = most significant digit to the right of rounding int8_t leadingDigit = getDigitPos(safeSubtract(position, 1)); // Compute which section of the number we are in. // EDGE means we are at the bottom or top edge, like 1.000 or 1.999 (used by doubles) // LOWER means we are between the bottom edge and the midpoint, like 1.391 // MIDPOINT means we are exactly in the middle, like 1.500 // UPPER means we are between the midpoint and the top edge, like 1.916 roundingutils::Section section; if (!isApproximate) { if (nickel && trailingDigit != 2 && trailingDigit != 7) { // Nickel rounding, and not at .02x or .07x if (trailingDigit < 2) { // .00, .01 => down to .00 section = roundingutils::SECTION_LOWER; } else if (trailingDigit < 5) { // .03, .04 => up to .05 section = roundingutils::SECTION_UPPER; } else if (trailingDigit < 7) { // .05, .06 => down to .05 section = roundingutils::SECTION_LOWER; } else { // .08, .09 => up to .10 section = roundingutils::SECTION_UPPER; } } else if (leadingDigit < 5) { // Includes nickel rounding .020-.024 and .070-.074 section = roundingutils::SECTION_LOWER; } else if (leadingDigit > 5) { // Includes nickel rounding .026-.029 and .076-.079 section = roundingutils::SECTION_UPPER; } else { // Includes nickel rounding .025 and .075 section = roundingutils::SECTION_MIDPOINT; for (int p = safeSubtract(position, 2); p >= 0; p--) { if (getDigitPos(p) != 0) { section = roundingutils::SECTION_UPPER; break; } } } } else { int32_t p = safeSubtract(position, 2); int32_t minP = uprv_max(0, precision - 14); if (leadingDigit == 0 && (!nickel || trailingDigit == 0 || trailingDigit == 5)) { section = roundingutils::SECTION_LOWER_EDGE; for (; p >= minP; p--) { if (getDigitPos(p) != 0) { section = roundingutils::SECTION_LOWER; break; } } } else if (leadingDigit == 4 && (!nickel || trailingDigit == 2 || trailingDigit == 7)) { section = roundingutils::SECTION_MIDPOINT; for (; p >= minP; p--) { if (getDigitPos(p) != 9) { section = roundingutils::SECTION_LOWER; break; } } } else if (leadingDigit == 5 && (!nickel || trailingDigit == 2 || trailingDigit == 7)) { section = roundingutils::SECTION_MIDPOINT; for (; p >= minP; p--) { if (getDigitPos(p) != 0) { section = roundingutils::SECTION_UPPER; break; } } } else if (leadingDigit == 9 && (!nickel || trailingDigit == 4 || trailingDigit == 9)) { section = roundingutils::SECTION_UPPER_EDGE; for (; p >= minP; p--) { if (getDigitPos(p) != 9) { section = roundingutils::SECTION_UPPER; break; } } } else if (nickel && trailingDigit != 2 && trailingDigit != 7) { // Nickel rounding, and not at .02x or .07x if (trailingDigit < 2) { // .00, .01 => down to .00 section = roundingutils::SECTION_LOWER; } else if (trailingDigit < 5) { // .03, .04 => up to .05 section = roundingutils::SECTION_UPPER; } else if (trailingDigit < 7) { // .05, .06 => down to .05 section = roundingutils::SECTION_LOWER; } else { // .08, .09 => up to .10 section = roundingutils::SECTION_UPPER; } } else if (leadingDigit < 5) { // Includes nickel rounding .020-.024 and .070-.074 section = roundingutils::SECTION_LOWER; } else { // Includes nickel rounding .026-.029 and .076-.079 section = roundingutils::SECTION_UPPER; } bool roundsAtMidpoint = roundingutils::roundsAtMidpoint(roundingMode); if (safeSubtract(position, 1) < precision - 14 || (roundsAtMidpoint && section == roundingutils::SECTION_MIDPOINT) || (!roundsAtMidpoint && section < 0 /* i.e. at upper or lower edge */)) { // Oops! This means that we have to get the exact representation of the double, // because the zone of uncertainty is along the rounding boundary. convertToAccurateDouble(); roundToMagnitude(magnitude, roundingMode, nickel, status); // start over return; } // Turn off the approximate double flag, since the value is now confirmed to be exact. isApproximate = false; origDouble = 0.0; origDelta = 0; if (position <= 0 && (!nickel || trailingDigit == 0 || trailingDigit == 5)) { // All digits are to the left of the rounding magnitude. return; } // Good to continue rounding. if (section == -1) { section = roundingutils::SECTION_LOWER; } if (section == -2) { section = roundingutils::SECTION_UPPER; } } // Nickel rounding "half even" goes to the nearest whole (away from the 5). bool isEven = nickel ? (trailingDigit < 2 || trailingDigit > 7 || (trailingDigit == 2 && section != roundingutils::SECTION_UPPER) || (trailingDigit == 7 && section == roundingutils::SECTION_UPPER)) : (trailingDigit % 2) == 0; bool roundDown = roundingutils::getRoundingDirection(isEven, isNegative(), section, roundingMode, status); if (U_FAILURE(status)) { return; } // Perform truncation if (position >= precision) { U_ASSERT(trailingDigit == 0); setBcdToZero(); scale = magnitude; } else { shiftRight(position); } if (nickel) { if (trailingDigit < 5 && roundDown) { setDigitPos(0, 0); compact(); return; } else if (trailingDigit >= 5 && !roundDown) { setDigitPos(0, 9); trailingDigit = 9; // do not return: use the bubbling logic below } else { setDigitPos(0, 5); // If the quantity was set to 0, we may need to restore a digit. if (precision == 0) { precision = 1; } // compact not necessary: digit at position 0 is nonzero return; } } // Bubble the result to the higher digits if (!roundDown) { if (trailingDigit == 9) { int bubblePos = 0; // Note: in the long implementation, the most digits BCD can have at this point is // 15, so bubblePos <= 15 and getDigitPos(bubblePos) is safe. for (; getDigitPos(bubblePos) == 9; bubblePos++) {} shiftRight(bubblePos); // shift off the trailing 9s } int8_t digit0 = getDigitPos(0); U_ASSERT(digit0 != 9); setDigitPos(0, static_cast(digit0 + 1)); precision += 1; // in case an extra digit got added } compact(); } } void DecimalQuantity::roundToInfinity() { if (isApproximate) { convertToAccurateDouble(); } } void DecimalQuantity::appendDigit(int8_t value, int32_t leadingZeros, bool appendAsInteger) { U_ASSERT(leadingZeros >= 0); // Zero requires special handling to maintain the invariant that the least-significant digit // in the BCD is nonzero. if (value == 0) { if (appendAsInteger && precision != 0) { scale += leadingZeros + 1; } return; } // Deal with trailing zeros if (scale > 0) { leadingZeros += scale; if (appendAsInteger) { scale = 0; } } // Append digit shiftLeft(leadingZeros + 1); setDigitPos(0, value); // Fix scale if in integer mode if (appendAsInteger) { scale += leadingZeros + 1; } } UnicodeString DecimalQuantity::toPlainString() const { U_ASSERT(!isApproximate); UnicodeString sb; if (isNegative()) { sb.append(u'-'); } if (precision == 0) { sb.append(u'0'); return sb; } int32_t upper = scale + precision + exponent - 1; int32_t lower = scale + exponent; if (upper < lReqPos - 1) { upper = lReqPos - 1; } if (lower > rReqPos) { lower = rReqPos; } int32_t p = upper; if (p < 0) { sb.append(u'0'); } for (; p >= 0; p--) { sb.append(u'0' + getDigitPos(p - scale - exponent)); } if (lower < 0) { sb.append(u'.'); } for(; p >= lower; p--) { sb.append(u'0' + getDigitPos(p - scale - exponent)); } return sb; } UnicodeString DecimalQuantity::toExponentString() const { U_ASSERT(!isApproximate); UnicodeString sb; if (isNegative()) { sb.append(u'-'); } int32_t upper = scale + precision - 1; int32_t lower = scale; if (upper < lReqPos - 1) { upper = lReqPos - 1; } if (lower > rReqPos) { lower = rReqPos; } int32_t p = upper; if (p < 0) { sb.append(u'0'); } for (; p >= 0; p--) { sb.append(u'0' + getDigitPos(p - scale)); } if (lower < 0) { sb.append(u'.'); } for(; p >= lower; p--) { sb.append(u'0' + getDigitPos(p - scale)); } if (exponent != 0) { sb.append(u'c'); ICU_Utility::appendNumber(sb, exponent); } return sb; } UnicodeString DecimalQuantity::toScientificString() const { U_ASSERT(!isApproximate); UnicodeString result; if (isNegative()) { result.append(u'-'); } if (precision == 0) { result.append(u"0E+0", -1); return result; } int32_t upperPos = precision - 1; int32_t lowerPos = 0; int32_t p = upperPos; result.append(u'0' + getDigitPos(p)); if ((--p) >= lowerPos) { result.append(u'.'); for (; p >= lowerPos; p--) { result.append(u'0' + getDigitPos(p)); } } result.append(u'E'); int32_t _scale = upperPos + scale + exponent; if (_scale == INT32_MIN) { result.append({u"-2147483648", -1}); return result; } else if (_scale < 0) { _scale *= -1; result.append(u'-'); } else { result.append(u'+'); } if (_scale == 0) { result.append(u'0'); } int32_t insertIndex = result.length(); while (_scale > 0) { std::div_t res = std::div(_scale, 10); result.insert(insertIndex, u'0' + res.rem); _scale = res.quot; } return result; } //////////////////////////////////////////////////// /// End of DecimalQuantity_AbstractBCD.java /// /// Start of DecimalQuantity_DualStorageBCD.java /// //////////////////////////////////////////////////// int8_t DecimalQuantity::getDigitPos(int32_t position) const { if (usingBytes) { if (position < 0 || position >= precision) { return 0; } return fBCD.bcdBytes.ptr[position]; } else { if (position < 0 || position >= 16) { return 0; } return (int8_t) ((fBCD.bcdLong >> (position * 4)) & 0xf); } } void DecimalQuantity::setDigitPos(int32_t position, int8_t value) { U_ASSERT(position >= 0); if (usingBytes) { ensureCapacity(position + 1); fBCD.bcdBytes.ptr[position] = value; } else if (position >= 16) { switchStorage(); ensureCapacity(position + 1); fBCD.bcdBytes.ptr[position] = value; } else { int shift = position * 4; fBCD.bcdLong = (fBCD.bcdLong & ~(0xfL << shift)) | ((long) value << shift); } } void DecimalQuantity::shiftLeft(int32_t numDigits) { if (!usingBytes && precision + numDigits > 16) { switchStorage(); } if (usingBytes) { ensureCapacity(precision + numDigits); uprv_memmove(fBCD.bcdBytes.ptr + numDigits, fBCD.bcdBytes.ptr, precision); uprv_memset(fBCD.bcdBytes.ptr, 0, numDigits); } else { fBCD.bcdLong <<= (numDigits * 4); } scale -= numDigits; precision += numDigits; } void DecimalQuantity::shiftRight(int32_t numDigits) { if (usingBytes) { int i = 0; for (; i < precision - numDigits; i++) { fBCD.bcdBytes.ptr[i] = fBCD.bcdBytes.ptr[i + numDigits]; } for (; i < precision; i++) { fBCD.bcdBytes.ptr[i] = 0; } } else { fBCD.bcdLong >>= (numDigits * 4); } scale += numDigits; precision -= numDigits; } void DecimalQuantity::popFromLeft(int32_t numDigits) { U_ASSERT(numDigits <= precision); if (usingBytes) { int i = precision - 1; for (; i >= precision - numDigits; i--) { fBCD.bcdBytes.ptr[i] = 0; } } else { fBCD.bcdLong &= (static_cast(1) << ((precision - numDigits) * 4)) - 1; } precision -= numDigits; } void DecimalQuantity::setBcdToZero() { if (usingBytes) { uprv_free(fBCD.bcdBytes.ptr); fBCD.bcdBytes.ptr = nullptr; usingBytes = false; } fBCD.bcdLong = 0L; scale = 0; precision = 0; isApproximate = false; origDouble = 0; origDelta = 0; exponent = 0; } void DecimalQuantity::readIntToBcd(int32_t n) { U_ASSERT(n != 0); // ints always fit inside the long implementation. uint64_t result = 0L; int i = 16; for (; n != 0; n /= 10, i--) { result = (result >> 4) + ((static_cast(n) % 10) << 60); } U_ASSERT(!usingBytes); fBCD.bcdLong = result >> (i * 4); scale = 0; precision = 16 - i; } void DecimalQuantity::readLongToBcd(int64_t n) { U_ASSERT(n != 0); if (n >= 10000000000000000L) { ensureCapacity(); int i = 0; for (; n != 0L; n /= 10L, i++) { fBCD.bcdBytes.ptr[i] = static_cast(n % 10); } U_ASSERT(usingBytes); scale = 0; precision = i; } else { uint64_t result = 0L; int i = 16; for (; n != 0L; n /= 10L, i--) { result = (result >> 4) + ((n % 10) << 60); } U_ASSERT(i >= 0); U_ASSERT(!usingBytes); fBCD.bcdLong = result >> (i * 4); scale = 0; precision = 16 - i; } } void DecimalQuantity::readDecNumberToBcd(const DecNum& decnum) { const decNumber* dn = decnum.getRawDecNumber(); if (dn->digits > 16) { ensureCapacity(dn->digits); for (int32_t i = 0; i < dn->digits; i++) { fBCD.bcdBytes.ptr[i] = dn->lsu[i]; } } else { uint64_t result = 0L; for (int32_t i = 0; i < dn->digits; i++) { result |= static_cast(dn->lsu[i]) << (4 * i); } fBCD.bcdLong = result; } scale = dn->exponent; precision = dn->digits; } void DecimalQuantity::readDoubleConversionToBcd( const char* buffer, int32_t length, int32_t point) { // NOTE: Despite the fact that double-conversion's API is called // "DoubleToAscii", they actually use '0' (as opposed to u8'0'). if (length > 16) { ensureCapacity(length); for (int32_t i = 0; i < length; i++) { fBCD.bcdBytes.ptr[i] = buffer[length-i-1] - '0'; } } else { uint64_t result = 0L; for (int32_t i = 0; i < length; i++) { result |= static_cast(buffer[length-i-1] - '0') << (4 * i); } fBCD.bcdLong = result; } scale = point - length; precision = length; } void DecimalQuantity::compact() { if (usingBytes) { int32_t delta = 0; for (; delta < precision && fBCD.bcdBytes.ptr[delta] == 0; delta++); if (delta == precision) { // Number is zero setBcdToZero(); return; } else { // Remove trailing zeros shiftRight(delta); } // Compute precision int32_t leading = precision - 1; for (; leading >= 0 && fBCD.bcdBytes.ptr[leading] == 0; leading--); precision = leading + 1; // Switch storage mechanism if possible if (precision <= 16) { switchStorage(); } } else { if (fBCD.bcdLong == 0L) { // Number is zero setBcdToZero(); return; } // Compact the number (remove trailing zeros) // TODO: Use a more efficient algorithm here and below. There is a logarithmic one. int32_t delta = 0; for (; delta < precision && getDigitPos(delta) == 0; delta++); fBCD.bcdLong >>= delta * 4; scale += delta; // Compute precision int32_t leading = precision - 1; for (; leading >= 0 && getDigitPos(leading) == 0; leading--); precision = leading + 1; } } void DecimalQuantity::ensureCapacity() { ensureCapacity(40); } void DecimalQuantity::ensureCapacity(int32_t capacity) { if (capacity == 0) { return; } int32_t oldCapacity = usingBytes ? fBCD.bcdBytes.len : 0; if (!usingBytes) { // TODO: There is nothing being done to check for memory allocation failures. // TODO: Consider indexing by nybbles instead of bytes in C++, so that we can // make these arrays half the size. fBCD.bcdBytes.ptr = static_cast(uprv_malloc(capacity * sizeof(int8_t))); fBCD.bcdBytes.len = capacity; // Initialize the byte array to zeros (this is done automatically in Java) uprv_memset(fBCD.bcdBytes.ptr, 0, capacity * sizeof(int8_t)); } else if (oldCapacity < capacity) { auto bcd1 = static_cast(uprv_malloc(capacity * 2 * sizeof(int8_t))); uprv_memcpy(bcd1, fBCD.bcdBytes.ptr, oldCapacity * sizeof(int8_t)); // Initialize the rest of the byte array to zeros (this is done automatically in Java) uprv_memset(bcd1 + oldCapacity, 0, (capacity - oldCapacity) * sizeof(int8_t)); uprv_free(fBCD.bcdBytes.ptr); fBCD.bcdBytes.ptr = bcd1; fBCD.bcdBytes.len = capacity * 2; } usingBytes = true; } void DecimalQuantity::switchStorage() { if (usingBytes) { // Change from bytes to long uint64_t bcdLong = 0L; for (int i = precision - 1; i >= 0; i--) { bcdLong <<= 4; bcdLong |= fBCD.bcdBytes.ptr[i]; } uprv_free(fBCD.bcdBytes.ptr); fBCD.bcdBytes.ptr = nullptr; fBCD.bcdLong = bcdLong; usingBytes = false; } else { // Change from long to bytes // Copy the long into a local variable since it will get munged when we allocate the bytes uint64_t bcdLong = fBCD.bcdLong; ensureCapacity(); for (int i = 0; i < precision; i++) { fBCD.bcdBytes.ptr[i] = static_cast(bcdLong & 0xf); bcdLong >>= 4; } U_ASSERT(usingBytes); } } void DecimalQuantity::copyBcdFrom(const DecimalQuantity &other) { setBcdToZero(); if (other.usingBytes) { ensureCapacity(other.precision); uprv_memcpy(fBCD.bcdBytes.ptr, other.fBCD.bcdBytes.ptr, other.precision * sizeof(int8_t)); } else { fBCD.bcdLong = other.fBCD.bcdLong; } } void DecimalQuantity::moveBcdFrom(DecimalQuantity &other) { setBcdToZero(); if (other.usingBytes) { usingBytes = true; fBCD.bcdBytes.ptr = other.fBCD.bcdBytes.ptr; fBCD.bcdBytes.len = other.fBCD.bcdBytes.len; // Take ownership away from the old instance: other.fBCD.bcdBytes.ptr = nullptr; other.usingBytes = false; } else { fBCD.bcdLong = other.fBCD.bcdLong; } } const char16_t* DecimalQuantity::checkHealth() const { if (usingBytes) { if (precision == 0) { return u"Zero precision but we are in byte mode"; } int32_t capacity = fBCD.bcdBytes.len; if (precision > capacity) { return u"Precision exceeds length of byte array"; } if (getDigitPos(precision - 1) == 0) { return u"Most significant digit is zero in byte mode"; } if (getDigitPos(0) == 0) { return u"Least significant digit is zero in long mode"; } for (int i = 0; i < precision; i++) { if (getDigitPos(i) >= 10) { return u"Digit exceeding 10 in byte array"; } if (getDigitPos(i) < 0) { return u"Digit below 0 in byte array"; } } for (int i = precision; i < capacity; i++) { if (getDigitPos(i) != 0) { return u"Nonzero digits outside of range in byte array"; } } } else { if (precision == 0 && fBCD.bcdLong != 0) { return u"Value in bcdLong even though precision is zero"; } if (precision > 16) { return u"Precision exceeds length of long"; } if (precision != 0 && getDigitPos(precision - 1) == 0) { return u"Most significant digit is zero in long mode"; } if (precision != 0 && getDigitPos(0) == 0) { return u"Least significant digit is zero in long mode"; } for (int i = 0; i < precision; i++) { if (getDigitPos(i) >= 10) { return u"Digit exceeding 10 in long"; } if (getDigitPos(i) < 0) { return u"Digit below 0 in long (?!)"; } } for (int i = precision; i < 16; i++) { if (getDigitPos(i) != 0) { return u"Nonzero digits outside of range in long"; } } } // No error return nullptr; } bool DecimalQuantity::operator==(const DecimalQuantity& other) const { bool basicEquals = scale == other.scale && precision == other.precision && flags == other.flags && lReqPos == other.lReqPos && rReqPos == other.rReqPos && isApproximate == other.isApproximate; if (!basicEquals) { return false; } if (precision == 0) { return true; } else if (isApproximate) { return origDouble == other.origDouble && origDelta == other.origDelta; } else { for (int m = getUpperDisplayMagnitude(); m >= getLowerDisplayMagnitude(); m--) { if (getDigit(m) != other.getDigit(m)) { return false; } } return true; } } UnicodeString DecimalQuantity::toString() const { UErrorCode localStatus = U_ZERO_ERROR; MaybeStackArray digits(precision + 1, localStatus); if (U_FAILURE(localStatus)) { return ICU_Utility::makeBogusString(); } for (int32_t i = 0; i < precision; i++) { digits[i] = getDigitPos(precision - i - 1) + '0'; } digits[precision] = 0; // terminate buffer char buffer8[100]; snprintf( buffer8, sizeof(buffer8), "", lReqPos, rReqPos, (usingBytes ? "bytes" : "long"), (isNegative() ? "-" : ""), (precision == 0 ? "0" : digits.getAlias()), "E", scale); return UnicodeString(buffer8, -1, US_INV); } #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/decimfmt.cpp0000644000176200001440000021236014700200761016527 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING // Allow implicit conversion from char16_t* to UnicodeString for this file: // Helpful in toString methods and elsewhere. #define UNISTR_FROM_STRING_EXPLICIT #include #include #include #include "unicode/errorcode.h" #include "unicode/decimfmt.h" #include "number_decimalquantity.h" #include "number_types.h" #include "numparse_impl.h" #include "number_mapper.h" #include "number_patternstring.h" #include "putilimp.h" #include "number_utils.h" #include "number_utypes.h" using namespace icu; using namespace icu::number; using namespace icu::number::impl; using namespace icu::numparse; using namespace icu::numparse::impl; using ERoundingMode = icu::DecimalFormat::ERoundingMode; using EPadPosition = icu::DecimalFormat::EPadPosition; // MSVC VS2015 warns C4805 when comparing bool with UBool, VS2017 no longer emits this warning. // TODO: Move this macro into a better place? #if U_PF_WINDOWS <= U_PLATFORM && U_PLATFORM <= U_PF_CYGWIN #define UBOOL_TO_BOOL(b) static_cast(b) #else #define UBOOL_TO_BOOL(b) b #endif UOBJECT_DEFINE_RTTI_IMPLEMENTATION(DecimalFormat) DecimalFormat::DecimalFormat(UErrorCode& status) : DecimalFormat(nullptr, status) { if (U_FAILURE(status)) { return; } // Use the default locale and decimal pattern. const char* localeName = Locale::getDefault().getName(); LocalPointer ns(NumberingSystem::createInstance(status)); UnicodeString patternString = utils::getPatternForStyle( localeName, ns->getName(), CLDR_PATTERN_STYLE_DECIMAL, status); setPropertiesFromPattern(patternString, IGNORE_ROUNDING_IF_CURRENCY, status); touch(status); } DecimalFormat::DecimalFormat(const UnicodeString& pattern, UErrorCode& status) : DecimalFormat(nullptr, status) { if (U_FAILURE(status)) { return; } setPropertiesFromPattern(pattern, IGNORE_ROUNDING_IF_CURRENCY, status); touch(status); } DecimalFormat::DecimalFormat(const UnicodeString& pattern, DecimalFormatSymbols* symbolsToAdopt, UErrorCode& status) : DecimalFormat(symbolsToAdopt, status) { if (U_FAILURE(status)) { return; } setPropertiesFromPattern(pattern, IGNORE_ROUNDING_IF_CURRENCY, status); touch(status); } DecimalFormat::DecimalFormat(const UnicodeString& pattern, DecimalFormatSymbols* symbolsToAdopt, UNumberFormatStyle style, UErrorCode& status) : DecimalFormat(symbolsToAdopt, status) { if (U_FAILURE(status)) { return; } // If choice is a currency type, ignore the rounding information. if (style == UNumberFormatStyle::UNUM_CURRENCY || style == UNumberFormatStyle::UNUM_CURRENCY_ISO || style == UNumberFormatStyle::UNUM_CURRENCY_ACCOUNTING || style == UNumberFormatStyle::UNUM_CASH_CURRENCY || style == UNumberFormatStyle::UNUM_CURRENCY_STANDARD || style == UNumberFormatStyle::UNUM_CURRENCY_PLURAL) { setPropertiesFromPattern(pattern, IGNORE_ROUNDING_ALWAYS, status); } else { setPropertiesFromPattern(pattern, IGNORE_ROUNDING_IF_CURRENCY, status); } // Note: in Java, CurrencyPluralInfo is set in NumberFormat.java, but in C++, it is not set there, // so we have to set it here. if (style == UNumberFormatStyle::UNUM_CURRENCY_PLURAL) { LocalPointer cpi( new CurrencyPluralInfo(fields->symbols->getLocale(), status), status); if (U_FAILURE(status)) { return; } fields->properties.currencyPluralInfo.fPtr.adoptInstead(cpi.orphan()); } touch(status); } DecimalFormat::DecimalFormat(const DecimalFormatSymbols* symbolsToAdopt, UErrorCode& status) { // we must take ownership of symbolsToAdopt, even in a failure case. LocalPointer adoptedSymbols(symbolsToAdopt); if (U_FAILURE(status)) { return; } fields = new DecimalFormatFields(); if (fields == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } if (adoptedSymbols.isNull()) { fields->symbols.adoptInsteadAndCheckErrorCode(new DecimalFormatSymbols(status), status); } else { fields->symbols.adoptInsteadAndCheckErrorCode(adoptedSymbols.orphan(), status); } if (U_FAILURE(status)) { delete fields; fields = nullptr; } } #if UCONFIG_HAVE_PARSEALLINPUT void DecimalFormat::setParseAllInput(UNumberFormatAttributeValue value) { if (fields == nullptr) { return; } if (value == fields->properties.parseAllInput) { return; } fields->properties.parseAllInput = value; } #endif DecimalFormat& DecimalFormat::setAttribute(UNumberFormatAttribute attr, int32_t newValue, UErrorCode& status) { if (U_FAILURE(status)) { return *this; } if (fields == nullptr) { // We only get here if an OOM error happened during construction, copy construction, assignment, or modification. status = U_MEMORY_ALLOCATION_ERROR; return *this; } switch (attr) { case UNUM_LENIENT_PARSE: setLenient(newValue != 0); break; case UNUM_PARSE_INT_ONLY: setParseIntegerOnly(newValue != 0); break; case UNUM_GROUPING_USED: setGroupingUsed(newValue != 0); break; case UNUM_DECIMAL_ALWAYS_SHOWN: setDecimalSeparatorAlwaysShown(newValue != 0); break; case UNUM_MAX_INTEGER_DIGITS: setMaximumIntegerDigits(newValue); break; case UNUM_MIN_INTEGER_DIGITS: setMinimumIntegerDigits(newValue); break; case UNUM_INTEGER_DIGITS: setMinimumIntegerDigits(newValue); setMaximumIntegerDigits(newValue); break; case UNUM_MAX_FRACTION_DIGITS: setMaximumFractionDigits(newValue); break; case UNUM_MIN_FRACTION_DIGITS: setMinimumFractionDigits(newValue); break; case UNUM_FRACTION_DIGITS: setMinimumFractionDigits(newValue); setMaximumFractionDigits(newValue); break; case UNUM_SIGNIFICANT_DIGITS_USED: setSignificantDigitsUsed(newValue != 0); break; case UNUM_MAX_SIGNIFICANT_DIGITS: setMaximumSignificantDigits(newValue); break; case UNUM_MIN_SIGNIFICANT_DIGITS: setMinimumSignificantDigits(newValue); break; case UNUM_MULTIPLIER: setMultiplier(newValue); break; case UNUM_SCALE: setMultiplierScale(newValue); break; case UNUM_GROUPING_SIZE: setGroupingSize(newValue); break; case UNUM_ROUNDING_MODE: setRoundingMode((DecimalFormat::ERoundingMode) newValue); break; case UNUM_FORMAT_WIDTH: setFormatWidth(newValue); break; case UNUM_PADDING_POSITION: /** The position at which padding will take place. */ setPadPosition((DecimalFormat::EPadPosition) newValue); break; case UNUM_SECONDARY_GROUPING_SIZE: setSecondaryGroupingSize(newValue); break; #if UCONFIG_HAVE_PARSEALLINPUT case UNUM_PARSE_ALL_INPUT: setParseAllInput((UNumberFormatAttributeValue) newValue); break; #endif case UNUM_PARSE_NO_EXPONENT: setParseNoExponent((UBool) newValue); break; case UNUM_PARSE_DECIMAL_MARK_REQUIRED: setDecimalPatternMatchRequired((UBool) newValue); break; case UNUM_CURRENCY_USAGE: setCurrencyUsage((UCurrencyUsage) newValue, &status); break; case UNUM_MINIMUM_GROUPING_DIGITS: setMinimumGroupingDigits(newValue); break; case UNUM_PARSE_CASE_SENSITIVE: setParseCaseSensitive(static_cast(newValue)); break; case UNUM_SIGN_ALWAYS_SHOWN: setSignAlwaysShown(static_cast(newValue)); break; case UNUM_FORMAT_FAIL_IF_MORE_THAN_MAX_DIGITS: setFormatFailIfMoreThanMaxDigits(static_cast(newValue)); break; default: status = U_UNSUPPORTED_ERROR; break; } return *this; } int32_t DecimalFormat::getAttribute(UNumberFormatAttribute attr, UErrorCode& status) const { if (U_FAILURE(status)) { return -1; } if (fields == nullptr) { // We only get here if an OOM error happened during construction, copy construction, assignment, or modification. status = U_MEMORY_ALLOCATION_ERROR; return -1; } switch (attr) { case UNUM_LENIENT_PARSE: return isLenient(); case UNUM_PARSE_INT_ONLY: return isParseIntegerOnly(); case UNUM_GROUPING_USED: return isGroupingUsed(); case UNUM_DECIMAL_ALWAYS_SHOWN: return isDecimalSeparatorAlwaysShown(); case UNUM_MAX_INTEGER_DIGITS: return getMaximumIntegerDigits(); case UNUM_MIN_INTEGER_DIGITS: return getMinimumIntegerDigits(); case UNUM_INTEGER_DIGITS: // TBD: what should this return? return getMinimumIntegerDigits(); case UNUM_MAX_FRACTION_DIGITS: return getMaximumFractionDigits(); case UNUM_MIN_FRACTION_DIGITS: return getMinimumFractionDigits(); case UNUM_FRACTION_DIGITS: // TBD: what should this return? return getMinimumFractionDigits(); case UNUM_SIGNIFICANT_DIGITS_USED: return areSignificantDigitsUsed(); case UNUM_MAX_SIGNIFICANT_DIGITS: return getMaximumSignificantDigits(); case UNUM_MIN_SIGNIFICANT_DIGITS: return getMinimumSignificantDigits(); case UNUM_MULTIPLIER: return getMultiplier(); case UNUM_SCALE: return getMultiplierScale(); case UNUM_GROUPING_SIZE: return getGroupingSize(); case UNUM_ROUNDING_MODE: return getRoundingMode(); case UNUM_FORMAT_WIDTH: return getFormatWidth(); case UNUM_PADDING_POSITION: return getPadPosition(); case UNUM_SECONDARY_GROUPING_SIZE: return getSecondaryGroupingSize(); case UNUM_PARSE_NO_EXPONENT: return isParseNoExponent(); case UNUM_PARSE_DECIMAL_MARK_REQUIRED: return isDecimalPatternMatchRequired(); case UNUM_CURRENCY_USAGE: return getCurrencyUsage(); case UNUM_MINIMUM_GROUPING_DIGITS: return getMinimumGroupingDigits(); case UNUM_PARSE_CASE_SENSITIVE: return isParseCaseSensitive(); case UNUM_SIGN_ALWAYS_SHOWN: return isSignAlwaysShown(); case UNUM_FORMAT_FAIL_IF_MORE_THAN_MAX_DIGITS: return isFormatFailIfMoreThanMaxDigits(); default: status = U_UNSUPPORTED_ERROR; break; } return -1; /* undefined */ } void DecimalFormat::setGroupingUsed(UBool enabled) { if (fields == nullptr) { return; } if (UBOOL_TO_BOOL(enabled) == fields->properties.groupingUsed) { return; } NumberFormat::setGroupingUsed(enabled); // to set field for compatibility fields->properties.groupingUsed = enabled; touchNoError(); } void DecimalFormat::setParseIntegerOnly(UBool value) { if (fields == nullptr) { return; } if (UBOOL_TO_BOOL(value) == fields->properties.parseIntegerOnly) { return; } NumberFormat::setParseIntegerOnly(value); // to set field for compatibility fields->properties.parseIntegerOnly = value; touchNoError(); } void DecimalFormat::setLenient(UBool enable) { if (fields == nullptr) { return; } ParseMode mode = enable ? PARSE_MODE_LENIENT : PARSE_MODE_STRICT; if (!fields->properties.parseMode.isNull() && mode == fields->properties.parseMode.getNoError()) { return; } NumberFormat::setLenient(enable); // to set field for compatibility fields->properties.parseMode = mode; touchNoError(); } DecimalFormat::DecimalFormat(const UnicodeString& pattern, DecimalFormatSymbols* symbolsToAdopt, UParseError&, UErrorCode& status) : DecimalFormat(symbolsToAdopt, status) { if (U_FAILURE(status)) { return; } // TODO: What is parseError for? setPropertiesFromPattern(pattern, IGNORE_ROUNDING_IF_CURRENCY, status); touch(status); } DecimalFormat::DecimalFormat(const UnicodeString& pattern, const DecimalFormatSymbols& symbols, UErrorCode& status) : DecimalFormat(nullptr, status) { if (U_FAILURE(status)) { return; } LocalPointer dfs(new DecimalFormatSymbols(symbols), status); if (U_FAILURE(status)) { // If we failed to allocate DecimalFormatSymbols, then release fields and its members. // We must have a fully complete fields object, we cannot have partially populated members. delete fields; fields = nullptr; status = U_MEMORY_ALLOCATION_ERROR; return; } fields->symbols.adoptInstead(dfs.orphan()); setPropertiesFromPattern(pattern, IGNORE_ROUNDING_IF_CURRENCY, status); touch(status); } DecimalFormat::DecimalFormat(const DecimalFormat& source) : NumberFormat(source) { // If the object that we are copying from is invalid, no point in going further. if (source.fields == nullptr) { return; } // Note: it is not safe to copy fields->formatter or fWarehouse directly because fields->formatter might have // dangling pointers to fields inside fWarehouse. The safe thing is to re-construct fields->formatter from // the property bag, despite being somewhat slower. fields = new DecimalFormatFields(source.fields->properties); if (fields == nullptr) { return; // no way to report an error. } UErrorCode status = U_ZERO_ERROR; fields->symbols.adoptInsteadAndCheckErrorCode(new DecimalFormatSymbols(*source.getDecimalFormatSymbols()), status); // In order to simplify error handling logic in the various getters/setters/etc, we do not allow // any partially populated DecimalFormatFields object. We must have a fully complete fields object // or else we set it to nullptr. if (U_FAILURE(status)) { delete fields; fields = nullptr; return; } touch(status); } DecimalFormat& DecimalFormat::operator=(const DecimalFormat& rhs) { // guard against self-assignment if (this == &rhs) { return *this; } // Make sure both objects are valid. if (fields == nullptr || rhs.fields == nullptr) { return *this; // unfortunately, no way to report an error. } fields->properties = rhs.fields->properties; fields->exportedProperties.clear(); UErrorCode status = U_ZERO_ERROR; LocalPointer dfs(new DecimalFormatSymbols(*rhs.getDecimalFormatSymbols()), status); if (U_FAILURE(status)) { // We failed to allocate DecimalFormatSymbols, release fields and its members. // We must have a fully complete fields object, we cannot have partially populated members. delete fields; fields = nullptr; return *this; } fields->symbols.adoptInstead(dfs.orphan()); touch(status); return *this; } DecimalFormat::~DecimalFormat() { if (fields == nullptr) { return; } delete fields->atomicParser.exchange(nullptr); delete fields->atomicCurrencyParser.exchange(nullptr); delete fields; } DecimalFormat* DecimalFormat::clone() const { // can only clone valid objects. if (fields == nullptr) { return nullptr; } LocalPointer df(new DecimalFormat(*this)); if (df.isValid() && df->fields != nullptr) { return df.orphan(); } return nullptr; } bool DecimalFormat::operator==(const Format& other) const { auto* otherDF = dynamic_cast(&other); if (otherDF == nullptr) { return false; } // If either object is in an invalid state, prevent dereferencing nullptr below. // Additionally, invalid objects should not be considered equal to anything. if (fields == nullptr || otherDF->fields == nullptr) { return false; } return fields->properties == otherDF->fields->properties && *getDecimalFormatSymbols() == *otherDF->getDecimalFormatSymbols(); } UnicodeString& DecimalFormat::format(double number, UnicodeString& appendTo, FieldPosition& pos) const { if (fields == nullptr) { appendTo.setToBogus(); return appendTo; } if (pos.getField() == FieldPosition::DONT_CARE && fastFormatDouble(number, appendTo)) { return appendTo; } UErrorCode localStatus = U_ZERO_ERROR; UFormattedNumberData output; output.quantity.setToDouble(number); fields->formatter.formatImpl(&output, localStatus); fieldPositionHelper(output, pos, appendTo.length(), localStatus); auto appendable = UnicodeStringAppendable(appendTo); output.appendTo(appendable, localStatus); return appendTo; } UnicodeString& DecimalFormat::format(double number, UnicodeString& appendTo, FieldPosition& pos, UErrorCode& status) const { if (U_FAILURE(status)) { return appendTo; // don't overwrite status if it's already a failure. } if (fields == nullptr) { // We only get here if an OOM error happened during construction, copy construction, assignment, or modification. status = U_MEMORY_ALLOCATION_ERROR; appendTo.setToBogus(); return appendTo; } if (pos.getField() == FieldPosition::DONT_CARE && fastFormatDouble(number, appendTo)) { return appendTo; } UFormattedNumberData output; output.quantity.setToDouble(number); fields->formatter.formatImpl(&output, status); fieldPositionHelper(output, pos, appendTo.length(), status); auto appendable = UnicodeStringAppendable(appendTo); output.appendTo(appendable, status); return appendTo; } UnicodeString& DecimalFormat::format(double number, UnicodeString& appendTo, FieldPositionIterator* posIter, UErrorCode& status) const { if (U_FAILURE(status)) { return appendTo; // don't overwrite status if it's already a failure. } if (fields == nullptr) { // We only get here if an OOM error happened during construction, copy construction, assignment, or modification. status = U_MEMORY_ALLOCATION_ERROR; appendTo.setToBogus(); return appendTo; } if (posIter == nullptr && fastFormatDouble(number, appendTo)) { return appendTo; } UFormattedNumberData output; output.quantity.setToDouble(number); fields->formatter.formatImpl(&output, status); fieldPositionIteratorHelper(output, posIter, appendTo.length(), status); auto appendable = UnicodeStringAppendable(appendTo); output.appendTo(appendable, status); return appendTo; } UnicodeString& DecimalFormat::format(int32_t number, UnicodeString& appendTo, FieldPosition& pos) const { return format(static_cast (number), appendTo, pos); } UnicodeString& DecimalFormat::format(int32_t number, UnicodeString& appendTo, FieldPosition& pos, UErrorCode& status) const { return format(static_cast (number), appendTo, pos, status); } UnicodeString& DecimalFormat::format(int32_t number, UnicodeString& appendTo, FieldPositionIterator* posIter, UErrorCode& status) const { return format(static_cast (number), appendTo, posIter, status); } UnicodeString& DecimalFormat::format(int64_t number, UnicodeString& appendTo, FieldPosition& pos) const { if (fields == nullptr) { appendTo.setToBogus(); return appendTo; } if (pos.getField() == FieldPosition::DONT_CARE && fastFormatInt64(number, appendTo)) { return appendTo; } UErrorCode localStatus = U_ZERO_ERROR; UFormattedNumberData output; output.quantity.setToLong(number); fields->formatter.formatImpl(&output, localStatus); fieldPositionHelper(output, pos, appendTo.length(), localStatus); auto appendable = UnicodeStringAppendable(appendTo); output.appendTo(appendable, localStatus); return appendTo; } UnicodeString& DecimalFormat::format(int64_t number, UnicodeString& appendTo, FieldPosition& pos, UErrorCode& status) const { if (U_FAILURE(status)) { return appendTo; // don't overwrite status if it's already a failure. } if (fields == nullptr) { // We only get here if an OOM error happened during construction, copy construction, assignment, or modification. status = U_MEMORY_ALLOCATION_ERROR; appendTo.setToBogus(); return appendTo; } if (pos.getField() == FieldPosition::DONT_CARE && fastFormatInt64(number, appendTo)) { return appendTo; } UFormattedNumberData output; output.quantity.setToLong(number); fields->formatter.formatImpl(&output, status); fieldPositionHelper(output, pos, appendTo.length(), status); auto appendable = UnicodeStringAppendable(appendTo); output.appendTo(appendable, status); return appendTo; } UnicodeString& DecimalFormat::format(int64_t number, UnicodeString& appendTo, FieldPositionIterator* posIter, UErrorCode& status) const { if (U_FAILURE(status)) { return appendTo; // don't overwrite status if it's already a failure. } if (fields == nullptr) { // We only get here if an OOM error happened during construction, copy construction, assignment, or modification. status = U_MEMORY_ALLOCATION_ERROR; appendTo.setToBogus(); return appendTo; } if (posIter == nullptr && fastFormatInt64(number, appendTo)) { return appendTo; } UFormattedNumberData output; output.quantity.setToLong(number); fields->formatter.formatImpl(&output, status); fieldPositionIteratorHelper(output, posIter, appendTo.length(), status); auto appendable = UnicodeStringAppendable(appendTo); output.appendTo(appendable, status); return appendTo; } UnicodeString& DecimalFormat::format(StringPiece number, UnicodeString& appendTo, FieldPositionIterator* posIter, UErrorCode& status) const { if (U_FAILURE(status)) { return appendTo; // don't overwrite status if it's already a failure. } if (fields == nullptr) { // We only get here if an OOM error happened during construction, copy construction, assignment, or modification. status = U_MEMORY_ALLOCATION_ERROR; appendTo.setToBogus(); return appendTo; } UFormattedNumberData output; output.quantity.setToDecNumber(number, status); fields->formatter.formatImpl(&output, status); fieldPositionIteratorHelper(output, posIter, appendTo.length(), status); auto appendable = UnicodeStringAppendable(appendTo); output.appendTo(appendable, status); return appendTo; } UnicodeString& DecimalFormat::format(const DecimalQuantity& number, UnicodeString& appendTo, FieldPositionIterator* posIter, UErrorCode& status) const { if (U_FAILURE(status)) { return appendTo; // don't overwrite status if it's already a failure. } if (fields == nullptr) { // We only get here if an OOM error happened during construction, copy construction, assignment, or modification. status = U_MEMORY_ALLOCATION_ERROR; appendTo.setToBogus(); return appendTo; } UFormattedNumberData output; output.quantity = number; fields->formatter.formatImpl(&output, status); fieldPositionIteratorHelper(output, posIter, appendTo.length(), status); auto appendable = UnicodeStringAppendable(appendTo); output.appendTo(appendable, status); return appendTo; } UnicodeString& DecimalFormat::format(const DecimalQuantity& number, UnicodeString& appendTo, FieldPosition& pos, UErrorCode& status) const { if (U_FAILURE(status)) { return appendTo; // don't overwrite status if it's already a failure. } if (fields == nullptr) { // We only get here if an OOM error happened during construction, copy construction, assignment, or modification. status = U_MEMORY_ALLOCATION_ERROR; appendTo.setToBogus(); return appendTo; } UFormattedNumberData output; output.quantity = number; fields->formatter.formatImpl(&output, status); fieldPositionHelper(output, pos, appendTo.length(), status); auto appendable = UnicodeStringAppendable(appendTo); output.appendTo(appendable, status); return appendTo; } void DecimalFormat::parse(const UnicodeString& text, Formattable& output, ParsePosition& parsePosition) const { if (fields == nullptr) { return; } if (parsePosition.getIndex() < 0 || parsePosition.getIndex() >= text.length()) { if (parsePosition.getIndex() == text.length()) { // If there is nothing to parse, it is an error parsePosition.setErrorIndex(parsePosition.getIndex()); } return; } ErrorCode status; ParsedNumber result; // Note: if this is a currency instance, currencies will be matched despite the fact that we are not in the // parseCurrency method (backwards compatibility) int32_t startIndex = parsePosition.getIndex(); const NumberParserImpl* parser = getParser(status); if (U_FAILURE(status)) { return; // unfortunately no way to report back the error. } parser->parse(text, startIndex, true, result, status); if (U_FAILURE(status)) { return; // unfortunately no way to report back the error. } // TODO: Do we need to check for fImpl->properties->parseAllInput (UCONFIG_HAVE_PARSEALLINPUT) here? if (result.success()) { parsePosition.setIndex(result.charEnd); result.populateFormattable(output, parser->getParseFlags()); } else { parsePosition.setErrorIndex(startIndex + result.charEnd); } } CurrencyAmount* DecimalFormat::parseCurrency(const UnicodeString& text, ParsePosition& parsePosition) const { if (fields == nullptr) { return nullptr; } if (parsePosition.getIndex() < 0 || parsePosition.getIndex() >= text.length()) { return nullptr; } ErrorCode status; ParsedNumber result; // Note: if this is a currency instance, currencies will be matched despite the fact that we are not in the // parseCurrency method (backwards compatibility) int32_t startIndex = parsePosition.getIndex(); const NumberParserImpl* parser = getCurrencyParser(status); if (U_FAILURE(status)) { return nullptr; } parser->parse(text, startIndex, true, result, status); if (U_FAILURE(status)) { return nullptr; } // TODO: Do we need to check for fImpl->properties->parseAllInput (UCONFIG_HAVE_PARSEALLINPUT) here? if (result.success()) { parsePosition.setIndex(result.charEnd); Formattable formattable; result.populateFormattable(formattable, parser->getParseFlags()); LocalPointer currencyAmount( new CurrencyAmount(formattable, result.currencyCode, status), status); if (U_FAILURE(status)) { return nullptr; } return currencyAmount.orphan(); } else { parsePosition.setErrorIndex(startIndex + result.charEnd); return nullptr; } } const DecimalFormatSymbols* DecimalFormat::getDecimalFormatSymbols() const { if (fields == nullptr) { return nullptr; } if (!fields->symbols.isNull()) { return fields->symbols.getAlias(); } else { return fields->formatter.getDecimalFormatSymbols(); } } void DecimalFormat::adoptDecimalFormatSymbols(DecimalFormatSymbols* symbolsToAdopt) { if (symbolsToAdopt == nullptr) { return; // do not allow caller to set fields->symbols to nullptr } // we must take ownership of symbolsToAdopt, even in a failure case. LocalPointer dfs(symbolsToAdopt); if (fields == nullptr) { return; } fields->symbols.adoptInstead(dfs.orphan()); touchNoError(); } void DecimalFormat::setDecimalFormatSymbols(const DecimalFormatSymbols& symbols) { if (fields == nullptr) { return; } UErrorCode status = U_ZERO_ERROR; LocalPointer dfs(new DecimalFormatSymbols(symbols), status); if (U_FAILURE(status)) { // We failed to allocate DecimalFormatSymbols, release fields and its members. // We must have a fully complete fields object, we cannot have partially populated members. delete fields; fields = nullptr; return; } fields->symbols.adoptInstead(dfs.orphan()); touchNoError(); } const CurrencyPluralInfo* DecimalFormat::getCurrencyPluralInfo() const { if (fields == nullptr) { return nullptr; } return fields->properties.currencyPluralInfo.fPtr.getAlias(); } void DecimalFormat::adoptCurrencyPluralInfo(CurrencyPluralInfo* toAdopt) { // TODO: should we guard against nullptr input, like in adoptDecimalFormatSymbols? // we must take ownership of toAdopt, even in a failure case. LocalPointer cpi(toAdopt); if (fields == nullptr) { return; } fields->properties.currencyPluralInfo.fPtr.adoptInstead(cpi.orphan()); touchNoError(); } void DecimalFormat::setCurrencyPluralInfo(const CurrencyPluralInfo& info) { if (fields == nullptr) { return; } if (fields->properties.currencyPluralInfo.fPtr.isNull()) { // Note: clone() can fail with OOM error, but we have no way to report it. :( fields->properties.currencyPluralInfo.fPtr.adoptInstead(info.clone()); } else { *fields->properties.currencyPluralInfo.fPtr = info; // copy-assignment operator } touchNoError(); } UnicodeString& DecimalFormat::getPositivePrefix(UnicodeString& result) const { if (fields == nullptr) { result.setToBogus(); return result; } UErrorCode status = U_ZERO_ERROR; fields->formatter.getAffixImpl(true, false, result, status); if (U_FAILURE(status)) { result.setToBogus(); } return result; } void DecimalFormat::setPositivePrefix(const UnicodeString& newValue) { if (fields == nullptr) { return; } if (newValue == fields->properties.positivePrefix) { return; } fields->properties.positivePrefix = newValue; touchNoError(); } UnicodeString& DecimalFormat::getNegativePrefix(UnicodeString& result) const { if (fields == nullptr) { result.setToBogus(); return result; } UErrorCode status = U_ZERO_ERROR; fields->formatter.getAffixImpl(true, true, result, status); if (U_FAILURE(status)) { result.setToBogus(); } return result; } void DecimalFormat::setNegativePrefix(const UnicodeString& newValue) { if (fields == nullptr) { return; } if (newValue == fields->properties.negativePrefix) { return; } fields->properties.negativePrefix = newValue; touchNoError(); } UnicodeString& DecimalFormat::getPositiveSuffix(UnicodeString& result) const { if (fields == nullptr) { result.setToBogus(); return result; } UErrorCode status = U_ZERO_ERROR; fields->formatter.getAffixImpl(false, false, result, status); if (U_FAILURE(status)) { result.setToBogus(); } return result; } void DecimalFormat::setPositiveSuffix(const UnicodeString& newValue) { if (fields == nullptr) { return; } if (newValue == fields->properties.positiveSuffix) { return; } fields->properties.positiveSuffix = newValue; touchNoError(); } UnicodeString& DecimalFormat::getNegativeSuffix(UnicodeString& result) const { if (fields == nullptr) { result.setToBogus(); return result; } UErrorCode status = U_ZERO_ERROR; fields->formatter.getAffixImpl(false, true, result, status); if (U_FAILURE(status)) { result.setToBogus(); } return result; } void DecimalFormat::setNegativeSuffix(const UnicodeString& newValue) { if (fields == nullptr) { return; } if (newValue == fields->properties.negativeSuffix) { return; } fields->properties.negativeSuffix = newValue; touchNoError(); } UBool DecimalFormat::isSignAlwaysShown() const { // Not much we can do to report an error. if (fields == nullptr) { return DecimalFormatProperties::getDefault().signAlwaysShown; } return fields->properties.signAlwaysShown; } void DecimalFormat::setSignAlwaysShown(UBool value) { if (fields == nullptr) { return; } if (UBOOL_TO_BOOL(value) == fields->properties.signAlwaysShown) { return; } fields->properties.signAlwaysShown = value; touchNoError(); } int32_t DecimalFormat::getMultiplier() const { const DecimalFormatProperties *dfp; // Not much we can do to report an error. if (fields == nullptr) { // Fallback to using the default instance of DecimalFormatProperties. dfp = &(DecimalFormatProperties::getDefault()); } else { dfp = &fields->properties; } if (dfp->multiplier != 1) { return dfp->multiplier; } else if (dfp->magnitudeMultiplier != 0) { return static_cast(uprv_pow10(dfp->magnitudeMultiplier)); } else { return 1; } } void DecimalFormat::setMultiplier(int32_t multiplier) { if (fields == nullptr) { return; } if (multiplier == 0) { multiplier = 1; // one being the benign default value for a multiplier. } // Try to convert to a magnitude multiplier first int delta = 0; int value = multiplier; while (value != 1) { delta++; int temp = value / 10; if (temp * 10 != value) { delta = -1; break; } value = temp; } if (delta != -1) { fields->properties.magnitudeMultiplier = delta; fields->properties.multiplier = 1; } else { fields->properties.magnitudeMultiplier = 0; fields->properties.multiplier = multiplier; } touchNoError(); } int32_t DecimalFormat::getMultiplierScale() const { // Not much we can do to report an error. if (fields == nullptr) { // Fallback to using the default instance of DecimalFormatProperties. return DecimalFormatProperties::getDefault().multiplierScale; } return fields->properties.multiplierScale; } void DecimalFormat::setMultiplierScale(int32_t newValue) { if (fields == nullptr) { return; } if (newValue == fields->properties.multiplierScale) { return; } fields->properties.multiplierScale = newValue; touchNoError(); } double DecimalFormat::getRoundingIncrement() const { // Not much we can do to report an error. if (fields == nullptr) { // Fallback to using the default instance of DecimalFormatProperties. return DecimalFormatProperties::getDefault().roundingIncrement; } return fields->exportedProperties.roundingIncrement; } void DecimalFormat::setRoundingIncrement(double newValue) { if (fields == nullptr) { return; } if (newValue == fields->properties.roundingIncrement) { return; } fields->properties.roundingIncrement = newValue; touchNoError(); } ERoundingMode DecimalFormat::getRoundingMode() const { // Not much we can do to report an error. if (fields == nullptr) { // Fallback to using the default instance of DecimalFormatProperties. return static_cast(DecimalFormatProperties::getDefault().roundingMode.getNoError()); } // UNumberFormatRoundingMode and ERoundingMode have the same values. return static_cast(fields->exportedProperties.roundingMode.getNoError()); } void DecimalFormat::setRoundingMode(ERoundingMode roundingMode) UPRV_NO_SANITIZE_UNDEFINED { if (fields == nullptr) { return; } auto uRoundingMode = static_cast(roundingMode); if (!fields->properties.roundingMode.isNull() && uRoundingMode == fields->properties.roundingMode.getNoError()) { return; } NumberFormat::setMaximumIntegerDigits(roundingMode); // to set field for compatibility fields->properties.roundingMode = uRoundingMode; touchNoError(); } int32_t DecimalFormat::getFormatWidth() const { // Not much we can do to report an error. if (fields == nullptr) { // Fallback to using the default instance of DecimalFormatProperties. return DecimalFormatProperties::getDefault().formatWidth; } return fields->properties.formatWidth; } void DecimalFormat::setFormatWidth(int32_t width) { if (fields == nullptr) { return; } if (width == fields->properties.formatWidth) { return; } fields->properties.formatWidth = width; touchNoError(); } UnicodeString DecimalFormat::getPadCharacterString() const { if (fields == nullptr || fields->properties.padString.isBogus()) { // Readonly-alias the static string kFallbackPaddingString return {true, kFallbackPaddingString, -1}; } else { return fields->properties.padString; } } void DecimalFormat::setPadCharacter(const UnicodeString& padChar) { if (fields == nullptr) { return; } if (padChar == fields->properties.padString) { return; } if (padChar.length() > 0) { fields->properties.padString = UnicodeString(padChar.char32At(0)); } else { fields->properties.padString.setToBogus(); } touchNoError(); } EPadPosition DecimalFormat::getPadPosition() const { if (fields == nullptr || fields->properties.padPosition.isNull()) { return EPadPosition::kPadBeforePrefix; } else { // UNumberFormatPadPosition and EPadPosition have the same values. return static_cast(fields->properties.padPosition.getNoError()); } } void DecimalFormat::setPadPosition(EPadPosition padPos) { if (fields == nullptr) { return; } auto uPadPos = static_cast(padPos); if (!fields->properties.padPosition.isNull() && uPadPos == fields->properties.padPosition.getNoError()) { return; } fields->properties.padPosition = uPadPos; touchNoError(); } UBool DecimalFormat::isScientificNotation() const { // Not much we can do to report an error. if (fields == nullptr) { // Fallback to using the default instance of DecimalFormatProperties. return (DecimalFormatProperties::getDefault().minimumExponentDigits != -1); } return (fields->properties.minimumExponentDigits != -1); } void DecimalFormat::setScientificNotation(UBool useScientific) { if (fields == nullptr) { return; } int32_t minExp = useScientific ? 1 : -1; if (fields->properties.minimumExponentDigits == minExp) { return; } if (useScientific) { fields->properties.minimumExponentDigits = 1; } else { fields->properties.minimumExponentDigits = -1; } touchNoError(); } int8_t DecimalFormat::getMinimumExponentDigits() const { // Not much we can do to report an error. if (fields == nullptr) { // Fallback to using the default instance of DecimalFormatProperties. return static_cast(DecimalFormatProperties::getDefault().minimumExponentDigits); } return static_cast(fields->properties.minimumExponentDigits); } void DecimalFormat::setMinimumExponentDigits(int8_t minExpDig) { if (fields == nullptr) { return; } if (minExpDig == fields->properties.minimumExponentDigits) { return; } fields->properties.minimumExponentDigits = minExpDig; touchNoError(); } UBool DecimalFormat::isExponentSignAlwaysShown() const { // Not much we can do to report an error. if (fields == nullptr) { // Fallback to using the default instance of DecimalFormatProperties. return DecimalFormatProperties::getDefault().exponentSignAlwaysShown; } return fields->properties.exponentSignAlwaysShown; } void DecimalFormat::setExponentSignAlwaysShown(UBool expSignAlways) { if (fields == nullptr) { return; } if (UBOOL_TO_BOOL(expSignAlways) == fields->properties.exponentSignAlwaysShown) { return; } fields->properties.exponentSignAlwaysShown = expSignAlways; touchNoError(); } int32_t DecimalFormat::getGroupingSize() const { int32_t groupingSize; // Not much we can do to report an error. if (fields == nullptr) { // Fallback to using the default instance of DecimalFormatProperties. groupingSize = DecimalFormatProperties::getDefault().groupingSize; } else { groupingSize = fields->properties.groupingSize; } if (groupingSize < 0) { return 0; } return groupingSize; } void DecimalFormat::setGroupingSize(int32_t newValue) { if (fields == nullptr) { return; } if (newValue == fields->properties.groupingSize) { return; } fields->properties.groupingSize = newValue; touchNoError(); } int32_t DecimalFormat::getSecondaryGroupingSize() const { int32_t grouping2; // Not much we can do to report an error. if (fields == nullptr) { // Fallback to using the default instance of DecimalFormatProperties. grouping2 = DecimalFormatProperties::getDefault().secondaryGroupingSize; } else { grouping2 = fields->properties.secondaryGroupingSize; } if (grouping2 < 0) { return 0; } return grouping2; } void DecimalFormat::setSecondaryGroupingSize(int32_t newValue) { if (fields == nullptr) { return; } if (newValue == fields->properties.secondaryGroupingSize) { return; } fields->properties.secondaryGroupingSize = newValue; touchNoError(); } int32_t DecimalFormat::getMinimumGroupingDigits() const { // Not much we can do to report an error. if (fields == nullptr) { // Fallback to using the default instance of DecimalFormatProperties. return DecimalFormatProperties::getDefault().minimumGroupingDigits; } return fields->properties.minimumGroupingDigits; } void DecimalFormat::setMinimumGroupingDigits(int32_t newValue) { if (fields == nullptr) { return; } if (newValue == fields->properties.minimumGroupingDigits) { return; } fields->properties.minimumGroupingDigits = newValue; touchNoError(); } UBool DecimalFormat::isDecimalSeparatorAlwaysShown() const { // Not much we can do to report an error. if (fields == nullptr) { // Fallback to using the default instance of DecimalFormatProperties. return DecimalFormatProperties::getDefault().decimalSeparatorAlwaysShown; } return fields->properties.decimalSeparatorAlwaysShown; } void DecimalFormat::setDecimalSeparatorAlwaysShown(UBool newValue) { if (fields == nullptr) { return; } if (UBOOL_TO_BOOL(newValue) == fields->properties.decimalSeparatorAlwaysShown) { return; } fields->properties.decimalSeparatorAlwaysShown = newValue; touchNoError(); } UBool DecimalFormat::isDecimalPatternMatchRequired() const { // Not much we can do to report an error. if (fields == nullptr) { // Fallback to using the default instance of DecimalFormatProperties. return DecimalFormatProperties::getDefault().decimalPatternMatchRequired; } return fields->properties.decimalPatternMatchRequired; } void DecimalFormat::setDecimalPatternMatchRequired(UBool newValue) { if (fields == nullptr) { return; } if (UBOOL_TO_BOOL(newValue) == fields->properties.decimalPatternMatchRequired) { return; } fields->properties.decimalPatternMatchRequired = newValue; touchNoError(); } UBool DecimalFormat::isParseNoExponent() const { // Not much we can do to report an error. if (fields == nullptr) { // Fallback to using the default instance of DecimalFormatProperties. return DecimalFormatProperties::getDefault().parseNoExponent; } return fields->properties.parseNoExponent; } void DecimalFormat::setParseNoExponent(UBool value) { if (fields == nullptr) { return; } if (UBOOL_TO_BOOL(value) == fields->properties.parseNoExponent) { return; } fields->properties.parseNoExponent = value; touchNoError(); } UBool DecimalFormat::isParseCaseSensitive() const { // Not much we can do to report an error. if (fields == nullptr) { // Fallback to using the default instance of DecimalFormatProperties. return DecimalFormatProperties::getDefault().parseCaseSensitive; } return fields->properties.parseCaseSensitive; } void DecimalFormat::setParseCaseSensitive(UBool value) { if (fields == nullptr) { return; } if (UBOOL_TO_BOOL(value) == fields->properties.parseCaseSensitive) { return; } fields->properties.parseCaseSensitive = value; touchNoError(); } UBool DecimalFormat::isFormatFailIfMoreThanMaxDigits() const { // Not much we can do to report an error. if (fields == nullptr) { // Fallback to using the default instance of DecimalFormatProperties. return DecimalFormatProperties::getDefault().formatFailIfMoreThanMaxDigits; } return fields->properties.formatFailIfMoreThanMaxDigits; } void DecimalFormat::setFormatFailIfMoreThanMaxDigits(UBool value) { if (fields == nullptr) { return; } if (UBOOL_TO_BOOL(value) == fields->properties.formatFailIfMoreThanMaxDigits) { return; } fields->properties.formatFailIfMoreThanMaxDigits = value; touchNoError(); } UnicodeString& DecimalFormat::toPattern(UnicodeString& result) const { if (fields == nullptr) { // We only get here if an OOM error happened during construction, copy construction, assignment, or modification. result.setToBogus(); return result; } // Pull some properties from exportedProperties and others from properties // to keep affix patterns intact. In particular, pull rounding properties // so that CurrencyUsage is reflected properly. // TODO: Consider putting this logic in number_patternstring.cpp instead. ErrorCode localStatus; DecimalFormatProperties tprops(fields->properties); bool useCurrency = ( !tprops.currency.isNull() || !tprops.currencyPluralInfo.fPtr.isNull() || !tprops.currencyUsage.isNull() || tprops.currencyAsDecimal || AffixUtils::hasCurrencySymbols(tprops.positivePrefixPattern, localStatus) || AffixUtils::hasCurrencySymbols(tprops.positiveSuffixPattern, localStatus) || AffixUtils::hasCurrencySymbols(tprops.negativePrefixPattern, localStatus) || AffixUtils::hasCurrencySymbols(tprops.negativeSuffixPattern, localStatus)); if (useCurrency) { tprops.minimumFractionDigits = fields->exportedProperties.minimumFractionDigits; tprops.maximumFractionDigits = fields->exportedProperties.maximumFractionDigits; tprops.roundingIncrement = fields->exportedProperties.roundingIncrement; } result = PatternStringUtils::propertiesToPatternString(tprops, localStatus); return result; } UnicodeString& DecimalFormat::toLocalizedPattern(UnicodeString& result) const { if (fields == nullptr) { // We only get here if an OOM error happened during construction, copy construction, assignment, or modification. result.setToBogus(); return result; } ErrorCode localStatus; result = toPattern(result); result = PatternStringUtils::convertLocalized(result, *getDecimalFormatSymbols(), true, localStatus); return result; } void DecimalFormat::applyPattern(const UnicodeString& pattern, UParseError&, UErrorCode& status) { // TODO: What is parseError for? applyPattern(pattern, status); } void DecimalFormat::applyPattern(const UnicodeString& pattern, UErrorCode& status) { // don't overwrite status if it's already a failure. if (U_FAILURE(status)) { return; } if (fields == nullptr) { // We only get here if an OOM error happened during construction, copy construction, assignment, or modification. status = U_MEMORY_ALLOCATION_ERROR; return; } setPropertiesFromPattern(pattern, IGNORE_ROUNDING_NEVER, status); touch(status); } void DecimalFormat::applyLocalizedPattern(const UnicodeString& localizedPattern, UParseError&, UErrorCode& status) { // TODO: What is parseError for? applyLocalizedPattern(localizedPattern, status); } void DecimalFormat::applyLocalizedPattern(const UnicodeString& localizedPattern, UErrorCode& status) { // don't overwrite status if it's already a failure. if (U_FAILURE(status)) { return; } if (fields == nullptr) { // We only get here if an OOM error happened during construction, copy construction, assignment, or modification. status = U_MEMORY_ALLOCATION_ERROR; return; } UnicodeString pattern = PatternStringUtils::convertLocalized( localizedPattern, *getDecimalFormatSymbols(), false, status); applyPattern(pattern, status); } void DecimalFormat::setMaximumIntegerDigits(int32_t newValue) { if (fields == nullptr) { return; } if (newValue == fields->properties.maximumIntegerDigits) { return; } // For backwards compatibility, conflicting min/max need to keep the most recent setting. int32_t min = fields->properties.minimumIntegerDigits; if (min >= 0 && min > newValue) { fields->properties.minimumIntegerDigits = newValue; } fields->properties.maximumIntegerDigits = newValue; touchNoError(); } void DecimalFormat::setMinimumIntegerDigits(int32_t newValue) { if (fields == nullptr) { return; } if (newValue == fields->properties.minimumIntegerDigits) { return; } // For backwards compatibility, conflicting min/max need to keep the most recent setting. int32_t max = fields->properties.maximumIntegerDigits; if (max >= 0 && max < newValue) { fields->properties.maximumIntegerDigits = newValue; } fields->properties.minimumIntegerDigits = newValue; touchNoError(); } void DecimalFormat::setMaximumFractionDigits(int32_t newValue) { if (fields == nullptr) { return; } if (newValue == fields->properties.maximumFractionDigits) { return; } // cap for backward compatibility, formerly 340, now 999 if (newValue > kMaxIntFracSig) { newValue = kMaxIntFracSig; } // For backwards compatibility, conflicting min/max need to keep the most recent setting. int32_t min = fields->properties.minimumFractionDigits; if (min >= 0 && min > newValue) { fields->properties.minimumFractionDigits = newValue; } fields->properties.maximumFractionDigits = newValue; touchNoError(); } void DecimalFormat::setMinimumFractionDigits(int32_t newValue) { if (fields == nullptr) { return; } if (newValue == fields->properties.minimumFractionDigits) { return; } // For backwards compatibility, conflicting min/max need to keep the most recent setting. int32_t max = fields->properties.maximumFractionDigits; if (max >= 0 && max < newValue) { fields->properties.maximumFractionDigits = newValue; } fields->properties.minimumFractionDigits = newValue; touchNoError(); } int32_t DecimalFormat::getMinimumSignificantDigits() const { // Not much we can do to report an error. if (fields == nullptr) { // Fallback to using the default instance of DecimalFormatProperties. return DecimalFormatProperties::getDefault().minimumSignificantDigits; } return fields->exportedProperties.minimumSignificantDigits; } int32_t DecimalFormat::getMaximumSignificantDigits() const { // Not much we can do to report an error. if (fields == nullptr) { // Fallback to using the default instance of DecimalFormatProperties. return DecimalFormatProperties::getDefault().maximumSignificantDigits; } return fields->exportedProperties.maximumSignificantDigits; } void DecimalFormat::setMinimumSignificantDigits(int32_t value) { if (fields == nullptr) { return; } if (value == fields->properties.minimumSignificantDigits) { return; } int32_t max = fields->properties.maximumSignificantDigits; if (max >= 0 && max < value) { fields->properties.maximumSignificantDigits = value; } fields->properties.minimumSignificantDigits = value; touchNoError(); } void DecimalFormat::setMaximumSignificantDigits(int32_t value) { if (fields == nullptr) { return; } if (value == fields->properties.maximumSignificantDigits) { return; } int32_t min = fields->properties.minimumSignificantDigits; if (min >= 0 && min > value) { fields->properties.minimumSignificantDigits = value; } fields->properties.maximumSignificantDigits = value; touchNoError(); } UBool DecimalFormat::areSignificantDigitsUsed() const { const DecimalFormatProperties* dfp; // Not much we can do to report an error. if (fields == nullptr) { // Fallback to using the default instance of DecimalFormatProperties. dfp = &(DecimalFormatProperties::getDefault()); } else { dfp = &fields->properties; } return dfp->minimumSignificantDigits != -1 || dfp->maximumSignificantDigits != -1; } void DecimalFormat::setSignificantDigitsUsed(UBool useSignificantDigits) { if (fields == nullptr) { return; } // These are the default values from the old implementation. if (useSignificantDigits) { if (fields->properties.minimumSignificantDigits != -1 || fields->properties.maximumSignificantDigits != -1) { return; } } else { if (fields->properties.minimumSignificantDigits == -1 && fields->properties.maximumSignificantDigits == -1) { return; } } int32_t minSig = useSignificantDigits ? 1 : -1; int32_t maxSig = useSignificantDigits ? 6 : -1; fields->properties.minimumSignificantDigits = minSig; fields->properties.maximumSignificantDigits = maxSig; touchNoError(); } void DecimalFormat::setCurrency(const char16_t* theCurrency, UErrorCode& ec) { // don't overwrite ec if it's already a failure. if (U_FAILURE(ec)) { return; } if (fields == nullptr) { // We only get here if an OOM error happened during construction, copy construction, assignment, or modification. ec = U_MEMORY_ALLOCATION_ERROR; return; } CurrencyUnit currencyUnit(theCurrency, ec); if (U_FAILURE(ec)) { return; } if (!fields->properties.currency.isNull() && fields->properties.currency.getNoError() == currencyUnit) { return; } NumberFormat::setCurrency(theCurrency, ec); // to set field for compatibility fields->properties.currency = currencyUnit; // In Java, the DecimalFormatSymbols is mutable. Why not in C++? LocalPointer newSymbols(new DecimalFormatSymbols(*getDecimalFormatSymbols()), ec); newSymbols->setCurrency(currencyUnit.getISOCurrency(), ec); fields->symbols.adoptInsteadAndCheckErrorCode(newSymbols.orphan(), ec); touch(ec); } void DecimalFormat::setCurrency(const char16_t* theCurrency) { ErrorCode localStatus; setCurrency(theCurrency, localStatus); } void DecimalFormat::setCurrencyUsage(UCurrencyUsage newUsage, UErrorCode* ec) { // don't overwrite ec if it's already a failure. if (U_FAILURE(*ec)) { return; } if (fields == nullptr) { // We only get here if an OOM error happened during construction, copy construction, assignment, or modification. *ec = U_MEMORY_ALLOCATION_ERROR; return; } if (!fields->properties.currencyUsage.isNull() && newUsage == fields->properties.currencyUsage.getNoError()) { return; } fields->properties.currencyUsage = newUsage; touch(*ec); } UCurrencyUsage DecimalFormat::getCurrencyUsage() const { // CurrencyUsage is not exported, so we have to get it from the input property bag. // TODO: Should we export CurrencyUsage instead? if (fields == nullptr || fields->properties.currencyUsage.isNull()) { return UCURR_USAGE_STANDARD; } return fields->properties.currencyUsage.getNoError(); } void DecimalFormat::formatToDecimalQuantity(double number, DecimalQuantity& output, UErrorCode& status) const { // don't overwrite status if it's already a failure. if (U_FAILURE(status)) { return; } if (fields == nullptr) { // We only get here if an OOM error happened during construction, copy construction, assignment, or modification. status = U_MEMORY_ALLOCATION_ERROR; return; } fields->formatter.formatDouble(number, status).getDecimalQuantity(output, status); } void DecimalFormat::formatToDecimalQuantity(const Formattable& number, DecimalQuantity& output, UErrorCode& status) const { // don't overwrite status if it's already a failure. if (U_FAILURE(status)) { return; } if (fields == nullptr) { // We only get here if an OOM error happened during construction, copy construction, assignment, or modification. status = U_MEMORY_ALLOCATION_ERROR; return; } UFormattedNumberData obj; number.populateDecimalQuantity(obj.quantity, status); fields->formatter.formatImpl(&obj, status); output = std::move(obj.quantity); } const number::LocalizedNumberFormatter* DecimalFormat::toNumberFormatter(UErrorCode& status) const { // We sometimes need to return nullptr here (see ICU-20380) if (U_FAILURE(status)) { return nullptr; } if (fields == nullptr) { // We only get here if an OOM error happened during construction, copy construction, assignment, or modification. status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } return &fields->formatter; } /** Rebuilds the formatter object from the property bag. */ void DecimalFormat::touch(UErrorCode& status) { if (U_FAILURE(status)) { return; } if (fields == nullptr) { // We only get here if an OOM error happened during construction, copy construction, assignment, or modification. // For regular construction, the caller should have checked the status variable for errors. // For copy construction, there is unfortunately nothing to report the error, so we need to guard against // this possible bad state here and set the status to an error. status = U_MEMORY_ALLOCATION_ERROR; return; } // In C++, fields->symbols (or, if it's null, the DecimalFormatSymbols owned by the underlying LocalizedNumberFormatter) // is the source of truth for the locale. const DecimalFormatSymbols* symbols = getDecimalFormatSymbols(); Locale locale = symbols->getLocale(); // Note: The formatter is relatively cheap to create, and we need it to populate fields->exportedProperties, // so automatically recompute it here. The parser is a bit more expensive and is not needed until the // parse method is called, so defer that until needed. // TODO: Only update the pieces that changed instead of re-computing the whole formatter? // Since memory has already been allocated for the formatter, we can move assign a stack-allocated object // and don't need to call new. (Which is slower and could possibly fail). // [Note that "symbols" above might point to the DecimalFormatSymbols object owned by fields->formatter. // That's okay, because NumberPropertyMapper::create() will clone it before fields->formatter's assignment // operator deletes it. But it does mean that "symbols" can't be counted on to be good after this line.] fields->formatter = NumberPropertyMapper::create( fields->properties, *symbols, fields->warehouse, fields->exportedProperties, status ).locale(locale); fields->symbols.adoptInstead(nullptr); // the fields->symbols property is only temporary, until we can copy it into a new LocalizedNumberFormatter // Do this after fields->exportedProperties are set up setupFastFormat(); // Delete the parsers if they were made previously delete fields->atomicParser.exchange(nullptr); delete fields->atomicCurrencyParser.exchange(nullptr); // In order for the getters to work, we need to populate some fields in NumberFormat. NumberFormat::setCurrency(fields->exportedProperties.currency.get(status).getISOCurrency(), status); NumberFormat::setMaximumIntegerDigits(fields->exportedProperties.maximumIntegerDigits); NumberFormat::setMinimumIntegerDigits(fields->exportedProperties.minimumIntegerDigits); NumberFormat::setMaximumFractionDigits(fields->exportedProperties.maximumFractionDigits); NumberFormat::setMinimumFractionDigits(fields->exportedProperties.minimumFractionDigits); // fImpl->properties, not fields->exportedProperties, since this information comes from the pattern: NumberFormat::setGroupingUsed(fields->properties.groupingUsed); } void DecimalFormat::touchNoError() { UErrorCode localStatus = U_ZERO_ERROR; touch(localStatus); } void DecimalFormat::setPropertiesFromPattern(const UnicodeString& pattern, int32_t ignoreRounding, UErrorCode& status) { if (U_SUCCESS(status)) { // Cast workaround to get around putting the enum in the public header file auto actualIgnoreRounding = static_cast(ignoreRounding); PatternParser::parseToExistingProperties(pattern, fields->properties, actualIgnoreRounding, status); } } const numparse::impl::NumberParserImpl* DecimalFormat::getParser(UErrorCode& status) const { // TODO: Move this into umutex.h? (similar logic also in numrange_fluent.cpp) // See ICU-20146 if (U_FAILURE(status)) { return nullptr; } // First try to get the pre-computed parser auto* ptr = fields->atomicParser.load(); if (ptr != nullptr) { return ptr; } // Try computing the parser on our own auto* temp = NumberParserImpl::createParserFromProperties(fields->properties, *getDecimalFormatSymbols(), false, status); if (U_FAILURE(status)) { return nullptr; } if (temp == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } // Note: ptr starts as nullptr; during compare_exchange, // it is set to what is actually stored in the atomic // if another thread beat us to computing the parser object. auto* nonConstThis = const_cast(this); if (!nonConstThis->fields->atomicParser.compare_exchange_strong(ptr, temp)) { // Another thread beat us to computing the parser delete temp; return ptr; } else { // Our copy of the parser got stored in the atomic return temp; } } const numparse::impl::NumberParserImpl* DecimalFormat::getCurrencyParser(UErrorCode& status) const { if (U_FAILURE(status)) { return nullptr; } // First try to get the pre-computed parser auto* ptr = fields->atomicCurrencyParser.load(); if (ptr != nullptr) { return ptr; } // Try computing the parser on our own auto* temp = NumberParserImpl::createParserFromProperties(fields->properties, *getDecimalFormatSymbols(), true, status); if (temp == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; // although we may still dereference, call sites should be guarded } // Note: ptr starts as nullptr; during compare_exchange, it is set to what is actually stored in the // atomic if another thread beat us to computing the parser object. auto* nonConstThis = const_cast(this); if (!nonConstThis->fields->atomicCurrencyParser.compare_exchange_strong(ptr, temp)) { // Another thread beat us to computing the parser delete temp; return ptr; } else { // Our copy of the parser got stored in the atomic return temp; } } void DecimalFormat::fieldPositionHelper( const UFormattedNumberData& formatted, FieldPosition& fieldPosition, int32_t offset, UErrorCode& status) { if (U_FAILURE(status)) { return; } // always return first occurrence: fieldPosition.setBeginIndex(0); fieldPosition.setEndIndex(0); bool found = formatted.nextFieldPosition(fieldPosition, status); if (found && offset != 0) { FieldPositionOnlyHandler fpoh(fieldPosition); fpoh.shiftLast(offset); } } void DecimalFormat::fieldPositionIteratorHelper( const UFormattedNumberData& formatted, FieldPositionIterator* fpi, int32_t offset, UErrorCode& status) { if (U_SUCCESS(status) && (fpi != nullptr)) { FieldPositionIteratorHandler fpih(fpi, status); fpih.setShift(offset); formatted.getAllFieldPositions(fpih, status); } } // To debug fast-format, change void(x) to printf(x) #define trace(x) void(x) void DecimalFormat::setupFastFormat() { // Check the majority of properties: if (!fields->properties.equalsDefaultExceptFastFormat()) { trace("no fast format: equality\n"); fields->canUseFastFormat = false; return; } // Now check the remaining properties. // Nontrivial affixes: UBool trivialPP = fields->properties.positivePrefixPattern.isEmpty(); UBool trivialPS = fields->properties.positiveSuffixPattern.isEmpty(); UBool trivialNP = fields->properties.negativePrefixPattern.isBogus() || ( fields->properties.negativePrefixPattern.length() == 1 && fields->properties.negativePrefixPattern.charAt(0) == u'-'); UBool trivialNS = fields->properties.negativeSuffixPattern.isEmpty(); if (!trivialPP || !trivialPS || !trivialNP || !trivialNS) { trace("no fast format: affixes\n"); fields->canUseFastFormat = false; return; } const DecimalFormatSymbols* symbols = getDecimalFormatSymbols(); // Grouping (secondary grouping is forbidden in equalsDefaultExceptFastFormat): bool groupingUsed = fields->properties.groupingUsed; int32_t groupingSize = fields->properties.groupingSize; bool unusualGroupingSize = groupingSize > 0 && groupingSize != 3; const UnicodeString& groupingString = symbols->getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol); if (groupingUsed && (unusualGroupingSize || groupingString.length() != 1)) { trace("no fast format: grouping\n"); fields->canUseFastFormat = false; return; } // Integer length: int32_t minInt = fields->exportedProperties.minimumIntegerDigits; int32_t maxInt = fields->exportedProperties.maximumIntegerDigits; // Fastpath supports up to only 10 digits (length of INT32_MIN) if (minInt > 10) { trace("no fast format: integer\n"); fields->canUseFastFormat = false; return; } // Fraction length (no fraction part allowed in fast path): int32_t minFrac = fields->exportedProperties.minimumFractionDigits; if (minFrac > 0) { trace("no fast format: fraction\n"); fields->canUseFastFormat = false; return; } // Other symbols: const UnicodeString& minusSignString = symbols->getConstSymbol(DecimalFormatSymbols::kMinusSignSymbol); UChar32 codePointZero = symbols->getCodePointZero(); if (minusSignString.length() != 1 || U16_LENGTH(codePointZero) != 1) { trace("no fast format: symbols\n"); fields->canUseFastFormat = false; return; } // Good to go! trace("can use fast format!\n"); fields->canUseFastFormat = true; fields->fastData.cpZero = static_cast(codePointZero); fields->fastData.cpGroupingSeparator = groupingUsed && groupingSize == 3 ? groupingString.charAt(0) : 0; fields->fastData.cpMinusSign = minusSignString.charAt(0); fields->fastData.minInt = (minInt < 0 || minInt > 127) ? 0 : static_cast(minInt); fields->fastData.maxInt = (maxInt < 0 || maxInt > 127) ? 127 : static_cast(maxInt); } bool DecimalFormat::fastFormatDouble(double input, UnicodeString& output) const { if (!fields->canUseFastFormat) { return false; } if (std::isnan(input) || uprv_trunc(input) != input || input <= INT32_MIN || input > INT32_MAX) { return false; } doFastFormatInt32(static_cast(input), std::signbit(input), output); return true; } bool DecimalFormat::fastFormatInt64(int64_t input, UnicodeString& output) const { if (!fields->canUseFastFormat) { return false; } if (input <= INT32_MIN || input > INT32_MAX) { return false; } doFastFormatInt32(static_cast(input), input < 0, output); return true; } void DecimalFormat::doFastFormatInt32(int32_t input, bool isNegative, UnicodeString& output) const { U_ASSERT(fields->canUseFastFormat); if (isNegative) { output.append(fields->fastData.cpMinusSign); U_ASSERT(input != INT32_MIN); // handled by callers input = -input; } // Cap at int32_t to make the buffer small and operations fast. // Longest string: "2,147,483,648" (13 chars in length) static constexpr int32_t localCapacity = 13; char16_t localBuffer[localCapacity]; char16_t* ptr = localBuffer + localCapacity; int8_t group = 0; int8_t minInt = (fields->fastData.minInt < 1)? 1: fields->fastData.minInt; for (int8_t i = 0; i < fields->fastData.maxInt && (input != 0 || i < minInt); i++) { if (group++ == 3 && fields->fastData.cpGroupingSeparator != 0) { *(--ptr) = fields->fastData.cpGroupingSeparator; group = 1; } std::div_t res = std::div(input, 10); *(--ptr) = static_cast(fields->fastData.cpZero + res.rem); input = res.quot; } int32_t len = localCapacity - static_cast(ptr - localBuffer); output.append(ptr, len); } #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/tznames.cpp0000644000176200001440000004022414700200761016416 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2011-2015, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/locid.h" #include "unicode/tznames.h" #include "unicode/uenum.h" #include "cmemory.h" #include "cstring.h" #include "mutex.h" #include "putilimp.h" #include "tznames_impl.h" #include "uassert.h" #include "ucln_in.h" #include "uhash.h" #include "umutex.h" #include "uvector.h" U_NAMESPACE_BEGIN // TimeZoneNames object cache handling static UMutex gTimeZoneNamesLock; static UHashtable *gTimeZoneNamesCache = nullptr; static UBool gTimeZoneNamesCacheInitialized = false; // Access count - incremented every time up to SWEEP_INTERVAL, // then reset to 0 static int32_t gAccessCount = 0; // Interval for calling the cache sweep function - every 100 times #define SWEEP_INTERVAL 100 // Cache expiration in millisecond. When a cached entry is no // longer referenced and exceeding this threshold since last // access time, then the cache entry will be deleted by the sweep // function. For now, 3 minutes. #define CACHE_EXPIRATION 180000.0 typedef struct TimeZoneNamesCacheEntry { TimeZoneNames* names; int32_t refCount; double lastAccess; } TimeZoneNamesCacheEntry; U_CDECL_BEGIN /** * Cleanup callback func */ static UBool U_CALLCONV timeZoneNames_cleanup() { if (gTimeZoneNamesCache != nullptr) { uhash_close(gTimeZoneNamesCache); gTimeZoneNamesCache = nullptr; } gTimeZoneNamesCacheInitialized = false; return true; } /** * Deleter for TimeZoneNamesCacheEntry */ static void U_CALLCONV deleteTimeZoneNamesCacheEntry(void *obj) { icu::TimeZoneNamesCacheEntry *entry = (icu::TimeZoneNamesCacheEntry*)obj; delete (icu::TimeZoneNamesImpl*) entry->names; uprv_free(entry); } U_CDECL_END /** * Function used for removing unreferrenced cache entries exceeding * the expiration time. This function must be called with in the mutex * block. */ static void sweepCache() { int32_t pos = UHASH_FIRST; const UHashElement* elem; double now = (double)uprv_getUTCtime(); while ((elem = uhash_nextElement(gTimeZoneNamesCache, &pos)) != 0) { TimeZoneNamesCacheEntry *entry = (TimeZoneNamesCacheEntry *)elem->value.pointer; if (entry->refCount <= 0 && (now - entry->lastAccess) > CACHE_EXPIRATION) { // delete this entry uhash_removeElement(gTimeZoneNamesCache, elem); } } } // --------------------------------------------------- // TimeZoneNamesDelegate // --------------------------------------------------- class TimeZoneNamesDelegate : public TimeZoneNames { public: TimeZoneNamesDelegate(const Locale& locale, UErrorCode& status); virtual ~TimeZoneNamesDelegate(); virtual bool operator==(const TimeZoneNames& other) const override; virtual bool operator!=(const TimeZoneNames& other) const {return !operator==(other);} virtual TimeZoneNamesDelegate* clone() const override; StringEnumeration* getAvailableMetaZoneIDs(UErrorCode& status) const override; StringEnumeration* getAvailableMetaZoneIDs(const UnicodeString& tzID, UErrorCode& status) const override; UnicodeString& getMetaZoneID(const UnicodeString& tzID, UDate date, UnicodeString& mzID) const override; UnicodeString& getReferenceZoneID(const UnicodeString& mzID, const char* region, UnicodeString& tzID) const override; UnicodeString& getMetaZoneDisplayName(const UnicodeString& mzID, UTimeZoneNameType type, UnicodeString& name) const override; UnicodeString& getTimeZoneDisplayName(const UnicodeString& tzID, UTimeZoneNameType type, UnicodeString& name) const override; UnicodeString& getExemplarLocationName(const UnicodeString& tzID, UnicodeString& name) const override; void loadAllDisplayNames(UErrorCode& status) override; void getDisplayNames(const UnicodeString& tzID, const UTimeZoneNameType types[], int32_t numTypes, UDate date, UnicodeString dest[], UErrorCode& status) const override; MatchInfoCollection* find(const UnicodeString& text, int32_t start, uint32_t types, UErrorCode& status) const override; private: TimeZoneNamesDelegate(); TimeZoneNamesCacheEntry* fTZnamesCacheEntry; }; TimeZoneNamesDelegate::TimeZoneNamesDelegate() : fTZnamesCacheEntry(0) { } TimeZoneNamesDelegate::TimeZoneNamesDelegate(const Locale& locale, UErrorCode& status) { Mutex lock(&gTimeZoneNamesLock); if (!gTimeZoneNamesCacheInitialized) { // Create empty hashtable if it is not already initialized. gTimeZoneNamesCache = uhash_open(uhash_hashChars, uhash_compareChars, nullptr, &status); if (U_SUCCESS(status)) { uhash_setKeyDeleter(gTimeZoneNamesCache, uprv_free); uhash_setValueDeleter(gTimeZoneNamesCache, deleteTimeZoneNamesCacheEntry); gTimeZoneNamesCacheInitialized = true; ucln_i18n_registerCleanup(UCLN_I18N_TIMEZONENAMES, timeZoneNames_cleanup); } } if (U_FAILURE(status)) { return; } // Check the cache, if not available, create new one and cache TimeZoneNamesCacheEntry *cacheEntry = nullptr; const char *key = locale.getName(); cacheEntry = (TimeZoneNamesCacheEntry *)uhash_get(gTimeZoneNamesCache, key); if (cacheEntry == nullptr) { TimeZoneNames *tznames = nullptr; char *newKey = nullptr; tznames = new TimeZoneNamesImpl(locale, status); if (tznames == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; } if (U_SUCCESS(status)) { newKey = (char *)uprv_malloc(uprv_strlen(key) + 1); if (newKey == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; } else { uprv_strcpy(newKey, key); } } if (U_SUCCESS(status)) { cacheEntry = (TimeZoneNamesCacheEntry *)uprv_malloc(sizeof(TimeZoneNamesCacheEntry)); if (cacheEntry == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; } else { cacheEntry->names = tznames; cacheEntry->refCount = 1; cacheEntry->lastAccess = (double)uprv_getUTCtime(); uhash_put(gTimeZoneNamesCache, newKey, cacheEntry, &status); } } if (U_FAILURE(status)) { if (tznames != nullptr) { delete tznames; } if (newKey != nullptr) { uprv_free(newKey); } if (cacheEntry != nullptr) { uprv_free(cacheEntry); } cacheEntry = nullptr; } } else { // Update the reference count cacheEntry->refCount++; cacheEntry->lastAccess = (double)uprv_getUTCtime(); } gAccessCount++; if (gAccessCount >= SWEEP_INTERVAL) { // sweep sweepCache(); gAccessCount = 0; } fTZnamesCacheEntry = cacheEntry; } TimeZoneNamesDelegate::~TimeZoneNamesDelegate() { umtx_lock(&gTimeZoneNamesLock); { if (fTZnamesCacheEntry) { U_ASSERT(fTZnamesCacheEntry->refCount > 0); // Just decrement the reference count fTZnamesCacheEntry->refCount--; } } umtx_unlock(&gTimeZoneNamesLock); } bool TimeZoneNamesDelegate::operator==(const TimeZoneNames& other) const { if (this == &other) { return true; } // Just compare if the other object also use the same // cache entry const TimeZoneNamesDelegate* rhs = dynamic_cast(&other); if (rhs) { return fTZnamesCacheEntry == rhs->fTZnamesCacheEntry; } return false; } TimeZoneNamesDelegate* TimeZoneNamesDelegate::clone() const { TimeZoneNamesDelegate* other = new TimeZoneNamesDelegate(); if (other != nullptr) { umtx_lock(&gTimeZoneNamesLock); { // Just increment the reference count fTZnamesCacheEntry->refCount++; other->fTZnamesCacheEntry = fTZnamesCacheEntry; } umtx_unlock(&gTimeZoneNamesLock); } return other; } StringEnumeration* TimeZoneNamesDelegate::getAvailableMetaZoneIDs(UErrorCode& status) const { return fTZnamesCacheEntry->names->getAvailableMetaZoneIDs(status); } StringEnumeration* TimeZoneNamesDelegate::getAvailableMetaZoneIDs(const UnicodeString& tzID, UErrorCode& status) const { return fTZnamesCacheEntry->names->getAvailableMetaZoneIDs(tzID, status); } UnicodeString& TimeZoneNamesDelegate::getMetaZoneID(const UnicodeString& tzID, UDate date, UnicodeString& mzID) const { return fTZnamesCacheEntry->names->getMetaZoneID(tzID, date, mzID); } UnicodeString& TimeZoneNamesDelegate::getReferenceZoneID(const UnicodeString& mzID, const char* region, UnicodeString& tzID) const { return fTZnamesCacheEntry->names->getReferenceZoneID(mzID, region, tzID); } UnicodeString& TimeZoneNamesDelegate::getMetaZoneDisplayName(const UnicodeString& mzID, UTimeZoneNameType type, UnicodeString& name) const { return fTZnamesCacheEntry->names->getMetaZoneDisplayName(mzID, type, name); } UnicodeString& TimeZoneNamesDelegate::getTimeZoneDisplayName(const UnicodeString& tzID, UTimeZoneNameType type, UnicodeString& name) const { return fTZnamesCacheEntry->names->getTimeZoneDisplayName(tzID, type, name); } UnicodeString& TimeZoneNamesDelegate::getExemplarLocationName(const UnicodeString& tzID, UnicodeString& name) const { return fTZnamesCacheEntry->names->getExemplarLocationName(tzID, name); } void TimeZoneNamesDelegate::loadAllDisplayNames(UErrorCode& status) { fTZnamesCacheEntry->names->loadAllDisplayNames(status); } void TimeZoneNamesDelegate::getDisplayNames(const UnicodeString& tzID, const UTimeZoneNameType types[], int32_t numTypes, UDate date, UnicodeString dest[], UErrorCode& status) const { fTZnamesCacheEntry->names->getDisplayNames(tzID, types, numTypes, date, dest, status); } TimeZoneNames::MatchInfoCollection* TimeZoneNamesDelegate::find(const UnicodeString& text, int32_t start, uint32_t types, UErrorCode& status) const { return fTZnamesCacheEntry->names->find(text, start, types, status); } // --------------------------------------------------- // TimeZoneNames base class // --------------------------------------------------- TimeZoneNames::~TimeZoneNames() { } TimeZoneNames* TimeZoneNames::createInstance(const Locale& locale, UErrorCode& status) { TimeZoneNames *instance = nullptr; if (U_SUCCESS(status)) { instance = new TimeZoneNamesDelegate(locale, status); if (instance == nullptr && U_SUCCESS(status)) { status = U_MEMORY_ALLOCATION_ERROR; } } return instance; } TimeZoneNames* TimeZoneNames::createTZDBInstance(const Locale& locale, UErrorCode& status) { TimeZoneNames *instance = nullptr; if (U_SUCCESS(status)) { instance = new TZDBTimeZoneNames(locale); if (instance == nullptr && U_SUCCESS(status)) { status = U_MEMORY_ALLOCATION_ERROR; } } return instance; } UnicodeString& TimeZoneNames::getExemplarLocationName(const UnicodeString& tzID, UnicodeString& name) const { return TimeZoneNamesImpl::getDefaultExemplarLocationName(tzID, name); } UnicodeString& TimeZoneNames::getDisplayName(const UnicodeString& tzID, UTimeZoneNameType type, UDate date, UnicodeString& name) const { getTimeZoneDisplayName(tzID, type, name); if (name.isEmpty()) { char16_t mzIDBuf[32]; UnicodeString mzID(mzIDBuf, 0, UPRV_LENGTHOF(mzIDBuf)); getMetaZoneID(tzID, date, mzID); getMetaZoneDisplayName(mzID, type, name); } return name; } // Empty default implementation, to be overridden in tznames_impl.cpp. void TimeZoneNames::loadAllDisplayNames(UErrorCode& /*status*/) { } // A default, lightweight implementation of getDisplayNames. // Overridden in tznames_impl.cpp. void TimeZoneNames::getDisplayNames(const UnicodeString& tzID, const UTimeZoneNameType types[], int32_t numTypes, UDate date, UnicodeString dest[], UErrorCode& status) const { if (U_FAILURE(status)) { return; } if (tzID.isEmpty()) { return; } UnicodeString mzID; for (int i = 0; i < numTypes; i++) { getTimeZoneDisplayName(tzID, types[i], dest[i]); if (dest[i].isEmpty()) { if (mzID.isEmpty()) { getMetaZoneID(tzID, date, mzID); } getMetaZoneDisplayName(mzID, types[i], dest[i]); } } } struct MatchInfo : UMemory { UTimeZoneNameType nameType; UnicodeString id; int32_t matchLength; UBool isTZID; MatchInfo(UTimeZoneNameType nameType, int32_t matchLength, const UnicodeString* tzID, const UnicodeString* mzID) { this->nameType = nameType; this->matchLength = matchLength; if (tzID != nullptr) { this->id.setTo(*tzID); this->isTZID = true; } else { this->id.setTo(*mzID); this->isTZID = false; } } }; U_CDECL_BEGIN static void U_CALLCONV deleteMatchInfo(void *obj) { delete static_cast(obj); } U_CDECL_END // --------------------------------------------------- // MatchInfoCollection class // --------------------------------------------------- TimeZoneNames::MatchInfoCollection::MatchInfoCollection() : fMatches(nullptr) { } TimeZoneNames::MatchInfoCollection::~MatchInfoCollection() { if (fMatches != nullptr) { delete fMatches; } } void TimeZoneNames::MatchInfoCollection::addZone(UTimeZoneNameType nameType, int32_t matchLength, const UnicodeString& tzID, UErrorCode& status) { if (U_FAILURE(status)) { return; } LocalPointer matchInfo(new MatchInfo(nameType, matchLength, &tzID, nullptr), status); UVector *matchesVec = matches(status); if (U_FAILURE(status)) { return; } matchesVec->adoptElement(matchInfo.orphan(), status); } void TimeZoneNames::MatchInfoCollection::addMetaZone(UTimeZoneNameType nameType, int32_t matchLength, const UnicodeString& mzID, UErrorCode& status) { if (U_FAILURE(status)) { return; } LocalPointer matchInfo(new MatchInfo(nameType, matchLength, nullptr, &mzID), status); UVector *matchesVec = matches(status); if (U_FAILURE(status)) { return; } matchesVec->adoptElement(matchInfo.orphan(), status); } int32_t TimeZoneNames::MatchInfoCollection::size() const { if (fMatches == nullptr) { return 0; } return fMatches->size(); } UTimeZoneNameType TimeZoneNames::MatchInfoCollection::getNameTypeAt(int32_t idx) const { const MatchInfo* match = (const MatchInfo*)fMatches->elementAt(idx); if (match) { return match->nameType; } return UTZNM_UNKNOWN; } int32_t TimeZoneNames::MatchInfoCollection::getMatchLengthAt(int32_t idx) const { const MatchInfo* match = (const MatchInfo*)fMatches->elementAt(idx); if (match) { return match->matchLength; } return 0; } UBool TimeZoneNames::MatchInfoCollection::getTimeZoneIDAt(int32_t idx, UnicodeString& tzID) const { tzID.remove(); const MatchInfo* match = (const MatchInfo*)fMatches->elementAt(idx); if (match && match->isTZID) { tzID.setTo(match->id); return true; } return false; } UBool TimeZoneNames::MatchInfoCollection::getMetaZoneIDAt(int32_t idx, UnicodeString& mzID) const { mzID.remove(); const MatchInfo* match = (const MatchInfo*)fMatches->elementAt(idx); if (match && !match->isTZID) { mzID.setTo(match->id); return true; } return false; } UVector* TimeZoneNames::MatchInfoCollection::matches(UErrorCode& status) { if (U_FAILURE(status)) { return nullptr; } if (fMatches != nullptr) { return fMatches; } fMatches = new UVector(deleteMatchInfo, nullptr, status); if (fMatches == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; } else if (U_FAILURE(status)) { delete fMatches; fMatches = nullptr; } return fMatches; } U_NAMESPACE_END #endif stringi/src/icu74/i18n/esctrn.cpp0000644000176200001440000001524714700200761016242 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (c) 2001-2011, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 11/19/2001 aliu Creation. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/utf16.h" #include "esctrn.h" #include "util.h" U_NAMESPACE_BEGIN static const char16_t UNIPRE[] = {85,43,0}; // "U+" static const char16_t BS_u[] = {92,117,0}; // "\\u" static const char16_t BS_U[] = {92,85,0}; // "\\U" static const char16_t XMLPRE[] = {38,35,120,0}; // "&#x" static const char16_t XML10PRE[] = {38,35,0}; // "&#" static const char16_t PERLPRE[] = {92,120,123,0}; // "\\x{" static const char16_t SEMI[] = {59,0}; // ";" static const char16_t RBRACE[] = {125,0}; // "}" UOBJECT_DEFINE_RTTI_IMPLEMENTATION(EscapeTransliterator) /** * Factory methods */ static Transliterator* _createEscUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) { // Unicode: "U+10FFFF" hex, min=4, max=6 return new EscapeTransliterator(ID, UnicodeString(true, UNIPRE, 2), UnicodeString(), 16, 4, true, nullptr); } static Transliterator* _createEscJava(const UnicodeString& ID, Transliterator::Token /*context*/) { // Java: "\\uFFFF" hex, min=4, max=4 return new EscapeTransliterator(ID, UnicodeString(true, BS_u, 2), UnicodeString(), 16, 4, false, nullptr); } static Transliterator* _createEscC(const UnicodeString& ID, Transliterator::Token /*context*/) { // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8 return new EscapeTransliterator(ID, UnicodeString(true, BS_u, 2), UnicodeString(), 16, 4, true, new EscapeTransliterator(UnicodeString(), UnicodeString(true, BS_U, 2), UnicodeString(), 16, 8, true, nullptr)); } static Transliterator* _createEscXML(const UnicodeString& ID, Transliterator::Token /*context*/) { // XML: "􏿿" hex, min=1, max=6 return new EscapeTransliterator(ID, UnicodeString(true, XMLPRE, 3), UnicodeString(SEMI[0]), 16, 1, true, nullptr); } static Transliterator* _createEscXML10(const UnicodeString& ID, Transliterator::Token /*context*/) { // XML10: "&1114111;" dec, min=1, max=7 (not really "Any-Hex") return new EscapeTransliterator(ID, UnicodeString(true, XML10PRE, 2), UnicodeString(SEMI[0]), 10, 1, true, nullptr); } static Transliterator* _createEscPerl(const UnicodeString& ID, Transliterator::Token /*context*/) { // Perl: "\\x{263A}" hex, min=1, max=6 return new EscapeTransliterator(ID, UnicodeString(true, PERLPRE, 3), UnicodeString(RBRACE[0]), 16, 1, true, nullptr); } /** * Registers standard variants with the system. Called by * Transliterator during initialization. */ void EscapeTransliterator::registerIDs() { Token t = integerToken(0); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-Hex/Unicode"), _createEscUnicode, t); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-Hex/Java"), _createEscJava, t); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-Hex/C"), _createEscC, t); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-Hex/XML"), _createEscXML, t); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-Hex/XML10"), _createEscXML10, t); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-Hex/Perl"), _createEscPerl, t); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-Hex"), _createEscJava, t); } /** * Constructs an escape transliterator with the given ID and * parameters. See the class member documentation for details. */ EscapeTransliterator::EscapeTransliterator(const UnicodeString& newID, const UnicodeString& _prefix, const UnicodeString& _suffix, int32_t _radix, int32_t _minDigits, UBool _grokSupplementals, EscapeTransliterator* adoptedSupplementalHandler) : Transliterator(newID, nullptr) { this->prefix = _prefix; this->suffix = _suffix; this->radix = _radix; this->minDigits = _minDigits; this->grokSupplementals = _grokSupplementals; this->supplementalHandler = adoptedSupplementalHandler; } /** * Copy constructor. */ EscapeTransliterator::EscapeTransliterator(const EscapeTransliterator& o) : Transliterator(o), prefix(o.prefix), suffix(o.suffix), radix(o.radix), minDigits(o.minDigits), grokSupplementals(o.grokSupplementals) { supplementalHandler = (o.supplementalHandler != 0) ? new EscapeTransliterator(*o.supplementalHandler) : nullptr; } EscapeTransliterator::~EscapeTransliterator() { delete supplementalHandler; } /** * Transliterator API. */ EscapeTransliterator* EscapeTransliterator::clone() const { return new EscapeTransliterator(*this); } /** * Implements {@link Transliterator#handleTransliterate}. */ void EscapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos, UBool /*isIncremental*/) const { /* TODO: Verify that isIncremental can be ignored */ int32_t start = pos.start; int32_t limit = pos.limit; UnicodeString buf(prefix); int32_t prefixLen = prefix.length(); UBool redoPrefix = false; while (start < limit) { int32_t c = grokSupplementals ? text.char32At(start) : text.charAt(start); int32_t charLen = grokSupplementals ? U16_LENGTH(c) : 1; if ((c & 0xFFFF0000) != 0 && supplementalHandler != nullptr) { buf.truncate(0); buf.append(supplementalHandler->prefix); ICU_Utility::appendNumber(buf, c, supplementalHandler->radix, supplementalHandler->minDigits); buf.append(supplementalHandler->suffix); redoPrefix = true; } else { if (redoPrefix) { buf.truncate(0); buf.append(prefix); redoPrefix = false; } else { buf.truncate(prefixLen); } ICU_Utility::appendNumber(buf, c, radix, minDigits); buf.append(suffix); } text.handleReplaceBetween(start, start + charLen, buf); start += buf.length(); limit += buf.length() - charLen; } pos.contextLimit += limit - pos.limit; pos.limit = limit; pos.start = start; } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ //eof stringi/src/icu74/i18n/collunsafe.h0000644000176200001440000002013014700200761016527 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // collunsafe.h // Copyright (C) 2015-2016, International Business Machines Corporation and others. // All Rights Reserved. // To be included by collationdatareader.cpp, and generated by gencolusb. // Machine generated, do not edit. #ifndef COLLUNSAFE_H #define COLLUNSAFE_H #include "unicode/utypes.h" #define COLLUNSAFE_ICU_VERSION "56.0.1" #define COLLUNSAFE_COLL_VERSION "9.64" #define COLLUNSAFE_SERIALIZE 1 static const int32_t unsafe_serializedCount = 850; static const uint16_t unsafe_serializedData[850] = { 0x8350, 0x01B8, 0x0034, 0x0035, 0x004C, 0x004D, 0x00A0, 0x00A1, // 8 0x0300, 0x034F, 0x0350, 0x0370, 0x03A9, 0x03AA, 0x03E2, 0x03E3, // 16 0x042F, 0x0430, 0x0483, 0x0488, 0x0531, 0x0532, 0x0591, 0x05BE, // 24 0x05BF, 0x05C0, 0x05C1, 0x05C3, 0x05C4, 0x05C6, 0x05C7, 0x05C8, // 32 0x05D0, 0x05D1, 0x0610, 0x061B, 0x0628, 0x0629, 0x064B, 0x0660, // 40 0x0670, 0x0671, 0x06D6, 0x06DD, 0x06DF, 0x06E5, 0x06E7, 0x06E9, // 48 0x06EA, 0x06EE, 0x0710, 0x0712, 0x0730, 0x074B, 0x078C, 0x078D, // 56 0x07D8, 0x07D9, 0x07EB, 0x07F4, 0x0800, 0x0801, 0x0816, 0x081A, // 64 0x081B, 0x0824, 0x0825, 0x0828, 0x0829, 0x082E, 0x0840, 0x0841, // 72 0x0859, 0x085C, 0x08E3, 0x0900, 0x0905, 0x0906, 0x093C, 0x093D, // 80 0x094D, 0x094E, 0x0951, 0x0955, 0x0995, 0x0996, 0x09BC, 0x09BD, // 88 0x09BE, 0x09BF, 0x09CD, 0x09CE, 0x09D7, 0x09D8, 0x0A15, 0x0A16, // 96 0x0A3C, 0x0A3D, 0x0A4D, 0x0A4E, 0x0A95, 0x0A96, 0x0ABC, 0x0ABD, // 104 0x0ACD, 0x0ACE, 0x0B15, 0x0B16, 0x0B3C, 0x0B3D, 0x0B3E, 0x0B3F, // 112 0x0B4D, 0x0B4E, 0x0B56, 0x0B58, 0x0B95, 0x0B96, 0x0BBE, 0x0BBF, // 120 0x0BCD, 0x0BCE, 0x0BD7, 0x0BD8, 0x0C15, 0x0C16, 0x0C4D, 0x0C4E, // 128 0x0C55, 0x0C57, 0x0C95, 0x0C96, 0x0CBC, 0x0CBD, 0x0CC2, 0x0CC3, // 136 0x0CCD, 0x0CCE, 0x0CD5, 0x0CD7, 0x0D15, 0x0D16, 0x0D3E, 0x0D3F, // 144 0x0D4D, 0x0D4E, 0x0D57, 0x0D58, 0x0D85, 0x0D86, 0x0DCA, 0x0DCB, // 152 0x0DCF, 0x0DD0, 0x0DDF, 0x0DE0, 0x0E01, 0x0E2F, 0x0E32, 0x0E33, // 160 0x0E38, 0x0E3B, 0x0E48, 0x0E4C, 0x0E81, 0x0E83, 0x0E84, 0x0E85, // 168 0x0E87, 0x0E89, 0x0E8A, 0x0E8B, 0x0E8D, 0x0E8E, 0x0E94, 0x0E98, // 176 0x0E99, 0x0EA0, 0x0EA1, 0x0EA4, 0x0EA5, 0x0EA6, 0x0EA7, 0x0EA8, // 184 0x0EAA, 0x0EAC, 0x0EAD, 0x0EAF, 0x0EB2, 0x0EB3, 0x0EB8, 0x0EBA, // 192 0x0EC8, 0x0ECC, 0x0EDC, 0x0EE0, 0x0F18, 0x0F1A, 0x0F35, 0x0F36, // 200 0x0F37, 0x0F38, 0x0F39, 0x0F3A, 0x0F40, 0x0F41, 0x0F71, 0x0F76, // 208 0x0F7A, 0x0F7E, 0x0F80, 0x0F85, 0x0F86, 0x0F88, 0x0FC6, 0x0FC7, // 216 0x1000, 0x1001, 0x102E, 0x102F, 0x1037, 0x1038, 0x1039, 0x103B, // 224 0x108D, 0x108E, 0x10D3, 0x10D4, 0x12A0, 0x12A1, 0x135D, 0x1360, // 232 0x13C4, 0x13C5, 0x14C0, 0x14C1, 0x168F, 0x1690, 0x16A0, 0x16A1, // 240 0x1703, 0x1704, 0x1714, 0x1715, 0x1723, 0x1724, 0x1734, 0x1735, // 248 0x1743, 0x1744, 0x1763, 0x1764, 0x1780, 0x1781, 0x17D2, 0x17D3, // 256 0x17DD, 0x17DE, 0x1826, 0x1827, 0x18A9, 0x18AA, 0x1900, 0x1901, // 264 0x1939, 0x193C, 0x1950, 0x1951, 0x1980, 0x19AC, 0x1A00, 0x1A01, // 272 0x1A17, 0x1A19, 0x1A20, 0x1A21, 0x1A60, 0x1A61, 0x1A75, 0x1A7D, // 280 0x1A7F, 0x1A80, 0x1AB0, 0x1ABE, 0x1B05, 0x1B06, 0x1B34, 0x1B36, // 288 0x1B44, 0x1B45, 0x1B6B, 0x1B74, 0x1B83, 0x1B84, 0x1BAA, 0x1BAC, // 296 0x1BC0, 0x1BC1, 0x1BE6, 0x1BE7, 0x1BF2, 0x1BF4, 0x1C00, 0x1C01, // 304 0x1C37, 0x1C38, 0x1C5A, 0x1C5B, 0x1CD0, 0x1CD3, 0x1CD4, 0x1CE1, // 312 0x1CE2, 0x1CE9, 0x1CED, 0x1CEE, 0x1CF4, 0x1CF5, 0x1CF8, 0x1CFA, // 320 0x1DC0, 0x1DF6, 0x1DFC, 0x1E00, 0x201C, 0x201D, 0x20AC, 0x20AD, // 328 0x20D0, 0x20DD, 0x20E1, 0x20E2, 0x20E5, 0x20F1, 0x263A, 0x263B, // 336 0x2C00, 0x2C01, 0x2CEF, 0x2CF2, 0x2D5E, 0x2D5F, 0x2D7F, 0x2D80, // 344 0x2DE0, 0x2E00, 0x302A, 0x3030, 0x304B, 0x304C, 0x3099, 0x309B, // 352 0x30AB, 0x30AC, 0x3105, 0x3106, 0x5B57, 0x5B58, 0xA288, 0xA289, // 360 0xA4E8, 0xA4E9, 0xA549, 0xA54A, 0xA66F, 0xA670, 0xA674, 0xA67E, // 368 0xA69E, 0xA6A1, 0xA6F0, 0xA6F2, 0xA800, 0xA801, 0xA806, 0xA807, // 376 0xA840, 0xA841, 0xA882, 0xA883, 0xA8C4, 0xA8C5, 0xA8E0, 0xA8F2, // 384 0xA90A, 0xA90B, 0xA92B, 0xA92E, 0xA930, 0xA931, 0xA953, 0xA954, // 392 0xA984, 0xA985, 0xA9B3, 0xA9B4, 0xA9C0, 0xA9C1, 0xAA00, 0xAA01, // 400 0xAA80, 0xAAB1, 0xAAB2, 0xAAB5, 0xAAB7, 0xAAB9, 0xAABE, 0xAAC0, // 408 0xAAC1, 0xAAC2, 0xAAF6, 0xAAF7, 0xABC0, 0xABC1, 0xABED, 0xABEE, // 416 0xAC00, 0xAC01, 0xD800, 0xD807, 0xD808, 0xD809, 0xD80C, 0xD80D, // 424 0xD811, 0xD812, 0xD81A, 0xD81C, 0xD82F, 0xD830, 0xD834, 0xD835, // 432 0xD83A, 0xD83B, 0xDC00, 0xE000, 0xFB1E, 0xFB1F, 0xFDD0, 0xFDD1, // 440 0xFE20, 0xFE30, 0x0001, 0x0000, 0x0001, 0x0001, 0x0001, 0x01FD, // 448 0x0001, 0x01FE, 0x0001, 0x0280, 0x0001, 0x0281, 0x0001, 0x02B7, // 456 0x0001, 0x02B8, 0x0001, 0x02E0, 0x0001, 0x02E1, 0x0001, 0x0308, // 464 0x0001, 0x0309, 0x0001, 0x0330, 0x0001, 0x0331, 0x0001, 0x036B, // 472 0x0001, 0x036C, 0x0001, 0x0376, 0x0001, 0x037B, 0x0001, 0x0380, // 480 0x0001, 0x0381, 0x0001, 0x03A0, 0x0001, 0x03A1, 0x0001, 0x0414, // 488 0x0001, 0x0415, 0x0001, 0x0450, 0x0001, 0x0451, 0x0001, 0x0480, // 496 0x0001, 0x0481, 0x0001, 0x0500, 0x0001, 0x0501, 0x0001, 0x0537, // 504 0x0001, 0x0538, 0x0001, 0x0647, 0x0001, 0x0648, 0x0001, 0x0800, // 512 0x0001, 0x0801, 0x0001, 0x0840, 0x0001, 0x0841, 0x0001, 0x0873, // 520 0x0001, 0x0874, 0x0001, 0x0896, 0x0001, 0x0897, 0x0001, 0x08F4, // 528 0x0001, 0x08F5, 0x0001, 0x0900, 0x0001, 0x0901, 0x0001, 0x0920, // 536 0x0001, 0x0921, 0x0001, 0x0980, 0x0001, 0x0981, 0x0001, 0x09A0, // 544 0x0001, 0x09A1, 0x0001, 0x0A00, 0x0001, 0x0A01, 0x0001, 0x0A0D, // 552 0x0001, 0x0A0E, 0x0001, 0x0A0F, 0x0001, 0x0A10, 0x0001, 0x0A38, // 560 0x0001, 0x0A3B, 0x0001, 0x0A3F, 0x0001, 0x0A40, 0x0001, 0x0A60, // 568 0x0001, 0x0A61, 0x0001, 0x0A95, 0x0001, 0x0A96, 0x0001, 0x0AC1, // 576 0x0001, 0x0AC2, 0x0001, 0x0AE5, 0x0001, 0x0AE7, 0x0001, 0x0B00, // 584 0x0001, 0x0B01, 0x0001, 0x0B40, 0x0001, 0x0B41, 0x0001, 0x0B60, // 592 0x0001, 0x0B61, 0x0001, 0x0B8F, 0x0001, 0x0B90, 0x0001, 0x0C00, // 600 0x0001, 0x0C01, 0x0001, 0x0CA1, 0x0001, 0x0CA2, 0x0001, 0x1005, // 608 0x0001, 0x1006, 0x0001, 0x1046, 0x0001, 0x1047, 0x0001, 0x107F, // 616 0x0001, 0x1080, 0x0001, 0x1083, 0x0001, 0x1084, 0x0001, 0x10B9, // 624 0x0001, 0x10BB, 0x0001, 0x10D0, 0x0001, 0x10D1, 0x0001, 0x1100, // 632 0x0001, 0x1104, 0x0001, 0x1127, 0x0001, 0x1128, 0x0001, 0x1133, // 640 0x0001, 0x1135, 0x0001, 0x1152, 0x0001, 0x1153, 0x0001, 0x1173, // 648 0x0001, 0x1174, 0x0001, 0x1183, 0x0001, 0x1184, 0x0001, 0x11C0, // 656 0x0001, 0x11C1, 0x0001, 0x11CA, 0x0001, 0x11CB, 0x0001, 0x1208, // 664 0x0001, 0x1209, 0x0001, 0x1235, 0x0001, 0x1237, 0x0001, 0x128F, // 672 0x0001, 0x1290, 0x0001, 0x12BE, 0x0001, 0x12BF, 0x0001, 0x12E9, // 680 0x0001, 0x12EB, 0x0001, 0x1315, 0x0001, 0x1316, 0x0001, 0x133C, // 688 0x0001, 0x133D, 0x0001, 0x133E, 0x0001, 0x133F, 0x0001, 0x134D, // 696 0x0001, 0x134E, 0x0001, 0x1357, 0x0001, 0x1358, 0x0001, 0x1366, // 704 0x0001, 0x136D, 0x0001, 0x1370, 0x0001, 0x1375, 0x0001, 0x1484, // 712 0x0001, 0x1485, 0x0001, 0x14B0, 0x0001, 0x14B1, 0x0001, 0x14BA, // 720 0x0001, 0x14BB, 0x0001, 0x14BD, 0x0001, 0x14BE, 0x0001, 0x14C2, // 728 0x0001, 0x14C4, 0x0001, 0x158E, 0x0001, 0x158F, 0x0001, 0x15AF, // 736 0x0001, 0x15B0, 0x0001, 0x15BF, 0x0001, 0x15C1, 0x0001, 0x160E, // 744 0x0001, 0x160F, 0x0001, 0x163F, 0x0001, 0x1640, 0x0001, 0x1680, // 752 0x0001, 0x1681, 0x0001, 0x16B6, 0x0001, 0x16B8, 0x0001, 0x1717, // 760 0x0001, 0x1718, 0x0001, 0x172B, 0x0001, 0x172C, 0x0001, 0x18B4, // 768 0x0001, 0x18B5, 0x0001, 0x1AC0, 0x0001, 0x1AC1, 0x0001, 0x2000, // 776 0x0001, 0x2001, 0x0001, 0x3153, 0x0001, 0x3154, 0x0001, 0x4400, // 784 0x0001, 0x4401, 0x0001, 0x6A4F, 0x0001, 0x6A50, 0x0001, 0x6AE6, // 792 0x0001, 0x6AE7, 0x0001, 0x6AF0, 0x0001, 0x6AF5, 0x0001, 0x6B1C, // 800 0x0001, 0x6B1D, 0x0001, 0x6B30, 0x0001, 0x6B37, 0x0001, 0x6F00, // 808 0x0001, 0x6F01, 0x0001, 0xBC20, 0x0001, 0xBC21, 0x0001, 0xBC9E, // 816 0x0001, 0xBC9F, 0x0001, 0xD165, 0x0001, 0xD16A, 0x0001, 0xD16D, // 824 0x0001, 0xD173, 0x0001, 0xD17B, 0x0001, 0xD183, 0x0001, 0xD185, // 832 0x0001, 0xD18C, 0x0001, 0xD1AA, 0x0001, 0xD1AE, 0x0001, 0xD242, // 840 0x0001, 0xD245, 0x0001, 0xE802, 0x0001, 0xE803, 0x0001, 0xE8D0, // 848 0x0001, 0xE8D7}; #endif stringi/src/icu74/i18n/double-conversion-bignum-dtoa.cpp0000644000176200001440000006700414700200761022603 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // // From the double-conversion library. Original license: // // Copyright 2010 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // ICU PATCH: ifdef around UCONFIG_NO_FORMATTING #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include // ICU PATCH: Customize header file paths for ICU. #include "double-conversion-bignum-dtoa.h" #include "double-conversion-bignum.h" #include "double-conversion-ieee.h" // ICU PATCH: Wrap in ICU namespace U_NAMESPACE_BEGIN namespace double_conversion { static int NormalizedExponent(uint64_t significand, int exponent) { DOUBLE_CONVERSION_ASSERT(significand != 0); while ((significand & Double::kHiddenBit) == 0) { significand = significand << 1; exponent = exponent - 1; } return exponent; } // Forward declarations: // Returns an estimation of k such that 10^(k-1) <= v < 10^k. static int EstimatePower(int exponent); // Computes v / 10^estimated_power exactly, as a ratio of two bignums, numerator // and denominator. static void InitialScaledStartValues(uint64_t significand, int exponent, bool lower_boundary_is_closer, int estimated_power, bool need_boundary_deltas, Bignum* numerator, Bignum* denominator, Bignum* delta_minus, Bignum* delta_plus); // Multiplies numerator/denominator so that its values lies in the range 1-10. // Returns decimal_point s.t. // v = numerator'/denominator' * 10^(decimal_point-1) // where numerator' and denominator' are the values of numerator and // denominator after the call to this function. static void FixupMultiply10(int estimated_power, bool is_even, int* decimal_point, Bignum* numerator, Bignum* denominator, Bignum* delta_minus, Bignum* delta_plus); // Generates digits from the left to the right and stops when the generated // digits yield the shortest decimal representation of v. static void GenerateShortestDigits(Bignum* numerator, Bignum* denominator, Bignum* delta_minus, Bignum* delta_plus, bool is_even, Vector buffer, int* length); // Generates 'requested_digits' after the decimal point. static void BignumToFixed(int requested_digits, int* decimal_point, Bignum* numerator, Bignum* denominator, Vector buffer, int* length); // Generates 'count' digits of numerator/denominator. // Once 'count' digits have been produced rounds the result depending on the // remainder (remainders of exactly .5 round upwards). Might update the // decimal_point when rounding up (for example for 0.9999). static void GenerateCountedDigits(int count, int* decimal_point, Bignum* numerator, Bignum* denominator, Vector buffer, int* length); void BignumDtoa(double v, BignumDtoaMode mode, int requested_digits, Vector buffer, int* length, int* decimal_point) { DOUBLE_CONVERSION_ASSERT(v > 0); DOUBLE_CONVERSION_ASSERT(!Double(v).IsSpecial()); uint64_t significand; int exponent; bool lower_boundary_is_closer; if (mode == BIGNUM_DTOA_SHORTEST_SINGLE) { float f = static_cast(v); DOUBLE_CONVERSION_ASSERT(f == v); significand = Single(f).Significand(); exponent = Single(f).Exponent(); lower_boundary_is_closer = Single(f).LowerBoundaryIsCloser(); } else { significand = Double(v).Significand(); exponent = Double(v).Exponent(); lower_boundary_is_closer = Double(v).LowerBoundaryIsCloser(); } bool need_boundary_deltas = (mode == BIGNUM_DTOA_SHORTEST || mode == BIGNUM_DTOA_SHORTEST_SINGLE); bool is_even = (significand & 1) == 0; int normalized_exponent = NormalizedExponent(significand, exponent); // estimated_power might be too low by 1. int estimated_power = EstimatePower(normalized_exponent); // Shortcut for Fixed. // The requested digits correspond to the digits after the point. If the // number is much too small, then there is no need in trying to get any // digits. if (mode == BIGNUM_DTOA_FIXED && -estimated_power - 1 > requested_digits) { buffer[0] = '\0'; *length = 0; // Set decimal-point to -requested_digits. This is what Gay does. // Note that it should not have any effect anyways since the string is // empty. *decimal_point = -requested_digits; return; } Bignum numerator; Bignum denominator; Bignum delta_minus; Bignum delta_plus; // Make sure the bignum can grow large enough. The smallest double equals // 4e-324. In this case the denominator needs fewer than 324*4 binary digits. // The maximum double is 1.7976931348623157e308 which needs fewer than // 308*4 binary digits. DOUBLE_CONVERSION_ASSERT(Bignum::kMaxSignificantBits >= 324*4); InitialScaledStartValues(significand, exponent, lower_boundary_is_closer, estimated_power, need_boundary_deltas, &numerator, &denominator, &delta_minus, &delta_plus); // We now have v = (numerator / denominator) * 10^estimated_power. FixupMultiply10(estimated_power, is_even, decimal_point, &numerator, &denominator, &delta_minus, &delta_plus); // We now have v = (numerator / denominator) * 10^(decimal_point-1), and // 1 <= (numerator + delta_plus) / denominator < 10 switch (mode) { case BIGNUM_DTOA_SHORTEST: case BIGNUM_DTOA_SHORTEST_SINGLE: GenerateShortestDigits(&numerator, &denominator, &delta_minus, &delta_plus, is_even, buffer, length); break; case BIGNUM_DTOA_FIXED: BignumToFixed(requested_digits, decimal_point, &numerator, &denominator, buffer, length); break; case BIGNUM_DTOA_PRECISION: GenerateCountedDigits(requested_digits, decimal_point, &numerator, &denominator, buffer, length); break; default: DOUBLE_CONVERSION_UNREACHABLE(); } buffer[*length] = '\0'; } // The procedure starts generating digits from the left to the right and stops // when the generated digits yield the shortest decimal representation of v. A // decimal representation of v is a number lying closer to v than to any other // double, so it converts to v when read. // // This is true if d, the decimal representation, is between m- and m+, the // upper and lower boundaries. d must be strictly between them if !is_even. // m- := (numerator - delta_minus) / denominator // m+ := (numerator + delta_plus) / denominator // // Precondition: 0 <= (numerator+delta_plus) / denominator < 10. // If 1 <= (numerator+delta_plus) / denominator < 10 then no leading 0 digit // will be produced. This should be the standard precondition. static void GenerateShortestDigits(Bignum* numerator, Bignum* denominator, Bignum* delta_minus, Bignum* delta_plus, bool is_even, Vector buffer, int* length) { // Small optimization: if delta_minus and delta_plus are the same just reuse // one of the two bignums. if (Bignum::Equal(*delta_minus, *delta_plus)) { delta_plus = delta_minus; } *length = 0; for (;;) { uint16_t digit; digit = numerator->DivideModuloIntBignum(*denominator); DOUBLE_CONVERSION_ASSERT(digit <= 9); // digit is a uint16_t and therefore always positive. // digit = numerator / denominator (integer division). // numerator = numerator % denominator. buffer[(*length)++] = static_cast(digit + '0'); // Can we stop already? // If the remainder of the division is less than the distance to the lower // boundary we can stop. In this case we simply round down (discarding the // remainder). // Similarly we test if we can round up (using the upper boundary). bool in_delta_room_minus; bool in_delta_room_plus; if (is_even) { in_delta_room_minus = Bignum::LessEqual(*numerator, *delta_minus); } else { in_delta_room_minus = Bignum::Less(*numerator, *delta_minus); } if (is_even) { in_delta_room_plus = Bignum::PlusCompare(*numerator, *delta_plus, *denominator) >= 0; } else { in_delta_room_plus = Bignum::PlusCompare(*numerator, *delta_plus, *denominator) > 0; } if (!in_delta_room_minus && !in_delta_room_plus) { // Prepare for next iteration. numerator->Times10(); delta_minus->Times10(); // We optimized delta_plus to be equal to delta_minus (if they share the // same value). So don't multiply delta_plus if they point to the same // object. if (delta_minus != delta_plus) { delta_plus->Times10(); } } else if (in_delta_room_minus && in_delta_room_plus) { // Let's see if 2*numerator < denominator. // If yes, then the next digit would be < 5 and we can round down. int compare = Bignum::PlusCompare(*numerator, *numerator, *denominator); if (compare < 0) { // Remaining digits are less than .5. -> Round down (== do nothing). } else if (compare > 0) { // Remaining digits are more than .5 of denominator. -> Round up. // Note that the last digit could not be a '9' as otherwise the whole // loop would have stopped earlier. // We still have an assert here in case the preconditions were not // satisfied. DOUBLE_CONVERSION_ASSERT(buffer[(*length) - 1] != '9'); buffer[(*length) - 1]++; } else { // Halfway case. // TODO(floitsch): need a way to solve half-way cases. // For now let's round towards even (since this is what Gay seems to // do). if ((buffer[(*length) - 1] - '0') % 2 == 0) { // Round down => Do nothing. } else { DOUBLE_CONVERSION_ASSERT(buffer[(*length) - 1] != '9'); buffer[(*length) - 1]++; } } return; } else if (in_delta_room_minus) { // Round down (== do nothing). return; } else { // in_delta_room_plus // Round up. // Note again that the last digit could not be '9' since this would have // stopped the loop earlier. // We still have an DOUBLE_CONVERSION_ASSERT here, in case the preconditions were not // satisfied. DOUBLE_CONVERSION_ASSERT(buffer[(*length) -1] != '9'); buffer[(*length) - 1]++; return; } } } // Let v = numerator / denominator < 10. // Then we generate 'count' digits of d = x.xxxxx... (without the decimal point) // from left to right. Once 'count' digits have been produced we decide whether // to round up or down. Remainders of exactly .5 round upwards. Numbers such // as 9.999999 propagate a carry all the way, and change the // exponent (decimal_point), when rounding upwards. static void GenerateCountedDigits(int count, int* decimal_point, Bignum* numerator, Bignum* denominator, Vector buffer, int* length) { DOUBLE_CONVERSION_ASSERT(count >= 0); for (int i = 0; i < count - 1; ++i) { uint16_t digit; digit = numerator->DivideModuloIntBignum(*denominator); DOUBLE_CONVERSION_ASSERT(digit <= 9); // digit is a uint16_t and therefore always positive. // digit = numerator / denominator (integer division). // numerator = numerator % denominator. buffer[i] = static_cast(digit + '0'); // Prepare for next iteration. numerator->Times10(); } // Generate the last digit. uint16_t digit; digit = numerator->DivideModuloIntBignum(*denominator); if (Bignum::PlusCompare(*numerator, *numerator, *denominator) >= 0) { digit++; } DOUBLE_CONVERSION_ASSERT(digit <= 10); buffer[count - 1] = static_cast(digit + '0'); // Correct bad digits (in case we had a sequence of '9's). Propagate the // carry until we hat a non-'9' or til we reach the first digit. for (int i = count - 1; i > 0; --i) { if (buffer[i] != '0' + 10) break; buffer[i] = '0'; buffer[i - 1]++; } if (buffer[0] == '0' + 10) { // Propagate a carry past the top place. buffer[0] = '1'; (*decimal_point)++; } *length = count; } // Generates 'requested_digits' after the decimal point. It might omit // trailing '0's. If the input number is too small then no digits at all are // generated (ex.: 2 fixed digits for 0.00001). // // Input verifies: 1 <= (numerator + delta) / denominator < 10. static void BignumToFixed(int requested_digits, int* decimal_point, Bignum* numerator, Bignum* denominator, Vector buffer, int* length) { // Note that we have to look at more than just the requested_digits, since // a number could be rounded up. Example: v=0.5 with requested_digits=0. // Even though the power of v equals 0 we can't just stop here. if (-(*decimal_point) > requested_digits) { // The number is definitively too small. // Ex: 0.001 with requested_digits == 1. // Set decimal-point to -requested_digits. This is what Gay does. // Note that it should not have any effect anyways since the string is // empty. *decimal_point = -requested_digits; *length = 0; return; } else if (-(*decimal_point) == requested_digits) { // We only need to verify if the number rounds down or up. // Ex: 0.04 and 0.06 with requested_digits == 1. DOUBLE_CONVERSION_ASSERT(*decimal_point == -requested_digits); // Initially the fraction lies in range (1, 10]. Multiply the denominator // by 10 so that we can compare more easily. denominator->Times10(); if (Bignum::PlusCompare(*numerator, *numerator, *denominator) >= 0) { // If the fraction is >= 0.5 then we have to include the rounded // digit. buffer[0] = '1'; *length = 1; (*decimal_point)++; } else { // Note that we caught most of similar cases earlier. *length = 0; } return; } else { // The requested digits correspond to the digits after the point. // The variable 'needed_digits' includes the digits before the point. int needed_digits = (*decimal_point) + requested_digits; GenerateCountedDigits(needed_digits, decimal_point, numerator, denominator, buffer, length); } } // Returns an estimation of k such that 10^(k-1) <= v < 10^k where // v = f * 2^exponent and 2^52 <= f < 2^53. // v is hence a normalized double with the given exponent. The output is an // approximation for the exponent of the decimal approximation .digits * 10^k. // // The result might undershoot by 1 in which case 10^k <= v < 10^k+1. // Note: this property holds for v's upper boundary m+ too. // 10^k <= m+ < 10^k+1. // (see explanation below). // // Examples: // EstimatePower(0) => 16 // EstimatePower(-52) => 0 // // Note: e >= 0 => EstimatedPower(e) > 0. No similar claim can be made for e<0. static int EstimatePower(int exponent) { // This function estimates log10 of v where v = f*2^e (with e == exponent). // Note that 10^floor(log10(v)) <= v, but v <= 10^ceil(log10(v)). // Note that f is bounded by its container size. Let p = 53 (the double's // significand size). Then 2^(p-1) <= f < 2^p. // // Given that log10(v) == log2(v)/log2(10) and e+(len(f)-1) is quite close // to log2(v) the function is simplified to (e+(len(f)-1)/log2(10)). // The computed number undershoots by less than 0.631 (when we compute log3 // and not log10). // // Optimization: since we only need an approximated result this computation // can be performed on 64 bit integers. On x86/x64 architecture the speedup is // not really measurable, though. // // Since we want to avoid overshooting we decrement by 1e10 so that // floating-point imprecisions don't affect us. // // Explanation for v's boundary m+: the computation takes advantage of // the fact that 2^(p-1) <= f < 2^p. Boundaries still satisfy this requirement // (even for denormals where the delta can be much more important). const double k1Log10 = 0.30102999566398114; // 1/lg(10) // For doubles len(f) == 53 (don't forget the hidden bit). const int kSignificandSize = Double::kSignificandSize; double estimate = ceil((exponent + kSignificandSize - 1) * k1Log10 - 1e-10); return static_cast(estimate); } // See comments for InitialScaledStartValues. static void InitialScaledStartValuesPositiveExponent( uint64_t significand, int exponent, int estimated_power, bool need_boundary_deltas, Bignum* numerator, Bignum* denominator, Bignum* delta_minus, Bignum* delta_plus) { // A positive exponent implies a positive power. DOUBLE_CONVERSION_ASSERT(estimated_power >= 0); // Since the estimated_power is positive we simply multiply the denominator // by 10^estimated_power. // numerator = v. numerator->AssignUInt64(significand); numerator->ShiftLeft(exponent); // denominator = 10^estimated_power. denominator->AssignPowerUInt16(10, estimated_power); if (need_boundary_deltas) { // Introduce a common denominator so that the deltas to the boundaries are // integers. denominator->ShiftLeft(1); numerator->ShiftLeft(1); // Let v = f * 2^e, then m+ - v = 1/2 * 2^e; With the common // denominator (of 2) delta_plus equals 2^e. delta_plus->AssignUInt16(1); delta_plus->ShiftLeft(exponent); // Same for delta_minus. The adjustments if f == 2^p-1 are done later. delta_minus->AssignUInt16(1); delta_minus->ShiftLeft(exponent); } } // See comments for InitialScaledStartValues static void InitialScaledStartValuesNegativeExponentPositivePower( uint64_t significand, int exponent, int estimated_power, bool need_boundary_deltas, Bignum* numerator, Bignum* denominator, Bignum* delta_minus, Bignum* delta_plus) { // v = f * 2^e with e < 0, and with estimated_power >= 0. // This means that e is close to 0 (have a look at how estimated_power is // computed). // numerator = significand // since v = significand * 2^exponent this is equivalent to // numerator = v * / 2^-exponent numerator->AssignUInt64(significand); // denominator = 10^estimated_power * 2^-exponent (with exponent < 0) denominator->AssignPowerUInt16(10, estimated_power); denominator->ShiftLeft(-exponent); if (need_boundary_deltas) { // Introduce a common denominator so that the deltas to the boundaries are // integers. denominator->ShiftLeft(1); numerator->ShiftLeft(1); // Let v = f * 2^e, then m+ - v = 1/2 * 2^e; With the common // denominator (of 2) delta_plus equals 2^e. // Given that the denominator already includes v's exponent the distance // to the boundaries is simply 1. delta_plus->AssignUInt16(1); // Same for delta_minus. The adjustments if f == 2^p-1 are done later. delta_minus->AssignUInt16(1); } } // See comments for InitialScaledStartValues static void InitialScaledStartValuesNegativeExponentNegativePower( uint64_t significand, int exponent, int estimated_power, bool need_boundary_deltas, Bignum* numerator, Bignum* denominator, Bignum* delta_minus, Bignum* delta_plus) { // Instead of multiplying the denominator with 10^estimated_power we // multiply all values (numerator and deltas) by 10^-estimated_power. // Use numerator as temporary container for power_ten. Bignum* power_ten = numerator; power_ten->AssignPowerUInt16(10, -estimated_power); if (need_boundary_deltas) { // Since power_ten == numerator we must make a copy of 10^estimated_power // before we complete the computation of the numerator. // delta_plus = delta_minus = 10^estimated_power delta_plus->AssignBignum(*power_ten); delta_minus->AssignBignum(*power_ten); } // numerator = significand * 2 * 10^-estimated_power // since v = significand * 2^exponent this is equivalent to // numerator = v * 10^-estimated_power * 2 * 2^-exponent. // Remember: numerator has been abused as power_ten. So no need to assign it // to itself. DOUBLE_CONVERSION_ASSERT(numerator == power_ten); numerator->MultiplyByUInt64(significand); // denominator = 2 * 2^-exponent with exponent < 0. denominator->AssignUInt16(1); denominator->ShiftLeft(-exponent); if (need_boundary_deltas) { // Introduce a common denominator so that the deltas to the boundaries are // integers. numerator->ShiftLeft(1); denominator->ShiftLeft(1); // With this shift the boundaries have their correct value, since // delta_plus = 10^-estimated_power, and // delta_minus = 10^-estimated_power. // These assignments have been done earlier. // The adjustments if f == 2^p-1 (lower boundary is closer) are done later. } } // Let v = significand * 2^exponent. // Computes v / 10^estimated_power exactly, as a ratio of two bignums, numerator // and denominator. The functions GenerateShortestDigits and // GenerateCountedDigits will then convert this ratio to its decimal // representation d, with the required accuracy. // Then d * 10^estimated_power is the representation of v. // (Note: the fraction and the estimated_power might get adjusted before // generating the decimal representation.) // // The initial start values consist of: // - a scaled numerator: s.t. numerator/denominator == v / 10^estimated_power. // - a scaled (common) denominator. // optionally (used by GenerateShortestDigits to decide if it has the shortest // decimal converting back to v): // - v - m-: the distance to the lower boundary. // - m+ - v: the distance to the upper boundary. // // v, m+, m-, and therefore v - m- and m+ - v all share the same denominator. // // Let ep == estimated_power, then the returned values will satisfy: // v / 10^ep = numerator / denominator. // v's boundaries m- and m+: // m- / 10^ep == v / 10^ep - delta_minus / denominator // m+ / 10^ep == v / 10^ep + delta_plus / denominator // Or in other words: // m- == v - delta_minus * 10^ep / denominator; // m+ == v + delta_plus * 10^ep / denominator; // // Since 10^(k-1) <= v < 10^k (with k == estimated_power) // or 10^k <= v < 10^(k+1) // we then have 0.1 <= numerator/denominator < 1 // or 1 <= numerator/denominator < 10 // // It is then easy to kickstart the digit-generation routine. // // The boundary-deltas are only filled if the mode equals BIGNUM_DTOA_SHORTEST // or BIGNUM_DTOA_SHORTEST_SINGLE. static void InitialScaledStartValues(uint64_t significand, int exponent, bool lower_boundary_is_closer, int estimated_power, bool need_boundary_deltas, Bignum* numerator, Bignum* denominator, Bignum* delta_minus, Bignum* delta_plus) { if (exponent >= 0) { InitialScaledStartValuesPositiveExponent( significand, exponent, estimated_power, need_boundary_deltas, numerator, denominator, delta_minus, delta_plus); } else if (estimated_power >= 0) { InitialScaledStartValuesNegativeExponentPositivePower( significand, exponent, estimated_power, need_boundary_deltas, numerator, denominator, delta_minus, delta_plus); } else { InitialScaledStartValuesNegativeExponentNegativePower( significand, exponent, estimated_power, need_boundary_deltas, numerator, denominator, delta_minus, delta_plus); } if (need_boundary_deltas && lower_boundary_is_closer) { // The lower boundary is closer at half the distance of "normal" numbers. // Increase the common denominator and adapt all but the delta_minus. denominator->ShiftLeft(1); // *2 numerator->ShiftLeft(1); // *2 delta_plus->ShiftLeft(1); // *2 } } // This routine multiplies numerator/denominator so that its values lies in the // range 1-10. That is after a call to this function we have: // 1 <= (numerator + delta_plus) /denominator < 10. // Let numerator the input before modification and numerator' the argument // after modification, then the output-parameter decimal_point is such that // numerator / denominator * 10^estimated_power == // numerator' / denominator' * 10^(decimal_point - 1) // In some cases estimated_power was too low, and this is already the case. We // then simply adjust the power so that 10^(k-1) <= v < 10^k (with k == // estimated_power) but do not touch the numerator or denominator. // Otherwise the routine multiplies the numerator and the deltas by 10. static void FixupMultiply10(int estimated_power, bool is_even, int* decimal_point, Bignum* numerator, Bignum* denominator, Bignum* delta_minus, Bignum* delta_plus) { bool in_range; if (is_even) { // For IEEE doubles half-way cases (in decimal system numbers ending with 5) // are rounded to the closest floating-point number with even significand. in_range = Bignum::PlusCompare(*numerator, *delta_plus, *denominator) >= 0; } else { in_range = Bignum::PlusCompare(*numerator, *delta_plus, *denominator) > 0; } if (in_range) { // Since numerator + delta_plus >= denominator we already have // 1 <= numerator/denominator < 10. Simply update the estimated_power. *decimal_point = estimated_power + 1; } else { *decimal_point = estimated_power; numerator->Times10(); if (Bignum::Equal(*delta_minus, *delta_plus)) { delta_minus->Times10(); delta_plus->AssignBignum(*delta_minus); } else { delta_minus->Times10(); delta_plus->Times10(); } } } } // namespace double_conversion // ICU PATCH: Close ICU namespace U_NAMESPACE_END #endif // ICU PATCH: close #if !UCONFIG_NO_FORMATTING stringi/src/icu74/i18n/dayperiodrules.cpp0000644000176200001440000004361514700200761017777 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2016, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * dayperiodrules.cpp * * created on: 2016-01-20 * created by: kazede */ #include "dayperiodrules.h" #include "unicode/ures.h" #include "bytesinkutil.h" #include "charstr.h" #include "cstring.h" #include "ucln_in.h" #include "uhash.h" #include "ulocimp.h" #include "umutex.h" #include "uresimp.h" U_NAMESPACE_BEGIN namespace { struct DayPeriodRulesData : public UMemory { DayPeriodRulesData() : localeToRuleSetNumMap(nullptr), rules(nullptr), maxRuleSetNum(0) {} UHashtable *localeToRuleSetNumMap; DayPeriodRules *rules; int32_t maxRuleSetNum; } *data = nullptr; enum CutoffType { CUTOFF_TYPE_UNKNOWN = -1, CUTOFF_TYPE_BEFORE, CUTOFF_TYPE_AFTER, // TODO: AFTER is deprecated in CLDR 29. Remove. CUTOFF_TYPE_FROM, CUTOFF_TYPE_AT }; } // namespace struct DayPeriodRulesDataSink : public ResourceSink { DayPeriodRulesDataSink() { for (int32_t i = 0; i < UPRV_LENGTHOF(cutoffs); ++i) { cutoffs[i] = 0; } } virtual ~DayPeriodRulesDataSink(); virtual void put(const char *key, ResourceValue &value, UBool, UErrorCode &errorCode) override { ResourceTable dayPeriodData = value.getTable(errorCode); if (U_FAILURE(errorCode)) { return; } for (int32_t i = 0; dayPeriodData.getKeyAndValue(i, key, value); ++i) { if (uprv_strcmp(key, "locales") == 0) { ResourceTable locales = value.getTable(errorCode); if (U_FAILURE(errorCode)) { return; } for (int32_t j = 0; locales.getKeyAndValue(j, key, value); ++j) { UnicodeString setNum_str = value.getUnicodeString(errorCode); int32_t setNum = parseSetNum(setNum_str, errorCode); uhash_puti(data->localeToRuleSetNumMap, const_cast(key), setNum, &errorCode); } } else if (uprv_strcmp(key, "rules") == 0) { // Allocate one more than needed to skip [0]. See comment in parseSetNum(). data->rules = new DayPeriodRules[data->maxRuleSetNum + 1]; if (data->rules == nullptr) { errorCode = U_MEMORY_ALLOCATION_ERROR; return; } ResourceTable rules = value.getTable(errorCode); processRules(rules, key, value, errorCode); if (U_FAILURE(errorCode)) { return; } } } } void processRules(const ResourceTable &rules, const char *key, ResourceValue &value, UErrorCode &errorCode) { if (U_FAILURE(errorCode)) { return; } for (int32_t i = 0; rules.getKeyAndValue(i, key, value); ++i) { ruleSetNum = parseSetNum(key, errorCode); ResourceTable ruleSet = value.getTable(errorCode); if (U_FAILURE(errorCode)) { return; } for (int32_t j = 0; ruleSet.getKeyAndValue(j, key, value); ++j) { period = DayPeriodRules::getDayPeriodFromString(key); if (period == DayPeriodRules::DAYPERIOD_UNKNOWN) { errorCode = U_INVALID_FORMAT_ERROR; return; } ResourceTable periodDefinition = value.getTable(errorCode); if (U_FAILURE(errorCode)) { return; } for (int32_t k = 0; periodDefinition.getKeyAndValue(k, key, value); ++k) { if (value.getType() == URES_STRING) { // Key-value pairs (e.g. before{6:00}). CutoffType type = getCutoffTypeFromString(key); addCutoff(type, value.getUnicodeString(errorCode), errorCode); if (U_FAILURE(errorCode)) { return; } } else { // Arrays (e.g. before{6:00, 24:00}). cutoffType = getCutoffTypeFromString(key); ResourceArray cutoffArray = value.getArray(errorCode); if (U_FAILURE(errorCode)) { return; } int32_t length = cutoffArray.getSize(); for (int32_t l = 0; l < length; ++l) { cutoffArray.getValue(l, value); addCutoff(cutoffType, value.getUnicodeString(errorCode), errorCode); if (U_FAILURE(errorCode)) { return; } } } } setDayPeriodForHoursFromCutoffs(errorCode); for (int32_t k = 0; k < UPRV_LENGTHOF(cutoffs); ++k) { cutoffs[k] = 0; } } if (!data->rules[ruleSetNum].allHoursAreSet()) { errorCode = U_INVALID_FORMAT_ERROR; return; } } } // Members. int32_t cutoffs[25]; // [0] thru [24]: 24 is allowed in "before 24". // "Path" to data. int32_t ruleSetNum; DayPeriodRules::DayPeriod period; CutoffType cutoffType; // Helpers. static int32_t parseSetNum(const UnicodeString &setNumStr, UErrorCode &errorCode) { CharString cs; cs.appendInvariantChars(setNumStr, errorCode); return parseSetNum(cs.data(), errorCode); } static int32_t parseSetNum(const char *setNumStr, UErrorCode &errorCode) { if (U_FAILURE(errorCode)) { return -1; } if (uprv_strncmp(setNumStr, "set", 3) != 0) { errorCode = U_INVALID_FORMAT_ERROR; return -1; } int32_t i = 3; int32_t setNum = 0; while (setNumStr[i] != 0) { int32_t digit = setNumStr[i] - '0'; if (digit < 0 || 9 < digit) { errorCode = U_INVALID_FORMAT_ERROR; return -1; } setNum = 10 * setNum + digit; ++i; } // Rule set number must not be zero. (0 is used to indicate "not found" by hashmap.) // Currently ICU data conveniently starts numbering rule sets from 1. if (setNum == 0) { errorCode = U_INVALID_FORMAT_ERROR; return -1; } else { return setNum; } } void addCutoff(CutoffType type, const UnicodeString &hour_str, UErrorCode &errorCode) { if (U_FAILURE(errorCode)) { return; } if (type == CUTOFF_TYPE_UNKNOWN) { errorCode = U_INVALID_FORMAT_ERROR; return; } int32_t hour = parseHour(hour_str, errorCode); if (U_FAILURE(errorCode)) { return; } cutoffs[hour] |= 1 << type; } // Translate the cutoffs[] array to day period rules. void setDayPeriodForHoursFromCutoffs(UErrorCode &errorCode) { DayPeriodRules &rule = data->rules[ruleSetNum]; for (int32_t startHour = 0; startHour <= 24; ++startHour) { // AT cutoffs must be either midnight or noon. if (cutoffs[startHour] & (1 << CUTOFF_TYPE_AT)) { if (startHour == 0 && period == DayPeriodRules::DAYPERIOD_MIDNIGHT) { rule.fHasMidnight = true; } else if (startHour == 12 && period == DayPeriodRules::DAYPERIOD_NOON) { rule.fHasNoon = true; } else { errorCode = U_INVALID_FORMAT_ERROR; // Bad data. return; } } // FROM/AFTER and BEFORE must come in a pair. if (cutoffs[startHour] & (1 << CUTOFF_TYPE_FROM) || cutoffs[startHour] & (1 << CUTOFF_TYPE_AFTER)) { for (int32_t hour = startHour + 1;; ++hour) { if (hour == startHour) { // We've gone around the array once and can't find a BEFORE. errorCode = U_INVALID_FORMAT_ERROR; return; } if (hour == 25) { hour = 0; } if (cutoffs[hour] & (1 << CUTOFF_TYPE_BEFORE)) { rule.add(startHour, hour, period); break; } } } } } // Translate "before" to CUTOFF_TYPE_BEFORE, for example. static CutoffType getCutoffTypeFromString(const char *type_str) { if (uprv_strcmp(type_str, "from") == 0) { return CUTOFF_TYPE_FROM; } else if (uprv_strcmp(type_str, "before") == 0) { return CUTOFF_TYPE_BEFORE; } else if (uprv_strcmp(type_str, "after") == 0) { return CUTOFF_TYPE_AFTER; } else if (uprv_strcmp(type_str, "at") == 0) { return CUTOFF_TYPE_AT; } else { return CUTOFF_TYPE_UNKNOWN; } } // Gets the numerical value of the hour from the Unicode string. static int32_t parseHour(const UnicodeString &time, UErrorCode &errorCode) { if (U_FAILURE(errorCode)) { return 0; } int32_t hourLimit = time.length() - 3; // `time` must look like "x:00" or "xx:00". // If length is wrong or `time` doesn't end with ":00", error out. if ((hourLimit != 1 && hourLimit != 2) || time[hourLimit] != 0x3A || time[hourLimit + 1] != 0x30 || time[hourLimit + 2] != 0x30) { errorCode = U_INVALID_FORMAT_ERROR; return 0; } // If `time` doesn't begin with a number in [0, 24], error out. // Note: "24:00" is possible in "before 24:00". int32_t hour = time[0] - 0x30; if (hour < 0 || 9 < hour) { errorCode = U_INVALID_FORMAT_ERROR; return 0; } if (hourLimit == 2) { int32_t hourDigit2 = time[1] - 0x30; if (hourDigit2 < 0 || 9 < hourDigit2) { errorCode = U_INVALID_FORMAT_ERROR; return 0; } hour = hour * 10 + hourDigit2; if (hour > 24) { errorCode = U_INVALID_FORMAT_ERROR; return 0; } } return hour; } }; // struct DayPeriodRulesDataSink struct DayPeriodRulesCountSink : public ResourceSink { virtual ~DayPeriodRulesCountSink(); virtual void put(const char *key, ResourceValue &value, UBool, UErrorCode &errorCode) override { ResourceTable rules = value.getTable(errorCode); if (U_FAILURE(errorCode)) { return; } for (int32_t i = 0; rules.getKeyAndValue(i, key, value); ++i) { int32_t setNum = DayPeriodRulesDataSink::parseSetNum(key, errorCode); if (setNum > data->maxRuleSetNum) { data->maxRuleSetNum = setNum; } } } }; // Out-of-line virtual destructors. DayPeriodRulesDataSink::~DayPeriodRulesDataSink() {} DayPeriodRulesCountSink::~DayPeriodRulesCountSink() {} namespace { UInitOnce initOnce {}; U_CFUNC UBool U_CALLCONV dayPeriodRulesCleanup() { delete[] data->rules; uhash_close(data->localeToRuleSetNumMap); delete data; data = nullptr; return true; } } // namespace void U_CALLCONV DayPeriodRules::load(UErrorCode &errorCode) { if (U_FAILURE(errorCode)) { return; } data = new DayPeriodRulesData(); data->localeToRuleSetNumMap = uhash_open(uhash_hashChars, uhash_compareChars, nullptr, &errorCode); LocalUResourceBundlePointer rb_dayPeriods(ures_openDirect(nullptr, "dayPeriods", &errorCode)); // Get the largest rule set number (so we allocate enough objects). DayPeriodRulesCountSink countSink; ures_getAllItemsWithFallback(rb_dayPeriods.getAlias(), "rules", countSink, errorCode); // Populate rules. DayPeriodRulesDataSink sink; ures_getAllItemsWithFallback(rb_dayPeriods.getAlias(), "", sink, errorCode); ucln_i18n_registerCleanup(UCLN_I18N_DAYPERIODRULES, dayPeriodRulesCleanup); } const DayPeriodRules *DayPeriodRules::getInstance(const Locale &locale, UErrorCode &errorCode) { umtx_initOnce(initOnce, DayPeriodRules::load, errorCode); // If the entire day period rules data doesn't conform to spec (even if the part we want // does), return nullptr. if(U_FAILURE(errorCode)) { return nullptr; } const char *localeCode = locale.getBaseName(); char name[ULOC_FULLNAME_CAPACITY]; if (uprv_strlen(localeCode) < ULOC_FULLNAME_CAPACITY) { uprv_strcpy(name, localeCode); // Treat empty string as root. if (*name == '\0') { uprv_strcpy(name, "root"); } } else { errorCode = U_BUFFER_OVERFLOW_ERROR; return nullptr; } int32_t ruleSetNum = 0; // NB there is no rule set 0 and 0 is returned upon lookup failure. while (*name != '\0') { ruleSetNum = uhash_geti(data->localeToRuleSetNumMap, name); if (ruleSetNum == 0) { CharString parent; CharStringByteSink sink(&parent); ulocimp_getParent(name, sink, &errorCode); if (parent.isEmpty()) { // Saves a lookup in the hash table. break; } parent.extract(name, UPRV_LENGTHOF(name), errorCode); } else { break; } } if (ruleSetNum <= 0 || data->rules[ruleSetNum].getDayPeriodForHour(0) == DAYPERIOD_UNKNOWN) { // If day period for hour 0 is UNKNOWN then day period for all hours are UNKNOWN. // Data doesn't exist even with fallback. return nullptr; } else { return &data->rules[ruleSetNum]; } } DayPeriodRules::DayPeriodRules() : fHasMidnight(false), fHasNoon(false) { for (int32_t i = 0; i < 24; ++i) { fDayPeriodForHour[i] = DayPeriodRules::DAYPERIOD_UNKNOWN; } } double DayPeriodRules::getMidPointForDayPeriod( DayPeriodRules::DayPeriod dayPeriod, UErrorCode &errorCode) const { if (U_FAILURE(errorCode)) { return -1; } int32_t startHour = getStartHourForDayPeriod(dayPeriod, errorCode); int32_t endHour = getEndHourForDayPeriod(dayPeriod, errorCode); // Can't obtain startHour or endHour; bail out. if (U_FAILURE(errorCode)) { return -1; } double midPoint = (startHour + endHour) / 2.0; if (startHour > endHour) { // dayPeriod wraps around midnight. Shift midPoint by 12 hours, in the direction that // lands it in [0, 24). midPoint += 12; if (midPoint >= 24) { midPoint -= 24; } } return midPoint; } int32_t DayPeriodRules::getStartHourForDayPeriod( DayPeriodRules::DayPeriod dayPeriod, UErrorCode &errorCode) const { if (U_FAILURE(errorCode)) { return -1; } if (dayPeriod == DAYPERIOD_MIDNIGHT) { return 0; } if (dayPeriod == DAYPERIOD_NOON) { return 12; } if (fDayPeriodForHour[0] == dayPeriod && fDayPeriodForHour[23] == dayPeriod) { // dayPeriod wraps around midnight. Start hour is later than end hour. for (int32_t i = 22; i >= 1; --i) { if (fDayPeriodForHour[i] != dayPeriod) { return (i + 1); } } } else { for (int32_t i = 0; i <= 23; ++i) { if (fDayPeriodForHour[i] == dayPeriod) { return i; } } } // dayPeriod doesn't exist in rule set; set error and exit. errorCode = U_ILLEGAL_ARGUMENT_ERROR; return -1; } int32_t DayPeriodRules::getEndHourForDayPeriod( DayPeriodRules::DayPeriod dayPeriod, UErrorCode &errorCode) const { if (U_FAILURE(errorCode)) { return -1; } if (dayPeriod == DAYPERIOD_MIDNIGHT) { return 0; } if (dayPeriod == DAYPERIOD_NOON) { return 12; } if (fDayPeriodForHour[0] == dayPeriod && fDayPeriodForHour[23] == dayPeriod) { // dayPeriod wraps around midnight. End hour is before start hour. for (int32_t i = 1; i <= 22; ++i) { if (fDayPeriodForHour[i] != dayPeriod) { // i o'clock is when a new period starts, therefore when the old period ends. return i; } } } else { for (int32_t i = 23; i >= 0; --i) { if (fDayPeriodForHour[i] == dayPeriod) { return (i + 1); } } } // dayPeriod doesn't exist in rule set; set error and exit. errorCode = U_ILLEGAL_ARGUMENT_ERROR; return -1; } DayPeriodRules::DayPeriod DayPeriodRules::getDayPeriodFromString(const char *type_str) { if (uprv_strcmp(type_str, "midnight") == 0) { return DAYPERIOD_MIDNIGHT; } else if (uprv_strcmp(type_str, "noon") == 0) { return DAYPERIOD_NOON; } else if (uprv_strcmp(type_str, "morning1") == 0) { return DAYPERIOD_MORNING1; } else if (uprv_strcmp(type_str, "afternoon1") == 0) { return DAYPERIOD_AFTERNOON1; } else if (uprv_strcmp(type_str, "evening1") == 0) { return DAYPERIOD_EVENING1; } else if (uprv_strcmp(type_str, "night1") == 0) { return DAYPERIOD_NIGHT1; } else if (uprv_strcmp(type_str, "morning2") == 0) { return DAYPERIOD_MORNING2; } else if (uprv_strcmp(type_str, "afternoon2") == 0) { return DAYPERIOD_AFTERNOON2; } else if (uprv_strcmp(type_str, "evening2") == 0) { return DAYPERIOD_EVENING2; } else if (uprv_strcmp(type_str, "night2") == 0) { return DAYPERIOD_NIGHT2; } else if (uprv_strcmp(type_str, "am") == 0) { return DAYPERIOD_AM; } else if (uprv_strcmp(type_str, "pm") == 0) { return DAYPERIOD_PM; } else { return DAYPERIOD_UNKNOWN; } } void DayPeriodRules::add(int32_t startHour, int32_t limitHour, DayPeriod period) { for (int32_t i = startHour; i != limitHour; ++i) { if (i == 24) { i = 0; } fDayPeriodForHour[i] = period; } } UBool DayPeriodRules::allHoursAreSet() { for (int32_t i = 0; i < 24; ++i) { if (fDayPeriodForHour[i] == DAYPERIOD_UNKNOWN) { return false; } } return true; } U_NAMESPACE_END stringi/src/icu74/i18n/ucsdet.cpp0000644000176200001440000001142314700200761016223 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************** * Copyright (C) 2005-2016, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_CONVERSION #include "unicode/ucsdet.h" #include "csdetect.h" #include "csmatch.h" #include "csrsbcs.h" #include "csrmbcs.h" #include "csrutf8.h" #include "csrucode.h" #include "csr2022.h" #include "cmemory.h" U_NAMESPACE_USE #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) #define DELETE_ARRAY(array) uprv_free((void *) (array)) U_CDECL_BEGIN U_CAPI UCharsetDetector * U_EXPORT2 ucsdet_open(UErrorCode *status) { if(U_FAILURE(*status)) { return 0; } CharsetDetector* csd = new CharsetDetector(*status); if (U_FAILURE(*status)) { delete csd; csd = nullptr; } return (UCharsetDetector *) csd; } U_CAPI void U_EXPORT2 ucsdet_close(UCharsetDetector *ucsd) { CharsetDetector *csd = (CharsetDetector *) ucsd; delete csd; } U_CAPI void U_EXPORT2 ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status) { if(U_FAILURE(*status)) { return; } ((CharsetDetector *) ucsd)->setText(textIn, len); } U_CAPI const char * U_EXPORT2 ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status) { if(U_FAILURE(*status)) { return nullptr; } return ((CharsetMatch *) ucsm)->getName(); } U_CAPI int32_t U_EXPORT2 ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status) { if(U_FAILURE(*status)) { return 0; } return ((CharsetMatch *) ucsm)->getConfidence(); } U_CAPI const char * U_EXPORT2 ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status) { if(U_FAILURE(*status)) { return nullptr; } return ((CharsetMatch *) ucsm)->getLanguage(); } U_CAPI const UCharsetMatch * U_EXPORT2 ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status) { if(U_FAILURE(*status)) { return nullptr; } return (const UCharsetMatch *) ((CharsetDetector *) ucsd)->detect(*status); } U_CAPI void U_EXPORT2 ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status) { if(U_FAILURE(*status)) { return; } ((CharsetDetector *) ucsd)->setDeclaredEncoding(encoding,length); } U_CAPI const UCharsetMatch** ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *maxMatchesFound, UErrorCode *status) { if(U_FAILURE(*status)) { return nullptr; } CharsetDetector *csd = (CharsetDetector *) ucsd; return (const UCharsetMatch**)csd->detectAll(*maxMatchesFound,*status); } // U_CAPI const char * U_EXPORT2 // ucsdet_getDetectableCharsetName(const UCharsetDetector *csd, int32_t index, UErrorCode *status) // { // if(U_FAILURE(*status)) { // return 0; // } // return csd->getCharsetName(index,*status); // } // U_CAPI int32_t U_EXPORT2 // ucsdet_getDetectableCharsetsCount(const UCharsetDetector *csd, UErrorCode *status) // { // if(U_FAILURE(*status)) { // return -1; // } // return UCharsetDetector::getDetectableCount(); // } U_CAPI UBool U_EXPORT2 ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd) { // todo: could use an error return... if (ucsd == nullptr) { return false; } return ((CharsetDetector *) ucsd)->getStripTagsFlag(); } U_CAPI UBool U_EXPORT2 ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter) { // todo: could use an error return... if (ucsd == nullptr) { return false; } CharsetDetector *csd = (CharsetDetector *) ucsd; UBool prev = csd->getStripTagsFlag(); csd->setStripTagsFlag(filter); return prev; } U_CAPI int32_t U_EXPORT2 ucsdet_getUChars(const UCharsetMatch *ucsm, char16_t *buf, int32_t cap, UErrorCode *status) { if(U_FAILURE(*status)) { return 0; } return ((CharsetMatch *) ucsm)->getUChars(buf, cap, status); } U_CAPI void U_EXPORT2 ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status) { ((CharsetDetector *)ucsd)->setDetectableCharset(encoding, enabled, *status); } U_CAPI UEnumeration * U_EXPORT2 ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status) { return CharsetDetector::getAllDetectableCharsets(*status); } U_CAPI UEnumeration * U_EXPORT2 ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status) { return ((CharsetDetector *)ucsd)->getDetectableCharsets(*status); } U_CDECL_END #endif stringi/src/icu74/i18n/number_mapper.h0000644000176200001440000002006114700200761017233 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #ifndef __NUMBER_MAPPER_H__ #define __NUMBER_MAPPER_H__ #include #include "number_types.h" #include "unicode/currpinf.h" #include "standardplural.h" #include "number_patternstring.h" #include "number_currencysymbols.h" #include "numparse_impl.h" U_NAMESPACE_BEGIN namespace number { namespace impl { class AutoAffixPatternProvider; class CurrencyPluralInfoAffixProvider; class PropertiesAffixPatternProvider : public AffixPatternProvider, public UMemory { public: bool isBogus() const { return fBogus; } void setToBogus() { fBogus = true; } void setTo(const DecimalFormatProperties& properties, UErrorCode& status); // AffixPatternProvider Methods: char16_t charAt(int32_t flags, int32_t i) const override; int32_t length(int32_t flags) const override; UnicodeString getString(int32_t flags) const override; bool hasCurrencySign() const override; bool positiveHasPlusSign() const override; bool hasNegativeSubpattern() const override; bool negativeHasMinusSign() const override; bool containsSymbolType(AffixPatternType, UErrorCode&) const override; bool hasBody() const override; bool currencyAsDecimal() const override; private: UnicodeString posPrefix; UnicodeString posSuffix; UnicodeString negPrefix; UnicodeString negSuffix; bool isCurrencyPattern; bool fCurrencyAsDecimal; PropertiesAffixPatternProvider() = default; // puts instance in valid but undefined state const UnicodeString& getStringInternal(int32_t flags) const; bool fBogus{true}; friend class AutoAffixPatternProvider; friend class CurrencyPluralInfoAffixProvider; }; class CurrencyPluralInfoAffixProvider : public AffixPatternProvider, public UMemory { public: bool isBogus() const { return fBogus; } void setToBogus() { fBogus = true; } void setTo(const CurrencyPluralInfo& cpi, const DecimalFormatProperties& properties, UErrorCode& status); // AffixPatternProvider Methods: char16_t charAt(int32_t flags, int32_t i) const override; int32_t length(int32_t flags) const override; UnicodeString getString(int32_t flags) const override; bool hasCurrencySign() const override; bool positiveHasPlusSign() const override; bool hasNegativeSubpattern() const override; bool negativeHasMinusSign() const override; bool containsSymbolType(AffixPatternType, UErrorCode&) const override; bool hasBody() const override; bool currencyAsDecimal() const override; private: PropertiesAffixPatternProvider affixesByPlural[StandardPlural::COUNT]; CurrencyPluralInfoAffixProvider() = default; bool fBogus{true}; friend class AutoAffixPatternProvider; }; class AutoAffixPatternProvider { public: inline AutoAffixPatternProvider() = default; inline AutoAffixPatternProvider(const DecimalFormatProperties& properties, UErrorCode& status) { setTo(properties, status); } inline void setTo(const DecimalFormatProperties& properties, UErrorCode& status) { if (properties.currencyPluralInfo.fPtr.isNull()) { propertiesAPP.setTo(properties, status); currencyPluralInfoAPP.setToBogus(); } else { propertiesAPP.setToBogus(); currencyPluralInfoAPP.setTo(*properties.currencyPluralInfo.fPtr, properties, status); } } inline void setTo(const AffixPatternProvider* provider, UErrorCode& status) { if (auto ptr = dynamic_cast(provider)) { propertiesAPP = *ptr; } else if (auto ptr = dynamic_cast(provider)) { currencyPluralInfoAPP = *ptr; } else { status = U_INTERNAL_PROGRAM_ERROR; } } inline const AffixPatternProvider& get() const { if (!currencyPluralInfoAPP.isBogus()) { return currencyPluralInfoAPP; } else { return propertiesAPP; } } private: PropertiesAffixPatternProvider propertiesAPP; CurrencyPluralInfoAffixProvider currencyPluralInfoAPP; }; /** * A struct for ownership of a few objects needed for formatting. */ struct DecimalFormatWarehouse : public UMemory { AutoAffixPatternProvider affixProvider; LocalPointer rules; }; /** * Internal fields for DecimalFormat. * TODO: Make some of these fields by value instead of by LocalPointer? */ struct DecimalFormatFields : public UMemory { DecimalFormatFields() {} DecimalFormatFields(const DecimalFormatProperties& propsToCopy) : properties(propsToCopy) {} /** The property bag corresponding to user-specified settings and settings from the pattern string. */ DecimalFormatProperties properties; /** The symbols for the current locale. */ LocalPointer symbols; /** * The pre-computed formatter object. Setters cause this to be re-computed atomically. The {@link * #format} method uses the formatter directly without needing to synchronize. */ LocalizedNumberFormatter formatter; /** The lazy-computed parser for .parse() */ std::atomic<::icu::numparse::impl::NumberParserImpl*> atomicParser = {}; /** The lazy-computed parser for .parseCurrency() */ std::atomic<::icu::numparse::impl::NumberParserImpl*> atomicCurrencyParser = {}; /** Small object ownership warehouse for the formatter and parser */ DecimalFormatWarehouse warehouse; /** The effective properties as exported from the formatter object. Used by some getters. */ DecimalFormatProperties exportedProperties; // Data for fastpath bool canUseFastFormat = false; struct FastFormatData { char16_t cpZero; char16_t cpGroupingSeparator; char16_t cpMinusSign; int8_t minInt; int8_t maxInt; } fastData; }; /** * Utilities for converting between a DecimalFormatProperties and a MacroProps. */ class NumberPropertyMapper { public: /** Convenience method to create a NumberFormatter directly from Properties. */ static UnlocalizedNumberFormatter create(const DecimalFormatProperties& properties, const DecimalFormatSymbols& symbols, DecimalFormatWarehouse& warehouse, UErrorCode& status); /** Convenience method to create a NumberFormatter directly from Properties. */ static UnlocalizedNumberFormatter create(const DecimalFormatProperties& properties, const DecimalFormatSymbols& symbols, DecimalFormatWarehouse& warehouse, DecimalFormatProperties& exportedProperties, UErrorCode& status); /** * Creates a new {@link MacroProps} object based on the content of a {@link DecimalFormatProperties} * object. In other words, maps Properties to MacroProps. This function is used by the * JDK-compatibility API to call into the ICU 60 fluent number formatting pipeline. * * @param properties * The property bag to be mapped. * @param symbols * The symbols associated with the property bag. * @param exportedProperties * A property bag in which to store validated properties. Used by some DecimalFormat * getters. * @return A new MacroProps containing all of the information in the Properties. */ static MacroProps oldToNew(const DecimalFormatProperties& properties, const DecimalFormatSymbols& symbols, DecimalFormatWarehouse& warehouse, DecimalFormatProperties* exportedProperties, UErrorCode& status); }; } // namespace impl } // namespace numparse U_NAMESPACE_END #endif //__NUMBER_MAPPER_H__ #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/funcrepl.h0000644000176200001440000000604314700200761016221 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (c) 2002-2011, International Business Machines Corporation * and others. All Rights Reserved. ********************************************************************** * Date Name Description * 02/04/2002 aliu Creation. ********************************************************************** */ #ifndef FUNCREPL_H #define FUNCREPL_H #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/unifunct.h" #include "unicode/unirepl.h" U_NAMESPACE_BEGIN class Transliterator; /** * A replacer that calls a transliterator to generate its output text. * The input text to the transliterator is the output of another * UnicodeReplacer object. That is, this replacer wraps another * replacer with a transliterator. * * @author Alan Liu */ class FunctionReplacer : public UnicodeFunctor, public UnicodeReplacer { private: /** * The transliterator. Must not be null. OWNED. */ Transliterator* translit; /** * The replacer object. This generates text that is then * processed by 'translit'. Must not be null. OWNED. */ UnicodeFunctor* replacer; public: /** * Construct a replacer that takes the output of the given * replacer, passes it through the given transliterator, and emits * the result as output. */ FunctionReplacer(Transliterator* adoptedTranslit, UnicodeFunctor* adoptedReplacer); /** * Copy constructor. */ FunctionReplacer(const FunctionReplacer& other); /** * Destructor */ virtual ~FunctionReplacer(); /** * Implement UnicodeFunctor */ virtual FunctionReplacer* clone() const override; /** * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer * and return the pointer. */ virtual UnicodeReplacer* toReplacer() const override; /** * UnicodeReplacer API */ virtual int32_t replace(Replaceable& text, int32_t start, int32_t limit, int32_t& cursor) override; /** * UnicodeReplacer API */ virtual UnicodeString& toReplacerPattern(UnicodeString& rule, UBool escapeUnprintable) const override; /** * Implement UnicodeReplacer */ virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const override; /** * UnicodeFunctor API */ virtual void setData(const TransliterationRuleData*) override; /** * ICU "poor man's RTTI", returns a UClassID for the actual class. */ virtual UClassID getDynamicClassID() const override; /** * ICU "poor man's RTTI", returns a UClassID for this class. */ static UClassID U_EXPORT2 getStaticClassID(); }; U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ #endif //eof stringi/src/icu74/i18n/decContext.h0000644000176200001440000003161414700200761016505 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ------------------------------------------------------------------ */ /* Decimal Context module header */ /* ------------------------------------------------------------------ */ /* Copyright (c) IBM Corporation, 2000-2011. All rights reserved. */ /* */ /* This software is made available under the terms of the */ /* ICU License -- ICU 1.8.1 and later. */ /* */ /* The description and User's Guide ("The decNumber C Library") for */ /* this software is called decNumber.pdf. This document is */ /* available, together with arithmetic and format specifications, */ /* testcases, and Web links, on the General Decimal Arithmetic page. */ /* */ /* Please send comments, suggestions, and corrections to the author: */ /* mfc@uk.ibm.com */ /* Mike Cowlishaw, IBM Fellow */ /* IBM UK, PO Box 31, Birmingham Road, Warwick CV34 5JL, UK */ /* ------------------------------------------------------------------ */ /* Modified version, for use from within ICU. * Renamed public functions, to avoid an unwanted export of the * standard names from the ICU library. * * Use ICU's uprv_malloc() and uprv_free() * * Revert comment syntax to plain C * * Remove a few compiler warnings. */ #include "unicode/utypes.h" #include "putilimp.h" /* */ /* Context variables must always have valid values: */ /* */ /* status -- [any bits may be cleared, but not set, by user] */ /* round -- must be one of the enumerated rounding modes */ /* */ /* The following variables are implied for fixed size formats (i.e., */ /* they are ignored) but should still be set correctly in case used */ /* with decNumber functions: */ /* */ /* clamp -- must be either 0 or 1 */ /* digits -- must be in the range 1 through 999999999 */ /* emax -- must be in the range 0 through 999999999 */ /* emin -- must be in the range 0 through -999999999 */ /* extended -- must be either 0 or 1 [present only if DECSUBSET] */ /* traps -- only defined bits may be set */ /* */ /* ------------------------------------------------------------------ */ #if !defined(DECCONTEXT) #define DECCONTEXT #define DECCNAME "decContext" /* Short name */ #define DECCFULLNAME "Decimal Context Descriptor" /* Verbose name */ #define DECCAUTHOR "Mike Cowlishaw" /* Who to blame */ #if !defined(int32_t) /* #include */ /* C99 standard integers */ #endif #include /* for printf, etc. */ #include /* for traps */ /* Extended flags setting -- set this to 0 to use only IEEE flags */ #if !defined(DECEXTFLAG) #define DECEXTFLAG 1 /* 1=enable extended flags */ #endif /* Conditional code flag -- set this to 0 for best performance */ #if !defined(DECSUBSET) #define DECSUBSET 0 /* 1=enable subset arithmetic */ #endif /* Context for operations, with associated constants */ enum rounding { DEC_ROUND_CEILING, /* round towards +infinity */ DEC_ROUND_UP, /* round away from 0 */ DEC_ROUND_HALF_UP, /* 0.5 rounds up */ DEC_ROUND_HALF_EVEN, /* 0.5 rounds to nearest even */ DEC_ROUND_HALF_DOWN, /* 0.5 rounds down */ DEC_ROUND_DOWN, /* round towards 0 (truncate) */ DEC_ROUND_FLOOR, /* round towards -infinity */ DEC_ROUND_05UP, /* round for reround */ DEC_ROUND_MAX /* enum must be less than this */ }; #define DEC_ROUND_DEFAULT DEC_ROUND_HALF_EVEN; typedef struct { int32_t digits; /* working precision */ int32_t emax; /* maximum positive exponent */ int32_t emin; /* minimum negative exponent */ enum rounding round; /* rounding mode */ uint32_t traps; /* trap-enabler flags */ uint32_t status; /* status flags */ uint8_t clamp; /* flag: apply IEEE exponent clamp */ #if DECSUBSET uint8_t extended; /* flag: special-values allowed */ #endif } decContext; /* Maxima and Minima for context settings */ #define DEC_MAX_DIGITS 999999999 #define DEC_MIN_DIGITS 1 #define DEC_MAX_EMAX 999999999 #define DEC_MIN_EMAX 0 #define DEC_MAX_EMIN 0 #define DEC_MIN_EMIN -999999999 #define DEC_MAX_MATH 999999 /* max emax, etc., for math funcs. */ /* Classifications for decimal numbers, aligned with 754 (note that */ /* 'normal' and 'subnormal' are meaningful only with a decContext */ /* or a fixed size format). */ enum decClass { DEC_CLASS_SNAN, DEC_CLASS_QNAN, DEC_CLASS_NEG_INF, DEC_CLASS_NEG_NORMAL, DEC_CLASS_NEG_SUBNORMAL, DEC_CLASS_NEG_ZERO, DEC_CLASS_POS_ZERO, DEC_CLASS_POS_SUBNORMAL, DEC_CLASS_POS_NORMAL, DEC_CLASS_POS_INF }; /* Strings for the decClasses */ #define DEC_ClassString_SN "sNaN" #define DEC_ClassString_QN "NaN" #define DEC_ClassString_NI "-Infinity" #define DEC_ClassString_NN "-Normal" #define DEC_ClassString_NS "-Subnormal" #define DEC_ClassString_NZ "-Zero" #define DEC_ClassString_PZ "+Zero" #define DEC_ClassString_PS "+Subnormal" #define DEC_ClassString_PN "+Normal" #define DEC_ClassString_PI "+Infinity" #define DEC_ClassString_UN "Invalid" /* Trap-enabler and Status flags (exceptional conditions), and */ /* their names. The top byte is reserved for internal use */ #if DECEXTFLAG /* Extended flags */ #define DEC_Conversion_syntax 0x00000001 #define DEC_Division_by_zero 0x00000002 #define DEC_Division_impossible 0x00000004 #define DEC_Division_undefined 0x00000008 #define DEC_Insufficient_storage 0x00000010 /* [when malloc fails] */ #define DEC_Inexact 0x00000020 #define DEC_Invalid_context 0x00000040 #define DEC_Invalid_operation 0x00000080 #if DECSUBSET #define DEC_Lost_digits 0x00000100 #endif #define DEC_Overflow 0x00000200 #define DEC_Clamped 0x00000400 #define DEC_Rounded 0x00000800 #define DEC_Subnormal 0x00001000 #define DEC_Underflow 0x00002000 #else /* IEEE flags only */ #define DEC_Conversion_syntax 0x00000010 #define DEC_Division_by_zero 0x00000002 #define DEC_Division_impossible 0x00000010 #define DEC_Division_undefined 0x00000010 #define DEC_Insufficient_storage 0x00000010 /* [when malloc fails] */ #define DEC_Inexact 0x00000001 #define DEC_Invalid_context 0x00000010 #define DEC_Invalid_operation 0x00000010 #if DECSUBSET #define DEC_Lost_digits 0x00000000 #endif #define DEC_Overflow 0x00000008 #define DEC_Clamped 0x00000000 #define DEC_Rounded 0x00000000 #define DEC_Subnormal 0x00000000 #define DEC_Underflow 0x00000004 #endif /* IEEE 754 groupings for the flags */ /* [DEC_Clamped, DEC_Lost_digits, DEC_Rounded, and DEC_Subnormal */ /* are not in IEEE 754] */ #define DEC_IEEE_754_Division_by_zero (DEC_Division_by_zero) #if DECSUBSET #define DEC_IEEE_754_Inexact (DEC_Inexact | DEC_Lost_digits) #else #define DEC_IEEE_754_Inexact (DEC_Inexact) #endif #define DEC_IEEE_754_Invalid_operation (DEC_Conversion_syntax | \ DEC_Division_impossible | \ DEC_Division_undefined | \ DEC_Insufficient_storage | \ DEC_Invalid_context | \ DEC_Invalid_operation) #define DEC_IEEE_754_Overflow (DEC_Overflow) #define DEC_IEEE_754_Underflow (DEC_Underflow) /* flags which are normally errors (result is qNaN, infinite, or 0) */ #define DEC_Errors (DEC_IEEE_754_Division_by_zero | \ DEC_IEEE_754_Invalid_operation | \ DEC_IEEE_754_Overflow | DEC_IEEE_754_Underflow) /* flags which cause a result to become qNaN */ #define DEC_NaNs DEC_IEEE_754_Invalid_operation /* flags which are normally for information only (finite results) */ #if DECSUBSET #define DEC_Information (DEC_Clamped | DEC_Rounded | DEC_Inexact \ | DEC_Lost_digits) #else #define DEC_Information (DEC_Clamped | DEC_Rounded | DEC_Inexact) #endif /* IEEE 854 names (for compatibility with older decNumber versions) */ #define DEC_IEEE_854_Division_by_zero DEC_IEEE_754_Division_by_zero #define DEC_IEEE_854_Inexact DEC_IEEE_754_Inexact #define DEC_IEEE_854_Invalid_operation DEC_IEEE_754_Invalid_operation #define DEC_IEEE_854_Overflow DEC_IEEE_754_Overflow #define DEC_IEEE_854_Underflow DEC_IEEE_754_Underflow /* Name strings for the exceptional conditions */ #define DEC_Condition_CS "Conversion syntax" #define DEC_Condition_DZ "Division by zero" #define DEC_Condition_DI "Division impossible" #define DEC_Condition_DU "Division undefined" #define DEC_Condition_IE "Inexact" #define DEC_Condition_IS "Insufficient storage" #define DEC_Condition_IC "Invalid context" #define DEC_Condition_IO "Invalid operation" #if DECSUBSET #define DEC_Condition_LD "Lost digits" #endif #define DEC_Condition_OV "Overflow" #define DEC_Condition_PA "Clamped" #define DEC_Condition_RO "Rounded" #define DEC_Condition_SU "Subnormal" #define DEC_Condition_UN "Underflow" #define DEC_Condition_ZE "No status" #define DEC_Condition_MU "Multiple status" #define DEC_Condition_Length 21 /* length of the longest string, */ /* including terminator */ /* Initialization descriptors, used by decContextDefault */ #define DEC_INIT_BASE 0 #define DEC_INIT_DECIMAL32 32 #define DEC_INIT_DECIMAL64 64 #define DEC_INIT_DECIMAL128 128 /* Synonyms */ #define DEC_INIT_DECSINGLE DEC_INIT_DECIMAL32 #define DEC_INIT_DECDOUBLE DEC_INIT_DECIMAL64 #define DEC_INIT_DECQUAD DEC_INIT_DECIMAL128 /* decContext routines */ U_CAPI decContext * U_EXPORT2 uprv_decContextClearStatus(decContext *, uint32_t); U_CAPI decContext * U_EXPORT2 uprv_decContextDefault(decContext *, int32_t); U_CAPI enum rounding U_EXPORT2 uprv_decContextGetRounding(decContext *); U_CAPI uint32_t U_EXPORT2 uprv_decContextGetStatus(decContext *); U_CAPI decContext * U_EXPORT2 uprv_decContextRestoreStatus(decContext *, uint32_t, uint32_t); U_CAPI uint32_t U_EXPORT2 uprv_decContextSaveStatus(decContext *, uint32_t); U_CAPI decContext * U_EXPORT2 uprv_decContextSetRounding(decContext *, enum rounding); U_CAPI decContext * U_EXPORT2 uprv_decContextSetStatus(decContext *, uint32_t); U_CAPI decContext * U_EXPORT2 uprv_decContextSetStatusFromString(decContext *, const char *); U_CAPI decContext * U_EXPORT2 uprv_decContextSetStatusFromStringQuiet(decContext *, const char *); U_CAPI decContext * U_EXPORT2 uprv_decContextSetStatusQuiet(decContext *, uint32_t); U_CAPI const char * U_EXPORT2 uprv_decContextStatusToString(const decContext *); U_CAPI uint32_t U_EXPORT2 uprv_decContextTestSavedStatus(uint32_t, uint32_t); U_CAPI uint32_t U_EXPORT2 uprv_decContextTestStatus(decContext *, uint32_t); U_CAPI decContext * U_EXPORT2 uprv_decContextZeroStatus(decContext *); #endif stringi/src/icu74/i18n/region.cpp0000644000176200001440000007565214700200761016235 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2014-2016, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* * * * File REGION.CPP * * Modification History:* * Date Name Description * 01/15/13 Emmons Original Port from ICU4J ******************************************************************************** */ /** * \file * \brief C++ API: Region classes (territory containment) */ #include "unicode/region.h" #include "unicode/utypes.h" #include "unicode/uobject.h" #include "unicode/unistr.h" #include "unicode/ures.h" #include "ucln_in.h" #include "cstring.h" #include "mutex.h" #include "uhash.h" #include "umutex.h" #include "uresimp.h" #include "region_impl.h" #include "util.h" #if !UCONFIG_NO_FORMATTING U_CDECL_BEGIN /** * Cleanup callback func */ static UBool U_CALLCONV region_cleanup() { icu::Region::cleanupRegionData(); return true; } U_CDECL_END U_NAMESPACE_BEGIN static UInitOnce gRegionDataInitOnce {}; static UVector* availableRegions[URGN_LIMIT]; static UHashtable *regionAliases = nullptr; static UHashtable *regionIDMap = nullptr; static UHashtable *numericCodeMap = nullptr; static UVector *allRegions = nullptr; static const char16_t UNKNOWN_REGION_ID [] = { 0x5A, 0x5A, 0 }; /* "ZZ" */ static const char16_t OUTLYING_OCEANIA_REGION_ID [] = { 0x51, 0x4F, 0 }; /* "QO" */ static const char16_t WORLD_ID [] = { 0x30, 0x30, 0x31, 0 }; /* "001" */ static const char16_t RANGE_MARKER = 0x7E; /* '~' */ UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegionNameEnumeration) /* * Initializes the region data from the ICU resource bundles. The region data * contains the basic relationships such as which regions are known, what the numeric * codes are, any known aliases, and the territory containment data. * * If the region data has already loaded, then this method simply returns without doing * anything meaningful. */ void U_CALLCONV Region::loadRegionData(UErrorCode &status) { // Construct service objs first LocalUHashtablePointer newRegionIDMap(uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, nullptr, &status)); LocalUHashtablePointer newNumericCodeMap(uhash_open(uhash_hashLong,uhash_compareLong,nullptr,&status)); LocalUHashtablePointer newRegionAliases(uhash_open(uhash_hashUnicodeString,uhash_compareUnicodeString,nullptr,&status)); LocalPointer continents(new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status), status); LocalPointer groupings(new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status), status); LocalPointer lpAllRegions(new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status), status); allRegions = lpAllRegions.orphan(); LocalUResourceBundlePointer metadata(ures_openDirect(nullptr,"metadata",&status)); LocalUResourceBundlePointer metadataAlias(ures_getByKey(metadata.getAlias(),"alias",nullptr,&status)); LocalUResourceBundlePointer territoryAlias(ures_getByKey(metadataAlias.getAlias(),"territory",nullptr,&status)); LocalUResourceBundlePointer supplementalData(ures_openDirect(nullptr,"supplementalData",&status)); LocalUResourceBundlePointer codeMappings(ures_getByKey(supplementalData.getAlias(),"codeMappings",nullptr,&status)); LocalUResourceBundlePointer idValidity(ures_getByKey(supplementalData.getAlias(),"idValidity",nullptr,&status)); LocalUResourceBundlePointer regionList(ures_getByKey(idValidity.getAlias(),"region",nullptr,&status)); LocalUResourceBundlePointer regionRegular(ures_getByKey(regionList.getAlias(),"regular",nullptr,&status)); LocalUResourceBundlePointer regionMacro(ures_getByKey(regionList.getAlias(),"macroregion",nullptr,&status)); LocalUResourceBundlePointer regionUnknown(ures_getByKey(regionList.getAlias(),"unknown",nullptr,&status)); LocalUResourceBundlePointer territoryContainment(ures_getByKey(supplementalData.getAlias(),"territoryContainment",nullptr,&status)); LocalUResourceBundlePointer worldContainment(ures_getByKey(territoryContainment.getAlias(),"001",nullptr,&status)); LocalUResourceBundlePointer groupingContainment(ures_getByKey(territoryContainment.getAlias(),"grouping",nullptr,&status)); ucln_i18n_registerCleanup(UCLN_I18N_REGION, region_cleanup); if (U_FAILURE(status)) { return; } // now, initialize uhash_setValueDeleter(newRegionIDMap.getAlias(), uprv_deleteUObject); // regionIDMap owns objs uhash_setKeyDeleter(newRegionAliases.getAlias(), uprv_deleteUObject); // regionAliases owns the string keys while (U_SUCCESS(status) && ures_hasNext(regionRegular.getAlias())) { UnicodeString regionName = ures_getNextUnicodeString(regionRegular.getAlias(),nullptr,&status); int32_t rangeMarkerLocation = regionName.indexOf(RANGE_MARKER); char16_t buf[6]; regionName.extract(buf,6,status); if ( rangeMarkerLocation > 0 ) { char16_t endRange = regionName.charAt(rangeMarkerLocation+1); buf[rangeMarkerLocation] = 0; while (U_SUCCESS(status) && buf[rangeMarkerLocation-1] <= endRange) { LocalPointer newRegion(new UnicodeString(buf), status); allRegions->adoptElement(newRegion.orphan(), status); buf[rangeMarkerLocation-1]++; } } else { LocalPointer newRegion(new UnicodeString(regionName), status); allRegions->adoptElement(newRegion.orphan(), status); } } while (U_SUCCESS(status) && ures_hasNext(regionMacro.getAlias())) { UnicodeString regionName = ures_getNextUnicodeString(regionMacro.getAlias(),nullptr,&status); int32_t rangeMarkerLocation = regionName.indexOf(RANGE_MARKER); char16_t buf[6]; regionName.extract(buf,6,status); if ( rangeMarkerLocation > 0 ) { char16_t endRange = regionName.charAt(rangeMarkerLocation+1); buf[rangeMarkerLocation] = 0; while ( buf[rangeMarkerLocation-1] <= endRange && U_SUCCESS(status)) { LocalPointer newRegion(new UnicodeString(buf), status); allRegions->adoptElement(newRegion.orphan(),status); buf[rangeMarkerLocation-1]++; } } else { LocalPointer newRegion(new UnicodeString(regionName), status); allRegions->adoptElement(newRegion.orphan(),status); } } while (U_SUCCESS(status) && ures_hasNext(regionUnknown.getAlias())) { LocalPointer regionName ( new UnicodeString(ures_getNextUnicodeString(regionUnknown.getAlias(), nullptr, &status), status)); allRegions->adoptElement(regionName.orphan(),status); } while (U_SUCCESS(status) && ures_hasNext(worldContainment.getAlias())) { UnicodeString *continentName = new UnicodeString(ures_getNextUnicodeString(worldContainment.getAlias(),nullptr,&status)); continents->adoptElement(continentName,status); } if (U_FAILURE(status)) { return; } for ( int32_t i = 0 ; i < allRegions->size() ; i++ ) { LocalPointer r(new Region(), status); if ( U_FAILURE(status) ) { return; } UnicodeString *regionName = (UnicodeString *)allRegions->elementAt(i); r->idStr = *regionName; r->idStr.extract(0,r->idStr.length(),r->id,sizeof(r->id),US_INV); r->fType = URGN_TERRITORY; // Only temporary - figure out the real type later once the aliases are known. int32_t pos = 0; int32_t result = ICU_Utility::parseAsciiInteger(r->idStr, pos); if (pos > 0) { r->code = result; // Convert string to number uhash_iput(newNumericCodeMap.getAlias(),r->code,(void *)(r.getAlias()),&status); r->fType = URGN_SUBCONTINENT; } else { r->code = -1; } void* idStrAlias = (void*)&(r->idStr); // about to orphan 'r'. Save this off. uhash_put(newRegionIDMap.getAlias(),idStrAlias,(void *)(r.orphan()),&status); // regionIDMap takes ownership } UResourceBundle *groupingBundle = nullptr; while (U_SUCCESS(status) && ures_hasNext(groupingContainment.getAlias())) { groupingBundle = ures_getNextResource(groupingContainment.getAlias(), groupingBundle, &status); if (U_FAILURE(status)) { break; } UnicodeString *groupingName = new UnicodeString(ures_getKey(groupingBundle), -1, US_INV); LocalPointer lpGroupingName(groupingName, status); groupings->adoptElement(lpGroupingName.orphan(), status); if (U_FAILURE(status)) { break; } Region *grouping = (Region *) uhash_get(newRegionIDMap.getAlias(), groupingName); if (grouping != nullptr) { for (int32_t i = 0; i < ures_getSize(groupingBundle) && U_SUCCESS(status); i++) { UnicodeString child = ures_getUnicodeStringByIndex(groupingBundle, i, &status); if (U_SUCCESS(status)) { if (grouping->containedRegions == nullptr) { LocalPointer lpContainedRegions( new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status), status); grouping->containedRegions = lpContainedRegions.orphan(); if (U_FAILURE(status)) { break; } } LocalPointer lpChildCopy(new UnicodeString(child), status); grouping->containedRegions->adoptElement(lpChildCopy.orphan(), status); } } } } ures_close(groupingBundle); // Process the territory aliases while (U_SUCCESS(status) && ures_hasNext(territoryAlias.getAlias())) { LocalUResourceBundlePointer res(ures_getNextResource(territoryAlias.getAlias(),nullptr,&status)); const char *aliasFrom = ures_getKey(res.getAlias()); LocalPointer aliasFromStr(new UnicodeString(aliasFrom, -1, US_INV), status); UnicodeString aliasTo = ures_getUnicodeStringByKey(res.getAlias(),"replacement",&status); res.adoptInstead(nullptr); const Region *aliasToRegion = (Region *) uhash_get(newRegionIDMap.getAlias(),&aliasTo); Region *aliasFromRegion = (Region *)uhash_get(newRegionIDMap.getAlias(),aliasFromStr.getAlias()); if ( aliasToRegion != nullptr && aliasFromRegion == nullptr ) { // This is just an alias from some string to a region uhash_put(newRegionAliases.getAlias(),(void *)aliasFromStr.orphan(), (void *)aliasToRegion,&status); } else { if ( aliasFromRegion == nullptr ) { // Deprecated region code not in the primary codes list - so need to create a deprecated region for it. LocalPointer newRgn(new Region, status); if ( U_SUCCESS(status) ) { aliasFromRegion = newRgn.orphan(); } else { return; // error out } aliasFromRegion->idStr.setTo(*aliasFromStr); aliasFromRegion->idStr.extract(0,aliasFromRegion->idStr.length(),aliasFromRegion->id,sizeof(aliasFromRegion->id),US_INV); uhash_put(newRegionIDMap.getAlias(),(void *)&(aliasFromRegion->idStr),(void *)aliasFromRegion,&status); int32_t pos = 0; int32_t result = ICU_Utility::parseAsciiInteger(aliasFromRegion->idStr, pos); if ( pos > 0 ) { aliasFromRegion->code = result; // Convert string to number uhash_iput(newNumericCodeMap.getAlias(),aliasFromRegion->code,(void *)aliasFromRegion,&status); } else { aliasFromRegion->code = -1; } aliasFromRegion->fType = URGN_DEPRECATED; } else { aliasFromRegion->fType = URGN_DEPRECATED; } { LocalPointer newPreferredValues(new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status), status); aliasFromRegion->preferredValues = newPreferredValues.orphan(); } if( U_FAILURE(status)) { return; } UnicodeString currentRegion; //currentRegion.remove(); TODO: was already 0 length? for (int32_t i = 0 ; i < aliasTo.length() && U_SUCCESS(status); i++ ) { if ( aliasTo.charAt(i) != 0x0020 ) { currentRegion.append(aliasTo.charAt(i)); } if ( aliasTo.charAt(i) == 0x0020 || i+1 == aliasTo.length() ) { Region *target = (Region *)uhash_get(newRegionIDMap.getAlias(),(void *)¤tRegion); if (target) { LocalPointer preferredValue(new UnicodeString(target->idStr), status); aliasFromRegion->preferredValues->adoptElement(preferredValue.orphan(),status); // may add null if err } currentRegion.remove(); } } } } // Process the code mappings - This will allow us to assign numeric codes to most of the territories. while (U_SUCCESS(status) && ures_hasNext(codeMappings.getAlias())) { UResourceBundle *mapping = ures_getNextResource(codeMappings.getAlias(),nullptr,&status); if (U_SUCCESS(status) && ures_getType(mapping) == URES_ARRAY && ures_getSize(mapping) == 3) { UnicodeString codeMappingID = ures_getUnicodeStringByIndex(mapping,0,&status); UnicodeString codeMappingNumber = ures_getUnicodeStringByIndex(mapping,1,&status); UnicodeString codeMapping3Letter = ures_getUnicodeStringByIndex(mapping,2,&status); Region *r = (Region *)uhash_get(newRegionIDMap.getAlias(),(void *)&codeMappingID); if ( r ) { int32_t pos = 0; int32_t result = ICU_Utility::parseAsciiInteger(codeMappingNumber, pos); if ( pos > 0 ) { r->code = result; // Convert string to number uhash_iput(newNumericCodeMap.getAlias(),r->code,(void *)r,&status); } LocalPointer code3(new UnicodeString(codeMapping3Letter), status); uhash_put(newRegionAliases.getAlias(),(void *)code3.orphan(), (void *)r,&status); } } ures_close(mapping); } // Now fill in the special cases for WORLD, UNKNOWN, CONTINENTS, and GROUPINGS Region *r; UnicodeString WORLD_ID_STRING(WORLD_ID); r = (Region *) uhash_get(newRegionIDMap.getAlias(),(void *)&WORLD_ID_STRING); if ( r ) { r->fType = URGN_WORLD; } UnicodeString UNKNOWN_REGION_ID_STRING(UNKNOWN_REGION_ID); r = (Region *) uhash_get(newRegionIDMap.getAlias(),(void *)&UNKNOWN_REGION_ID_STRING); if ( r ) { r->fType = URGN_UNKNOWN; } for ( int32_t i = 0 ; i < continents->size() ; i++ ) { r = (Region *) uhash_get(newRegionIDMap.getAlias(),(void *)continents->elementAt(i)); if ( r ) { r->fType = URGN_CONTINENT; } } for ( int32_t i = 0 ; i < groupings->size() ; i++ ) { r = (Region *) uhash_get(newRegionIDMap.getAlias(),(void *)groupings->elementAt(i)); if ( r ) { r->fType = URGN_GROUPING; } } // Special case: The region code "QO" (Outlying Oceania) is a subcontinent code added by CLDR // even though it looks like a territory code. Need to handle it here. UnicodeString OUTLYING_OCEANIA_REGION_ID_STRING(OUTLYING_OCEANIA_REGION_ID); r = (Region *) uhash_get(newRegionIDMap.getAlias(),(void *)&OUTLYING_OCEANIA_REGION_ID_STRING); if ( r ) { r->fType = URGN_SUBCONTINENT; } // Load territory containment info from the supplemental data. while ( ures_hasNext(territoryContainment.getAlias()) ) { LocalUResourceBundlePointer mapping(ures_getNextResource(territoryContainment.getAlias(),nullptr,&status)); if( U_FAILURE(status) ) { return; // error out } const char *parent = ures_getKey(mapping.getAlias()); if (uprv_strcmp(parent, "containedGroupings") == 0 || uprv_strcmp(parent, "deprecated") == 0) { continue; // handle new pseudo-parent types added in ICU data per cldrbug 7808; for now just skip. // #11232 is to do something useful with these. } UnicodeString parentStr = UnicodeString(parent, -1 , US_INV); Region *parentRegion = (Region *) uhash_get(newRegionIDMap.getAlias(),(void *)&parentStr); for ( int j = 0 ; j < ures_getSize(mapping.getAlias()); j++ ) { UnicodeString child = ures_getUnicodeStringByIndex(mapping.getAlias(),j,&status); Region *childRegion = (Region *) uhash_get(newRegionIDMap.getAlias(),(void *)&child); if ( parentRegion != nullptr && childRegion != nullptr ) { // Add the child region to the set of regions contained by the parent if (parentRegion->containedRegions == nullptr) { LocalPointer lpContainedRegions( new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status), status); parentRegion->containedRegions = lpContainedRegions.orphan(); if (U_FAILURE(status)) { return; } } LocalPointer childStr(new UnicodeString(), status); if (U_FAILURE(status)) { return; // error out } childStr->fastCopyFrom(childRegion->idStr); parentRegion->containedRegions->adoptElement(childStr.orphan(),status); if (U_FAILURE(status)) { return; } // Set the parent region to be the containing region of the child. // Regions of type GROUPING can't be set as the parent, since another region // such as a SUBCONTINENT, CONTINENT, or WORLD must always be the parent. if ( parentRegion->fType != URGN_GROUPING) { childRegion->containingRegion = parentRegion; } } } } // Create the availableRegions lists int32_t pos = UHASH_FIRST; while ( const UHashElement* element = uhash_nextElement(newRegionIDMap.getAlias(),&pos)) { Region *ar = (Region *)element->value.pointer; if ( availableRegions[ar->fType] == nullptr ) { LocalPointer newAr(new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status), status); availableRegions[ar->fType] = newAr.orphan(); } LocalPointer arString(new UnicodeString(ar->idStr), status); if( U_FAILURE(status) ) { return; // error out } availableRegions[ar->fType]->adoptElement(arString.orphan(), status); } // copy hashtables numericCodeMap = newNumericCodeMap.orphan(); regionIDMap = newRegionIDMap.orphan(); regionAliases = newRegionAliases.orphan(); } void Region::cleanupRegionData() { for (int32_t i = 0 ; i < URGN_LIMIT ; i++ ) { if ( availableRegions[i] ) { delete availableRegions[i]; availableRegions[i] = nullptr; } } if (regionAliases) { uhash_close(regionAliases); } if (numericCodeMap) { uhash_close(numericCodeMap); } if (regionIDMap) { uhash_close(regionIDMap); } if (allRegions) { delete allRegions; allRegions = nullptr; } regionAliases = numericCodeMap = regionIDMap = nullptr; gRegionDataInitOnce.reset(); } Region::Region () : code(-1), fType(URGN_UNKNOWN), containingRegion(nullptr), containedRegions(nullptr), preferredValues(nullptr) { id[0] = 0; } Region::~Region () { if (containedRegions) { delete containedRegions; } if (preferredValues) { delete preferredValues; } } /** * Returns true if the two regions are equal. * Per PMC, just use pointer compare, since we have at most one instance of each Region. */ bool Region::operator==(const Region &that) const { return (idStr == that.idStr); } /** * Returns true if the two regions are NOT equal; that is, if operator ==() returns false. * Per PMC, just use pointer compare, since we have at most one instance of each Region. */ bool Region::operator!=(const Region &that) const { return (idStr != that.idStr); } /** * Returns a pointer to a Region using the given region code. The region code can be either 2-letter ISO code, * 3-letter ISO code, UNM.49 numeric code, or other valid Unicode Region Code as defined by the LDML specification. * The identifier will be canonicalized internally using the supplemental metadata as defined in the CLDR. * If the region code is nullptr or not recognized, the appropriate error code will be set ( U_ILLEGAL_ARGUMENT_ERROR ) */ const Region* U_EXPORT2 Region::getInstance(const char *region_code, UErrorCode &status) { umtx_initOnce(gRegionDataInitOnce, &loadRegionData, status); if (U_FAILURE(status)) { return nullptr; } if ( !region_code ) { status = U_ILLEGAL_ARGUMENT_ERROR; return nullptr; } UnicodeString regionCodeString = UnicodeString(region_code, -1, US_INV); Region *r = (Region *)uhash_get(regionIDMap,(void *)®ionCodeString); if ( !r ) { r = (Region *)uhash_get(regionAliases,(void *)®ionCodeString); } if ( !r ) { // Unknown region code status = U_ILLEGAL_ARGUMENT_ERROR; return nullptr; } if ( r->fType == URGN_DEPRECATED && r->preferredValues->size() == 1) { StringEnumeration *pv = r->getPreferredValues(status); pv->reset(status); const UnicodeString *ustr = pv->snext(status); r = (Region *)uhash_get(regionIDMap,(void *)ustr); delete pv; } return r; } /** * Returns a pointer to a Region using the given numeric region code. If the numeric region code is not recognized, * the appropriate error code will be set ( U_ILLEGAL_ARGUMENT_ERROR ). */ const Region* U_EXPORT2 Region::getInstance (int32_t code, UErrorCode &status) { umtx_initOnce(gRegionDataInitOnce, &loadRegionData, status); if (U_FAILURE(status)) { return nullptr; } Region *r = (Region *)uhash_iget(numericCodeMap,code); if ( !r ) { // Just in case there's an alias that's numeric, try to find it. UnicodeString id; ICU_Utility::appendNumber(id, code, 10, 1); r = (Region *)uhash_get(regionAliases,&id); } if( U_FAILURE(status) ) { return nullptr; } if ( !r ) { status = U_ILLEGAL_ARGUMENT_ERROR; return nullptr; } if ( r->fType == URGN_DEPRECATED && r->preferredValues->size() == 1) { StringEnumeration *pv = r->getPreferredValues(status); pv->reset(status); const UnicodeString *ustr = pv->snext(status); r = (Region *)uhash_get(regionIDMap,(void *)ustr); delete pv; } return r; } /** * Returns an enumeration over the IDs of all known regions that match the given type. */ StringEnumeration* U_EXPORT2 Region::getAvailable(URegionType type, UErrorCode &status) { umtx_initOnce(gRegionDataInitOnce, &loadRegionData, status); // returns immediately if U_FAILURE(status) if (U_FAILURE(status)) { return nullptr; } return new RegionNameEnumeration(availableRegions[type],status); } /** * Returns a pointer to the region that contains this region. Returns nullptr if this region is code "001" (World) * or "ZZ" (Unknown region). For example, calling this method with region "IT" (Italy) returns the * region "039" (Southern Europe). */ const Region* Region::getContainingRegion() const { UErrorCode status = U_ZERO_ERROR; umtx_initOnce(gRegionDataInitOnce, &loadRegionData, status); return containingRegion; } /** * Return a pointer to the region that geographically contains this region and matches the given type, * moving multiple steps up the containment chain if necessary. Returns nullptr if no containing region can be found * that matches the given type. Note: The URegionTypes = "URGN_GROUPING", "URGN_DEPRECATED", or "URGN_UNKNOWN" * are not appropriate for use in this API. nullptr will be returned in this case. For example, calling this method * with region "IT" (Italy) for type "URGN_CONTINENT" returns the region "150" ( Europe ). */ const Region* Region::getContainingRegion(URegionType type) const { UErrorCode status = U_ZERO_ERROR; umtx_initOnce(gRegionDataInitOnce, &loadRegionData, status); if ( containingRegion == nullptr ) { return nullptr; } return ( containingRegion->fType == type)? containingRegion: containingRegion->getContainingRegion(type); } /** * Return an enumeration over the IDs of all the regions that are immediate children of this region in the * region hierarchy. These returned regions could be either macro regions, territories, or a mixture of the two, * depending on the containment data as defined in CLDR. This API may return nullptr if this region doesn't have * any sub-regions. For example, calling this method with region "150" (Europe) returns an enumeration containing * the various sub regions of Europe - "039" (Southern Europe) - "151" (Eastern Europe) - "154" (Northern Europe) * and "155" (Western Europe). */ StringEnumeration* Region::getContainedRegions(UErrorCode &status) const { umtx_initOnce(gRegionDataInitOnce, &loadRegionData, status); // returns immediately if U_FAILURE(status) if (U_FAILURE(status)) { return nullptr; } return new RegionNameEnumeration(containedRegions,status); } /** * Returns an enumeration over the IDs of all the regions that are children of this region anywhere in the region * hierarchy and match the given type. This API may return an empty enumeration if this region doesn't have any * sub-regions that match the given type. For example, calling this method with region "150" (Europe) and type * "URGN_TERRITORY" returns a set containing all the territories in Europe ( "FR" (France) - "IT" (Italy) - "DE" (Germany) etc. ) */ StringEnumeration* Region::getContainedRegions( URegionType type, UErrorCode &status ) const { umtx_initOnce(gRegionDataInitOnce, &loadRegionData, status); // returns immediately if U_FAILURE(status) UVector result(nullptr, uhash_compareChars, status); LocalPointer cr(getContainedRegions(status), status); if (U_FAILURE(status)) { return nullptr; } const char *regionId; while((regionId = cr->next(nullptr, status)) != nullptr && U_SUCCESS(status)) { const Region *r = Region::getInstance(regionId, status); if ( r->getType() == type) { result.addElement(const_cast(&r->idStr), status); } else { LocalPointer children(r->getContainedRegions(type, status)); const char *id2; while(U_SUCCESS(status) && ((id2 = children->next(nullptr, status)) != nullptr)) { const Region *r2 = Region::getInstance(id2,status); result.addElement(const_cast(&r2->idStr), status); } } } LocalPointer resultEnumeration( new RegionNameEnumeration(&result, status), status); return U_SUCCESS(status) ? resultEnumeration.orphan() : nullptr; } /** * Returns true if this region contains the supplied other region anywhere in the region hierarchy. */ UBool Region::contains(const Region &other) const { UErrorCode status = U_ZERO_ERROR; umtx_initOnce(gRegionDataInitOnce, &loadRegionData, status); if (!containedRegions) { return false; } if (containedRegions->contains((void *)&other.idStr)) { return true; } else { for ( int32_t i = 0 ; i < containedRegions->size() ; i++ ) { UnicodeString *crStr = (UnicodeString *)containedRegions->elementAt(i); Region *cr = (Region *) uhash_get(regionIDMap,(void *)crStr); if ( cr && cr->contains(other) ) { return true; } } } return false; } /** * For deprecated regions, return an enumeration over the IDs of the regions that are the preferred replacement * regions for this region. Returns nullptr for a non-deprecated region. For example, calling this method with region * "SU" (Soviet Union) would return a list of the regions containing "RU" (Russia), "AM" (Armenia), "AZ" (Azerbaijan), etc... */ StringEnumeration* Region::getPreferredValues(UErrorCode &status) const { umtx_initOnce(gRegionDataInitOnce, &loadRegionData, status); // returns immediately if U_FAILURE(status) if (U_FAILURE(status) || fType != URGN_DEPRECATED) { return nullptr; } return new RegionNameEnumeration(preferredValues,status); } /** * Return this region's canonical region code. */ const char* Region::getRegionCode() const { return id; } int32_t Region::getNumericCode() const { return code; } /** * Returns the region type of this region. */ URegionType Region::getType() const { return fType; } RegionNameEnumeration::RegionNameEnumeration(UVector *nameList, UErrorCode& status) : pos(0), fRegionNames(nullptr) { // TODO: https://unicode-org.atlassian.net/browse/ICU-21829 // Is all of the copying going on here really necessary? if (nameList && U_SUCCESS(status)) { LocalPointer regionNames( new UVector(uprv_deleteUObject, uhash_compareUnicodeString, nameList->size(), status), status); for ( int32_t i = 0 ; U_SUCCESS(status) && i < nameList->size() ; i++ ) { UnicodeString* this_region_name = (UnicodeString *)nameList->elementAt(i); LocalPointer new_region_name(new UnicodeString(*this_region_name), status); regionNames->adoptElement(new_region_name.orphan(), status); } if (U_SUCCESS(status)) { fRegionNames = regionNames.orphan(); } } } const UnicodeString* RegionNameEnumeration::snext(UErrorCode& status) { if (U_FAILURE(status) || (fRegionNames==nullptr)) { return nullptr; } const UnicodeString* nextStr = (const UnicodeString *)fRegionNames->elementAt(pos); if (nextStr!=nullptr) { pos++; } return nextStr; } void RegionNameEnumeration::reset(UErrorCode& /*status*/) { pos=0; } int32_t RegionNameEnumeration::count(UErrorCode& /*status*/) const { return (fRegionNames==nullptr) ? 0 : fRegionNames->size(); } RegionNameEnumeration::~RegionNameEnumeration() { delete fRegionNames; } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ //eof stringi/src/icu74/i18n/nortrans.cpp0000644000176200001440000001450514700200761016606 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2001-2011, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 07/03/01 aliu Creation. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/normalizer2.h" #include "unicode/utf16.h" #include "cstring.h" #include "nortrans.h" U_NAMESPACE_BEGIN UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator) static inline Transliterator::Token cstrToken(const char *s) { return Transliterator::pointerToken((void *)s); } /** * System registration hook. */ void NormalizationTransliterator::registerIDs() { // In the Token, the byte after the NUL is the UNormalization2Mode. Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"), _create, cstrToken("nfc\0\0")); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"), _create, cstrToken("nfkc\0\0")); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"), _create, cstrToken("nfc\0\1")); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"), _create, cstrToken("nfkc\0\1")); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"), _create, cstrToken("nfc\0\2")); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"), _create, cstrToken("nfc\0\3")); Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"), UNICODE_STRING_SIMPLE("NFD"), true); Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"), UNICODE_STRING_SIMPLE("NFKD"), true); Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"), UNICODE_STRING_SIMPLE("NFD"), false); Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"), UNICODE_STRING_SIMPLE("FCD"), false); } /** * Factory methods */ Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID, Token context) { const char *name = (const char *)context.pointer; UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1]; UErrorCode errorCode = U_ZERO_ERROR; const Normalizer2 *norm2 = Normalizer2::getInstance(nullptr, name, mode, errorCode); if(U_SUCCESS(errorCode)) { return new NormalizationTransliterator(ID, *norm2); } else { return nullptr; } } /** * Constructs a transliterator. */ NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id, const Normalizer2 &norm2) : Transliterator(id, 0), fNorm2(norm2) {} /** * Destructor. */ NormalizationTransliterator::~NormalizationTransliterator() { } /** * Copy constructor. */ NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) : Transliterator(o), fNorm2(o.fNorm2) {} /** * Transliterator API. */ NormalizationTransliterator* NormalizationTransliterator::clone() const { return new NormalizationTransliterator(*this); } /** * Implements {@link Transliterator#handleTransliterate}. */ void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, UBool isIncremental) const { // start and limit of the input range int32_t start = offsets.start; int32_t limit = offsets.limit; if(start >= limit) { return; } /* * Normalize as short chunks at a time as possible even in * bulk mode, so that styled text is minimally disrupted. * In incremental mode, a chunk that ends with offsets.limit * must not be normalized. * * If it was known that the input text is not styled, then * a bulk mode normalization could look like this: UnicodeString input, normalized; int32_t length = limit - start; _Replaceable_extractBetween(text, start, limit, input.getBuffer(length)); input.releaseBuffer(length); UErrorCode status = U_ZERO_ERROR; fNorm2.normalize(input, normalized, status); text.handleReplaceBetween(start, limit, normalized); int32_t delta = normalized.length() - length; offsets.contextLimit += delta; offsets.limit += delta; offsets.start = limit + delta; */ UErrorCode errorCode = U_ZERO_ERROR; UnicodeString segment; UnicodeString normalized; UChar32 c = text.char32At(start); do { int32_t prev = start; // Skip at least one character so we make progress. // c holds the character at start. segment.remove(); do { segment.append(c); start += U16_LENGTH(c); } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start))); if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) { // stop in incremental mode when we reach the input limit // in case there are additional characters that could change the // normalization result start=prev; break; } fNorm2.normalize(segment, normalized, errorCode); if(U_FAILURE(errorCode)) { break; } if(segment != normalized) { // replace the input chunk with its normalized form text.handleReplaceBetween(prev, start, normalized); // update all necessary indexes accordingly int32_t delta = normalized.length() - (start - prev); start += delta; limit += delta; } } while(start < limit); offsets.start = start; offsets.contextLimit += limit - offsets.limit; offsets.limit = limit; } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ stringi/src/icu74/i18n/csrsbcs.h0000644000176200001440000001550214700200761016045 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2005-2015, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ #ifndef __CSRSBCS_H #define __CSRSBCS_H #include "unicode/uobject.h" #if !UCONFIG_NO_CONVERSION #include "csrecog.h" U_NAMESPACE_BEGIN class NGramParser : public UMemory { private: int32_t ngram; const int32_t *ngramList; int32_t ngramCount; int32_t hitCount; protected: int32_t byteIndex; const uint8_t *charMap; void addByte(int32_t b); public: NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap); virtual ~NGramParser(); private: /* * Binary search for value in table, which must have exactly 64 entries. */ int32_t search(const int32_t *table, int32_t value); void lookup(int32_t thisNgram); virtual int32_t nextByte(InputText *det); virtual void parseCharacters(InputText *det); public: int32_t parse(InputText *det); }; #if !UCONFIG_ONLY_HTML_CONVERSION class NGramParser_IBM420 : public NGramParser { public: NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap); ~NGramParser_IBM420(); private: int32_t alef; int32_t isLamAlef(int32_t b); int32_t nextByte(InputText *det) override; void parseCharacters(InputText *det) override; }; #endif class CharsetRecog_sbcs : public CharsetRecognizer { public: CharsetRecog_sbcs(); virtual ~CharsetRecog_sbcs(); virtual const char *getName() const override = 0; virtual UBool match(InputText *det, CharsetMatch *results) const override = 0; virtual int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const; }; class CharsetRecog_8859_1 : public CharsetRecog_sbcs { public: virtual ~CharsetRecog_8859_1(); const char *getName() const override; virtual UBool match(InputText *det, CharsetMatch *results) const override; }; class CharsetRecog_8859_2 : public CharsetRecog_sbcs { public: virtual ~CharsetRecog_8859_2(); const char *getName() const override; virtual UBool match(InputText *det, CharsetMatch *results) const override; }; class CharsetRecog_8859_5 : public CharsetRecog_sbcs { public: virtual ~CharsetRecog_8859_5(); const char *getName() const override; }; class CharsetRecog_8859_6 : public CharsetRecog_sbcs { public: virtual ~CharsetRecog_8859_6(); const char *getName() const override; }; class CharsetRecog_8859_7 : public CharsetRecog_sbcs { public: virtual ~CharsetRecog_8859_7(); const char *getName() const override; }; class CharsetRecog_8859_8 : public CharsetRecog_sbcs { public: virtual ~CharsetRecog_8859_8(); virtual const char *getName() const override; }; class CharsetRecog_8859_9 : public CharsetRecog_sbcs { public: virtual ~CharsetRecog_8859_9(); const char *getName() const override; }; class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5 { public: virtual ~CharsetRecog_8859_5_ru(); const char *getLanguage() const override; virtual UBool match(InputText *det, CharsetMatch *results) const override; }; class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6 { public: virtual ~CharsetRecog_8859_6_ar(); const char *getLanguage() const override; virtual UBool match(InputText *det, CharsetMatch *results) const override; }; class CharsetRecog_8859_7_el : public CharsetRecog_8859_7 { public: virtual ~CharsetRecog_8859_7_el(); const char *getLanguage() const override; virtual UBool match(InputText *det, CharsetMatch *results) const override; }; class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8 { public: virtual ~CharsetRecog_8859_8_I_he(); const char *getName() const override; const char *getLanguage() const override; virtual UBool match(InputText *det, CharsetMatch *results) const override; }; class CharsetRecog_8859_8_he : public CharsetRecog_8859_8 { public: virtual ~CharsetRecog_8859_8_he (); const char *getLanguage() const override; virtual UBool match(InputText *det, CharsetMatch *results) const override; }; class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9 { public: virtual ~CharsetRecog_8859_9_tr (); const char *getLanguage() const override; virtual UBool match(InputText *det, CharsetMatch *results) const override; }; class CharsetRecog_windows_1256 : public CharsetRecog_sbcs { public: virtual ~CharsetRecog_windows_1256(); const char *getName() const override; const char *getLanguage() const override; virtual UBool match(InputText *det, CharsetMatch *results) const override; }; class CharsetRecog_windows_1251 : public CharsetRecog_sbcs { public: virtual ~CharsetRecog_windows_1251(); const char *getName() const override; const char *getLanguage() const override; virtual UBool match(InputText *det, CharsetMatch *results) const override; }; class CharsetRecog_KOI8_R : public CharsetRecog_sbcs { public: virtual ~CharsetRecog_KOI8_R(); const char *getName() const override; const char *getLanguage() const override; virtual UBool match(InputText *det, CharsetMatch *results) const override; }; #if !UCONFIG_ONLY_HTML_CONVERSION class CharsetRecog_IBM424_he : public CharsetRecog_sbcs { public: virtual ~CharsetRecog_IBM424_he(); const char *getLanguage() const override; }; class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he { public: virtual ~CharsetRecog_IBM424_he_rtl(); const char *getName() const override; virtual UBool match(InputText *det, CharsetMatch *results) const override; }; class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he { virtual ~CharsetRecog_IBM424_he_ltr(); const char *getName() const override; virtual UBool match(InputText *det, CharsetMatch *results) const override; }; class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs { public: virtual ~CharsetRecog_IBM420_ar(); const char *getLanguage() const override; int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const override; }; class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar { public: virtual ~CharsetRecog_IBM420_ar_rtl(); const char *getName() const override; virtual UBool match(InputText *det, CharsetMatch *results) const override; }; class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar { virtual ~CharsetRecog_IBM420_ar_ltr(); const char *getName() const override; virtual UBool match(InputText *det, CharsetMatch *results) const override; }; #endif U_NAMESPACE_END #endif /* !UCONFIG_NO_CONVERSION */ #endif /* __CSRSBCS_H */ stringi/src/icu74/i18n/nfrule.h0000644000176200001440000001063414700200761015677 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 1997-2015, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* */ #ifndef NFRULE_H #define NFRULE_H #include "unicode/rbnf.h" #if U_HAVE_RBNF #include "unicode/utypes.h" #include "unicode/uobject.h" #include "unicode/unistr.h" U_NAMESPACE_BEGIN class FieldPosition; class Formattable; class NFRuleList; class NFRuleSet; class NFSubstitution; class ParsePosition; class PluralFormat; class RuleBasedNumberFormat; class UnicodeString; class NFRule : public UMemory { public: enum ERuleType { kNoBase = 0, kNegativeNumberRule = -1, kImproperFractionRule = -2, kProperFractionRule = -3, kDefaultRule = -4, kInfinityRule = -5, kNaNRule = -6, kOtherRule = -7 }; static void makeRules(UnicodeString& definition, NFRuleSet* ruleSet, const NFRule* predecessor, const RuleBasedNumberFormat* rbnf, NFRuleList& ruleList, UErrorCode& status); NFRule(const RuleBasedNumberFormat* rbnf, const UnicodeString &ruleText, UErrorCode &status); ~NFRule(); bool operator==(const NFRule& rhs) const; bool operator!=(const NFRule& rhs) const { return !operator==(rhs); } ERuleType getType() const { return (ERuleType)(baseValue <= kNoBase ? (ERuleType)baseValue : kOtherRule); } void setType(ERuleType ruleType) { baseValue = (int32_t)ruleType; } int64_t getBaseValue() const { return baseValue; } void setBaseValue(int64_t value, UErrorCode& status); char16_t getDecimalPoint() const { return decimalPoint; } int64_t getDivisor() const; void doFormat(int64_t number, UnicodeString& toAppendTo, int32_t pos, int32_t recursionCount, UErrorCode& status) const; void doFormat(double number, UnicodeString& toAppendTo, int32_t pos, int32_t recursionCount, UErrorCode& status) const; UBool doParse(const UnicodeString& text, ParsePosition& pos, UBool isFractional, double upperBound, uint32_t nonNumericalExecutedRuleMask, Formattable& result) const; UBool shouldRollBack(int64_t number) const; void _appendRuleText(UnicodeString& result) const; int32_t findTextLenient(const UnicodeString& str, const UnicodeString& key, int32_t startingAt, int32_t* resultCount) const; void setDecimalFormatSymbols(const DecimalFormatSymbols &newSymbols, UErrorCode& status); private: void parseRuleDescriptor(UnicodeString& descriptor, UErrorCode& status); void extractSubstitutions(const NFRuleSet* ruleSet, const UnicodeString &ruleText, const NFRule* predecessor, UErrorCode& status); NFSubstitution* extractSubstitution(const NFRuleSet* ruleSet, const NFRule* predecessor, UErrorCode& status); int16_t expectedExponent() const; int32_t indexOfAnyRulePrefix() const; double matchToDelimiter(const UnicodeString& text, int32_t startPos, double baseValue, const UnicodeString& delimiter, ParsePosition& pp, const NFSubstitution* sub, uint32_t nonNumericalExecutedRuleMask, double upperBound) const; void stripPrefix(UnicodeString& text, const UnicodeString& prefix, ParsePosition& pp) const; int32_t prefixLength(const UnicodeString& str, const UnicodeString& prefix, UErrorCode& status) const; UBool allIgnorable(const UnicodeString& str, UErrorCode& status) const; int32_t findText(const UnicodeString& str, const UnicodeString& key, int32_t startingAt, int32_t* resultCount) const; private: int64_t baseValue; int32_t radix; int16_t exponent; char16_t decimalPoint; UnicodeString fRuleText; NFSubstitution* sub1; NFSubstitution* sub2; const RuleBasedNumberFormat* formatter; const PluralFormat* rulePatternFormat; NFRule(const NFRule &other); // forbid copying of this class NFRule &operator=(const NFRule &other); // forbid copying of this class }; U_NAMESPACE_END /* U_HAVE_RBNF */ #endif // NFRULE_H #endif stringi/src/icu74/i18n/rbt.h0000644000176200001440000002027514700200761015175 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 1999-2007, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 11/17/99 aliu Creation. ********************************************************************** */ #ifndef RBT_H #define RBT_H #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/translit.h" #include "unicode/utypes.h" #include "unicode/parseerr.h" #include "unicode/udata.h" #define U_ICUDATA_TRANSLIT U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "translit" U_NAMESPACE_BEGIN class TransliterationRuleData; /** * RuleBasedTransliterator is a transliterator * built from a set of rules as defined for * Transliterator::createFromRules(). * See the C++ class Transliterator documentation for the rule syntax. * * @author Alan Liu * @internal Use transliterator factory methods instead since this class will be removed in that release. */ class RuleBasedTransliterator : public Transliterator { private: /** * The data object is immutable, so we can freely share it with * other instances of RBT, as long as we do NOT own this object. * TODO: data is no longer immutable. See bugs #1866, 2155 */ TransliterationRuleData* fData; /** * If true, we own the data object and must delete it. */ UBool isDataOwned; public: /** * Constructs a new transliterator from the given rules. * @param rules rules, separated by ';' * @param direction either FORWARD or REVERSE. * @exception IllegalArgumentException if rules are malformed. * @internal Use transliterator factory methods instead since this class will be removed in that release. */ RuleBasedTransliterator(const UnicodeString& id, const UnicodeString& rules, UTransDirection direction, UnicodeFilter* adoptedFilter, UParseError& parseError, UErrorCode& status); /** * Constructs a new transliterator from the given rules. * @param rules rules, separated by ';' * @param direction either FORWARD or REVERSE. * @exception IllegalArgumentException if rules are malformed. * @internal Use transliterator factory methods instead since this class will be removed in that release. */ /*RuleBasedTransliterator(const UnicodeString& id, const UnicodeString& rules, UTransDirection direction, UnicodeFilter* adoptedFilter, UErrorCode& status);*/ /** * Convenience constructor with no filter. * @internal Use transliterator factory methods instead since this class will be removed in that release. */ /*RuleBasedTransliterator(const UnicodeString& id, const UnicodeString& rules, UTransDirection direction, UErrorCode& status);*/ /** * Convenience constructor with no filter and FORWARD direction. * @internal Use transliterator factory methods instead since this class will be removed in that release. */ /*RuleBasedTransliterator(const UnicodeString& id, const UnicodeString& rules, UErrorCode& status);*/ /** * Convenience constructor with FORWARD direction. * @internal Use transliterator factory methods instead since this class will be removed in that release. */ /*RuleBasedTransliterator(const UnicodeString& id, const UnicodeString& rules, UnicodeFilter* adoptedFilter, UErrorCode& status);*/ private: friend class TransliteratorRegistry; // to access TransliterationRuleData convenience ctor /** * Convenience constructor. * @param id the id for the transliterator. * @param theData the rule data for the transliterator. * @param adoptedFilter the filter for the transliterator */ RuleBasedTransliterator(const UnicodeString& id, const TransliterationRuleData* theData, UnicodeFilter* adoptedFilter = 0); friend class Transliterator; // to access following ct /** * Internal constructor. * @param id the id for the transliterator. * @param theData the rule data for the transliterator. * @param isDataAdopted determine who will own the 'data' object. True, the caller should not delete 'data'. */ RuleBasedTransliterator(const UnicodeString& id, TransliterationRuleData* data, UBool isDataAdopted); public: /** * Copy constructor. * @internal Use transliterator factory methods instead since this class will be removed in that release. */ RuleBasedTransliterator(const RuleBasedTransliterator&); virtual ~RuleBasedTransliterator(); /** * Implement Transliterator API. * @internal Use transliterator factory methods instead since this class will be removed in that release. */ virtual RuleBasedTransliterator* clone() const override; protected: /** * Implements {@link Transliterator#handleTransliterate}. * @internal Use transliterator factory methods instead since this class will be removed in that release. */ virtual void handleTransliterate(Replaceable& text, UTransPosition& offsets, UBool isIncremental) const override; public: /** * Return a representation of this transliterator as source rules. * These rules will produce an equivalent transliterator if used * to construct a new transliterator. * @param result the string to receive the rules. Previous * contents will be deleted. * @param escapeUnprintable if true then convert unprintable * character to their hex escape representations, \uxxxx or * \Uxxxxxxxx. Unprintable characters are those other than * U+000A, U+0020..U+007E. * @internal Use transliterator factory methods instead since this class will be removed in that release. */ virtual UnicodeString& toRules(UnicodeString& result, UBool escapeUnprintable) const override; protected: /** * Implement Transliterator framework */ virtual void handleGetSourceSet(UnicodeSet& result) const override; public: /** * Override Transliterator framework */ virtual UnicodeSet& getTargetSet(UnicodeSet& result) const override; /** * Return the class ID for this class. This is useful only for * comparing to a return value from getDynamicClassID(). For example: *
     * .      Base* polymorphic_pointer = createPolymorphicObject();
     * .      if (polymorphic_pointer->getDynamicClassID() ==
     * .          Derived::getStaticClassID()) ...
     * 
* @return The class ID for all objects of this class. * @internal Use transliterator factory methods instead since this class will be removed in that release. */ U_I18N_API static UClassID U_EXPORT2 getStaticClassID(); /** * Returns a unique class ID polymorphically. This method * is to implement a simple version of RTTI, since not all C++ * compilers support genuine RTTI. Polymorphic operator==() and * clone() methods call this method. * * @return The class ID for this object. All objects of a given * class have the same class ID. Objects of other classes have * different class IDs. */ virtual UClassID getDynamicClassID() const override; private: void _construct(const UnicodeString& rules, UTransDirection direction, UParseError& parseError, UErrorCode& status); }; U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ #endif stringi/src/icu74/i18n/decNumber.h0000644000176200001440000003176214700200761016315 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ------------------------------------------------------------------ */ /* Decimal Number arithmetic module header */ /* ------------------------------------------------------------------ */ /* Copyright (c) IBM Corporation, 2000-2010. All rights reserved. */ /* */ /* This software is made available under the terms of the */ /* ICU License -- ICU 1.8.1 and later. */ /* */ /* The description and User's Guide ("The decNumber C Library") for */ /* this software is called decNumber.pdf. This document is */ /* available, together with arithmetic and format specifications, */ /* testcases, and Web links, on the General Decimal Arithmetic page. */ /* */ /* Please send comments, suggestions, and corrections to the author: */ /* mfc@uk.ibm.com */ /* Mike Cowlishaw, IBM Fellow */ /* IBM UK, PO Box 31, Birmingham Road, Warwick CV34 5JL, UK */ /* ------------------------------------------------------------------ */ /* Modified version, for use from within ICU. * Renamed public functions, to avoid an unwanted export of the * standard names from the ICU library. * * Use ICU's uprv_malloc() and uprv_free() * * Revert comment syntax to plain C * * Remove a few compiler warnings. */ #if !defined(DECNUMBER) #define DECNUMBER #define DECNAME "decNumber" /* Short name */ #define DECFULLNAME "Decimal Number Module" /* Verbose name */ #define DECAUTHOR "Mike Cowlishaw" /* Who to blame */ #if !defined(DECCONTEXT) #include "decContext.h" #endif /* Bit settings for decNumber.bits */ #define DECNEG 0x80 /* Sign; 1=negative, 0=positive or zero */ #define DECINF 0x40 /* 1=Infinity */ #define DECNAN 0x20 /* 1=NaN */ #define DECSNAN 0x10 /* 1=sNaN */ /* The remaining bits are reserved; they must be 0 */ #define DECSPECIAL (DECINF|DECNAN|DECSNAN) /* any special value */ /* Define the decNumber data structure. The size and shape of the */ /* units array in the structure is determined by the following */ /* constant. This must not be changed without recompiling the */ /* decNumber library modules. */ /* For ICU, use one digit per byte, to make it easier to emulate the * old DigitList interface on top of a decNumber */ #define DECDPUN 1 /* DECimal Digits Per UNit [must be >0 */ /* and <10; 3 or powers of 2 are best]. */ /* DECNUMDIGITS is the default number of digits that can be held in */ /* the structure. If undefined, 1 is assumed and it is assumed */ /* that the structure will be immediately followed by extra space, */ /* as required. DECNUMDIGITS is always >0. */ #if !defined(DECNUMDIGITS) #define DECNUMDIGITS 4 #endif /* The size (integer data type) of each unit is determined by the */ /* number of digits it will hold. */ #if DECDPUN<=2 #define decNumberUnit uint8_t #elif DECDPUN<=4 #define decNumberUnit uint16_t #else #define decNumberUnit uint32_t #endif /* The number of units needed is ceil(DECNUMDIGITS/DECDPUN) */ #define DECNUMUNITS ((DECNUMDIGITS+DECDPUN-1)/DECDPUN) /* The data structure... */ typedef struct { int32_t digits; /* Count of digits in the coefficient; >0 */ int32_t exponent; /* Unadjusted exponent, unbiased, in */ /* range: -1999999997 through 999999999 */ uint8_t bits; /* Indicator bits (see above) */ /* Coefficient, from least significant unit */ decNumberUnit lsu[DECNUMUNITS]; } decNumber; /* Notes: */ /* 1. If digits is > DECDPUN then there will one or more */ /* decNumberUnits immediately following the first element of lsu.*/ /* These contain the remaining (more significant) digits of the */ /* number, and may be in the lsu array, or may be guaranteed by */ /* some other mechanism (such as being contained in another */ /* structure, or being overlaid on dynamically allocated */ /* storage). */ /* */ /* Each integer of the coefficient (except potentially the last) */ /* contains DECDPUN digits (e.g., a value in the range 0 through */ /* 99999999 if DECDPUN is 8, or 0 through 999 if DECDPUN is 3). */ /* */ /* 2. A decNumber converted to a string may need up to digits+14 */ /* characters. The worst cases (non-exponential and exponential */ /* formats) are -0.00000{9...}# and -9.{9...}E+999999999# */ /* (where # is '\0') */ /* ---------------------------------------------------------------- */ /* decNumber public functions and macros */ /* ---------------------------------------------------------------- */ /* Conversions */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberFromInt32(decNumber *, int32_t); U_CAPI decNumber * U_EXPORT2 uprv_decNumberFromUInt32(decNumber *, uint32_t); U_CAPI decNumber * U_EXPORT2 uprv_decNumberFromString(decNumber *, const char *, decContext *); U_CAPI char * U_EXPORT2 uprv_decNumberToString(const decNumber *, char *); U_CAPI char * U_EXPORT2 uprv_decNumberToEngString(const decNumber *, char *); U_CAPI uint32_t U_EXPORT2 uprv_decNumberToUInt32(const decNumber *, decContext *); U_CAPI int32_t U_EXPORT2 uprv_decNumberToInt32(const decNumber *, decContext *); U_CAPI uint8_t * U_EXPORT2 uprv_decNumberGetBCD(const decNumber *, uint8_t *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberSetBCD(decNumber *, const uint8_t *, uint32_t); /* Operators and elementary functions */ U_CAPI decNumber * U_EXPORT2 uprv_decNumberAbs(decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberAdd(decNumber *, const decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberAnd(decNumber *, const decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberCompare(decNumber *, const decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberCompareSignal(decNumber *, const decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberCompareTotal(decNumber *, const decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberCompareTotalMag(decNumber *, const decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberDivide(decNumber *, const decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberDivideInteger(decNumber *, const decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberExp(decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberFMA(decNumber *, const decNumber *, const decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberInvert(decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberLn(decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberLogB(decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberLog10(decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberMax(decNumber *, const decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberMaxMag(decNumber *, const decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberMin(decNumber *, const decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberMinMag(decNumber *, const decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberMinus(decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberMultiply(decNumber *, const decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberNormalize(decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberOr(decNumber *, const decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberPlus(decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberPower(decNumber *, const decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberQuantize(decNumber *, const decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberReduce(decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberRemainder(decNumber *, const decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberRemainderNear(decNumber *, const decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberRescale(decNumber *, const decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberRotate(decNumber *, const decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberSameQuantum(decNumber *, const decNumber *, const decNumber *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberScaleB(decNumber *, const decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberShift(decNumber *, const decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberSquareRoot(decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberSubtract(decNumber *, const decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberToIntegralExact(decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberToIntegralValue(decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberXor(decNumber *, const decNumber *, const decNumber *, decContext *); /* Utilities */ enum decClass uprv_decNumberClass(const decNumber *, decContext *); U_CAPI const char * U_EXPORT2 uprv_decNumberClassToString(enum decClass); U_CAPI decNumber * U_EXPORT2 uprv_decNumberCopy(decNumber *, const decNumber *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberCopyAbs(decNumber *, const decNumber *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberCopyNegate(decNumber *, const decNumber *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberCopySign(decNumber *, const decNumber *, const decNumber *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberNextMinus(decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberNextPlus(decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberNextToward(decNumber *, const decNumber *, const decNumber *, decContext *); U_CAPI decNumber * U_EXPORT2 uprv_decNumberTrim(decNumber *); U_CAPI const char * U_EXPORT2 uprv_decNumberVersion(); U_CAPI decNumber * U_EXPORT2 uprv_decNumberZero(decNumber *); /* Functions for testing decNumbers (normality depends on context) */ U_CAPI int32_t U_EXPORT2 uprv_decNumberIsNormal(const decNumber *, decContext *); U_CAPI int32_t U_EXPORT2 uprv_decNumberIsSubnormal(const decNumber *, decContext *); /* Macros for testing decNumber *dn */ #define decNumberIsCanonical(dn) (1) /* All decNumbers are saintly */ #define decNumberIsFinite(dn) (((dn)->bits&DECSPECIAL)==0) #define decNumberIsInfinite(dn) (((dn)->bits&DECINF)!=0) #define decNumberIsNaN(dn) (((dn)->bits&(DECNAN|DECSNAN))!=0) #define decNumberIsNegative(dn) (((dn)->bits&DECNEG)!=0) #define decNumberIsQNaN(dn) (((dn)->bits&(DECNAN))!=0) #define decNumberIsSNaN(dn) (((dn)->bits&(DECSNAN))!=0) #define decNumberIsSpecial(dn) (((dn)->bits&DECSPECIAL)!=0) #define decNumberIsZero(dn) (*(dn)->lsu==0 \ && (dn)->digits==1 \ && (((dn)->bits&DECSPECIAL)==0)) #define decNumberRadix(dn) (10) #endif stringi/src/icu74/i18n/regexcst.txt0000644000176200001440000006105614700200761016624 0ustar liggesusers# Copyright (C) 2016 and later: Unicode, Inc. and others. # License & terms of use: http://www.unicode.org/copyright.html #***************************************************************************** # # Copyright (C) 2002-2015, International Business Machines Corporation and others. # All Rights Reserved. # #***************************************************************************** # # file: regexcst.txt # ICU Regular Expression Parser State Table # # This state table is used when reading and parsing a regular expression pattern # The pattern parser uses a state machine; the data in this file define the # state transitions that occur for each input character. # # *** This file defines the regex pattern grammar. This is it. # *** The determination of what is accepted is here. # # This file is processed by a perl script "regexcst.pl" to produce initialized C arrays # that are then built with the rule parser. # # # Here is the syntax of the state definitions in this file: # # #StateName: # input-char n next-state ^push-state action # input-char n next-state ^push-state action # | | | | | # | | | | |--- action to be performed by state machine # | | | | See function RBBIRuleScanner::doParseActions() # | | | | # | | | |--- Push this named state onto the state stack. # | | | Later, when next state is specified as "pop", # | | | the pushed state will become the current state. # | | | # | | |--- Transition to this state if the current input character matches the input # | | character or char class in the left hand column. "pop" causes the next # | | state to be popped from the state stack. # | | # | |--- When making the state transition specified on this line, advance to the next # | character from the input only if 'n' appears here. # | # |--- Character or named character classes to test for. If the current character being scanned # matches, perform the actions and go to the state specified on this line. # The input character is tested sequentally, in the order written. The characters and # character classes tested for do not need to be mutually exclusive. The first match wins. # # # start state, scan position is at the beginning of the pattern. # start: default term doPatStart # # term. At a position where we can accept the start most items in a pattern. # term: quoted n expr-quant doLiteralChar rule_char n expr-quant doLiteralChar '[' n set-open ^set-finish doSetBegin '(' n open-paren '.' n expr-quant doDotAny '^' n expr-quant doCaret '$' n expr-quant doDollar '\' n backslash '|' n term doOrOperator ')' n pop doCloseParen eof term doPatFinish default errorDeath doRuleError # # expr-quant We've just finished scanning a term, now look for the optional # trailing quantifier - *, +, ?, *?, etc. # expr-quant: '*' n quant-star '+' n quant-plus '?' n quant-opt '{' n interval-open doIntervalInit '(' n open-paren-quant default expr-cont # # expr-cont Expression, continuation. At a point where additional terms are # allowed, but not required. No Quantifiers # expr-cont: '|' n term doOrOperator ')' n pop doCloseParen default term # # open-paren-quant Special case handling for comments appearing before a quantifier, # e.g. x(?#comment )* # Open parens from expr-quant come here; anything but a (?# comment # branches into the normal parenthesis sequence as quickly as possible. # open-paren-quant: '?' n open-paren-quant2 doSuppressComments default open-paren open-paren-quant2: '#' n paren-comment ^expr-quant default open-paren-extended # # open-paren We've got an open paren. We need to scan further to # determine what kind of quantifier it is - plain (, (?:, (?>, or whatever. # open-paren: '?' n open-paren-extended doSuppressComments default term ^expr-quant doOpenCaptureParen open-paren-extended: ':' n term ^expr-quant doOpenNonCaptureParen # (?: '>' n term ^expr-quant doOpenAtomicParen # (?> '=' n term ^expr-cont doOpenLookAhead # (?= '!' n term ^expr-cont doOpenLookAheadNeg # (?! '<' n open-paren-lookbehind '#' n paren-comment ^term 'i' paren-flag doBeginMatchMode 'd' paren-flag doBeginMatchMode 'm' paren-flag doBeginMatchMode 's' paren-flag doBeginMatchMode 'u' paren-flag doBeginMatchMode 'w' paren-flag doBeginMatchMode 'x' paren-flag doBeginMatchMode '-' paren-flag doBeginMatchMode '(' n errorDeath doConditionalExpr '{' n errorDeath doPerlInline default errorDeath doBadOpenParenType open-paren-lookbehind: '=' n term ^expr-cont doOpenLookBehind # (?<= '!' n term ^expr-cont doOpenLookBehindNeg # (? ... ), position currently on the name. # named-capture: ascii_letter n named-capture doContinueNamedCapture digit_char n named-capture doContinueNamedCapture '>' n term ^expr-quant doOpenCaptureParen # common w non-named capture. default errorDeath doBadNamedCapture # # quant-star Scanning a '*' quantifier. Need to look ahead to decide # between plain '*', '*?', '*+' # quant-star: '?' n expr-cont doNGStar # *? '+' n expr-cont doPossessiveStar # *+ default expr-cont doStar # # quant-plus Scanning a '+' quantifier. Need to look ahead to decide # between plain '+', '+?', '++' # quant-plus: '?' n expr-cont doNGPlus # *? '+' n expr-cont doPossessivePlus # *+ default expr-cont doPlus # # quant-opt Scanning a '?' quantifier. Need to look ahead to decide # between plain '?', '??', '?+' # quant-opt: '?' n expr-cont doNGOpt # ?? '+' n expr-cont doPossessiveOpt # ?+ default expr-cont doOpt # ? # # Interval scanning a '{', the opening delimiter for an interval specification # {number} or {min, max} or {min,} # interval-open: digit_char interval-lower default errorDeath doIntervalError interval-lower: digit_char n interval-lower doIntevalLowerDigit ',' n interval-upper '}' n interval-type doIntervalSame # {n} default errorDeath doIntervalError interval-upper: digit_char n interval-upper doIntervalUpperDigit '}' n interval-type default errorDeath doIntervalError interval-type: '?' n expr-cont doNGInterval # {n,m}? '+' n expr-cont doPossessiveInterval # {n,m}+ default expr-cont doInterval # {m,n} # # backslash # Backslash. Figure out which of the \thingies we have encountered. # The low level next-char function will have preprocessed # some of them already; those won't come here. backslash: 'A' n term doBackslashA 'B' n term doBackslashB 'b' n term doBackslashb 'd' n expr-quant doBackslashd 'D' n expr-quant doBackslashD 'G' n term doBackslashG 'h' n expr-quant doBackslashh 'H' n expr-quant doBackslashH 'k' n named-backref 'N' expr-quant doNamedChar # \N{NAME} named char 'p' expr-quant doProperty # \p{Lu} style property 'P' expr-quant doProperty 'R' n expr-quant doBackslashR 'Q' n term doEnterQuoteMode 'S' n expr-quant doBackslashS 's' n expr-quant doBackslashs 'v' n expr-quant doBackslashv 'V' n expr-quant doBackslashV 'W' n expr-quant doBackslashW 'w' n expr-quant doBackslashw 'X' n expr-quant doBackslashX 'Z' n term doBackslashZ 'z' n term doBackslashz digit_char n expr-quant doBackRef # Will scan multiple digits eof errorDeath doEscapeError default n expr-quant doEscapedLiteralChar # named-backref Scanned \k # Leading to \k # Failure to get the full sequence is an error. # named-backref: '<' n named-backref-2 doBeginNamedBackRef default errorDeath doBadNamedCapture named-backref-2: ascii_letter n named-backref-3 doContinueNamedBackRef default errorDeath doBadNamedCapture named-backref-3: ascii_letter n named-backref-3 doContinueNamedBackRef digit_char n named-backref-3 doContinueNamedBackRef '>' n expr-quant doCompleteNamedBackRef default errorDeath doBadNamedCapture # # [set expression] parsing, # All states involved in parsing set expressions have names beginning with "set-" # set-open: '^' n set-open2 doSetNegate ':' set-posix doSetPosixProp default set-open2 set-open2: ']' n set-after-lit doSetLiteral default set-start # set-posix: # scanned a '[:' If it really is a [:property:], doSetPosixProp will have # moved the scan to the closing ']'. If it wasn't a property # expression, the scan will still be at the opening ':', which should # be interpreted as a normal set expression. set-posix: ']' n pop doSetEnd ':' set-start default errorDeath doRuleError # should not be possible. # # set-start after the [ and special case leading characters (^ and/or ]) but before # everything else. A '-' is literal at this point. # set-start: ']' n pop doSetEnd '[' n set-open ^set-after-set doSetBeginUnion '\' n set-escape '-' n set-start-dash '&' n set-start-amp default n set-after-lit doSetLiteral # set-start-dash Turn "[--" into a syntax error. # "[-x" is good, - and x are literals. # set-start-dash: '-' errorDeath doRuleError default set-after-lit doSetAddDash # set-start-amp Turn "[&&" into a syntax error. # "[&x" is good, & and x are literals. # set-start-amp: '&' errorDeath doRuleError default set-after-lit doSetAddAmp # # set-after-lit The last thing scanned was a literal character within a set. # Can be followed by anything. Single '-' or '&' are # literals in this context, not operators. set-after-lit: ']' n pop doSetEnd '[' n set-open ^set-after-set doSetBeginUnion '-' n set-lit-dash '&' n set-lit-amp '\' n set-escape eof errorDeath doSetNoCloseError default n set-after-lit doSetLiteral set-after-set: ']' n pop doSetEnd '[' n set-open ^set-after-set doSetBeginUnion '-' n set-set-dash '&' n set-set-amp '\' n set-escape eof errorDeath doSetNoCloseError default n set-after-lit doSetLiteral set-after-range: ']' n pop doSetEnd '[' n set-open ^set-after-set doSetBeginUnion '-' n set-range-dash '&' n set-range-amp '\' n set-escape eof errorDeath doSetNoCloseError default n set-after-lit doSetLiteral # set-after-op # After a -- or && # It is an error to close a set at this point. # set-after-op: '[' n set-open ^set-after-set doSetBeginUnion ']' errorDeath doSetOpError '\' n set-escape default n set-after-lit doSetLiteral # # set-set-amp # Have scanned [[set]& # Could be a '&' intersection operator, if a set follows. # Could be the start of a '&&' operator. # Otherwise is a literal. set-set-amp: '[' n set-open ^set-after-set doSetBeginIntersection1 '&' n set-after-op doSetIntersection2 default set-after-lit doSetAddAmp # set-lit-amp Have scanned "[literals&" # Could be a start of "&&" operator or a literal # In [abc&[def]], the '&' is a literal # set-lit-amp: '&' n set-after-op doSetIntersection2 default set-after-lit doSetAddAmp # # set-set-dash # Have scanned [set]- # Could be a '-' difference operator, if a [set] follows. # Could be the start of a '--' operator. # Otherwise is a literal. set-set-dash: '[' n set-open ^set-after-set doSetBeginDifference1 '-' n set-after-op doSetDifference2 default set-after-lit doSetAddDash # # set-range-dash # scanned a-b- or \w- # any set or range like item where the trailing single '-' should # be literal, not a set difference operation. # A trailing "--" is still a difference operator. set-range-dash: '-' n set-after-op doSetDifference2 default set-after-lit doSetAddDash set-range-amp: '&' n set-after-op doSetIntersection2 default set-after-lit doSetAddAmp # set-lit-dash # Have scanned "[literals-" Could be a range or a -- operator or a literal # In [abc-[def]], the '-' is a literal (confirmed with a Java test) # [abc-\p{xx} the '-' is an error # [abc-] the '-' is a literal # [ab-xy] the '-' is a range # set-lit-dash: '-' n set-after-op doSetDifference2 '[' set-after-lit doSetAddDash ']' set-after-lit doSetAddDash '\' n set-lit-dash-escape default n set-after-range doSetRange # set-lit-dash-escape # # scanned "[literal-\" # Could be a range, if the \ introduces an escaped literal char or a named char. # Otherwise it is an error. # set-lit-dash-escape: 's' errorDeath doSetOpError 'S' errorDeath doSetOpError 'w' errorDeath doSetOpError 'W' errorDeath doSetOpError 'd' errorDeath doSetOpError 'D' errorDeath doSetOpError 'N' set-after-range doSetNamedRange default n set-after-range doSetRange # # set-escape # Common back-slash escape processing within set expressions # set-escape: 'p' set-after-set doSetProp 'P' set-after-set doSetProp 'N' set-after-lit doSetNamedChar 's' n set-after-range doSetBackslash_s 'S' n set-after-range doSetBackslash_S 'w' n set-after-range doSetBackslash_w 'W' n set-after-range doSetBackslash_W 'd' n set-after-range doSetBackslash_d 'D' n set-after-range doSetBackslash_D 'h' n set-after-range doSetBackslash_h 'H' n set-after-range doSetBackslash_H 'v' n set-after-range doSetBackslash_v 'V' n set-after-range doSetBackslash_V default n set-after-lit doSetLiteralEscaped # # set-finish # Have just encountered the final ']' that completes a [set], and # arrived here via a pop. From here, we exit the set parsing world, and go # back to generic regular expression parsing. # set-finish: default expr-quant doSetFinish # # errorDeath. This state is specified as the next state whenever a syntax error # in the source rules is detected. Barring bugs, the state machine will never # actually get here, but will stop because of the action associated with the error. # But, just in case, this state asks the state machine to exit. errorDeath: default n errorDeath doExit stringi/src/icu74/i18n/pluralranges.cpp0000644000176200001440000001147614700200761017443 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING // Allow implicit conversion from char16_t* to UnicodeString for this file: // Helpful in toString methods and elsewhere. #define UNISTR_FROM_STRING_EXPLICIT #include "unicode/numberrangeformatter.h" #include "pluralranges.h" #include "uresimp.h" #include "charstr.h" #include "uassert.h" #include "util.h" #include "numrange_impl.h" U_NAMESPACE_BEGIN namespace { class PluralRangesDataSink : public ResourceSink { public: PluralRangesDataSink(StandardPluralRanges& output) : fOutput(output) {} void put(const char* /*key*/, ResourceValue& value, UBool /*noFallback*/, UErrorCode& status) override { ResourceArray entriesArray = value.getArray(status); if (U_FAILURE(status)) { return; } fOutput.setCapacity(entriesArray.getSize(), status); if (U_FAILURE(status)) { return; } for (int i = 0; entriesArray.getValue(i, value); i++) { ResourceArray pluralFormsArray = value.getArray(status); if (U_FAILURE(status)) { return; } if (pluralFormsArray.getSize() != 3) { status = U_RESOURCE_TYPE_MISMATCH; return; } pluralFormsArray.getValue(0, value); StandardPlural::Form first = StandardPlural::fromString(value.getUnicodeString(status), status); if (U_FAILURE(status)) { return; } pluralFormsArray.getValue(1, value); StandardPlural::Form second = StandardPlural::fromString(value.getUnicodeString(status), status); if (U_FAILURE(status)) { return; } pluralFormsArray.getValue(2, value); StandardPlural::Form result = StandardPlural::fromString(value.getUnicodeString(status), status); if (U_FAILURE(status)) { return; } fOutput.addPluralRange(first, second, result); } } private: StandardPluralRanges& fOutput; }; void getPluralRangesData(const Locale& locale, StandardPluralRanges& output, UErrorCode& status) { LocalUResourceBundlePointer rb(ures_openDirect(nullptr, "pluralRanges", &status)); if (U_FAILURE(status)) { return; } CharString dataPath; dataPath.append("locales/", -1, status); dataPath.append(locale.getLanguage(), -1, status); if (U_FAILURE(status)) { return; } int32_t setLen; // Not all languages are covered: fail gracefully UErrorCode internalStatus = U_ZERO_ERROR; const char16_t* set = ures_getStringByKeyWithFallback(rb.getAlias(), dataPath.data(), &setLen, &internalStatus); if (U_FAILURE(internalStatus)) { return; } dataPath.clear(); dataPath.append("rules/", -1, status); dataPath.appendInvariantChars(set, setLen, status); if (U_FAILURE(status)) { return; } PluralRangesDataSink sink(output); ures_getAllItemsWithFallback(rb.getAlias(), dataPath.data(), sink, status); } } // namespace StandardPluralRanges StandardPluralRanges::forLocale(const Locale& locale, UErrorCode& status) { StandardPluralRanges result; getPluralRangesData(locale, result, status); return result; } StandardPluralRanges StandardPluralRanges::copy(UErrorCode& status) const { StandardPluralRanges result; if (fTriplesLen > result.fTriples.getCapacity()) { if (result.fTriples.resize(fTriplesLen) == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return result; } } uprv_memcpy(result.fTriples.getAlias(), fTriples.getAlias(), fTriplesLen * sizeof(fTriples[0])); result.fTriplesLen = fTriplesLen; return result; } LocalPointer StandardPluralRanges::toPointer(UErrorCode& status) && noexcept { return LocalPointer(new StandardPluralRanges(std::move(*this)), status); } void StandardPluralRanges::addPluralRange( StandardPlural::Form first, StandardPlural::Form second, StandardPlural::Form result) { U_ASSERT(fTriplesLen < fTriples.getCapacity()); fTriples[fTriplesLen] = {first, second, result}; fTriplesLen++; } void StandardPluralRanges::setCapacity(int32_t length, UErrorCode& status) { if (U_FAILURE(status)) { return; } if (length > fTriples.getCapacity()) { if (fTriples.resize(length, 0) == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; } } } StandardPlural::Form StandardPluralRanges::resolve(StandardPlural::Form first, StandardPlural::Form second) const { for (int32_t i=0; i #include U_NAMESPACE_BEGIN union FormatInfo { NUMBERFMTW number; CURRENCYFMTW currency; }; UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Win32NumberFormat) #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) #define DELETE_ARRAY(array) uprv_free((void *) (array)) #define STACK_BUFFER_SIZE 32 /* * Turns a string of the form "3;2;0" into the grouping UINT * needed for NUMBERFMT and CURRENCYFMT. If the string does not * end in ";0" then the return value should be multiplied by 10. * (e.g. "3" => 30, "3;2" => 320) */ static UINT getGrouping(const wchar_t *grouping) { UINT g = 0; const wchar_t *s; for (s = grouping; *s != L'\0'; s += 1) { if (*s > L'0' && *s < L'9') { g = g * 10 + (*s - L'0'); } else if (*s != L';') { break; } } if (*s != L'0') { g *= 10; } return g; } static void getNumberFormat(NUMBERFMTW *fmt, const wchar_t *windowsLocaleName) { wchar_t buf[10]; GetLocaleInfoEx(windowsLocaleName, LOCALE_RETURN_NUMBER|LOCALE_IDIGITS, (LPWSTR) &fmt->NumDigits, sizeof(UINT)); GetLocaleInfoEx(windowsLocaleName, LOCALE_RETURN_NUMBER|LOCALE_ILZERO, (LPWSTR) &fmt->LeadingZero, sizeof(UINT)); GetLocaleInfoEx(windowsLocaleName, LOCALE_SGROUPING, (LPWSTR)buf, 10); fmt->Grouping = getGrouping(buf); fmt->lpDecimalSep = NEW_ARRAY(wchar_t, 6); GetLocaleInfoEx(windowsLocaleName, LOCALE_SDECIMAL, fmt->lpDecimalSep, 6); fmt->lpThousandSep = NEW_ARRAY(wchar_t, 6); GetLocaleInfoEx(windowsLocaleName, LOCALE_STHOUSAND, fmt->lpThousandSep, 6); GetLocaleInfoEx(windowsLocaleName, LOCALE_RETURN_NUMBER|LOCALE_INEGNUMBER, (LPWSTR) &fmt->NegativeOrder, sizeof(UINT)); } static void freeNumberFormat(NUMBERFMTW *fmt) { if (fmt != nullptr) { DELETE_ARRAY(fmt->lpThousandSep); DELETE_ARRAY(fmt->lpDecimalSep); } } static void getCurrencyFormat(CURRENCYFMTW *fmt, const wchar_t *windowsLocaleName) { wchar_t buf[10]; GetLocaleInfoEx(windowsLocaleName, LOCALE_RETURN_NUMBER|LOCALE_ICURRDIGITS, (LPWSTR) &fmt->NumDigits, sizeof(UINT)); GetLocaleInfoEx(windowsLocaleName, LOCALE_RETURN_NUMBER|LOCALE_ILZERO, (LPWSTR) &fmt->LeadingZero, sizeof(UINT)); GetLocaleInfoEx(windowsLocaleName, LOCALE_SMONGROUPING, (LPWSTR)buf, sizeof(buf)); fmt->Grouping = getGrouping(buf); fmt->lpDecimalSep = NEW_ARRAY(wchar_t, 6); GetLocaleInfoEx(windowsLocaleName, LOCALE_SMONDECIMALSEP, fmt->lpDecimalSep, 6); fmt->lpThousandSep = NEW_ARRAY(wchar_t, 6); GetLocaleInfoEx(windowsLocaleName, LOCALE_SMONTHOUSANDSEP, fmt->lpThousandSep, 6); GetLocaleInfoEx(windowsLocaleName, LOCALE_RETURN_NUMBER|LOCALE_INEGCURR, (LPWSTR) &fmt->NegativeOrder, sizeof(UINT)); GetLocaleInfoEx(windowsLocaleName, LOCALE_RETURN_NUMBER|LOCALE_ICURRENCY, (LPWSTR) &fmt->PositiveOrder, sizeof(UINT)); fmt->lpCurrencySymbol = NEW_ARRAY(wchar_t, 8); GetLocaleInfoEx(windowsLocaleName, LOCALE_SCURRENCY, (LPWSTR) fmt->lpCurrencySymbol, 8); } static void freeCurrencyFormat(CURRENCYFMTW *fmt) { if (fmt != nullptr) { DELETE_ARRAY(fmt->lpCurrencySymbol); DELETE_ARRAY(fmt->lpThousandSep); DELETE_ARRAY(fmt->lpDecimalSep); } } #if UCONFIG_USE_WINDOWS_LCID_MAPPING_API // TODO: This is copied in both winnmfmt.cpp and windtfmt.cpp, but really should // be factored out into a common helper for both. static UErrorCode GetEquivalentWindowsLocaleName(const Locale& locale, UnicodeString** buffer) { UErrorCode status = U_ZERO_ERROR; // Convert from names like "en_CA" and "de_DE@collation=phonebook" to "en-CA" and "de-DE-u-co-phonebk". CharString asciiBCP47Tag; { CharStringByteSink sink(&asciiBCP47Tag); ulocimp_toLanguageTag(locale.getName(), sink, false, &status); } if (U_SUCCESS(status)) { // Need it to be UTF-16, not 8-bit // TODO: This seems like a good thing for a helper wchar_t bcp47Tag[LOCALE_NAME_MAX_LENGTH] = {}; int32_t i; for (i = 0; i < UPRV_LENGTHOF(bcp47Tag); i++) { if (asciiBCP47Tag[i] == '\0') { break; } else { // normally just copy the character bcp47Tag[i] = static_cast(asciiBCP47Tag[i]); } } // Ensure it's null terminated if (i < (UPRV_LENGTHOF(bcp47Tag) - 1)) { bcp47Tag[i] = L'\0'; } else { // Ran out of room. bcp47Tag[UPRV_LENGTHOF(bcp47Tag) - 1] = L'\0'; } wchar_t windowsLocaleName[LOCALE_NAME_MAX_LENGTH] = {}; // Note: On Windows versions below 10, there is no support for locale name aliases. // This means that it will fail for locales where ICU has a completely different // name (like ku vs ckb), and it will also not work for alternate sort locale // names like "de-DE-u-co-phonebk". // TODO: We could add some sort of exception table for cases like ku vs ckb. int length = ResolveLocaleName(bcp47Tag, windowsLocaleName, UPRV_LENGTHOF(windowsLocaleName)); if (length > 0) { *buffer = new UnicodeString(windowsLocaleName); } else { status = U_UNSUPPORTED_ERROR; } } return status; } #endif Win32NumberFormat::Win32NumberFormat(const Locale &locale, UBool currency, UErrorCode &status) : NumberFormat(), fCurrency(currency), fFormatInfo(nullptr), fFractionDigitsSet(false), fWindowsLocaleName(nullptr) { if (!U_FAILURE(status)) { fLCID = locale.getLCID(); #if UCONFIG_USE_WINDOWS_LCID_MAPPING_API GetEquivalentWindowsLocaleName(locale, &fWindowsLocaleName); #else #warning GetEquivalentWindowsLocaleName has been disabled #endif // Note: In the previous code, it would look up the LCID for the locale, and if // the locale was not recognized then it would get an LCID of 0, which is a // synonym for LOCALE_USER_DEFAULT on Windows. // If the above method fails, then fWindowsLocaleName will remain as nullptr, and // then we will pass nullptr to API GetLocaleInfoEx, which is the same as passing // LOCALE_USER_DEFAULT. // Resolve actual locale to be used later UErrorCode tmpsts = U_ZERO_ERROR; char tmpLocID[ULOC_FULLNAME_CAPACITY]; int32_t len = uloc_getLocaleForLCID(fLCID, tmpLocID, UPRV_LENGTHOF(tmpLocID) - 1, &tmpsts); if (U_SUCCESS(tmpsts)) { tmpLocID[len] = 0; fLocale = Locale((const char*)tmpLocID); } const wchar_t *localeName = nullptr; if (fWindowsLocaleName != nullptr) { localeName = reinterpret_cast(toOldUCharPtr(fWindowsLocaleName->getTerminatedBuffer())); } fFormatInfo = (FormatInfo*)uprv_malloc(sizeof(FormatInfo)); if (fCurrency) { getCurrencyFormat(&fFormatInfo->currency, localeName); } else { getNumberFormat(&fFormatInfo->number, localeName); } } } Win32NumberFormat::Win32NumberFormat(const Win32NumberFormat &other) : NumberFormat(other), fFormatInfo((FormatInfo*)uprv_malloc(sizeof(FormatInfo))) { if (fFormatInfo != nullptr) { uprv_memset(fFormatInfo, 0, sizeof(*fFormatInfo)); } *this = other; } Win32NumberFormat::~Win32NumberFormat() { if (fFormatInfo != nullptr) { if (fCurrency) { freeCurrencyFormat(&fFormatInfo->currency); } else { freeNumberFormat(&fFormatInfo->number); } uprv_free(fFormatInfo); } delete fWindowsLocaleName; } Win32NumberFormat &Win32NumberFormat::operator=(const Win32NumberFormat &other) { if (this == &other) { return *this; } // self-assignment: no-op NumberFormat::operator=(other); this->fCurrency = other.fCurrency; this->fLocale = other.fLocale; this->fLCID = other.fLCID; this->fFractionDigitsSet = other.fFractionDigitsSet; this->fWindowsLocaleName = other.fWindowsLocaleName == nullptr ? nullptr : new UnicodeString(*other.fWindowsLocaleName); const wchar_t *localeName = nullptr; if (fWindowsLocaleName != nullptr) { localeName = reinterpret_cast(toOldUCharPtr(fWindowsLocaleName->getTerminatedBuffer())); } if (fCurrency) { freeCurrencyFormat(&fFormatInfo->currency); getCurrencyFormat(&fFormatInfo->currency, localeName); } else { freeNumberFormat(&fFormatInfo->number); getNumberFormat(&fFormatInfo->number, localeName); } return *this; } Win32NumberFormat *Win32NumberFormat::clone() const { return new Win32NumberFormat(*this); } UnicodeString& Win32NumberFormat::format(double number, UnicodeString& appendTo, FieldPosition& /* pos */) const { return format(getMaximumFractionDigits(), appendTo, L"%.16f", number); } UnicodeString& Win32NumberFormat::format(int32_t number, UnicodeString& appendTo, FieldPosition& /* pos */) const { return format(getMinimumFractionDigits(), appendTo, L"%I32d", number); } UnicodeString& Win32NumberFormat::format(int64_t number, UnicodeString& appendTo, FieldPosition& /* pos */) const { return format(getMinimumFractionDigits(), appendTo, L"%I64d", number); } void Win32NumberFormat::parse(const UnicodeString& text, Formattable& result, ParsePosition& parsePosition) const { UErrorCode status = U_ZERO_ERROR; NumberFormat *nf = fCurrency? NumberFormat::createCurrencyInstance(fLocale, status) : NumberFormat::createInstance(fLocale, status); nf->parse(text, result, parsePosition); delete nf; } void Win32NumberFormat::setMaximumFractionDigits(int32_t newValue) { fFractionDigitsSet = true; NumberFormat::setMaximumFractionDigits(newValue); } void Win32NumberFormat::setMinimumFractionDigits(int32_t newValue) { fFractionDigitsSet = true; NumberFormat::setMinimumFractionDigits(newValue); } UnicodeString &Win32NumberFormat::format(int32_t numDigits, UnicodeString &appendTo, const wchar_t *fmt, ...) const { wchar_t nStackBuffer[STACK_BUFFER_SIZE]; wchar_t *nBuffer = nStackBuffer; va_list args; int result; nBuffer[0] = 0x0000; /* Due to the arguments causing a result to be <= 23 characters (+2 for nullptr and minus), we don't need to reallocate the buffer. */ va_start(args, fmt); result = _vsnwprintf(nBuffer, STACK_BUFFER_SIZE, fmt, args); va_end(args); /* Just to make sure of the above statement, we add this assert */ U_ASSERT(result >=0); // The following code is not used because _vscwprintf isn't available on MinGW at the moment. /*if (result < 0) { int newLength; va_start(args, fmt); newLength = _vscwprintf(fmt, args); va_end(args); nBuffer = NEW_ARRAY(char16_t, newLength + 1); va_start(args, fmt); result = _vsnwprintf(nBuffer, newLength + 1, fmt, args); va_end(args); }*/ // vswprintf is sensitive to the locale set by setlocale. For some locales // it doesn't use "." as the decimal separator, which is what GetNumberFormatW // and GetCurrencyFormatW both expect to see. // // To fix this, we scan over the string and replace the first non-digits, except // for a leading "-", with a "." // // Note: (nBuffer[0] == L'-') will evaluate to 1 if there is a leading '-' in the // number, and 0 otherwise. for (wchar_t *p = &nBuffer[nBuffer[0] == L'-']; *p != L'\0'; p += 1) { if (*p < L'0' || *p > L'9') { *p = L'.'; break; } } wchar_t stackBuffer[STACK_BUFFER_SIZE]; wchar_t *buffer = stackBuffer; FormatInfo formatInfo; formatInfo = *fFormatInfo; buffer[0] = 0x0000; const wchar_t *localeName = nullptr; if (fWindowsLocaleName != nullptr) { localeName = reinterpret_cast(toOldUCharPtr(fWindowsLocaleName->getTerminatedBuffer())); } if (fCurrency) { if (fFractionDigitsSet) { formatInfo.currency.NumDigits = (UINT) numDigits; } if (!isGroupingUsed()) { formatInfo.currency.Grouping = 0; } result = GetCurrencyFormatEx(localeName, 0, nBuffer, &formatInfo.currency, buffer, STACK_BUFFER_SIZE); if (result == 0) { DWORD lastError = GetLastError(); if (lastError == ERROR_INSUFFICIENT_BUFFER) { int newLength = GetCurrencyFormatEx(localeName, 0, nBuffer, &formatInfo.currency, nullptr, 0); buffer = NEW_ARRAY(wchar_t, newLength); buffer[0] = 0x0000; GetCurrencyFormatEx(localeName, 0, nBuffer, &formatInfo.currency, buffer, newLength); } } } else { if (fFractionDigitsSet) { formatInfo.number.NumDigits = (UINT) numDigits; } if (!isGroupingUsed()) { formatInfo.number.Grouping = 0; } result = GetNumberFormatEx(localeName, 0, nBuffer, &formatInfo.number, buffer, STACK_BUFFER_SIZE); if (result == 0) { if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) { int newLength = GetNumberFormatEx(localeName, 0, nBuffer, &formatInfo.number, nullptr, 0); buffer = NEW_ARRAY(wchar_t, newLength); buffer[0] = 0x0000; GetNumberFormatEx(localeName, 0, nBuffer, &formatInfo.number, buffer, newLength); } } } appendTo.append((char16_t *)buffer, (int32_t) wcslen(buffer)); if (buffer != stackBuffer) { DELETE_ARRAY(buffer); } /*if (nBuffer != nStackBuffer) { DELETE_ARRAY(nBuffer); }*/ return appendTo; } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ #endif // U_PLATFORM_USES_ONLY_WIN32_API stringi/src/icu74/i18n/tmunit.cpp0000644000176200001440000000765014700200761016263 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2008-2014, Google, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ #include "unicode/tmunit.h" #include "uassert.h" #if !UCONFIG_NO_FORMATTING U_NAMESPACE_BEGIN UOBJECT_DEFINE_RTTI_IMPLEMENTATION(TimeUnit) /* * There are only 7 time units. * So, TimeUnit could be made as singleton * (similar to uniset_props.cpp, or unorm.cpp, * in which a static TimeUnit* array is created, and * the creatInstance() returns a const TimeUnit*). * But the constraint is TimeUnit is a data member of Measure. * But Measure (which is an existing API) does not expect it's "unit" member * as singleton. Meaure takes ownership of the "unit" member. * In its constructor, it does not take a const "unit" pointer. * Also, Measure can clone and destruct the "unit" pointer. * In order to preserve the old behavior and let Measure handle singleton "unit", * 1. a flag need to be added in Measure; * 2. a new constructor which takes const "unit" as parameter need to be added, * and this new constructor will set the flag on. * 3. clone and destructor need to check upon this flag to distinguish on how * to handle the "unit". * * Since TimeUnit is such a light weight object, comparing with the heavy weight * format operation, we decided to avoid the above complication. * * So, both TimeUnit and CurrencyUnit (the 2 subclasses of MeasureUnit) are * immutable and non-singleton. * * Currently, TimeUnitAmount and CurrencyAmount are immutable. * If an application needs to create a long list of TimeUnitAmount on the same * time unit but different number, for example, * 1 hour, 2 hour, 3 hour, ................. 10,000 hour, * there might be performance hit because 10,000 TimeUnit object, * although all are the same time unit, will be created in heap and deleted. * * To address this performance issue, if there is any in the future, * we should and need to change TimeUnitAmount and CurrencyAmount to be * immutable by allowing a setter on the number. * Or we need to add 2 parallel mutable classes in order to * preserve the existing API. * Or we can use freezable. */ TimeUnit* U_EXPORT2 TimeUnit::createInstance(TimeUnit::UTimeUnitFields timeUnitField, UErrorCode& status) { if (U_FAILURE(status)) { return nullptr; } if (timeUnitField < 0 || timeUnitField >= UTIMEUNIT_FIELD_COUNT) { status = U_ILLEGAL_ARGUMENT_ERROR; return nullptr; } return new TimeUnit(timeUnitField); } TimeUnit::TimeUnit(TimeUnit::UTimeUnitFields timeUnitField) { fTimeUnitField = timeUnitField; switch (fTimeUnitField) { case UTIMEUNIT_YEAR: initTime("year"); break; case UTIMEUNIT_MONTH: initTime("month"); break; case UTIMEUNIT_DAY: initTime("day"); break; case UTIMEUNIT_WEEK: initTime("week"); break; case UTIMEUNIT_HOUR: initTime("hour"); break; case UTIMEUNIT_MINUTE: initTime("minute"); break; case UTIMEUNIT_SECOND: initTime("second"); break; default: UPRV_UNREACHABLE_EXIT; } } TimeUnit::TimeUnit(const TimeUnit& other) : MeasureUnit(other), fTimeUnitField(other.fTimeUnitField) { } TimeUnit* TimeUnit::clone() const { return new TimeUnit(*this); } TimeUnit& TimeUnit::operator=(const TimeUnit& other) { if (this == &other) { return *this; } MeasureUnit::operator=(other); fTimeUnitField = other.fTimeUnitField; return *this; } TimeUnit::UTimeUnitFields TimeUnit::getTimeUnitField() const { return fTimeUnitField; } TimeUnit::~TimeUnit() { } U_NAMESPACE_END #endif stringi/src/icu74/i18n/collationdatawriter.h0000644000176200001440000000372614770511777020506 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2013-2014, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * collationdatawriter.h * * created on: 2013aug06 * created by: Markus W. Scherer */ #ifndef __COLLATIONDATAWRITER_H__ #define __COLLATIONDATAWRITER_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION U_NAMESPACE_BEGIN struct CollationData; struct CollationSettings; struct CollationTailoring; /** * Collation-related code for tools & demos. */ class U_I18N_API CollationDataWriter /* all static */ { public: static int32_t writeBase(const CollationData &data, const CollationSettings &settings, const void *rootElements, int32_t rootElementsLength, int32_t indexes[], uint8_t *dest, int32_t capacity, UErrorCode &errorCode); static int32_t writeTailoring(const CollationTailoring &t, const CollationSettings &settings, int32_t indexes[], uint8_t *dest, int32_t capacity, UErrorCode &errorCode); private: CollationDataWriter() = delete; // no constructor static int32_t write(UBool isBase, const UVersionInfo dataVersion, const CollationData &data, const CollationSettings &settings, const void *rootElements, int32_t rootElementsLength, int32_t indexes[], uint8_t *dest, int32_t capacity, UErrorCode &errorCode); static void copyData(const int32_t indexes[], int32_t startIndex, const void *src, uint8_t *dest); }; U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION #endif // __COLLATIONDATAWRITER_H__ stringi/src/icu74/i18n/sharedbreakiterator.h0000644000176200001440000000325014700200761020425 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * Copyright (C) 2014, International Business Machines * Corporation and others. All Rights Reserved. ****************************************************************************** * sharedbreakiterator.h */ #ifndef __SHARED_BREAKITERATOR_H__ #define __SHARED_BREAKITERATOR_H__ #include "unicode/utypes.h" #include "sharedobject.h" #if !UCONFIG_NO_BREAK_ITERATION U_NAMESPACE_BEGIN class BreakIterator; // SharedBreakIterator encapsulates a shared BreakIterator. Because // BreakIterator has mutable semantics, clients must ensure that all uses // of a particular shared BreakIterator is protected by the same mutex // ensuring that only one thread at a time gets access to that shared // BreakIterator. Clients can accomplish this by creating a mutex for all // uses of break iterator within a particular class. Then objects of that // class may then freely share break iterators among themselves. However, // these shared break iterators must never be exposed outside of that class. class U_I18N_API SharedBreakIterator : public SharedObject { public: SharedBreakIterator(BreakIterator *biToAdopt); virtual ~SharedBreakIterator(); BreakIterator *get() const { return ptr; } BreakIterator *operator->() const { return ptr; } BreakIterator &operator*() const { return *ptr; } private: BreakIterator *ptr; SharedBreakIterator(const SharedBreakIterator &) = delete; SharedBreakIterator &operator=(const SharedBreakIterator &) = delete; }; U_NAMESPACE_END #endif #endif stringi/src/icu74/i18n/collationiterator.cpp0000644000176200001440000011136214700200761020475 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2010-2014, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * collationiterator.cpp * * created on: 2010oct27 * created by: Markus W. Scherer */ #include "utypeinfo.h" // for 'typeid' to work #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION #include "unicode/ucharstrie.h" #include "unicode/ustringtrie.h" #include "charstr.h" #include "cmemory.h" #include "collation.h" #include "collationdata.h" #include "collationfcd.h" #include "collationiterator.h" #include "normalizer2impl.h" #include "uassert.h" #include "uvectr32.h" U_NAMESPACE_BEGIN CollationIterator::CEBuffer::~CEBuffer() {} UBool CollationIterator::CEBuffer::ensureAppendCapacity(int32_t appCap, UErrorCode &errorCode) { int32_t capacity = buffer.getCapacity(); if((length + appCap) <= capacity) { return true; } if(U_FAILURE(errorCode)) { return false; } do { if(capacity < 1000) { capacity *= 4; } else { capacity *= 2; } } while(capacity < (length + appCap)); int64_t *p = buffer.resize(capacity, length); if(p == nullptr) { errorCode = U_MEMORY_ALLOCATION_ERROR; return false; } return true; } // State of combining marks skipped in discontiguous contraction. // We create a state object on first use and keep it around deactivated between uses. class SkippedState : public UMemory { public: // Born active but empty. SkippedState() : pos(0), skipLengthAtMatch(0) {} void clear() { oldBuffer.remove(); pos = 0; // The newBuffer is reset by setFirstSkipped(). } UBool isEmpty() const { return oldBuffer.isEmpty(); } UBool hasNext() const { return pos < oldBuffer.length(); } // Requires hasNext(). UChar32 next() { UChar32 c = oldBuffer.char32At(pos); pos += U16_LENGTH(c); return c; } // Accounts for one more input code point read beyond the end of the marks buffer. void incBeyond() { U_ASSERT(!hasNext()); ++pos; } // Goes backward through the skipped-marks buffer. // Returns the number of code points read beyond the skipped marks // that need to be backtracked through normal input. int32_t backwardNumCodePoints(int32_t n) { int32_t length = oldBuffer.length(); int32_t beyond = pos - length; if(beyond > 0) { if(beyond >= n) { // Not back far enough to re-enter the oldBuffer. pos -= n; return n; } else { // Back out all beyond-oldBuffer code points and re-enter the buffer. pos = oldBuffer.moveIndex32(length, beyond - n); return beyond; } } else { // Go backwards from inside the oldBuffer. pos = oldBuffer.moveIndex32(pos, -n); return 0; } } void setFirstSkipped(UChar32 c) { skipLengthAtMatch = 0; newBuffer.setTo(c); } void skip(UChar32 c) { newBuffer.append(c); } void recordMatch() { skipLengthAtMatch = newBuffer.length(); } // Replaces the characters we consumed with the newly skipped ones. void replaceMatch() { // Note: UnicodeString.replace() pins pos to at most length(). oldBuffer.replace(0, pos, newBuffer, 0, skipLengthAtMatch); pos = 0; } void saveTrieState(const UCharsTrie &trie) { trie.saveState(state); } void resetToTrieState(UCharsTrie &trie) const { trie.resetToState(state); } private: // Combining marks skipped in previous discontiguous-contraction matching. // After that discontiguous contraction was completed, we start reading them from here. UnicodeString oldBuffer; // Combining marks newly skipped in current discontiguous-contraction matching. // These might have been read from the normal text or from the oldBuffer. UnicodeString newBuffer; // Reading index in oldBuffer, // or counter for how many code points have been read beyond oldBuffer (pos-oldBuffer.length()). int32_t pos; // newBuffer.length() at the time of the last matching character. // When a partial match fails, we back out skipped and partial-matching input characters. int32_t skipLengthAtMatch; // We save the trie state before we attempt to match a character, // so that we can skip it and try the next one. UCharsTrie::State state; }; CollationIterator::CollationIterator(const CollationIterator &other) : UObject(other), trie(other.trie), data(other.data), cesIndex(other.cesIndex), skipped(nullptr), numCpFwd(other.numCpFwd), isNumeric(other.isNumeric) { UErrorCode errorCode = U_ZERO_ERROR; int32_t length = other.ceBuffer.length; if(length > 0 && ceBuffer.ensureAppendCapacity(length, errorCode)) { for(int32_t i = 0; i < length; ++i) { ceBuffer.set(i, other.ceBuffer.get(i)); } ceBuffer.length = length; } else { cesIndex = 0; } } CollationIterator::~CollationIterator() { delete skipped; } bool CollationIterator::operator==(const CollationIterator &other) const { // Subclasses: Call this method and then add more specific checks. // Compare the iterator state but not the collation data (trie & data fields): // Assume that the caller compares the data. // Ignore skipped since that should be unused between calls to nextCE(). // (It only stays around to avoid another memory allocation.) if(!(typeid(*this) == typeid(other) && ceBuffer.length == other.ceBuffer.length && cesIndex == other.cesIndex && numCpFwd == other.numCpFwd && isNumeric == other.isNumeric)) { return false; } for(int32_t i = 0; i < ceBuffer.length; ++i) { if(ceBuffer.get(i) != other.ceBuffer.get(i)) { return false; } } return true; } void CollationIterator::reset() { cesIndex = ceBuffer.length = 0; if(skipped != nullptr) { skipped->clear(); } } int32_t CollationIterator::fetchCEs(UErrorCode &errorCode) { while(U_SUCCESS(errorCode) && nextCE(errorCode) != Collation::NO_CE) { // No need to loop for each expansion CE. cesIndex = ceBuffer.length; } return ceBuffer.length; } uint32_t CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) { c = nextCodePoint(errorCode); return (c < 0) ? Collation::FALLBACK_CE32 : data->getCE32(c); } char16_t CollationIterator::handleGetTrailSurrogate() { return 0; } UBool CollationIterator::foundNULTerminator() { return false; } UBool CollationIterator::forbidSurrogateCodePoints() const { return false; } uint32_t CollationIterator::getDataCE32(UChar32 c) const { return data->getCE32(c); } uint32_t CollationIterator::getCE32FromBuilderData(uint32_t /*ce32*/, UErrorCode &errorCode) { if(U_SUCCESS(errorCode)) { errorCode = U_INTERNAL_PROGRAM_ERROR; } return 0; } int64_t CollationIterator::nextCEFromCE32(const CollationData *d, UChar32 c, uint32_t ce32, UErrorCode &errorCode) { --ceBuffer.length; // Undo ceBuffer.incLength(). appendCEsFromCE32(d, c, ce32, true, errorCode); if(U_SUCCESS(errorCode)) { return ceBuffer.get(cesIndex++); } else { return Collation::NO_CE_PRIMARY; } } void CollationIterator::appendCEsFromCE32(const CollationData *d, UChar32 c, uint32_t ce32, UBool forward, UErrorCode &errorCode) { while(Collation::isSpecialCE32(ce32)) { switch(Collation::tagFromCE32(ce32)) { case Collation::FALLBACK_TAG: case Collation::RESERVED_TAG_3: if(U_SUCCESS(errorCode)) { errorCode = U_INTERNAL_PROGRAM_ERROR; } return; case Collation::LONG_PRIMARY_TAG: ceBuffer.append(Collation::ceFromLongPrimaryCE32(ce32), errorCode); return; case Collation::LONG_SECONDARY_TAG: ceBuffer.append(Collation::ceFromLongSecondaryCE32(ce32), errorCode); return; case Collation::LATIN_EXPANSION_TAG: if(ceBuffer.ensureAppendCapacity(2, errorCode)) { ceBuffer.set(ceBuffer.length, Collation::latinCE0FromCE32(ce32)); ceBuffer.set(ceBuffer.length + 1, Collation::latinCE1FromCE32(ce32)); ceBuffer.length += 2; } return; case Collation::EXPANSION32_TAG: { const uint32_t *ce32s = d->ce32s + Collation::indexFromCE32(ce32); int32_t length = Collation::lengthFromCE32(ce32); if(ceBuffer.ensureAppendCapacity(length, errorCode)) { do { ceBuffer.appendUnsafe(Collation::ceFromCE32(*ce32s++)); } while(--length > 0); } return; } case Collation::EXPANSION_TAG: { const int64_t *ces = d->ces + Collation::indexFromCE32(ce32); int32_t length = Collation::lengthFromCE32(ce32); if(ceBuffer.ensureAppendCapacity(length, errorCode)) { do { ceBuffer.appendUnsafe(*ces++); } while(--length > 0); } return; } case Collation::BUILDER_DATA_TAG: ce32 = getCE32FromBuilderData(ce32, errorCode); if(U_FAILURE(errorCode)) { return; } if(ce32 == Collation::FALLBACK_CE32) { d = data->base; ce32 = d->getCE32(c); } break; case Collation::PREFIX_TAG: if(forward) { backwardNumCodePoints(1, errorCode); } ce32 = getCE32FromPrefix(d, ce32, errorCode); if(forward) { forwardNumCodePoints(1, errorCode); } break; case Collation::CONTRACTION_TAG: { const char16_t *p = d->contexts + Collation::indexFromCE32(ce32); uint32_t defaultCE32 = CollationData::readCE32(p); // Default if no suffix match. if(!forward) { // Backward contractions are handled by previousCEUnsafe(). // c has contractions but they were not found. ce32 = defaultCE32; break; } UChar32 nextCp; if(skipped == nullptr && numCpFwd < 0) { // Some portion of nextCE32FromContraction() pulled out here as an ASCII fast path, // avoiding the function call and the nextSkippedCodePoint() overhead. nextCp = nextCodePoint(errorCode); if(nextCp < 0) { // No more text. ce32 = defaultCE32; break; } else if((ce32 & Collation::CONTRACT_NEXT_CCC) != 0 && !CollationFCD::mayHaveLccc(nextCp)) { // All contraction suffixes start with characters with lccc!=0 // but the next code point has lccc==0. backwardNumCodePoints(1, errorCode); ce32 = defaultCE32; break; } } else { nextCp = nextSkippedCodePoint(errorCode); if(nextCp < 0) { // No more text. ce32 = defaultCE32; break; } else if((ce32 & Collation::CONTRACT_NEXT_CCC) != 0 && !CollationFCD::mayHaveLccc(nextCp)) { // All contraction suffixes start with characters with lccc!=0 // but the next code point has lccc==0. backwardNumSkipped(1, errorCode); ce32 = defaultCE32; break; } } ce32 = nextCE32FromContraction(d, ce32, p + 2, defaultCE32, nextCp, errorCode); if(ce32 == Collation::NO_CE32) { // CEs from a discontiguous contraction plus the skipped combining marks // have been appended already. return; } break; } case Collation::DIGIT_TAG: if(isNumeric) { appendNumericCEs(ce32, forward, errorCode); return; } else { // Fetch the non-numeric-collation CE32 and continue. ce32 = d->ce32s[Collation::indexFromCE32(ce32)]; break; } case Collation::U0000_TAG: U_ASSERT(c == 0); if(forward && foundNULTerminator()) { // Handle NUL-termination. (Not needed in Java.) ceBuffer.append(Collation::NO_CE, errorCode); return; } else { // Fetch the normal ce32 for U+0000 and continue. ce32 = d->ce32s[0]; break; } case Collation::HANGUL_TAG: { const uint32_t *jamoCE32s = d->jamoCE32s; c -= Hangul::HANGUL_BASE; UChar32 t = c % Hangul::JAMO_T_COUNT; c /= Hangul::JAMO_T_COUNT; UChar32 v = c % Hangul::JAMO_V_COUNT; c /= Hangul::JAMO_V_COUNT; if((ce32 & Collation::HANGUL_NO_SPECIAL_JAMO) != 0) { // None of the Jamo CE32s are isSpecialCE32(). // Avoid recursive function calls and per-Jamo tests. if(ceBuffer.ensureAppendCapacity(t == 0 ? 2 : 3, errorCode)) { ceBuffer.set(ceBuffer.length, Collation::ceFromCE32(jamoCE32s[c])); ceBuffer.set(ceBuffer.length + 1, Collation::ceFromCE32(jamoCE32s[19 + v])); ceBuffer.length += 2; if(t != 0) { ceBuffer.appendUnsafe(Collation::ceFromCE32(jamoCE32s[39 + t])); } } return; } else { // We should not need to compute each Jamo code point. // In particular, there should be no offset or implicit ce32. appendCEsFromCE32(d, U_SENTINEL, jamoCE32s[c], forward, errorCode); appendCEsFromCE32(d, U_SENTINEL, jamoCE32s[19 + v], forward, errorCode); if(t == 0) { return; } // offset 39 = 19 + 21 - 1: // 19 = JAMO_L_COUNT // 21 = JAMO_T_COUNT // -1 = omit t==0 ce32 = jamoCE32s[39 + t]; c = U_SENTINEL; break; } } case Collation::LEAD_SURROGATE_TAG: { U_ASSERT(forward); // Backward iteration should never see lead surrogate code _unit_ data. U_ASSERT(U16_IS_LEAD(c)); char16_t trail; if(U16_IS_TRAIL(trail = handleGetTrailSurrogate())) { c = U16_GET_SUPPLEMENTARY(c, trail); ce32 &= Collation::LEAD_TYPE_MASK; if(ce32 == Collation::LEAD_ALL_UNASSIGNED) { ce32 = Collation::UNASSIGNED_CE32; // unassigned-implicit } else if(ce32 == Collation::LEAD_ALL_FALLBACK || (ce32 = d->getCE32FromSupplementary(c)) == Collation::FALLBACK_CE32) { // fall back to the base data d = d->base; ce32 = d->getCE32FromSupplementary(c); } } else { // c is an unpaired surrogate. ce32 = Collation::UNASSIGNED_CE32; } break; } case Collation::OFFSET_TAG: U_ASSERT(c >= 0); ceBuffer.append(d->getCEFromOffsetCE32(c, ce32), errorCode); return; case Collation::IMPLICIT_TAG: U_ASSERT(c >= 0); if(U_IS_SURROGATE(c) && forbidSurrogateCodePoints()) { ce32 = Collation::FFFD_CE32; break; } else { ceBuffer.append(Collation::unassignedCEFromCodePoint(c), errorCode); return; } } } ceBuffer.append(Collation::ceFromSimpleCE32(ce32), errorCode); } uint32_t CollationIterator::getCE32FromPrefix(const CollationData *d, uint32_t ce32, UErrorCode &errorCode) { const char16_t *p = d->contexts + Collation::indexFromCE32(ce32); ce32 = CollationData::readCE32(p); // Default if no prefix match. p += 2; // Number of code points read before the original code point. int32_t lookBehind = 0; UCharsTrie prefixes(p); for(;;) { UChar32 c = previousCodePoint(errorCode); if(c < 0) { break; } ++lookBehind; UStringTrieResult match = prefixes.nextForCodePoint(c); if(USTRINGTRIE_HAS_VALUE(match)) { ce32 = (uint32_t)prefixes.getValue(); } if(!USTRINGTRIE_HAS_NEXT(match)) { break; } } forwardNumCodePoints(lookBehind, errorCode); return ce32; } UChar32 CollationIterator::nextSkippedCodePoint(UErrorCode &errorCode) { if(skipped != nullptr && skipped->hasNext()) { return skipped->next(); } if(numCpFwd == 0) { return U_SENTINEL; } UChar32 c = nextCodePoint(errorCode); if(skipped != nullptr && !skipped->isEmpty() && c >= 0) { skipped->incBeyond(); } if(numCpFwd > 0 && c >= 0) { --numCpFwd; } return c; } void CollationIterator::backwardNumSkipped(int32_t n, UErrorCode &errorCode) { if(skipped != nullptr && !skipped->isEmpty()) { n = skipped->backwardNumCodePoints(n); } backwardNumCodePoints(n, errorCode); if(numCpFwd >= 0) { numCpFwd += n; } } uint32_t CollationIterator::nextCE32FromContraction(const CollationData *d, uint32_t contractionCE32, const char16_t *p, uint32_t ce32, UChar32 c, UErrorCode &errorCode) { // c: next code point after the original one // Number of code points read beyond the original code point. // Needed for discontiguous contraction matching. int32_t lookAhead = 1; // Number of code points read since the last match (initially only c). int32_t sinceMatch = 1; // Normally we only need a contiguous match, // and therefore need not remember the suffixes state from before a mismatch for retrying. // If we are already processing skipped combining marks, then we do track the state. UCharsTrie suffixes(p); if(skipped != nullptr && !skipped->isEmpty()) { skipped->saveTrieState(suffixes); } UStringTrieResult match = suffixes.firstForCodePoint(c); for(;;) { UChar32 nextCp; if(USTRINGTRIE_HAS_VALUE(match)) { ce32 = (uint32_t)suffixes.getValue(); if(!USTRINGTRIE_HAS_NEXT(match) || (c = nextSkippedCodePoint(errorCode)) < 0) { return ce32; } if(skipped != nullptr && !skipped->isEmpty()) { skipped->saveTrieState(suffixes); } sinceMatch = 1; } else if(match == USTRINGTRIE_NO_MATCH || (nextCp = nextSkippedCodePoint(errorCode)) < 0) { // No match for c, or partial match (USTRINGTRIE_NO_VALUE) and no further text. // Back up if necessary, and try a discontiguous contraction. if((contractionCE32 & Collation::CONTRACT_TRAILING_CCC) != 0 && // Discontiguous contraction matching extends an existing match. // If there is no match yet, then there is nothing to do. ((contractionCE32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) == 0 || sinceMatch < lookAhead)) { // The last character of at least one suffix has lccc!=0, // allowing for discontiguous contractions. // UCA S2.1.1 only processes non-starters immediately following // "a match in the table" (sinceMatch=1). if(sinceMatch > 1) { // Return to the state after the last match. // (Return to sinceMatch=0 and re-fetch the first partially-matched character.) backwardNumSkipped(sinceMatch, errorCode); c = nextSkippedCodePoint(errorCode); lookAhead -= sinceMatch - 1; sinceMatch = 1; } if(d->getFCD16(c) > 0xff) { return nextCE32FromDiscontiguousContraction( d, suffixes, ce32, lookAhead, c, errorCode); } } break; } else { // Continue after partial match (USTRINGTRIE_NO_VALUE) for c. // It does not have a result value, therefore it is not itself "a match in the table". // If a partially-matched c has ccc!=0 then // it might be skipped in discontiguous contraction. c = nextCp; ++sinceMatch; } ++lookAhead; match = suffixes.nextForCodePoint(c); } backwardNumSkipped(sinceMatch, errorCode); return ce32; } uint32_t CollationIterator::nextCE32FromDiscontiguousContraction( const CollationData *d, UCharsTrie &suffixes, uint32_t ce32, int32_t lookAhead, UChar32 c, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return 0; } // UCA section 3.3.2 Contractions: // Contractions that end with non-starter characters // are known as discontiguous contractions. // ... discontiguous contractions must be detected in input text // whenever the final sequence of non-starter characters could be rearranged // so as to make a contiguous matching sequence that is canonically equivalent. // UCA: http://www.unicode.org/reports/tr10/#S2.1 // S2.1 Find the longest initial substring S at each point that has a match in the table. // S2.1.1 If there are any non-starters following S, process each non-starter C. // S2.1.2 If C is not blocked from S, find if S + C has a match in the table. // Note: A non-starter in a string is called blocked // if there is another non-starter of the same canonical combining class or zero // between it and the last character of canonical combining class 0. // S2.1.3 If there is a match, replace S by S + C, and remove C. // First: Is a discontiguous contraction even possible? uint16_t fcd16 = d->getFCD16(c); U_ASSERT(fcd16 > 0xff); // The caller checked this already, as a shortcut. UChar32 nextCp = nextSkippedCodePoint(errorCode); if(nextCp < 0) { // No further text. backwardNumSkipped(1, errorCode); return ce32; } ++lookAhead; uint8_t prevCC = (uint8_t)fcd16; fcd16 = d->getFCD16(nextCp); if(fcd16 <= 0xff) { // The next code point after c is a starter (S2.1.1 "process each non-starter"). backwardNumSkipped(2, errorCode); return ce32; } // We have read and matched (lookAhead-2) code points, // read non-matching c and peeked ahead at nextCp. // Return to the state before the mismatch and continue matching with nextCp. if(skipped == nullptr || skipped->isEmpty()) { if(skipped == nullptr) { skipped = new SkippedState(); if(skipped == nullptr) { errorCode = U_MEMORY_ALLOCATION_ERROR; return 0; } } suffixes.reset(); if(lookAhead > 2) { // Replay the partial match so far. backwardNumCodePoints(lookAhead, errorCode); suffixes.firstForCodePoint(nextCodePoint(errorCode)); for(int32_t i = 3; i < lookAhead; ++i) { suffixes.nextForCodePoint(nextCodePoint(errorCode)); } // Skip c (which did not match) and nextCp (which we will try now). forwardNumCodePoints(2, errorCode); } skipped->saveTrieState(suffixes); } else { // Reset to the trie state before the failed match of c. skipped->resetToTrieState(suffixes); } skipped->setFirstSkipped(c); // Number of code points read since the last match (at this point: c and nextCp). int32_t sinceMatch = 2; c = nextCp; for(;;) { UStringTrieResult match; // "If C is not blocked from S, find if S + C has a match in the table." (S2.1.2) if(prevCC < (fcd16 >> 8) && USTRINGTRIE_HAS_VALUE(match = suffixes.nextForCodePoint(c))) { // "If there is a match, replace S by S + C, and remove C." (S2.1.3) // Keep prevCC unchanged. ce32 = (uint32_t)suffixes.getValue(); sinceMatch = 0; skipped->recordMatch(); if(!USTRINGTRIE_HAS_NEXT(match)) { break; } skipped->saveTrieState(suffixes); } else { // No match for "S + C", skip C. skipped->skip(c); skipped->resetToTrieState(suffixes); prevCC = (uint8_t)fcd16; } if((c = nextSkippedCodePoint(errorCode)) < 0) { break; } ++sinceMatch; fcd16 = d->getFCD16(c); if(fcd16 <= 0xff) { // The next code point after c is a starter (S2.1.1 "process each non-starter"). break; } } backwardNumSkipped(sinceMatch, errorCode); UBool isTopDiscontiguous = skipped->isEmpty(); skipped->replaceMatch(); if(isTopDiscontiguous && !skipped->isEmpty()) { // We did get a match after skipping one or more combining marks, // and we are not in a recursive discontiguous contraction. // Append CEs from the contraction ce32 // and then from the combining marks that we skipped before the match. c = U_SENTINEL; for(;;) { appendCEsFromCE32(d, c, ce32, true, errorCode); // Fetch CE32s for skipped combining marks from the normal data, with fallback, // rather than from the CollationData where we found the contraction. if(!skipped->hasNext()) { break; } c = skipped->next(); ce32 = getDataCE32(c); if(ce32 == Collation::FALLBACK_CE32) { d = data->base; ce32 = d->getCE32(c); } else { d = data; } // Note: A nested discontiguous-contraction match // replaces consumed combining marks with newly skipped ones // and resets the reading position to the beginning. } skipped->clear(); ce32 = Collation::NO_CE32; // Signal to the caller that the result is in the ceBuffer. } return ce32; } void CollationIterator::appendNumericCEs(uint32_t ce32, UBool forward, UErrorCode &errorCode) { // Collect digits. CharString digits; if(forward) { for(;;) { char digit = Collation::digitFromCE32(ce32); digits.append(digit, errorCode); if(numCpFwd == 0) { break; } UChar32 c = nextCodePoint(errorCode); if(c < 0) { break; } ce32 = data->getCE32(c); if(ce32 == Collation::FALLBACK_CE32) { ce32 = data->base->getCE32(c); } if(!Collation::hasCE32Tag(ce32, Collation::DIGIT_TAG)) { backwardNumCodePoints(1, errorCode); break; } if(numCpFwd > 0) { --numCpFwd; } } } else { for(;;) { char digit = Collation::digitFromCE32(ce32); digits.append(digit, errorCode); UChar32 c = previousCodePoint(errorCode); if(c < 0) { break; } ce32 = data->getCE32(c); if(ce32 == Collation::FALLBACK_CE32) { ce32 = data->base->getCE32(c); } if(!Collation::hasCE32Tag(ce32, Collation::DIGIT_TAG)) { forwardNumCodePoints(1, errorCode); break; } } // Reverse the digit string. char *p = digits.data(); char *q = p + digits.length() - 1; while(p < q) { char digit = *p; *p++ = *q; *q-- = digit; } } if(U_FAILURE(errorCode)) { return; } int32_t pos = 0; do { // Skip leading zeros. while(pos < (digits.length() - 1) && digits[pos] == 0) { ++pos; } // Write a sequence of CEs for at most 254 digits at a time. int32_t segmentLength = digits.length() - pos; if(segmentLength > 254) { segmentLength = 254; } appendNumericSegmentCEs(digits.data() + pos, segmentLength, errorCode); pos += segmentLength; } while(U_SUCCESS(errorCode) && pos < digits.length()); } void CollationIterator::appendNumericSegmentCEs(const char *digits, int32_t length, UErrorCode &errorCode) { U_ASSERT(1 <= length && length <= 254); U_ASSERT(length == 1 || digits[0] != 0); uint32_t numericPrimary = data->numericPrimary; // Note: We use primary byte values 2..255: digits are not compressible. if(length <= 7) { // Very dense encoding for small numbers. int32_t value = digits[0]; for(int32_t i = 1; i < length; ++i) { value = value * 10 + digits[i]; } // Primary weight second byte values: // 74 byte values 2.. 75 for small numbers in two-byte primary weights. // 40 byte values 76..115 for medium numbers in three-byte primary weights. // 16 byte values 116..131 for large numbers in four-byte primary weights. // 124 byte values 132..255 for very large numbers with 4..127 digit pairs. int32_t firstByte = 2; int32_t numBytes = 74; if(value < numBytes) { // Two-byte primary for 0..73, good for day & month numbers etc. uint32_t primary = numericPrimary | ((firstByte + value) << 16); ceBuffer.append(Collation::makeCE(primary), errorCode); return; } value -= numBytes; firstByte += numBytes; numBytes = 40; if(value < numBytes * 254) { // Three-byte primary for 74..10233=74+40*254-1, good for year numbers and more. uint32_t primary = numericPrimary | ((firstByte + value / 254) << 16) | ((2 + value % 254) << 8); ceBuffer.append(Collation::makeCE(primary), errorCode); return; } value -= numBytes * 254; firstByte += numBytes; numBytes = 16; if(value < numBytes * 254 * 254) { // Four-byte primary for 10234..1042489=10234+16*254*254-1. uint32_t primary = numericPrimary | (2 + value % 254); value /= 254; primary |= (2 + value % 254) << 8; value /= 254; primary |= (firstByte + value % 254) << 16; ceBuffer.append(Collation::makeCE(primary), errorCode); return; } // original value > 1042489 } U_ASSERT(length >= 7); // The second primary byte value 132..255 indicates the number of digit pairs (4..127), // then we generate primary bytes with those pairs. // Omit trailing 00 pairs. // Decrement the value for the last pair. // Set the exponent. 4 pairs->132, 5 pairs->133, ..., 127 pairs->255. int32_t numPairs = (length + 1) / 2; uint32_t primary = numericPrimary | ((132 - 4 + numPairs) << 16); // Find the length without trailing 00 pairs. while(digits[length - 1] == 0 && digits[length - 2] == 0) { length -= 2; } // Read the first pair. uint32_t pair; int32_t pos; if(length & 1) { // Only "half a pair" if we have an odd number of digits. pair = digits[0]; pos = 1; } else { pair = digits[0] * 10 + digits[1]; pos = 2; } pair = 11 + 2 * pair; // Add the pairs of digits between pos and length. int32_t shift = 8; while(pos < length) { if(shift == 0) { // Every three pairs/bytes we need to store a 4-byte-primary CE // and start with a new CE with the '0' primary lead byte. primary |= pair; ceBuffer.append(Collation::makeCE(primary), errorCode); primary = numericPrimary; shift = 16; } else { primary |= pair << shift; shift -= 8; } pair = 11 + 2 * (digits[pos] * 10 + digits[pos + 1]); pos += 2; } primary |= (pair - 1) << shift; ceBuffer.append(Collation::makeCE(primary), errorCode); } int64_t CollationIterator::previousCE(UVector32 &offsets, UErrorCode &errorCode) { if(ceBuffer.length > 0) { // Return the previous buffered CE. return ceBuffer.get(--ceBuffer.length); } offsets.removeAllElements(); int32_t limitOffset = getOffset(); UChar32 c = previousCodePoint(errorCode); if(c < 0) { return Collation::NO_CE; } if(data->isUnsafeBackward(c, isNumeric)) { return previousCEUnsafe(c, offsets, errorCode); } // Simple, safe-backwards iteration: // Get a CE going backwards, handle prefixes but no contractions. uint32_t ce32 = data->getCE32(c); const CollationData *d; if(ce32 == Collation::FALLBACK_CE32) { d = data->base; ce32 = d->getCE32(c); } else { d = data; } if(Collation::isSimpleOrLongCE32(ce32)) { return Collation::ceFromCE32(ce32); } appendCEsFromCE32(d, c, ce32, false, errorCode); if(U_SUCCESS(errorCode)) { if(ceBuffer.length > 1) { offsets.addElement(getOffset(), errorCode); // For an expansion, the offset of each non-initial CE is the limit offset, // consistent with forward iteration. while(offsets.size() <= ceBuffer.length) { offsets.addElement(limitOffset, errorCode); } } return ceBuffer.get(--ceBuffer.length); } else { return Collation::NO_CE_PRIMARY; } } int64_t CollationIterator::previousCEUnsafe(UChar32 c, UVector32 &offsets, UErrorCode &errorCode) { // We just move through the input counting safe and unsafe code points // without collecting the unsafe-backward substring into a buffer and // switching to it. // This is to keep the logic simple. Otherwise we would have to handle // prefix matching going before the backward buffer, switching // to iteration and back, etc. // In the most important case of iterating over a normal string, // reading from the string itself is already maximally fast. // The only drawback there is that after getting the CEs we always // skip backward to the safe character rather than switching out // of a backwardBuffer. // But this should not be the common case for previousCE(), // and correctness and maintainability are more important than // complex optimizations. // Find the first safe character before c. int32_t numBackward = 1; while((c = previousCodePoint(errorCode)) >= 0) { ++numBackward; if(!data->isUnsafeBackward(c, isNumeric)) { break; } } // Set the forward iteration limit. // Note: This counts code points. // We cannot enforce a limit in the middle of a surrogate pair or similar. numCpFwd = numBackward; // Reset the forward iterator. cesIndex = 0; U_ASSERT(ceBuffer.length == 0); // Go forward and collect the CEs. int32_t offset = getOffset(); while(numCpFwd > 0) { // nextCE() normally reads one code point. // Contraction matching and digit specials read more and check numCpFwd. --numCpFwd; // Append one or more CEs to the ceBuffer. (void)nextCE(errorCode); U_ASSERT(U_FAILURE(errorCode) || ceBuffer.get(ceBuffer.length - 1) != Collation::NO_CE); // No need to loop for getting each expansion CE from nextCE(). cesIndex = ceBuffer.length; // However, we need to write an offset for each CE. // This is for CollationElementIterator::getOffset() to return // intermediate offsets from the unsafe-backwards segment. U_ASSERT(offsets.size() < ceBuffer.length); offsets.addElement(offset, errorCode); // For an expansion, the offset of each non-initial CE is the limit offset, // consistent with forward iteration. offset = getOffset(); while(offsets.size() < ceBuffer.length) { offsets.addElement(offset, errorCode); } } U_ASSERT(offsets.size() == ceBuffer.length); // End offset corresponding to just after the unsafe-backwards segment. offsets.addElement(offset, errorCode); // Reset the forward iteration limit // and move backward to before the segment for which we fetched CEs. numCpFwd = -1; backwardNumCodePoints(numBackward, errorCode); // Use the collected CEs and return the last one. cesIndex = 0; // Avoid cesIndex > ceBuffer.length when that gets decremented. if(U_SUCCESS(errorCode)) { return ceBuffer.get(--ceBuffer.length); } else { return Collation::NO_CE_PRIMARY; } } U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION stringi/src/icu74/i18n/fpositer.cpp0000644000176200001440000000534114700200761016571 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * Copyright (C) 2009-2012, International Business Machines Corporation and * others. All Rights Reserved. ****************************************************************************** * Date Name Description * 12/14/09 doug Creation. ****************************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/fpositer.h" #include "cmemory.h" #include "uvectr32.h" U_NAMESPACE_BEGIN FieldPositionIterator::~FieldPositionIterator() { delete data; data = nullptr; pos = -1; } FieldPositionIterator::FieldPositionIterator() : data(nullptr), pos(-1) { } FieldPositionIterator::FieldPositionIterator(const FieldPositionIterator &rhs) : UObject(rhs), data(nullptr), pos(rhs.pos) { if (rhs.data) { UErrorCode status = U_ZERO_ERROR; data = new UVector32(status); data->assign(*rhs.data, status); if (status != U_ZERO_ERROR) { delete data; data = nullptr; pos = -1; } } } bool FieldPositionIterator::operator==(const FieldPositionIterator &rhs) const { if (&rhs == this) { return true; } if (pos != rhs.pos) { return false; } if (!data) { return rhs.data == nullptr; } return rhs.data ? data->operator==(*rhs.data) : false; } void FieldPositionIterator::setData(UVector32 *adopt, UErrorCode& status) { // Verify that adopt has valid data, and update status if it doesn't. if (U_SUCCESS(status)) { if (adopt) { if (adopt->size() == 0) { delete adopt; adopt = nullptr; } else if ((adopt->size() % 4) != 0) { status = U_ILLEGAL_ARGUMENT_ERROR; } else { for (int i = 2; i < adopt->size(); i += 4) { if (adopt->elementAti(i) >= adopt->elementAti(i+1)) { status = U_ILLEGAL_ARGUMENT_ERROR; break; } } } } } // We own the data, even if status is in error, so we need to delete it now // if we're not keeping track of it. if (!U_SUCCESS(status)) { delete adopt; return; } delete data; data = adopt; pos = adopt == nullptr ? -1 : 0; } UBool FieldPositionIterator::next(FieldPosition& fp) { if (pos == -1) { return false; } // Ignore the first element of the tetrad: used for field category pos++; fp.setField(data->elementAti(pos++)); fp.setBeginIndex(data->elementAti(pos++)); fp.setEndIndex(data->elementAti(pos++)); if (pos == data->size()) { pos = -1; } return true; } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/japancal.cpp0000644000176200001440000002303514700200761016507 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2003-2009,2012,2016 International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* * * File JAPANCAL.CPP * * Modification History: * 05/16/2003 srl copied from buddhcal.cpp * */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #if U_PLATFORM_HAS_WINUWP_API == 0 #include // getenv() is not available in UWP env #else #ifndef WIN32_LEAN_AND_MEAN # define WIN32_LEAN_AND_MEAN #endif # define VC_EXTRALEAN # define NOUSER # define NOSERVICE # define NOIME # define NOMCX #include #endif #include "cmemory.h" #include "erarules.h" #include "japancal.h" #include "unicode/gregocal.h" #include "umutex.h" #include "uassert.h" #include "ucln_in.h" #include "cstring.h" static icu::EraRules * gJapaneseEraRules = nullptr; static icu::UInitOnce gJapaneseEraRulesInitOnce {}; static int32_t gCurrentEra = 0; U_CDECL_BEGIN static UBool japanese_calendar_cleanup() { if (gJapaneseEraRules) { delete gJapaneseEraRules; gJapaneseEraRules = nullptr; } gCurrentEra = 0; gJapaneseEraRulesInitOnce.reset(); return true; } U_CDECL_END U_NAMESPACE_BEGIN UOBJECT_DEFINE_RTTI_IMPLEMENTATION(JapaneseCalendar) static const int32_t kGregorianEpoch = 1970; // used as the default value of EXTENDED_YEAR static const char* TENTATIVE_ERA_VAR_NAME = "ICU_ENABLE_TENTATIVE_ERA"; // Export the following for use by test code. UBool JapaneseCalendar::enableTentativeEra() { // Although start date of next Japanese era is planned ahead, a name of // new era might not be available. This implementation allows tester to // check a new era without era names by settings below (in priority order). // By default, such tentative era is disabled. // 1. Environment variable ICU_ENABLE_TENTATIVE_ERA=true or false UBool includeTentativeEra = false; #if U_PLATFORM_HAS_WINUWP_API == 1 // UWP doesn't allow access to getenv(), but we can call GetEnvironmentVariableW to do the same thing. char16_t varName[26] = {}; u_charsToUChars(TENTATIVE_ERA_VAR_NAME, varName, static_cast(uprv_strlen(TENTATIVE_ERA_VAR_NAME))); WCHAR varValue[5] = {}; DWORD ret = GetEnvironmentVariableW(reinterpret_cast(varName), varValue, UPRV_LENGTHOF(varValue)); if ((ret == 4) && (_wcsicmp(varValue, L"true") == 0)) { includeTentativeEra = true; } #else char *envVarVal = getenv(TENTATIVE_ERA_VAR_NAME); if (envVarVal != nullptr && uprv_stricmp(envVarVal, "true") == 0) { includeTentativeEra = true; } #endif return includeTentativeEra; } // Initialize global Japanese era data static void U_CALLCONV initializeEras(UErrorCode &status) { gJapaneseEraRules = EraRules::createInstance("japanese", JapaneseCalendar::enableTentativeEra(), status); if (U_FAILURE(status)) { return; } gCurrentEra = gJapaneseEraRules->getCurrentEraIndex(); } static void init(UErrorCode &status) { umtx_initOnce(gJapaneseEraRulesInitOnce, &initializeEras, status); ucln_i18n_registerCleanup(UCLN_I18N_JAPANESE_CALENDAR, japanese_calendar_cleanup); } /* Some platforms don't like to export constants, like old Palm OS and some z/OS configurations. */ uint32_t JapaneseCalendar::getCurrentEra() { return gCurrentEra; } JapaneseCalendar::JapaneseCalendar(const Locale& aLocale, UErrorCode& success) : GregorianCalendar(aLocale, success) { init(success); setTimeInMillis(getNow(), success); // Call this again now that the vtable is set up properly. } JapaneseCalendar::~JapaneseCalendar() { } JapaneseCalendar::JapaneseCalendar(const JapaneseCalendar& source) : GregorianCalendar(source) { UErrorCode status = U_ZERO_ERROR; init(status); U_ASSERT(U_SUCCESS(status)); } JapaneseCalendar& JapaneseCalendar::operator= ( const JapaneseCalendar& right) { GregorianCalendar::operator=(right); return *this; } JapaneseCalendar* JapaneseCalendar::clone() const { return new JapaneseCalendar(*this); } const char *JapaneseCalendar::getType() const { return "japanese"; } int32_t JapaneseCalendar::getDefaultMonthInYear(int32_t eyear) { int32_t era = internalGetEra(); // TODO do we assume we can trust 'era'? What if it is denormalized? int32_t month = 0; // Find out if we are at the edge of an era int32_t eraStart[3] = { 0,0,0 }; UErrorCode status = U_ZERO_ERROR; gJapaneseEraRules->getStartDate(era, eraStart, status); U_ASSERT(U_SUCCESS(status)); if(eyear == eraStart[0]) { // Yes, we're in the first year of this era. return eraStart[1] // month -1; // return 0-based month } return month; } int32_t JapaneseCalendar::getDefaultDayInMonth(int32_t eyear, int32_t month) { int32_t era = internalGetEra(); int32_t day = 1; int32_t eraStart[3] = { 0,0,0 }; UErrorCode status = U_ZERO_ERROR; gJapaneseEraRules->getStartDate(era, eraStart, status); U_ASSERT(U_SUCCESS(status)); if(eyear == eraStart[0]) { if(month == eraStart[1] - 1) { return eraStart[2]; } } return day; } int32_t JapaneseCalendar::internalGetEra() const { return internalGet(UCAL_ERA, gCurrentEra); } int32_t JapaneseCalendar::handleGetExtendedYear() { // EXTENDED_YEAR in JapaneseCalendar is a Gregorian year // The default value of EXTENDED_YEAR is 1970 (Showa 45) int32_t year; if (newerField(UCAL_EXTENDED_YEAR, UCAL_YEAR) == UCAL_EXTENDED_YEAR && newerField(UCAL_EXTENDED_YEAR, UCAL_ERA) == UCAL_EXTENDED_YEAR) { year = internalGet(UCAL_EXTENDED_YEAR, kGregorianEpoch); } else { UErrorCode status = U_ZERO_ERROR; int32_t eraStartYear = gJapaneseEraRules->getStartYear(internalGet(UCAL_ERA, gCurrentEra), status); U_ASSERT(U_SUCCESS(status)); // extended year is a gregorian year, where 1 = 1AD, 0 = 1BC, -1 = 2BC, etc year = internalGet(UCAL_YEAR, 1) // pin to minimum of year 1 (first year) + eraStartYear // add gregorian starting year - 1; // Subtract one because year starts at 1 } return year; } void JapaneseCalendar::handleComputeFields(int32_t julianDay, UErrorCode& status) { //Calendar::timeToFields(theTime, quick, status); GregorianCalendar::handleComputeFields(julianDay, status); int32_t year = internalGet(UCAL_EXTENDED_YEAR); // Gregorian year int32_t eraIdx = gJapaneseEraRules->getEraIndex(year, internalGetMonth() + 1, internalGet(UCAL_DAY_OF_MONTH), status); internalSet(UCAL_ERA, eraIdx); internalSet(UCAL_YEAR, year - gJapaneseEraRules->getStartYear(eraIdx, status) + 1); } /* Disable pivoting */ UBool JapaneseCalendar::haveDefaultCentury() const { return false; } UDate JapaneseCalendar::defaultCenturyStart() const { return 0;// WRONG } int32_t JapaneseCalendar::defaultCenturyStartYear() const { return 0; } int32_t JapaneseCalendar::handleGetLimit(UCalendarDateFields field, ELimitType limitType) const { switch(field) { case UCAL_ERA: if (limitType == UCAL_LIMIT_MINIMUM || limitType == UCAL_LIMIT_GREATEST_MINIMUM) { return 0; } return gJapaneseEraRules->getNumberOfEras() - 1; // max known era, not gCurrentEra case UCAL_YEAR: { switch (limitType) { case UCAL_LIMIT_MINIMUM: case UCAL_LIMIT_GREATEST_MINIMUM: return 1; case UCAL_LIMIT_LEAST_MAXIMUM: return 1; case UCAL_LIMIT_COUNT: //added to avoid warning case UCAL_LIMIT_MAXIMUM: { UErrorCode status = U_ZERO_ERROR; int32_t eraStartYear = gJapaneseEraRules->getStartYear(gCurrentEra, status); U_ASSERT(U_SUCCESS(status)); return GregorianCalendar::handleGetLimit(UCAL_YEAR, UCAL_LIMIT_MAXIMUM) - eraStartYear; } default: return 1; // Error condition, invalid limitType } } default: return GregorianCalendar::handleGetLimit(field,limitType); } } int32_t JapaneseCalendar::getActualMaximum(UCalendarDateFields field, UErrorCode& status) const { if (field == UCAL_YEAR) { int32_t era = get(UCAL_ERA, status); if (U_FAILURE(status)) { return 0; // error case... any value } if (era == gJapaneseEraRules->getNumberOfEras() - 1) { // max known era, not gCurrentEra // TODO: Investigate what value should be used here - revisit after 4.0. return handleGetLimit(UCAL_YEAR, UCAL_LIMIT_MAXIMUM); } else { int32_t nextEraStart[3] = { 0,0,0 }; gJapaneseEraRules->getStartDate(era + 1, nextEraStart, status); int32_t nextEraYear = nextEraStart[0]; int32_t nextEraMonth = nextEraStart[1]; // 1-base int32_t nextEraDate = nextEraStart[2]; int32_t eraStartYear = gJapaneseEraRules->getStartYear(era, status); int32_t maxYear = nextEraYear - eraStartYear + 1; // 1-base if (nextEraMonth == 1 && nextEraDate == 1) { // Subtract 1, because the next era starts at Jan 1 maxYear--; } return maxYear; } } return GregorianCalendar::getActualMaximum(field, status); } U_NAMESPACE_END #endif stringi/src/icu74/i18n/sharedbreakiterator.cpp0000644000176200001440000000156714700200761020771 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2013-2014, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* * * File RELDATEFMTTEST.CPP * ******************************************************************************* */ #include "sharedbreakiterator.h" #include "unicode/brkiter.h" #if !UCONFIG_NO_BREAK_ITERATION U_NAMESPACE_BEGIN SharedBreakIterator::SharedBreakIterator( BreakIterator *biToAdopt) : ptr(biToAdopt) { } SharedBreakIterator::~SharedBreakIterator() { delete ptr; } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ stringi/src/icu74/i18n/number_multiplier.cpp0000644000176200001440000001106614700200761020475 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING // Allow implicit conversion from char16_t* to UnicodeString for this file: // Helpful in toString methods and elsewhere. #define UNISTR_FROM_STRING_EXPLICIT #include "number_decnum.h" #include "number_types.h" #include "number_multiplier.h" #include "numparse_validators.h" #include "number_utils.h" #include "decNumber.h" using namespace icu; using namespace icu::number; using namespace icu::number::impl; using namespace icu::numparse::impl; Scale::Scale(int32_t magnitude, DecNum* arbitraryToAdopt) : fMagnitude(magnitude), fArbitrary(arbitraryToAdopt), fError(U_ZERO_ERROR) { if (fArbitrary != nullptr) { // Attempt to convert the DecNum to a magnitude multiplier. fArbitrary->normalize(); if (fArbitrary->getRawDecNumber()->digits == 1 && fArbitrary->getRawDecNumber()->lsu[0] == 1 && !fArbitrary->isNegative()) { // Success! fMagnitude += fArbitrary->getRawDecNumber()->exponent; delete fArbitrary; fArbitrary = nullptr; } } } Scale::Scale(const Scale& other) : fMagnitude(other.fMagnitude), fArbitrary(nullptr), fError(other.fError) { if (other.fArbitrary != nullptr) { UErrorCode localStatus = U_ZERO_ERROR; fArbitrary = new DecNum(*other.fArbitrary, localStatus); } } Scale& Scale::operator=(const Scale& other) { if (this == &other) { return *this; } // self-assignment: no-op fMagnitude = other.fMagnitude; if (other.fArbitrary != nullptr) { UErrorCode localStatus = U_ZERO_ERROR; fArbitrary = new DecNum(*other.fArbitrary, localStatus); } else { fArbitrary = nullptr; } fError = other.fError; return *this; } Scale::Scale(Scale&& src) noexcept : fMagnitude(src.fMagnitude), fArbitrary(src.fArbitrary), fError(src.fError) { // Take ownership away from src if necessary src.fArbitrary = nullptr; } Scale& Scale::operator=(Scale&& src) noexcept { fMagnitude = src.fMagnitude; if (fArbitrary != nullptr) { delete fArbitrary; } fArbitrary = src.fArbitrary; fError = src.fError; // Take ownership away from src if necessary src.fArbitrary = nullptr; return *this; } Scale::~Scale() { delete fArbitrary; } Scale Scale::none() { return {0, nullptr}; } Scale Scale::powerOfTen(int32_t power) { return {power, nullptr}; } Scale Scale::byDecimal(StringPiece multiplicand) { UErrorCode localError = U_ZERO_ERROR; LocalPointer decnum(new DecNum(), localError); if (U_FAILURE(localError)) { return {localError}; } decnum->setTo(multiplicand, localError); if (U_FAILURE(localError)) { return {localError}; } return {0, decnum.orphan()}; } Scale Scale::byDouble(double multiplicand) { UErrorCode localError = U_ZERO_ERROR; LocalPointer decnum(new DecNum(), localError); if (U_FAILURE(localError)) { return {localError}; } decnum->setTo(multiplicand, localError); if (U_FAILURE(localError)) { return {localError}; } return {0, decnum.orphan()}; } Scale Scale::byDoubleAndPowerOfTen(double multiplicand, int32_t power) { UErrorCode localError = U_ZERO_ERROR; LocalPointer decnum(new DecNum(), localError); if (U_FAILURE(localError)) { return {localError}; } decnum->setTo(multiplicand, localError); if (U_FAILURE(localError)) { return {localError}; } return {power, decnum.orphan()}; } void Scale::applyTo(impl::DecimalQuantity& quantity) const { quantity.adjustMagnitude(fMagnitude); if (fArbitrary != nullptr) { UErrorCode localStatus = U_ZERO_ERROR; quantity.multiplyBy(*fArbitrary, localStatus); } } void Scale::applyReciprocalTo(impl::DecimalQuantity& quantity) const { quantity.adjustMagnitude(-fMagnitude); if (fArbitrary != nullptr) { UErrorCode localStatus = U_ZERO_ERROR; quantity.divideBy(*fArbitrary, localStatus); } } void MultiplierFormatHandler::setAndChain(const Scale& multiplier, const MicroPropsGenerator* parent) { fMultiplier = multiplier; fParent = parent; } void MultiplierFormatHandler::processQuantity(DecimalQuantity& quantity, MicroProps& micros, UErrorCode& status) const { fParent->processQuantity(quantity, micros, status); fMultiplier.applyTo(quantity); } #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/toupptrn.cpp0000644000176200001440000000312714700200761016631 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2001-2007, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 05/24/01 aliu Creation. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/ustring.h" #include "unicode/uchar.h" #include "toupptrn.h" #include "ustr_imp.h" #include "cpputils.h" U_NAMESPACE_BEGIN UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UppercaseTransliterator) /** * Constructs a transliterator. */ UppercaseTransliterator::UppercaseTransliterator() : CaseMapTransliterator(UNICODE_STRING("Any-Upper", 9), ucase_toFullUpper) { } /** * Destructor. */ UppercaseTransliterator::~UppercaseTransliterator() { } /** * Copy constructor. */ UppercaseTransliterator::UppercaseTransliterator(const UppercaseTransliterator& o) : CaseMapTransliterator(o) { } /** * Assignment operator. */ /*UppercaseTransliterator& UppercaseTransliterator::operator=( const UppercaseTransliterator& o) { CaseMapTransliterator::operator=(o); return *this; }*/ /** * Transliterator API. */ UppercaseTransliterator* UppercaseTransliterator::clone() const { return new UppercaseTransliterator(*this); } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ stringi/src/icu74/i18n/winnmfmt.h0000644000176200001440000001365214700200761016246 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************** * Copyright (C) 2005-2015, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************** * * File WINNMFMT.H * ******************************************************************************** */ #ifndef __WINNMFMT #define __WINNMFMT #include "unicode/utypes.h" #if U_PLATFORM_USES_ONLY_WIN32_API #include "unicode/format.h" #include "unicode/datefmt.h" #include "unicode/calendar.h" #include "unicode/ustring.h" #include "unicode/locid.h" #if !UCONFIG_NO_FORMATTING /** * \file * \brief C++ API: Format numbers using Windows API. */ U_NAMESPACE_BEGIN union FormatInfo; class Win32NumberFormat : public NumberFormat { public: Win32NumberFormat(const Locale &locale, UBool currency, UErrorCode &status); Win32NumberFormat(const Win32NumberFormat &other); virtual ~Win32NumberFormat(); virtual Win32NumberFormat *clone() const; Win32NumberFormat &operator=(const Win32NumberFormat &other); /** * Format a double number. Concrete subclasses must implement * these pure virtual methods. * * @param number The value to be formatted. * @param appendTo Output parameter to receive result. * Result is appended to existing contents. * @param pos On input: an alignment field, if desired. * On output: the offsets of the alignment field. * @return Reference to 'appendTo' parameter. */ virtual UnicodeString& format(double number, UnicodeString& appendTo, FieldPosition& pos) const; /** * Format a long number. Concrete subclasses must implement * these pure virtual methods. * * @param number The value to be formatted. * @param appendTo Output parameter to receive result. * Result is appended to existing contents. * @param pos On input: an alignment field, if desired. * On output: the offsets of the alignment field. * @return Reference to 'appendTo' parameter. */ virtual UnicodeString& format(int32_t number, UnicodeString& appendTo, FieldPosition& pos) const; /** * Format an int64 number. * * @param number The value to be formatted. * @param appendTo Output parameter to receive result. * Result is appended to existing contents. * @param pos On input: an alignment field, if desired. * On output: the offsets of the alignment field. * @return Reference to 'appendTo' parameter. */ virtual UnicodeString& format(int64_t number, UnicodeString& appendTo, FieldPosition& pos) const; using NumberFormat::format; // Use the default behavior for the following. // virtual UnicodeString &format(double number, UnicodeString &appendTo) const; // virtual UnicodeString &format(int32_t number, UnicodeString &appendTo) const; // virtual UnicodeString &format(int64_t number, UnicodeString &appendTo) const; virtual void parse(const UnicodeString& text, Formattable& result, ParsePosition& parsePosition) const; /** * Sets the maximum number of digits allowed in the fraction portion of a * number. maximumFractionDigits must be >= minimumFractionDigits. If the * new value for maximumFractionDigits is less than the current value * of minimumFractionDigits, then minimumFractionDigits will also be set to * the new value. * @param newValue the new value to be set. * @see getMaximumFractionDigits */ virtual void setMaximumFractionDigits(int32_t newValue); /** * Sets the minimum number of digits allowed in the fraction portion of a * number. minimumFractionDigits must be <= maximumFractionDigits. If the * new value for minimumFractionDigits exceeds the current value * of maximumFractionDigits, then maximumIntegerDigits will also be set to * the new value * @param newValue the new value to be set. * @see getMinimumFractionDigits */ virtual void setMinimumFractionDigits(int32_t newValue); /** * Return the class ID for this class. This is useful only for comparing to * a return value from getDynamicClassID(). For example: *
     * .   Base* polymorphic_pointer = createPolymorphicObject();
     * .   if (polymorphic_pointer->getDynamicClassID() ==
     * .       derived::getStaticClassID()) ...
     * 
* @return The class ID for all objects of this class. */ U_I18N_API static UClassID U_EXPORT2 getStaticClassID(); /** * Returns a unique class ID POLYMORPHICALLY. Pure virtual override. This * method is to implement a simple version of RTTI, since not all C++ * compilers support genuine RTTI. Polymorphic operator==() and clone() * methods call this method. * * @return The class ID for this object. All objects of a * given class have the same class ID. Objects of * other classes have different class IDs. */ virtual UClassID getDynamicClassID() const; private: UnicodeString &format(int32_t numDigits, UnicodeString &appendTo, const wchar_t *format, ...) const; UBool fCurrency; Locale fLocale; int32_t fLCID; FormatInfo *fFormatInfo; UBool fFractionDigitsSet; UnicodeString* fWindowsLocaleName; // Stores the equivalent Windows locale name. }; U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ #endif // U_PLATFORM_USES_ONLY_WIN32_API #endif // __WINNMFMT stringi/src/icu74/i18n/double-conversion-string-to-double.cpp0000644000176200001440000006770614700200761023604 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // // From the double-conversion library. Original license: // // Copyright 2010 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // ICU PATCH: ifdef around UCONFIG_NO_FORMATTING #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING // ICU PATCH: Do not include std::locale. #include // #include #include // ICU PATCH: Customize header file paths for ICU. #include "double-conversion-string-to-double.h" #include "double-conversion-ieee.h" #include "double-conversion-strtod.h" #include "double-conversion-utils.h" // ICU PATCH: Wrap in ICU namespace U_NAMESPACE_BEGIN #ifdef _MSC_VER # if _MSC_VER >= 1900 // Fix MSVC >= 2015 (_MSC_VER == 1900) warning // C4244: 'argument': conversion from 'const uc16' to 'char', possible loss of data // against Advance and friends, when instantiated with **it as char, not uc16. __pragma(warning(disable: 4244)) # endif # if _MSC_VER <= 1700 // VS2012, see IsDecimalDigitForRadix warning fix, below # define VS2012_RADIXWARN # endif #endif namespace double_conversion { namespace { inline char ToLower(char ch) { #if 0 // do not include std::locale in ICU static const std::ctype& cType = std::use_facet >(std::locale::classic()); return cType.tolower(ch); #else (void)ch; DOUBLE_CONVERSION_UNREACHABLE(); #endif } inline char Pass(char ch) { return ch; } template static inline bool ConsumeSubStringImpl(Iterator* current, Iterator end, const char* substring, Converter converter) { DOUBLE_CONVERSION_ASSERT(converter(**current) == *substring); for (substring++; *substring != '\0'; substring++) { ++*current; if (*current == end || converter(**current) != *substring) { return false; } } ++*current; return true; } // Consumes the given substring from the iterator. // Returns false, if the substring does not match. template static bool ConsumeSubString(Iterator* current, Iterator end, const char* substring, bool allow_case_insensitivity) { if (allow_case_insensitivity) { return ConsumeSubStringImpl(current, end, substring, ToLower); } else { return ConsumeSubStringImpl(current, end, substring, Pass); } } // Consumes first character of the str is equal to ch inline bool ConsumeFirstCharacter(char ch, const char* str, bool case_insensitivity) { return case_insensitivity ? ToLower(ch) == str[0] : ch == str[0]; } } // namespace // Maximum number of significant digits in decimal representation. // The longest possible double in decimal representation is // (2^53 - 1) * 2 ^ -1074 that is (2 ^ 53 - 1) * 5 ^ 1074 / 10 ^ 1074 // (768 digits). If we parse a number whose first digits are equal to a // mean of 2 adjacent doubles (that could have up to 769 digits) the result // must be rounded to the bigger one unless the tail consists of zeros, so // we don't need to preserve all the digits. const int kMaxSignificantDigits = 772; static const char kWhitespaceTable7[] = { 32, 13, 10, 9, 11, 12 }; static const int kWhitespaceTable7Length = DOUBLE_CONVERSION_ARRAY_SIZE(kWhitespaceTable7); static const uc16 kWhitespaceTable16[] = { 160, 8232, 8233, 5760, 6158, 8192, 8193, 8194, 8195, 8196, 8197, 8198, 8199, 8200, 8201, 8202, 8239, 8287, 12288, 65279 }; static const int kWhitespaceTable16Length = DOUBLE_CONVERSION_ARRAY_SIZE(kWhitespaceTable16); static bool isWhitespace(int x) { if (x < 128) { for (int i = 0; i < kWhitespaceTable7Length; i++) { if (kWhitespaceTable7[i] == x) return true; } } else { for (int i = 0; i < kWhitespaceTable16Length; i++) { if (kWhitespaceTable16[i] == x) return true; } } return false; } // Returns true if a nonspace found and false if the end has reached. template static inline bool AdvanceToNonspace(Iterator* current, Iterator end) { while (*current != end) { if (!isWhitespace(**current)) return true; ++*current; } return false; } static bool isDigit(int x, int radix) { return (x >= '0' && x <= '9' && x < '0' + radix) || (radix > 10 && x >= 'a' && x < 'a' + radix - 10) || (radix > 10 && x >= 'A' && x < 'A' + radix - 10); } static double SignedZero(bool sign) { return sign ? -0.0 : 0.0; } // Returns true if 'c' is a decimal digit that is valid for the given radix. // // The function is small and could be inlined, but VS2012 emitted a warning // because it constant-propagated the radix and concluded that the last // condition was always true. Moving it into a separate function and // suppressing optimisation keeps the compiler from warning. #ifdef VS2012_RADIXWARN #pragma optimize("",off) static bool IsDecimalDigitForRadix(int c, int radix) { return '0' <= c && c <= '9' && (c - '0') < radix; } #pragma optimize("",on) #else static bool inline IsDecimalDigitForRadix(int c, int radix) { return '0' <= c && c <= '9' && (c - '0') < radix; } #endif // Returns true if 'c' is a character digit that is valid for the given radix. // The 'a_character' should be 'a' or 'A'. // // The function is small and could be inlined, but VS2012 emitted a warning // because it constant-propagated the radix and concluded that the first // condition was always false. By moving it into a separate function the // compiler wouldn't warn anymore. static bool IsCharacterDigitForRadix(int c, int radix, char a_character) { return radix > 10 && c >= a_character && c < a_character + radix - 10; } // Returns true, when the iterator is equal to end. template static bool Advance (Iterator* it, uc16 separator, int base, Iterator& end) { if (separator == StringToDoubleConverter::kNoSeparator) { ++(*it); return *it == end; } if (!isDigit(**it, base)) { ++(*it); return *it == end; } ++(*it); if (*it == end) return true; if (*it + 1 == end) return false; if (**it == separator && isDigit(*(*it + 1), base)) { ++(*it); } return *it == end; } // Checks whether the string in the range start-end is a hex-float string. // This function assumes that the leading '0x'/'0X' is already consumed. // // Hex float strings are of one of the following forms: // - hex_digits+ 'p' ('+'|'-')? exponent_digits+ // - hex_digits* '.' hex_digits+ 'p' ('+'|'-')? exponent_digits+ // - hex_digits+ '.' 'p' ('+'|'-')? exponent_digits+ template static bool IsHexFloatString(Iterator start, Iterator end, uc16 separator, bool allow_trailing_junk) { DOUBLE_CONVERSION_ASSERT(start != end); Iterator current = start; bool saw_digit = false; while (isDigit(*current, 16)) { saw_digit = true; if (Advance(¤t, separator, 16, end)) return false; } if (*current == '.') { if (Advance(¤t, separator, 16, end)) return false; while (isDigit(*current, 16)) { saw_digit = true; if (Advance(¤t, separator, 16, end)) return false; } } if (!saw_digit) return false; if (*current != 'p' && *current != 'P') return false; if (Advance(¤t, separator, 16, end)) return false; if (*current == '+' || *current == '-') { if (Advance(¤t, separator, 16, end)) return false; } if (!isDigit(*current, 10)) return false; if (Advance(¤t, separator, 16, end)) return true; while (isDigit(*current, 10)) { if (Advance(¤t, separator, 16, end)) return true; } return allow_trailing_junk || !AdvanceToNonspace(¤t, end); } // Parsing integers with radix 2, 4, 8, 16, 32. Assumes current != end. // // If parse_as_hex_float is true, then the string must be a valid // hex-float. template static double RadixStringToIeee(Iterator* current, Iterator end, bool sign, uc16 separator, bool parse_as_hex_float, bool allow_trailing_junk, double junk_string_value, bool read_as_double, bool* result_is_junk) { DOUBLE_CONVERSION_ASSERT(*current != end); DOUBLE_CONVERSION_ASSERT(!parse_as_hex_float || IsHexFloatString(*current, end, separator, allow_trailing_junk)); const int kDoubleSize = Double::kSignificandSize; const int kSingleSize = Single::kSignificandSize; const int kSignificandSize = read_as_double? kDoubleSize: kSingleSize; *result_is_junk = true; int64_t number = 0; int exponent = 0; const int radix = (1 << radix_log_2); // Whether we have encountered a '.' and are parsing the decimal digits. // Only relevant if parse_as_hex_float is true. bool post_decimal = false; // Skip leading 0s. while (**current == '0') { if (Advance(current, separator, radix, end)) { *result_is_junk = false; return SignedZero(sign); } } while (true) { int digit; if (IsDecimalDigitForRadix(**current, radix)) { digit = static_cast(**current) - '0'; if (post_decimal) exponent -= radix_log_2; } else if (IsCharacterDigitForRadix(**current, radix, 'a')) { digit = static_cast(**current) - 'a' + 10; if (post_decimal) exponent -= radix_log_2; } else if (IsCharacterDigitForRadix(**current, radix, 'A')) { digit = static_cast(**current) - 'A' + 10; if (post_decimal) exponent -= radix_log_2; } else if (parse_as_hex_float && **current == '.') { post_decimal = true; Advance(current, separator, radix, end); DOUBLE_CONVERSION_ASSERT(*current != end); continue; } else if (parse_as_hex_float && (**current == 'p' || **current == 'P')) { break; } else { if (allow_trailing_junk || !AdvanceToNonspace(current, end)) { break; } else { return junk_string_value; } } number = number * radix + digit; int overflow = static_cast(number >> kSignificandSize); if (overflow != 0) { // Overflow occurred. Need to determine which direction to round the // result. int overflow_bits_count = 1; while (overflow > 1) { overflow_bits_count++; overflow >>= 1; } int dropped_bits_mask = ((1 << overflow_bits_count) - 1); int dropped_bits = static_cast(number) & dropped_bits_mask; number >>= overflow_bits_count; exponent += overflow_bits_count; bool zero_tail = true; for (;;) { if (Advance(current, separator, radix, end)) break; if (parse_as_hex_float && **current == '.') { // Just run over the '.'. We are just trying to see whether there is // a non-zero digit somewhere. Advance(current, separator, radix, end); DOUBLE_CONVERSION_ASSERT(*current != end); post_decimal = true; } if (!isDigit(**current, radix)) break; zero_tail = zero_tail && **current == '0'; if (!post_decimal) exponent += radix_log_2; } if (!parse_as_hex_float && !allow_trailing_junk && AdvanceToNonspace(current, end)) { return junk_string_value; } int middle_value = (1 << (overflow_bits_count - 1)); if (dropped_bits > middle_value) { number++; // Rounding up. } else if (dropped_bits == middle_value) { // Rounding to even to consistency with decimals: half-way case rounds // up if significant part is odd and down otherwise. if ((number & 1) != 0 || !zero_tail) { number++; // Rounding up. } } // Rounding up may cause overflow. if ((number & ((int64_t)1 << kSignificandSize)) != 0) { exponent++; number >>= 1; } break; } if (Advance(current, separator, radix, end)) break; } DOUBLE_CONVERSION_ASSERT(number < ((int64_t)1 << kSignificandSize)); DOUBLE_CONVERSION_ASSERT(static_cast(static_cast(number)) == number); *result_is_junk = false; if (parse_as_hex_float) { DOUBLE_CONVERSION_ASSERT(**current == 'p' || **current == 'P'); Advance(current, separator, radix, end); DOUBLE_CONVERSION_ASSERT(*current != end); bool is_negative = false; if (**current == '+') { Advance(current, separator, radix, end); DOUBLE_CONVERSION_ASSERT(*current != end); } else if (**current == '-') { is_negative = true; Advance(current, separator, radix, end); DOUBLE_CONVERSION_ASSERT(*current != end); } int written_exponent = 0; while (IsDecimalDigitForRadix(**current, 10)) { // No need to read exponents if they are too big. That could potentially overflow // the `written_exponent` variable. if (abs(written_exponent) <= 100 * Double::kMaxExponent) { written_exponent = 10 * written_exponent + **current - '0'; } if (Advance(current, separator, radix, end)) break; } if (is_negative) written_exponent = -written_exponent; exponent += written_exponent; } if (exponent == 0 || number == 0) { if (sign) { if (number == 0) return -0.0; number = -number; } return static_cast(number); } DOUBLE_CONVERSION_ASSERT(number != 0); double result = Double(DiyFp(number, exponent)).value(); return sign ? -result : result; } template double StringToDoubleConverter::StringToIeee( Iterator input, int length, bool read_as_double, int* processed_characters_count) const { Iterator current = input; Iterator end = input + length; *processed_characters_count = 0; const bool allow_trailing_junk = (flags_ & ALLOW_TRAILING_JUNK) != 0; const bool allow_leading_spaces = (flags_ & ALLOW_LEADING_SPACES) != 0; const bool allow_trailing_spaces = (flags_ & ALLOW_TRAILING_SPACES) != 0; const bool allow_spaces_after_sign = (flags_ & ALLOW_SPACES_AFTER_SIGN) != 0; const bool allow_case_insensitivity = (flags_ & ALLOW_CASE_INSENSITIVITY) != 0; // To make sure that iterator dereferencing is valid the following // convention is used: // 1. Each '++current' statement is followed by check for equality to 'end'. // 2. If AdvanceToNonspace returned false then current == end. // 3. If 'current' becomes equal to 'end' the function returns or goes to // 'parsing_done'. // 4. 'current' is not dereferenced after the 'parsing_done' label. // 5. Code before 'parsing_done' may rely on 'current != end'. if (current == end) return empty_string_value_; if (allow_leading_spaces || allow_trailing_spaces) { if (!AdvanceToNonspace(¤t, end)) { *processed_characters_count = static_cast(current - input); return empty_string_value_; } if (!allow_leading_spaces && (input != current)) { // No leading spaces allowed, but AdvanceToNonspace moved forward. return junk_string_value_; } } // Exponent will be adjusted if insignificant digits of the integer part // or insignificant leading zeros of the fractional part are dropped. int exponent = 0; int significant_digits = 0; int insignificant_digits = 0; bool nonzero_digit_dropped = false; bool sign = false; if (*current == '+' || *current == '-') { sign = (*current == '-'); ++current; Iterator next_non_space = current; // Skip following spaces (if allowed). if (!AdvanceToNonspace(&next_non_space, end)) return junk_string_value_; if (!allow_spaces_after_sign && (current != next_non_space)) { return junk_string_value_; } current = next_non_space; } if (infinity_symbol_ != DOUBLE_CONVERSION_NULLPTR) { if (ConsumeFirstCharacter(*current, infinity_symbol_, allow_case_insensitivity)) { if (!ConsumeSubString(¤t, end, infinity_symbol_, allow_case_insensitivity)) { return junk_string_value_; } if (!(allow_trailing_spaces || allow_trailing_junk) && (current != end)) { return junk_string_value_; } if (!allow_trailing_junk && AdvanceToNonspace(¤t, end)) { return junk_string_value_; } *processed_characters_count = static_cast(current - input); return sign ? -Double::Infinity() : Double::Infinity(); } } if (nan_symbol_ != DOUBLE_CONVERSION_NULLPTR) { if (ConsumeFirstCharacter(*current, nan_symbol_, allow_case_insensitivity)) { if (!ConsumeSubString(¤t, end, nan_symbol_, allow_case_insensitivity)) { return junk_string_value_; } if (!(allow_trailing_spaces || allow_trailing_junk) && (current != end)) { return junk_string_value_; } if (!allow_trailing_junk && AdvanceToNonspace(¤t, end)) { return junk_string_value_; } *processed_characters_count = static_cast(current - input); return sign ? -Double::NaN() : Double::NaN(); } } bool leading_zero = false; if (*current == '0') { if (Advance(¤t, separator_, 10, end)) { *processed_characters_count = static_cast(current - input); return SignedZero(sign); } leading_zero = true; // It could be hexadecimal value. if (((flags_ & ALLOW_HEX) || (flags_ & ALLOW_HEX_FLOATS)) && (*current == 'x' || *current == 'X')) { ++current; if (current == end) return junk_string_value_; // "0x" bool parse_as_hex_float = (flags_ & ALLOW_HEX_FLOATS) && IsHexFloatString(current, end, separator_, allow_trailing_junk); if (!parse_as_hex_float && !isDigit(*current, 16)) { return junk_string_value_; } bool result_is_junk; double result = RadixStringToIeee<4>(¤t, end, sign, separator_, parse_as_hex_float, allow_trailing_junk, junk_string_value_, read_as_double, &result_is_junk); if (!result_is_junk) { if (allow_trailing_spaces) AdvanceToNonspace(¤t, end); *processed_characters_count = static_cast(current - input); } return result; } // Ignore leading zeros in the integer part. while (*current == '0') { if (Advance(¤t, separator_, 10, end)) { *processed_characters_count = static_cast(current - input); return SignedZero(sign); } } } bool octal = leading_zero && (flags_ & ALLOW_OCTALS) != 0; // The longest form of simplified number is: "-.1eXXX\0". const int kBufferSize = kMaxSignificantDigits + 10; DOUBLE_CONVERSION_STACK_UNINITIALIZED char buffer[kBufferSize]; // NOLINT: size is known at compile time. int buffer_pos = 0; // Copy significant digits of the integer part (if any) to the buffer. while (*current >= '0' && *current <= '9') { if (significant_digits < kMaxSignificantDigits) { DOUBLE_CONVERSION_ASSERT(buffer_pos < kBufferSize); buffer[buffer_pos++] = static_cast(*current); significant_digits++; // Will later check if it's an octal in the buffer. } else { insignificant_digits++; // Move the digit into the exponential part. nonzero_digit_dropped = nonzero_digit_dropped || *current != '0'; } octal = octal && *current < '8'; if (Advance(¤t, separator_, 10, end)) goto parsing_done; } if (significant_digits == 0) { octal = false; } if (*current == '.') { if (octal && !allow_trailing_junk) return junk_string_value_; if (octal) goto parsing_done; if (Advance(¤t, separator_, 10, end)) { if (significant_digits == 0 && !leading_zero) { return junk_string_value_; } else { goto parsing_done; } } if (significant_digits == 0) { // octal = false; // Integer part consists of 0 or is absent. Significant digits start after // leading zeros (if any). while (*current == '0') { if (Advance(¤t, separator_, 10, end)) { *processed_characters_count = static_cast(current - input); return SignedZero(sign); } exponent--; // Move this 0 into the exponent. } } // There is a fractional part. // We don't emit a '.', but adjust the exponent instead. while (*current >= '0' && *current <= '9') { if (significant_digits < kMaxSignificantDigits) { DOUBLE_CONVERSION_ASSERT(buffer_pos < kBufferSize); buffer[buffer_pos++] = static_cast(*current); significant_digits++; exponent--; } else { // Ignore insignificant digits in the fractional part. nonzero_digit_dropped = nonzero_digit_dropped || *current != '0'; } if (Advance(¤t, separator_, 10, end)) goto parsing_done; } } if (!leading_zero && exponent == 0 && significant_digits == 0) { // If leading_zeros is true then the string contains zeros. // If exponent < 0 then string was [+-]\.0*... // If significant_digits != 0 the string is not equal to 0. // Otherwise there are no digits in the string. return junk_string_value_; } // Parse exponential part. if (*current == 'e' || *current == 'E') { if (octal && !allow_trailing_junk) return junk_string_value_; if (octal) goto parsing_done; Iterator junk_begin = current; ++current; if (current == end) { if (allow_trailing_junk) { current = junk_begin; goto parsing_done; } else { return junk_string_value_; } } char exponen_sign = '+'; if (*current == '+' || *current == '-') { exponen_sign = static_cast(*current); ++current; if (current == end) { if (allow_trailing_junk) { current = junk_begin; goto parsing_done; } else { return junk_string_value_; } } } if (current == end || *current < '0' || *current > '9') { if (allow_trailing_junk) { current = junk_begin; goto parsing_done; } else { return junk_string_value_; } } const int max_exponent = INT_MAX / 2; DOUBLE_CONVERSION_ASSERT(-max_exponent / 2 <= exponent && exponent <= max_exponent / 2); int num = 0; do { // Check overflow. int digit = *current - '0'; if (num >= max_exponent / 10 && !(num == max_exponent / 10 && digit <= max_exponent % 10)) { num = max_exponent; } else { num = num * 10 + digit; } ++current; } while (current != end && *current >= '0' && *current <= '9'); exponent += (exponen_sign == '-' ? -num : num); } if (!(allow_trailing_spaces || allow_trailing_junk) && (current != end)) { return junk_string_value_; } if (!allow_trailing_junk && AdvanceToNonspace(¤t, end)) { return junk_string_value_; } if (allow_trailing_spaces) { AdvanceToNonspace(¤t, end); } parsing_done: exponent += insignificant_digits; if (octal) { double result; bool result_is_junk; char* start = buffer; result = RadixStringToIeee<3>(&start, buffer + buffer_pos, sign, separator_, false, // Don't parse as hex_float. allow_trailing_junk, junk_string_value_, read_as_double, &result_is_junk); DOUBLE_CONVERSION_ASSERT(!result_is_junk); *processed_characters_count = static_cast(current - input); return result; } if (nonzero_digit_dropped) { buffer[buffer_pos++] = '1'; exponent--; } DOUBLE_CONVERSION_ASSERT(buffer_pos < kBufferSize); buffer[buffer_pos] = '\0'; // Code above ensures there are no leading zeros and the buffer has fewer than // kMaxSignificantDecimalDigits characters. Trim trailing zeros. Vector chars(buffer, buffer_pos); chars = TrimTrailingZeros(chars); exponent += buffer_pos - chars.length(); double converted; if (read_as_double) { converted = StrtodTrimmed(chars, exponent); } else { converted = StrtofTrimmed(chars, exponent); } *processed_characters_count = static_cast(current - input); return sign? -converted: converted; } double StringToDoubleConverter::StringToDouble( const char* buffer, int length, int* processed_characters_count) const { return StringToIeee(buffer, length, true, processed_characters_count); } double StringToDoubleConverter::StringToDouble( const uc16* buffer, int length, int* processed_characters_count) const { return StringToIeee(buffer, length, true, processed_characters_count); } float StringToDoubleConverter::StringToFloat( const char* buffer, int length, int* processed_characters_count) const { return static_cast(StringToIeee(buffer, length, false, processed_characters_count)); } float StringToDoubleConverter::StringToFloat( const uc16* buffer, int length, int* processed_characters_count) const { return static_cast(StringToIeee(buffer, length, false, processed_characters_count)); } template<> double StringToDoubleConverter::StringTo( const char* buffer, int length, int* processed_characters_count) const { return StringToDouble(buffer, length, processed_characters_count); } template<> float StringToDoubleConverter::StringTo( const char* buffer, int length, int* processed_characters_count) const { return StringToFloat(buffer, length, processed_characters_count); } template<> double StringToDoubleConverter::StringTo( const uc16* buffer, int length, int* processed_characters_count) const { return StringToDouble(buffer, length, processed_characters_count); } template<> float StringToDoubleConverter::StringTo( const uc16* buffer, int length, int* processed_characters_count) const { return StringToFloat(buffer, length, processed_characters_count); } } // namespace double_conversion // ICU PATCH: Close ICU namespace U_NAMESPACE_END #endif // ICU PATCH: close #if !UCONFIG_NO_FORMATTING stringi/src/icu74/i18n/choicfmt.cpp0000644000176200001440000004442614700200761016541 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 1997-2013, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* * * File CHOICFMT.CPP * * Modification History: * * Date Name Description * 02/19/97 aliu Converted from java. * 03/20/97 helena Finished first cut of implementation and got rid * of nextDouble/previousDouble and replaced with * boolean array. * 4/10/97 aliu Clean up. Modified to work on AIX. * 06/04/97 helena Fixed applyPattern(), toPattern() and not to include * wchar.h. * 07/09/97 helena Made ParsePosition into a class. * 08/06/97 nos removed overloaded constructor, fixed 'format(array)' * 07/22/98 stephen JDK 1.2 Sync - removed UBool array (doubleFlags) * 02/22/99 stephen Removed character literals for EBCDIC safety ******************************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/choicfmt.h" #include "unicode/numfmt.h" #include "unicode/locid.h" #include "cpputils.h" #include "cstring.h" #include "messageimpl.h" #include "putilimp.h" #include "uassert.h" #include #include // ***************************************************************************** // class ChoiceFormat // ***************************************************************************** U_NAMESPACE_BEGIN UOBJECT_DEFINE_RTTI_IMPLEMENTATION(ChoiceFormat) // Special characters used by ChoiceFormat. There are two characters // used interchangeably to indicate <=. Either is parsed, but only // LESS_EQUAL is generated by toPattern(). #define SINGLE_QUOTE ((char16_t)0x0027) /*'*/ #define LESS_THAN ((char16_t)0x003C) /*<*/ #define LESS_EQUAL ((char16_t)0x0023) /*#*/ #define LESS_EQUAL2 ((char16_t)0x2264) #define VERTICAL_BAR ((char16_t)0x007C) /*|*/ #define MINUS ((char16_t)0x002D) /*-*/ static const char16_t LEFT_CURLY_BRACE = 0x7B; /*{*/ static const char16_t RIGHT_CURLY_BRACE = 0x7D; /*}*/ #ifdef INFINITY #undef INFINITY #endif #define INFINITY ((char16_t)0x221E) //static const char16_t gPositiveInfinity[] = {INFINITY, 0}; //static const char16_t gNegativeInfinity[] = {MINUS, INFINITY, 0}; #define POSITIVE_INF_STRLEN 1 #define NEGATIVE_INF_STRLEN 2 // ------------------------------------- // Creates a ChoiceFormat instance based on the pattern. ChoiceFormat::ChoiceFormat(const UnicodeString& newPattern, UErrorCode& status) : constructorErrorCode(status), msgPattern(status) { applyPattern(newPattern, status); } // ------------------------------------- // Creates a ChoiceFormat instance with the limit array and // format strings for each limit. ChoiceFormat::ChoiceFormat(const double* limits, const UnicodeString* formats, int32_t cnt ) : constructorErrorCode(U_ZERO_ERROR), msgPattern(constructorErrorCode) { setChoices(limits, nullptr, formats, cnt, constructorErrorCode); } // ------------------------------------- ChoiceFormat::ChoiceFormat(const double* limits, const UBool* closures, const UnicodeString* formats, int32_t cnt ) : constructorErrorCode(U_ZERO_ERROR), msgPattern(constructorErrorCode) { setChoices(limits, closures, formats, cnt, constructorErrorCode); } // ------------------------------------- // copy constructor ChoiceFormat::ChoiceFormat(const ChoiceFormat& that) : NumberFormat(that), constructorErrorCode(that.constructorErrorCode), msgPattern(that.msgPattern) { } // ------------------------------------- // Private constructor that creates a // ChoiceFormat instance based on the // pattern and populates UParseError ChoiceFormat::ChoiceFormat(const UnicodeString& newPattern, UParseError& parseError, UErrorCode& status) : constructorErrorCode(status), msgPattern(status) { applyPattern(newPattern,parseError, status); } // ------------------------------------- bool ChoiceFormat::operator==(const Format& that) const { if (this == &that) return true; if (!NumberFormat::operator==(that)) return false; const ChoiceFormat& thatAlias = static_cast(that); return msgPattern == thatAlias.msgPattern; } // ------------------------------------- // copy constructor const ChoiceFormat& ChoiceFormat::operator=(const ChoiceFormat& that) { if (this != &that) { NumberFormat::operator=(that); constructorErrorCode = that.constructorErrorCode; msgPattern = that.msgPattern; } return *this; } // ------------------------------------- ChoiceFormat::~ChoiceFormat() { } // ------------------------------------- /** * Convert a double value to a string without the overhead of NumberFormat. */ UnicodeString& ChoiceFormat::dtos(double value, UnicodeString& string) { /* Buffer to contain the digits and any extra formatting stuff. */ char temp[DBL_DIG + 16]; char *itrPtr = temp; char *expPtr; snprintf(temp, sizeof(temp), "%.*g", DBL_DIG, value); /* Find and convert the decimal point. Using setlocale on some machines will cause snprintf to use a comma for certain locales. */ while (*itrPtr && (*itrPtr == '-' || isdigit(*itrPtr))) { itrPtr++; } if (*itrPtr != 0 && *itrPtr != 'e') { /* We reached something that looks like a decimal point. In case someone used setlocale(), which changes the decimal point. */ *itrPtr = '.'; itrPtr++; } /* Search for the exponent */ while (*itrPtr && *itrPtr != 'e') { itrPtr++; } if (*itrPtr == 'e') { itrPtr++; /* Verify the exponent sign */ if (*itrPtr == '+' || *itrPtr == '-') { itrPtr++; } /* Remove leading zeros. You will see this on Windows machines. */ expPtr = itrPtr; while (*itrPtr == '0') { itrPtr++; } if (*itrPtr && expPtr != itrPtr) { /* Shift the exponent without zeros. */ while (*itrPtr) { *(expPtr++) = *(itrPtr++); } // NUL terminate *expPtr = 0; } } string = UnicodeString(temp, -1, US_INV); /* invariant codepage */ return string; } // ------------------------------------- // calls the overloaded applyPattern method. void ChoiceFormat::applyPattern(const UnicodeString& pattern, UErrorCode& status) { msgPattern.parseChoiceStyle(pattern, nullptr, status); constructorErrorCode = status; } // ------------------------------------- // Applies the pattern to this ChoiceFormat instance. void ChoiceFormat::applyPattern(const UnicodeString& pattern, UParseError& parseError, UErrorCode& status) { msgPattern.parseChoiceStyle(pattern, &parseError, status); constructorErrorCode = status; } // ------------------------------------- // Returns the input pattern string. UnicodeString& ChoiceFormat::toPattern(UnicodeString& result) const { return result = msgPattern.getPatternString(); } // ------------------------------------- // Sets the limit and format arrays. void ChoiceFormat::setChoices( const double* limits, const UnicodeString* formats, int32_t cnt ) { UErrorCode errorCode = U_ZERO_ERROR; setChoices(limits, nullptr, formats, cnt, errorCode); } // ------------------------------------- // Sets the limit and format arrays. void ChoiceFormat::setChoices( const double* limits, const UBool* closures, const UnicodeString* formats, int32_t cnt ) { UErrorCode errorCode = U_ZERO_ERROR; setChoices(limits, closures, formats, cnt, errorCode); } void ChoiceFormat::setChoices(const double* limits, const UBool* closures, const UnicodeString* formats, int32_t count, UErrorCode &errorCode) { if (U_FAILURE(errorCode)) { return; } if (limits == nullptr || formats == nullptr) { errorCode = U_ILLEGAL_ARGUMENT_ERROR; return; } // Reconstruct the original input pattern. // Modified version of the pre-ICU 4.8 toPattern() implementation. UnicodeString result; for (int32_t i = 0; i < count; ++i) { if (i != 0) { result += VERTICAL_BAR; } UnicodeString buf; if (uprv_isPositiveInfinity(limits[i])) { result += INFINITY; } else if (uprv_isNegativeInfinity(limits[i])) { result += MINUS; result += INFINITY; } else { result += dtos(limits[i], buf); } if (closures != nullptr && closures[i]) { result += LESS_THAN; } else { result += LESS_EQUAL; } // Append formats[i], using quotes if there are special // characters. Single quotes themselves must be escaped in // either case. const UnicodeString& text = formats[i]; int32_t textLength = text.length(); int32_t nestingLevel = 0; for (int32_t j = 0; j < textLength; ++j) { char16_t c = text[j]; if (c == SINGLE_QUOTE && nestingLevel == 0) { // Double each top-level apostrophe. result.append(c); } else if (c == VERTICAL_BAR && nestingLevel == 0) { // Surround each pipe symbol with apostrophes for quoting. // If the next character is an apostrophe, then that will be doubled, // and although the parser will see the apostrophe pairs beginning // and ending one character earlier than our doubling, the result // is as desired. // | -> '|' // |' -> '|''' // |'' -> '|''''' etc. result.append(SINGLE_QUOTE).append(c).append(SINGLE_QUOTE); continue; // Skip the append(c) at the end of the loop body. } else if (c == LEFT_CURLY_BRACE) { ++nestingLevel; } else if (c == RIGHT_CURLY_BRACE && nestingLevel > 0) { --nestingLevel; } result.append(c); } } // Apply the reconstructed pattern. applyPattern(result, errorCode); } // ------------------------------------- // Gets the limit array. const double* ChoiceFormat::getLimits(int32_t& cnt) const { cnt = 0; return nullptr; } // ------------------------------------- // Gets the closures array. const UBool* ChoiceFormat::getClosures(int32_t& cnt) const { cnt = 0; return nullptr; } // ------------------------------------- // Gets the format array. const UnicodeString* ChoiceFormat::getFormats(int32_t& cnt) const { cnt = 0; return nullptr; } // ------------------------------------- // Formats an int64 number, it's actually formatted as // a double. The returned format string may differ // from the input number because of this. UnicodeString& ChoiceFormat::format(int64_t number, UnicodeString& appendTo, FieldPosition& status) const { return format((double) number, appendTo, status); } // ------------------------------------- // Formats an int32_t number, it's actually formatted as // a double. UnicodeString& ChoiceFormat::format(int32_t number, UnicodeString& appendTo, FieldPosition& status) const { return format((double) number, appendTo, status); } // ------------------------------------- // Formats a double number. UnicodeString& ChoiceFormat::format(double number, UnicodeString& appendTo, FieldPosition& /*pos*/) const { if (msgPattern.countParts() == 0) { // No pattern was applied, or it failed. return appendTo; } // Get the appropriate sub-message. int32_t msgStart = findSubMessage(msgPattern, 0, number); if (!MessageImpl::jdkAposMode(msgPattern)) { int32_t patternStart = msgPattern.getPart(msgStart).getLimit(); int32_t msgLimit = msgPattern.getLimitPartIndex(msgStart); appendTo.append(msgPattern.getPatternString(), patternStart, msgPattern.getPatternIndex(msgLimit) - patternStart); return appendTo; } // JDK compatibility mode: Remove SKIP_SYNTAX. return MessageImpl::appendSubMessageWithoutSkipSyntax(msgPattern, msgStart, appendTo); } int32_t ChoiceFormat::findSubMessage(const MessagePattern &pattern, int32_t partIndex, double number) { int32_t count = pattern.countParts(); int32_t msgStart; // Iterate over (ARG_INT|DOUBLE, ARG_SELECTOR, message) tuples // until ARG_LIMIT or end of choice-only pattern. // Ignore the first number and selector and start the loop on the first message. partIndex += 2; for (;;) { // Skip but remember the current sub-message. msgStart = partIndex; partIndex = pattern.getLimitPartIndex(partIndex); if (++partIndex >= count) { // Reached the end of the choice-only pattern. // Return with the last sub-message. break; } const MessagePattern::Part &part = pattern.getPart(partIndex++); UMessagePatternPartType type = part.getType(); if (type == UMSGPAT_PART_TYPE_ARG_LIMIT) { // Reached the end of the ChoiceFormat style. // Return with the last sub-message. break; } // part is an ARG_INT or ARG_DOUBLE U_ASSERT(MessagePattern::Part::hasNumericValue(type)); double boundary = pattern.getNumericValue(part); // Fetch the ARG_SELECTOR character. int32_t selectorIndex = pattern.getPatternIndex(partIndex++); char16_t boundaryChar = pattern.getPatternString().charAt(selectorIndex); if (boundaryChar == LESS_THAN ? !(number > boundary) : !(number >= boundary)) { // The number is in the interval between the previous boundary and the current one. // Return with the sub-message between them. // The !(a>b) and !(a>=b) comparisons are equivalent to // (a<=b) and (a= 0) { int32_t newIndex = start + len; if (newIndex > furthest) { furthest = newIndex; bestNumber = tempNumber; if (furthest == source.length()) { break; } } } partIndex = msgLimit + 1; } if (furthest == start) { pos.setErrorIndex(start); } else { pos.setIndex(furthest); } return bestNumber; } int32_t ChoiceFormat::matchStringUntilLimitPart( const MessagePattern &pattern, int32_t partIndex, int32_t limitPartIndex, const UnicodeString &source, int32_t sourceOffset) { int32_t matchingSourceLength = 0; const UnicodeString &msgString = pattern.getPatternString(); int32_t prevIndex = pattern.getPart(partIndex).getLimit(); for (;;) { const MessagePattern::Part &part = pattern.getPart(++partIndex); if (partIndex == limitPartIndex || part.getType() == UMSGPAT_PART_TYPE_SKIP_SYNTAX) { int32_t index = part.getIndex(); int32_t length = index - prevIndex; if (length != 0 && 0 != source.compare(sourceOffset, length, msgString, prevIndex, length)) { return -1; // mismatch } matchingSourceLength += length; if (partIndex == limitPartIndex) { return matchingSourceLength; } prevIndex = part.getLimit(); // SKIP_SYNTAX } } } // ------------------------------------- ChoiceFormat* ChoiceFormat::clone() const { ChoiceFormat *aCopy = new ChoiceFormat(*this); return aCopy; } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ //eof stringi/src/icu74/i18n/gregocal.cpp0000644000176200001440000014103014700200761016515 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 1997-2016, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* * * File GREGOCAL.CPP * * Modification History: * * Date Name Description * 02/05/97 clhuang Creation. * 03/28/97 aliu Made highly questionable fix to computeFields to * handle DST correctly. * 04/22/97 aliu Cleaned up code drastically. Added monthLength(). * Finished unimplemented parts of computeTime() for * week-based date determination. Removed quetionable * fix and wrote correct fix for computeFields() and * daylight time handling. Rewrote inDaylightTime() * and computeFields() to handle sensitive Daylight to * Standard time transitions correctly. * 05/08/97 aliu Added code review changes. Fixed isLeapYear() to * not cutover. * 08/12/97 aliu Added equivalentTo. Misc other fixes. Updated * add() from Java source. * 07/28/98 stephen Sync up with JDK 1.2 * 09/14/98 stephen Changed type of kOneDay, kOneWeek to double. * Fixed bug in roll() * 10/15/99 aliu Fixed j31, incorrect WEEK_OF_YEAR computation. * 10/15/99 aliu Fixed j32, cannot set date to Feb 29 2000 AD. * {JDK bug 4210209 4209272} * 11/15/99 weiv Added YEAR_WOY and DOW_LOCAL computation * to timeToFields method, updated kMinValues, kMaxValues & kLeastMaxValues * 12/09/99 aliu Fixed j81, calculation errors and roll bugs * in year of cutover. * 01/24/2000 aliu Revised computeJulianDay for YEAR YEAR_WOY WOY. ******************************************************************************** */ #include "unicode/utypes.h" #include #if !UCONFIG_NO_FORMATTING #include "unicode/gregocal.h" #include "gregoimp.h" #include "umutex.h" #include "uassert.h" // ***************************************************************************** // class GregorianCalendar // ***************************************************************************** /** * Note that the Julian date used here is not a true Julian date, since * it is measured from midnight, not noon. This value is the Julian * day number of January 1, 1970 (Gregorian calendar) at noon UTC. [LIU] */ static const int16_t kNumDays[] = {0,31,59,90,120,151,181,212,243,273,304,334}; // 0-based, for day-in-year static const int16_t kLeapNumDays[] = {0,31,60,91,121,152,182,213,244,274,305,335}; // 0-based, for day-in-year static const int8_t kMonthLength[] = {31,28,31,30,31,30,31,31,30,31,30,31}; // 0-based static const int8_t kLeapMonthLength[] = {31,29,31,30,31,30,31,31,30,31,30,31}; // 0-based // setTimeInMillis() limits the Julian day range to +/-7F000000. // This would seem to limit the year range to: // ms=+183882168921600000 jd=7f000000 December 20, 5828963 AD // ms=-184303902528000000 jd=81000000 September 20, 5838270 BC // HOWEVER, CalendarRegressionTest/Test4167060 shows that the actual // range limit on the year field is smaller (~ +/-140000). [alan 3.0] static const int32_t kGregorianCalendarLimits[UCAL_FIELD_COUNT][4] = { // Minimum Greatest Least Maximum // Minimum Maximum { 0, 0, 1, 1}, // ERA { 1, 1, 140742, 144683}, // YEAR { 0, 0, 11, 11}, // MONTH { 1, 1, 52, 53}, // WEEK_OF_YEAR {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // WEEK_OF_MONTH { 1, 1, 28, 31}, // DAY_OF_MONTH { 1, 1, 365, 366}, // DAY_OF_YEAR {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // DAY_OF_WEEK { -1, -1, 4, 5}, // DAY_OF_WEEK_IN_MONTH {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // AM_PM {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // HOUR {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // HOUR_OF_DAY {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // MINUTE {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // SECOND {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // MILLISECOND {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // ZONE_OFFSET {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // DST_OFFSET { -140742, -140742, 140742, 144683}, // YEAR_WOY {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // DOW_LOCAL { -140742, -140742, 140742, 144683}, // EXTENDED_YEAR {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // JULIAN_DAY {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // MILLISECONDS_IN_DAY {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // IS_LEAP_MONTH { 0, 0, 11, 11}, // ORDINAL_MONTH }; /* *
*                            Greatest       Least 
* Field name        Minimum   Minimum     Maximum     Maximum
* ----------        -------   -------     -------     -------
* ERA                     0         0           1           1
* YEAR                    1         1      140742      144683
* MONTH                   0         0          11          11
* WEEK_OF_YEAR            1         1          52          53
* WEEK_OF_MONTH           0         0           4           6
* DAY_OF_MONTH            1         1          28          31
* DAY_OF_YEAR             1         1         365         366
* DAY_OF_WEEK             1         1           7           7
* DAY_OF_WEEK_IN_MONTH   -1        -1           4           5
* AM_PM                   0         0           1           1
* HOUR                    0         0          11          11
* HOUR_OF_DAY             0         0          23          23
* MINUTE                  0         0          59          59
* SECOND                  0         0          59          59
* MILLISECOND             0         0         999         999
* ZONE_OFFSET           -12*      -12*         12*         12*
* DST_OFFSET              0         0           1*          1*
* YEAR_WOY                1         1      140742      144683
* DOW_LOCAL               1         1           7           7
* 
* (*) In units of one-hour */ #if defined( U_DEBUG_CALSVC ) || defined (U_DEBUG_CAL) #include #endif U_NAMESPACE_BEGIN UOBJECT_DEFINE_RTTI_IMPLEMENTATION(GregorianCalendar) // 00:00:00 UTC, October 15, 1582, expressed in ms from the epoch. // Note that only Italy and other Catholic countries actually // observed this cutover. Most other countries followed in // the next few centuries, some as late as 1928. [LIU] // in Java, -12219292800000L //const UDate GregorianCalendar::kPapalCutover = -12219292800000L; static const uint32_t kCutoverJulianDay = 2299161; static const UDate kPapalCutover = (2299161.0 - kEpochStartAsJulianDay) * U_MILLIS_PER_DAY; //static const UDate kPapalCutoverJulian = (2299161.0 - kEpochStartAsJulianDay); // ------------------------------------- GregorianCalendar::GregorianCalendar(UErrorCode& status) : Calendar(status), fGregorianCutover(kPapalCutover), fCutoverJulianDay(kCutoverJulianDay), fNormalizedGregorianCutover(fGregorianCutover), fGregorianCutoverYear(1582), fIsGregorian(true), fInvertGregorian(false) { setTimeInMillis(getNow(), status); } // ------------------------------------- GregorianCalendar::GregorianCalendar(TimeZone* zone, UErrorCode& status) : Calendar(zone, Locale::getDefault(), status), fGregorianCutover(kPapalCutover), fCutoverJulianDay(kCutoverJulianDay), fNormalizedGregorianCutover(fGregorianCutover), fGregorianCutoverYear(1582), fIsGregorian(true), fInvertGregorian(false) { setTimeInMillis(getNow(), status); } // ------------------------------------- GregorianCalendar::GregorianCalendar(const TimeZone& zone, UErrorCode& status) : Calendar(zone, Locale::getDefault(), status), fGregorianCutover(kPapalCutover), fCutoverJulianDay(kCutoverJulianDay), fNormalizedGregorianCutover(fGregorianCutover), fGregorianCutoverYear(1582), fIsGregorian(true), fInvertGregorian(false) { setTimeInMillis(getNow(), status); } // ------------------------------------- GregorianCalendar::GregorianCalendar(const Locale& aLocale, UErrorCode& status) : Calendar(TimeZone::forLocaleOrDefault(aLocale), aLocale, status), fGregorianCutover(kPapalCutover), fCutoverJulianDay(kCutoverJulianDay), fNormalizedGregorianCutover(fGregorianCutover), fGregorianCutoverYear(1582), fIsGregorian(true), fInvertGregorian(false) { setTimeInMillis(getNow(), status); } // ------------------------------------- GregorianCalendar::GregorianCalendar(TimeZone* zone, const Locale& aLocale, UErrorCode& status) : Calendar(zone, aLocale, status), fGregorianCutover(kPapalCutover), fCutoverJulianDay(kCutoverJulianDay), fNormalizedGregorianCutover(fGregorianCutover), fGregorianCutoverYear(1582), fIsGregorian(true), fInvertGregorian(false) { setTimeInMillis(getNow(), status); } // ------------------------------------- GregorianCalendar::GregorianCalendar(const TimeZone& zone, const Locale& aLocale, UErrorCode& status) : Calendar(zone, aLocale, status), fGregorianCutover(kPapalCutover), fCutoverJulianDay(kCutoverJulianDay), fNormalizedGregorianCutover(fGregorianCutover), fGregorianCutoverYear(1582), fIsGregorian(true), fInvertGregorian(false) { setTimeInMillis(getNow(), status); } // ------------------------------------- GregorianCalendar::GregorianCalendar(int32_t year, int32_t month, int32_t date, UErrorCode& status) : Calendar(TimeZone::createDefault(), Locale::getDefault(), status), fGregorianCutover(kPapalCutover), fCutoverJulianDay(kCutoverJulianDay), fNormalizedGregorianCutover(fGregorianCutover), fGregorianCutoverYear(1582), fIsGregorian(true), fInvertGregorian(false) { set(UCAL_ERA, AD); set(UCAL_YEAR, year); set(UCAL_MONTH, month); set(UCAL_DATE, date); } // ------------------------------------- GregorianCalendar::GregorianCalendar(int32_t year, int32_t month, int32_t date, int32_t hour, int32_t minute, UErrorCode& status) : Calendar(TimeZone::createDefault(), Locale::getDefault(), status), fGregorianCutover(kPapalCutover), fCutoverJulianDay(kCutoverJulianDay), fNormalizedGregorianCutover(fGregorianCutover), fGregorianCutoverYear(1582), fIsGregorian(true), fInvertGregorian(false) { set(UCAL_ERA, AD); set(UCAL_YEAR, year); set(UCAL_MONTH, month); set(UCAL_DATE, date); set(UCAL_HOUR_OF_DAY, hour); set(UCAL_MINUTE, minute); } // ------------------------------------- GregorianCalendar::GregorianCalendar(int32_t year, int32_t month, int32_t date, int32_t hour, int32_t minute, int32_t second, UErrorCode& status) : Calendar(TimeZone::createDefault(), Locale::getDefault(), status), fGregorianCutover(kPapalCutover), fCutoverJulianDay(kCutoverJulianDay), fNormalizedGregorianCutover(fGregorianCutover), fGregorianCutoverYear(1582), fIsGregorian(true), fInvertGregorian(false) { set(UCAL_ERA, AD); set(UCAL_YEAR, year); set(UCAL_MONTH, month); set(UCAL_DATE, date); set(UCAL_HOUR_OF_DAY, hour); set(UCAL_MINUTE, minute); set(UCAL_SECOND, second); } // ------------------------------------- GregorianCalendar::~GregorianCalendar() { } // ------------------------------------- GregorianCalendar::GregorianCalendar(const GregorianCalendar &source) : Calendar(source), fGregorianCutover(source.fGregorianCutover), fCutoverJulianDay(source.fCutoverJulianDay), fNormalizedGregorianCutover(source.fNormalizedGregorianCutover), fGregorianCutoverYear(source.fGregorianCutoverYear), fIsGregorian(source.fIsGregorian), fInvertGregorian(source.fInvertGregorian) { } // ------------------------------------- GregorianCalendar* GregorianCalendar::clone() const { return new GregorianCalendar(*this); } // ------------------------------------- GregorianCalendar & GregorianCalendar::operator=(const GregorianCalendar &right) { if (this != &right) { Calendar::operator=(right); fGregorianCutover = right.fGregorianCutover; fNormalizedGregorianCutover = right.fNormalizedGregorianCutover; fGregorianCutoverYear = right.fGregorianCutoverYear; fCutoverJulianDay = right.fCutoverJulianDay; } return *this; } // ------------------------------------- UBool GregorianCalendar::isEquivalentTo(const Calendar& other) const { // Calendar override. return Calendar::isEquivalentTo(other) && fGregorianCutover == ((GregorianCalendar*)&other)->fGregorianCutover; } // ------------------------------------- void GregorianCalendar::setGregorianChange(UDate date, UErrorCode& status) { if (U_FAILURE(status)) return; // Precompute two internal variables which we use to do the actual // cutover computations. These are the normalized cutover, which is the // midnight at or before the cutover, and the cutover year. The // normalized cutover is in pure date milliseconds; it contains no time // of day or timezone component, and it used to compare against other // pure date values. double cutoverDay = ClockMath::floorDivide(date, (double)kOneDay); // Handle the rare case of numeric overflow where the user specifies a time // outside of INT32_MIN .. INT32_MAX number of days. if (cutoverDay <= INT32_MIN) { cutoverDay = INT32_MIN; fGregorianCutover = fNormalizedGregorianCutover = cutoverDay * kOneDay; } else if (cutoverDay >= INT32_MAX) { cutoverDay = INT32_MAX; fGregorianCutover = fNormalizedGregorianCutover = cutoverDay * kOneDay; } else { fNormalizedGregorianCutover = cutoverDay * kOneDay; fGregorianCutover = date; } // Normalize the year so BC values are represented as 0 and negative // values. GregorianCalendar *cal = new GregorianCalendar(getTimeZone(), status); /* test for nullptr */ if (cal == 0) { status = U_MEMORY_ALLOCATION_ERROR; return; } if(U_FAILURE(status)) return; cal->setTime(date, status); fGregorianCutoverYear = cal->get(UCAL_YEAR, status); if (cal->get(UCAL_ERA, status) == BC) fGregorianCutoverYear = 1 - fGregorianCutoverYear; fCutoverJulianDay = (int32_t)cutoverDay; delete cal; } void GregorianCalendar::handleComputeFields(int32_t julianDay, UErrorCode& status) { int32_t eyear, month, dayOfMonth, dayOfYear, unusedRemainder; if(U_FAILURE(status)) { return; } #if defined (U_DEBUG_CAL) fprintf(stderr, "%s:%d: jd%d- (greg's %d)- [cut=%d]\n", __FILE__, __LINE__, julianDay, getGregorianDayOfYear(), fCutoverJulianDay); #endif if (julianDay >= fCutoverJulianDay) { month = getGregorianMonth(); dayOfMonth = getGregorianDayOfMonth(); dayOfYear = getGregorianDayOfYear(); eyear = getGregorianYear(); } else { // The Julian epoch day (not the same as Julian Day) // is zero on Saturday December 30, 0 (Gregorian). int32_t julianEpochDay = julianDay - (kJan1_1JulianDay - 2); eyear = (int32_t) ClockMath::floorDivide((4.0*julianEpochDay) + 1464.0, (int32_t) 1461, &unusedRemainder); // Compute the Julian calendar day number for January 1, eyear int32_t january1 = 365*(eyear-1) + ClockMath::floorDivide(eyear-1, (int32_t)4); dayOfYear = (julianEpochDay - january1); // 0-based // Julian leap years occurred historically every 4 years starting // with 8 AD. Before 8 AD the spacing is irregular; every 3 years // from 45 BC to 9 BC, and then none until 8 AD. However, we don't // implement this historical detail; instead, we implement the // computationally cleaner proleptic calendar, which assumes // consistent 4-year cycles throughout time. UBool isLeap = ((eyear&0x3) == 0); // equiv. to (eyear%4 == 0) // Common Julian/Gregorian calculation int32_t correction = 0; int32_t march1 = isLeap ? 60 : 59; // zero-based DOY for March 1 if (dayOfYear >= march1) { correction = isLeap ? 1 : 2; } month = (12 * (dayOfYear + correction) + 6) / 367; // zero-based month dayOfMonth = dayOfYear - (isLeap?kLeapNumDays[month]:kNumDays[month]) + 1; // one-based DOM ++dayOfYear; #if defined (U_DEBUG_CAL) // fprintf(stderr, "%d - %d[%d] + 1\n", dayOfYear, isLeap?kLeapNumDays[month]:kNumDays[month], month ); // fprintf(stderr, "%s:%d: greg's HCF %d -> %d/%d/%d not %d/%d/%d\n", // __FILE__, __LINE__,julianDay, // eyear,month,dayOfMonth, // getGregorianYear(), getGregorianMonth(), getGregorianDayOfMonth() ); fprintf(stderr, "%s:%d: doy %d (greg's %d)- [cut=%d]\n", __FILE__, __LINE__, dayOfYear, getGregorianDayOfYear(), fCutoverJulianDay); #endif } // [j81] if we are after the cutover in its year, shift the day of the year if((eyear == fGregorianCutoverYear) && (julianDay >= fCutoverJulianDay)) { //from handleComputeMonthStart int32_t gregShift = Grego::gregorianShift(eyear); #if defined (U_DEBUG_CAL) fprintf(stderr, "%s:%d: gregorian shift %d ::: doy%d => %d [cut=%d]\n", __FILE__, __LINE__,gregShift, dayOfYear, dayOfYear+gregShift, fCutoverJulianDay); #endif dayOfYear += gregShift; } internalSet(UCAL_MONTH, month); internalSet(UCAL_ORDINAL_MONTH, month); internalSet(UCAL_DAY_OF_MONTH, dayOfMonth); internalSet(UCAL_DAY_OF_YEAR, dayOfYear); internalSet(UCAL_EXTENDED_YEAR, eyear); int32_t era = AD; if (eyear < 1) { era = BC; eyear = 1 - eyear; } internalSet(UCAL_ERA, era); internalSet(UCAL_YEAR, eyear); } // ------------------------------------- UDate GregorianCalendar::getGregorianChange() const { return fGregorianCutover; } // ------------------------------------- UBool GregorianCalendar::isLeapYear(int32_t year) const { // MSVC complains bitterly if we try to use Grego::isLeapYear here // NOTE: year&0x3 == year%4 return (year >= fGregorianCutoverYear ? (((year&0x3) == 0) && ((year%100 != 0) || (year%400 == 0))) : // Gregorian ((year&0x3) == 0)); // Julian } // ------------------------------------- int32_t GregorianCalendar::handleComputeJulianDay(UCalendarDateFields bestField) { fInvertGregorian = false; int32_t jd = Calendar::handleComputeJulianDay(bestField); if((bestField == UCAL_WEEK_OF_YEAR) && // if we are doing WOY calculations, we are counting relative to Jan 1 *julian* (internalGet(UCAL_EXTENDED_YEAR)==fGregorianCutoverYear) && jd >= fCutoverJulianDay) { fInvertGregorian = true; // So that the Julian Jan 1 will be used in handleComputeMonthStart return Calendar::handleComputeJulianDay(bestField); } // The following check handles portions of the cutover year BEFORE the // cutover itself happens. //if ((fIsGregorian==true) != (jd >= fCutoverJulianDay)) { /* cutoverJulianDay)) { */ if ((fIsGregorian) != (jd >= fCutoverJulianDay)) { /* cutoverJulianDay)) { */ #if defined (U_DEBUG_CAL) fprintf(stderr, "%s:%d: jd [invert] %d\n", __FILE__, __LINE__, jd); #endif fInvertGregorian = true; jd = Calendar::handleComputeJulianDay(bestField); #if defined (U_DEBUG_CAL) fprintf(stderr, "%s:%d: fIsGregorian %s, fInvertGregorian %s - ", __FILE__, __LINE__,fIsGregorian?"T":"F", fInvertGregorian?"T":"F"); fprintf(stderr, " jd NOW %d\n", jd); #endif } else { #if defined (U_DEBUG_CAL) fprintf(stderr, "%s:%d: jd [==] %d - %sfIsGregorian %sfInvertGregorian, %d\n", __FILE__, __LINE__, jd, fIsGregorian?"T":"F", fInvertGregorian?"T":"F", bestField); #endif } if(fIsGregorian && (internalGet(UCAL_EXTENDED_YEAR) == fGregorianCutoverYear)) { int32_t gregShift = Grego::gregorianShift(internalGet(UCAL_EXTENDED_YEAR)); if (bestField == UCAL_DAY_OF_YEAR) { #if defined (U_DEBUG_CAL) fprintf(stderr, "%s:%d: [DOY%d] gregorian shift of JD %d += %d\n", __FILE__, __LINE__, fFields[bestField],jd, gregShift); #endif jd -= gregShift; } else if ( bestField == UCAL_WEEK_OF_MONTH ) { int32_t weekShift = 14; #if defined (U_DEBUG_CAL) fprintf(stderr, "%s:%d: [WOY/WOM] gregorian week shift of %d += %d\n", __FILE__, __LINE__, jd, weekShift); #endif jd += weekShift; // shift by weeks for week based fields. } } return jd; } int32_t GregorianCalendar::handleComputeMonthStart(int32_t eyear, int32_t month, UBool /* useMonth */) const { GregorianCalendar *nonConstThis = (GregorianCalendar*)this; // cast away const // If the month is out of range, adjust it into range, and // modify the extended year value accordingly. if (month < 0 || month > 11) { eyear += ClockMath::floorDivide(month, 12, &month); } UBool isLeap = eyear%4 == 0; int64_t y = (int64_t)eyear-1; int64_t julianDay = 365*y + ClockMath::floorDivide(y, (int64_t)4) + (kJan1_1JulianDay - 3); nonConstThis->fIsGregorian = (eyear >= fGregorianCutoverYear); #if defined (U_DEBUG_CAL) fprintf(stderr, "%s:%d: (hcms%d/%d) fIsGregorian %s, fInvertGregorian %s\n", __FILE__, __LINE__, eyear,month, fIsGregorian?"T":"F", fInvertGregorian?"T":"F"); #endif if (fInvertGregorian) { nonConstThis->fIsGregorian = !fIsGregorian; } if (fIsGregorian) { isLeap = isLeap && ((eyear%100 != 0) || (eyear%400 == 0)); // Add 2 because Gregorian calendar starts 2 days after // Julian calendar int32_t gregShift = Grego::gregorianShift(eyear); #if defined (U_DEBUG_CAL) fprintf(stderr, "%s:%d: (hcms%d/%d) gregorian shift of %d += %d\n", __FILE__, __LINE__, eyear, month, julianDay, gregShift); #endif julianDay += gregShift; } // At this point julianDay indicates the day BEFORE the first // day of January 1, of either the Julian or Gregorian // calendar. if (month != 0) { julianDay += isLeap?kLeapNumDays[month]:kNumDays[month]; } return static_cast(julianDay); } int32_t GregorianCalendar::handleGetMonthLength(int32_t extendedYear, int32_t month) const { // If the month is out of range, adjust it into range, and // modify the extended year value accordingly. if (month < 0 || month > 11) { extendedYear += ClockMath::floorDivide(month, 12, &month); } return isLeapYear(extendedYear) ? kLeapMonthLength[month] : kMonthLength[month]; } int32_t GregorianCalendar::handleGetYearLength(int32_t eyear) const { return isLeapYear(eyear) ? 366 : 365; } int32_t GregorianCalendar::monthLength(int32_t month) const { int32_t year = internalGet(UCAL_EXTENDED_YEAR); return handleGetMonthLength(year, month); } // ------------------------------------- int32_t GregorianCalendar::monthLength(int32_t month, int32_t year) const { return isLeapYear(year) ? kLeapMonthLength[month] : kMonthLength[month]; } // ------------------------------------- int32_t GregorianCalendar::yearLength() const { return isLeapYear(internalGet(UCAL_YEAR)) ? 366 : 365; } // ------------------------------------- UBool GregorianCalendar::validateFields() const { for (int32_t field = 0; field < UCAL_FIELD_COUNT; field++) { // Ignore DATE and DAY_OF_YEAR which are handled below if (field != UCAL_DATE && field != UCAL_DAY_OF_YEAR && isSet((UCalendarDateFields)field) && ! boundsCheck(internalGet((UCalendarDateFields)field), (UCalendarDateFields)field)) return false; } // Values differ in Least-Maximum and Maximum should be handled // specially. if (isSet(UCAL_DATE)) { int32_t date = internalGet(UCAL_DATE); if (date < getMinimum(UCAL_DATE) || date > monthLength(internalGetMonth())) { return false; } } if (isSet(UCAL_DAY_OF_YEAR)) { int32_t days = internalGet(UCAL_DAY_OF_YEAR); if (days < 1 || days > yearLength()) { return false; } } // Handle DAY_OF_WEEK_IN_MONTH, which must not have the value zero. // We've checked against minimum and maximum above already. if (isSet(UCAL_DAY_OF_WEEK_IN_MONTH) && 0 == internalGet(UCAL_DAY_OF_WEEK_IN_MONTH)) { return false; } return true; } // ------------------------------------- UBool GregorianCalendar::boundsCheck(int32_t value, UCalendarDateFields field) const { return value >= getMinimum(field) && value <= getMaximum(field); } // ------------------------------------- UDate GregorianCalendar::getEpochDay(UErrorCode& status) { complete(status); // Divide by 1000 (convert to seconds) in order to prevent overflow when // dealing with UDate(Long.MIN_VALUE) and UDate(Long.MAX_VALUE). double wallSec = internalGetTime()/1000 + (internalGet(UCAL_ZONE_OFFSET) + internalGet(UCAL_DST_OFFSET))/1000; return ClockMath::floorDivide(wallSec, kOneDay/1000.0); } // ------------------------------------- // ------------------------------------- /** * Compute the julian day number of the day BEFORE the first day of * January 1, year 1 of the given calendar. If julianDay == 0, it * specifies (Jan. 1, 1) - 1, in whatever calendar we are using (Julian * or Gregorian). */ double GregorianCalendar::computeJulianDayOfYear(UBool isGregorian, int32_t year, UBool& isLeap) { isLeap = year%4 == 0; int32_t y = year - 1; double julianDay = 365.0*y + ClockMath::floorDivide(y, 4) + (kJan1_1JulianDay - 3); if (isGregorian) { isLeap = isLeap && ((year%100 != 0) || (year%400 == 0)); // Add 2 because Gregorian calendar starts 2 days after Julian calendar julianDay += Grego::gregorianShift(year); } return julianDay; } // /** // * Compute the day of week, relative to the first day of week, from // * 0..6, of the current DOW_LOCAL or DAY_OF_WEEK fields. This is // * equivalent to get(DOW_LOCAL) - 1. // */ // int32_t GregorianCalendar::computeRelativeDOW() const { // int32_t relDow = 0; // if (fStamp[UCAL_DOW_LOCAL] > fStamp[UCAL_DAY_OF_WEEK]) { // relDow = internalGet(UCAL_DOW_LOCAL) - 1; // 1-based // } else if (fStamp[UCAL_DAY_OF_WEEK] != kUnset) { // relDow = internalGet(UCAL_DAY_OF_WEEK) - getFirstDayOfWeek(); // if (relDow < 0) relDow += 7; // } // return relDow; // } // /** // * Compute the day of week, relative to the first day of week, // * from 0..6 of the given julian day. // */ // int32_t GregorianCalendar::computeRelativeDOW(double julianDay) const { // int32_t relDow = julianDayToDayOfWeek(julianDay) - getFirstDayOfWeek(); // if (relDow < 0) { // relDow += 7; // } // return relDow; // } // /** // * Compute the DOY using the WEEK_OF_YEAR field and the julian day // * of the day BEFORE January 1 of a year (a return value from // * computeJulianDayOfYear). // */ // int32_t GregorianCalendar::computeDOYfromWOY(double julianDayOfYear) const { // // Compute DOY from day of week plus week of year // // Find the day of the week for the first of this year. This // // is zero-based, with 0 being the locale-specific first day of // // the week. Add 1 to get first day of year. // int32_t fdy = computeRelativeDOW(julianDayOfYear + 1); // return // // Compute doy of first (relative) DOW of WOY 1 // (((7 - fdy) < getMinimalDaysInFirstWeek()) // ? (8 - fdy) : (1 - fdy)) // // Adjust for the week number. // + (7 * (internalGet(UCAL_WEEK_OF_YEAR) - 1)) // // Adjust for the DOW // + computeRelativeDOW(); // } // ------------------------------------- double GregorianCalendar::millisToJulianDay(UDate millis) { return (double)kEpochStartAsJulianDay + ClockMath::floorDivide(millis, (double)kOneDay); } // ------------------------------------- UDate GregorianCalendar::julianDayToMillis(double julian) { return (UDate) ((julian - kEpochStartAsJulianDay) * (double) kOneDay); } // ------------------------------------- int32_t GregorianCalendar::aggregateStamp(int32_t stamp_a, int32_t stamp_b) { return (((stamp_a != kUnset && stamp_b != kUnset) ? uprv_max(stamp_a, stamp_b) : (int32_t)kUnset)); } // ------------------------------------- /** * Roll a field by a signed amount. * Note: This will be made public later. [LIU] */ void GregorianCalendar::roll(EDateFields field, int32_t amount, UErrorCode& status) { roll((UCalendarDateFields) field, amount, status); } void GregorianCalendar::roll(UCalendarDateFields field, int32_t amount, UErrorCode& status) UPRV_NO_SANITIZE_UNDEFINED { if((amount == 0) || U_FAILURE(status)) { return; } // J81 processing. (gregorian cutover) UBool inCutoverMonth = false; int32_t cMonthLen=0; // 'c' for cutover; in days int32_t cDayOfMonth=0; // no discontinuity: [0, cMonthLen) double cMonthStart=0.0; // in ms // Common code - see if we're in the cutover month of the cutover year if(get(UCAL_EXTENDED_YEAR, status) == fGregorianCutoverYear) { switch (field) { case UCAL_DAY_OF_MONTH: case UCAL_WEEK_OF_MONTH: { int32_t max = monthLength(internalGetMonth()); UDate t = internalGetTime(); // We subtract 1 from the DAY_OF_MONTH to make it zero-based, and an // additional 10 if we are after the cutover. Thus the monthStart // value will be correct iff we actually are in the cutover month. cDayOfMonth = internalGet(UCAL_DAY_OF_MONTH) - ((t >= fGregorianCutover) ? 10 : 0); cMonthStart = t - ((cDayOfMonth - 1) * kOneDay); // A month containing the cutover is 10 days shorter. if ((cMonthStart < fGregorianCutover) && (cMonthStart + (cMonthLen=(max-10))*kOneDay >= fGregorianCutover)) { inCutoverMonth = true; } } break; default: ; } } switch (field) { case UCAL_WEEK_OF_YEAR: { // Unlike WEEK_OF_MONTH, WEEK_OF_YEAR never shifts the day of the // week. Also, rolling the week of the year can have seemingly // strange effects simply because the year of the week of year // may be different from the calendar year. For example, the // date Dec 28, 1997 is the first day of week 1 of 1998 (if // weeks start on Sunday and the minimal days in first week is // <= 3). int32_t woy = get(UCAL_WEEK_OF_YEAR, status); // Get the ISO year, which matches the week of year. This // may be one year before or after the calendar year. int32_t isoYear = get(UCAL_YEAR_WOY, status); int32_t isoDoy = internalGet(UCAL_DAY_OF_YEAR); if (internalGetMonth() == UCAL_JANUARY) { if (woy >= 52) { isoDoy += handleGetYearLength(isoYear); } } else { if (woy == 1) { isoDoy -= handleGetYearLength(isoYear - 1); } } woy += amount; // Do fast checks to avoid unnecessary computation: if (woy < 1 || woy > 52) { // Determine the last week of the ISO year. // We do this using the standard formula we use // everywhere in this file. If we can see that the // days at the end of the year are going to fall into // week 1 of the next year, we drop the last week by // subtracting 7 from the last day of the year. int32_t lastDoy = handleGetYearLength(isoYear); int32_t lastRelDow = (lastDoy - isoDoy + internalGet(UCAL_DAY_OF_WEEK) - getFirstDayOfWeek()) % 7; if (lastRelDow < 0) lastRelDow += 7; if ((6 - lastRelDow) >= getMinimalDaysInFirstWeek()) lastDoy -= 7; int32_t lastWoy = weekNumber(lastDoy, lastRelDow + 1); woy = ((woy + lastWoy - 1) % lastWoy) + 1; } set(UCAL_WEEK_OF_YEAR, woy); set(UCAL_YEAR_WOY,isoYear); return; } case UCAL_DAY_OF_MONTH: if( !inCutoverMonth ) { Calendar::roll(field, amount, status); return; } else { // [j81] 1582 special case for DOM // The default computation works except when the current month // contains the Gregorian cutover. We handle this special case // here. [j81 - aliu] double monthLen = cMonthLen * kOneDay; double msIntoMonth = uprv_fmod(internalGetTime() - cMonthStart + amount * kOneDay, monthLen); if (msIntoMonth < 0) { msIntoMonth += monthLen; } #if defined (U_DEBUG_CAL) fprintf(stderr, "%s:%d: roll DOM %d -> %.0lf ms \n", __FILE__, __LINE__,amount, cMonthLen, cMonthStart+msIntoMonth); #endif setTimeInMillis(cMonthStart + msIntoMonth, status); return; } case UCAL_WEEK_OF_MONTH: if( !inCutoverMonth ) { Calendar::roll(field, amount, status); return; } else { #if defined (U_DEBUG_CAL) fprintf(stderr, "%s:%d: roll WOM %d ??????????????????? \n", __FILE__, __LINE__,amount); #endif // NOTE: following copied from the old // GregorianCalendar::roll( WEEK_OF_MONTH ) code // This is tricky, because during the roll we may have to shift // to a different day of the week. For example: // s m t w r f s // 1 2 3 4 5 // 6 7 8 9 10 11 12 // When rolling from the 6th or 7th back one week, we go to the // 1st (assuming that the first partial week counts). The same // thing happens at the end of the month. // The other tricky thing is that we have to figure out whether // the first partial week actually counts or not, based on the // minimal first days in the week. And we have to use the // correct first day of the week to delineate the week // boundaries. // Here's our algorithm. First, we find the real boundaries of // the month. Then we discard the first partial week if it // doesn't count in this locale. Then we fill in the ends with // phantom days, so that the first partial week and the last // partial week are full weeks. We then have a nice square // block of weeks. We do the usual rolling within this block, // as is done elsewhere in this method. If we wind up on one of // the phantom days that we added, we recognize this and pin to // the first or the last day of the month. Easy, eh? // Another wrinkle: To fix jitterbug 81, we have to make all this // work in the oddball month containing the Gregorian cutover. // This month is 10 days shorter than usual, and also contains // a discontinuity in the days; e.g., the default cutover month // is Oct 1582, and goes from day of month 4 to day of month 15. // Normalize the DAY_OF_WEEK so that 0 is the first day of the week // in this locale. We have dow in 0..6. int32_t dow = internalGet(UCAL_DAY_OF_WEEK) - getFirstDayOfWeek(); if (dow < 0) dow += 7; // Find the day of month, compensating for cutover discontinuity. int32_t dom = cDayOfMonth; // Find the day of the week (normalized for locale) for the first // of the month. int32_t fdm = (dow - dom + 1) % 7; if (fdm < 0) fdm += 7; // Get the first day of the first full week of the month, // including phantom days, if any. Figure out if the first week // counts or not; if it counts, then fill in phantom days. If // not, advance to the first real full week (skip the partial week). int32_t start; if ((7 - fdm) < getMinimalDaysInFirstWeek()) start = 8 - fdm; // Skip the first partial week else start = 1 - fdm; // This may be zero or negative // Get the day of the week (normalized for locale) for the last // day of the month. int32_t monthLen = cMonthLen; int32_t ldm = (monthLen - dom + dow) % 7; // We know monthLen >= DAY_OF_MONTH so we skip the += 7 step here. // Get the limit day for the blocked-off rectangular month; that // is, the day which is one past the last day of the month, // after the month has already been filled in with phantom days // to fill out the last week. This day has a normalized DOW of 0. int32_t limit = monthLen + 7 - ldm; // Now roll between start and (limit - 1). int32_t gap = limit - start; int32_t newDom = (dom + amount*7 - start) % gap; if (newDom < 0) newDom += gap; newDom += start; // Finally, pin to the real start and end of the month. if (newDom < 1) newDom = 1; if (newDom > monthLen) newDom = monthLen; // Set the DAY_OF_MONTH. We rely on the fact that this field // takes precedence over everything else (since all other fields // are also set at this point). If this fact changes (if the // disambiguation algorithm changes) then we will have to unset // the appropriate fields here so that DAY_OF_MONTH is attended // to. // If we are in the cutover month, manipulate ms directly. Don't do // this in general because it doesn't work across DST boundaries // (details, details). This takes care of the discontinuity. setTimeInMillis(cMonthStart + (newDom-1)*kOneDay, status); return; } default: Calendar::roll(field, amount, status); return; } } // ------------------------------------- /** * Return the minimum value that this field could have, given the current date. * For the Gregorian calendar, this is the same as getMinimum() and getGreatestMinimum(). * @param field the time field. * @return the minimum value that this field could have, given the current date. * @deprecated ICU 2.6. Use getActualMinimum(UCalendarDateFields field) instead. */ int32_t GregorianCalendar::getActualMinimum(EDateFields field) const { return getMinimum((UCalendarDateFields)field); } int32_t GregorianCalendar::getActualMinimum(EDateFields field, UErrorCode& /* status */) const { return getMinimum((UCalendarDateFields)field); } /** * Return the minimum value that this field could have, given the current date. * For the Gregorian calendar, this is the same as getMinimum() and getGreatestMinimum(). * @param field the time field. * @return the minimum value that this field could have, given the current date. * @draft ICU 2.6. */ int32_t GregorianCalendar::getActualMinimum(UCalendarDateFields field, UErrorCode& /* status */) const { return getMinimum(field); } // ------------------------------------ /** * Old year limits were least max 292269054, max 292278994. */ /** * @stable ICU 2.0 */ int32_t GregorianCalendar::handleGetLimit(UCalendarDateFields field, ELimitType limitType) const { return kGregorianCalendarLimits[field][limitType]; } /** * Return the maximum value that this field could have, given the current date. * For example, with the date "Feb 3, 1997" and the DAY_OF_MONTH field, the actual * maximum would be 28; for "Feb 3, 1996" it s 29. Similarly for a Hebrew calendar, * for some years the actual maximum for MONTH is 12, and for others 13. * @stable ICU 2.0 */ int32_t GregorianCalendar::getActualMaximum(UCalendarDateFields field, UErrorCode& status) const { /* It is a known limitation that the code here (and in getActualMinimum) * won't behave properly at the extreme limits of GregorianCalendar's * representable range (except for the code that handles the YEAR * field). That's because the ends of the representable range are at * odd spots in the year. For calendars with the default Gregorian * cutover, these limits are Sun Dec 02 16:47:04 GMT 292269055 BC to Sun * Aug 17 07:12:55 GMT 292278994 AD, somewhat different for non-GMT * zones. As a result, if the calendar is set to Aug 1 292278994 AD, * the actual maximum of DAY_OF_MONTH is 17, not 30. If the date is Mar * 31 in that year, the actual maximum month might be Jul, whereas is * the date is Mar 15, the actual maximum might be Aug -- depending on * the precise semantics that are desired. Similar considerations * affect all fields. Nonetheless, this effect is sufficiently arcane * that we permit it, rather than complicating the code to handle such * intricacies. - liu 8/20/98 * UPDATE: No longer true, since we have pulled in the limit values on * the year. - Liu 11/6/00 */ switch (field) { case UCAL_YEAR: /* The year computation is no different, in principle, from the * others, however, the range of possible maxima is large. In * addition, the way we know we've exceeded the range is different. * For these reasons, we use the special case code below to handle * this field. * * The actual maxima for YEAR depend on the type of calendar: * * Gregorian = May 17, 292275056 BC - Aug 17, 292278994 AD * Julian = Dec 2, 292269055 BC - Jan 3, 292272993 AD * Hybrid = Dec 2, 292269055 BC - Aug 17, 292278994 AD * * We know we've exceeded the maximum when either the month, date, * time, or era changes in response to setting the year. We don't * check for month, date, and time here because the year and era are * sufficient to detect an invalid year setting. NOTE: If code is * added to check the month and date in the future for some reason, * Feb 29 must be allowed to shift to Mar 1 when setting the year. */ { if(U_FAILURE(status)) return 0; Calendar *cal = clone(); if(!cal) { status = U_MEMORY_ALLOCATION_ERROR; return 0; } cal->setLenient(true); int32_t era = cal->get(UCAL_ERA, status); UDate d = cal->getTime(status); /* Perform a binary search, with the invariant that lowGood is a * valid year, and highBad is an out of range year. */ int32_t lowGood = kGregorianCalendarLimits[UCAL_YEAR][1]; int32_t highBad = kGregorianCalendarLimits[UCAL_YEAR][2]+1; while ((lowGood + 1) < highBad) { int32_t y = (lowGood + highBad) / 2; cal->set(UCAL_YEAR, y); if (cal->get(UCAL_YEAR, status) == y && cal->get(UCAL_ERA, status) == era) { lowGood = y; } else { highBad = y; cal->setTime(d, status); // Restore original fields } } delete cal; return lowGood; } default: return Calendar::getActualMaximum(field,status); } } int32_t GregorianCalendar::handleGetExtendedYear() { // the year to return int32_t year = kEpochYear; // year field to use int32_t yearField = UCAL_EXTENDED_YEAR; // There are three separate fields which could be used to // derive the proper year. Use the one most recently set. if (fStamp[yearField] < fStamp[UCAL_YEAR]) yearField = UCAL_YEAR; if (fStamp[yearField] < fStamp[UCAL_YEAR_WOY]) yearField = UCAL_YEAR_WOY; // based on the "best" year field, get the year switch(yearField) { case UCAL_EXTENDED_YEAR: year = internalGet(UCAL_EXTENDED_YEAR, kEpochYear); break; case UCAL_YEAR: { // The year defaults to the epoch start, the era to AD int32_t era = internalGet(UCAL_ERA, AD); if (era == BC) { year = 1 - internalGet(UCAL_YEAR, 1); // Convert to extended year } else { year = internalGet(UCAL_YEAR, kEpochYear); } } break; case UCAL_YEAR_WOY: year = handleGetExtendedYearFromWeekFields(internalGet(UCAL_YEAR_WOY), internalGet(UCAL_WEEK_OF_YEAR)); #if defined (U_DEBUG_CAL) // if(internalGet(UCAL_YEAR_WOY) != year) { fprintf(stderr, "%s:%d: hGEYFWF[%d,%d] -> %d\n", __FILE__, __LINE__,internalGet(UCAL_YEAR_WOY),internalGet(UCAL_WEEK_OF_YEAR),year); //} #endif break; default: year = kEpochYear; } return year; } int32_t GregorianCalendar::handleGetExtendedYearFromWeekFields(int32_t yearWoy, int32_t woy) { // convert year to extended form int32_t era = internalGet(UCAL_ERA, AD); if(era == BC) { yearWoy = 1 - yearWoy; } return Calendar::handleGetExtendedYearFromWeekFields(yearWoy, woy); } // ------------------------------------- /** * Return the ERA. We need a special method for this because the * default ERA is AD, but a zero (unset) ERA is BC. */ int32_t GregorianCalendar::internalGetEra() const { return isSet(UCAL_ERA) ? internalGet(UCAL_ERA) : (int32_t)AD; } const char * GregorianCalendar::getType() const { //static const char kGregorianType = "gregorian"; return "gregorian"; } /** * The system maintains a static default century start date and Year. They are * initialized the first time they are used. Once the system default century date * and year are set, they do not change. */ static UDate gSystemDefaultCenturyStart = DBL_MIN; static int32_t gSystemDefaultCenturyStartYear = -1; static icu::UInitOnce gSystemDefaultCenturyInit {}; UBool GregorianCalendar::haveDefaultCentury() const { return true; } static void U_CALLCONV initializeSystemDefaultCentury() { // initialize systemDefaultCentury and systemDefaultCenturyYear based // on the current time. They'll be set to 80 years before // the current time. UErrorCode status = U_ZERO_ERROR; GregorianCalendar calendar(status); if (U_SUCCESS(status)) { calendar.setTime(Calendar::getNow(), status); calendar.add(UCAL_YEAR, -80, status); gSystemDefaultCenturyStart = calendar.getTime(status); gSystemDefaultCenturyStartYear = calendar.get(UCAL_YEAR, status); } // We have no recourse upon failure unless we want to propagate the failure // out. } UDate GregorianCalendar::defaultCenturyStart() const { // lazy-evaluate systemDefaultCenturyStart umtx_initOnce(gSystemDefaultCenturyInit, &initializeSystemDefaultCentury); return gSystemDefaultCenturyStart; } int32_t GregorianCalendar::defaultCenturyStartYear() const { // lazy-evaluate systemDefaultCenturyStartYear umtx_initOnce(gSystemDefaultCenturyInit, &initializeSystemDefaultCentury); return gSystemDefaultCenturyStartYear; } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ //eof stringi/src/icu74/i18n/formatted_string_builder.cpp0000644000176200001440000003573714700200761022033 0ustar liggesusers// © 2017 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "formatted_string_builder.h" #include "putilimp.h" #include "unicode/ustring.h" #include "unicode/utf16.h" #include "unicode/unum.h" // for UNumberFormatFields literals namespace { // A version of uprv_memcpy that checks for length 0. // By default, uprv_memcpy requires a length of at least 1. inline void uprv_memcpy2(void* dest, const void* src, size_t len) { if (len > 0) { uprv_memcpy(dest, src, len); } } // A version of uprv_memmove that checks for length 0. // By default, uprv_memmove requires a length of at least 1. inline void uprv_memmove2(void* dest, const void* src, size_t len) { if (len > 0) { uprv_memmove(dest, src, len); } } } // namespace U_NAMESPACE_BEGIN FormattedStringBuilder::FormattedStringBuilder() { #if U_DEBUG // Initializing the memory to non-zero helps catch some bugs that involve // reading from an improperly terminated string. for (int32_t i=0; i DEFAULT_CAPACITY) { // FIXME: uprv_malloc // C++ note: malloc appears in two places: here and in prepareForInsertHelper. auto newChars = static_cast (uprv_malloc(sizeof(char16_t) * capacity)); auto newFields = static_cast(uprv_malloc(sizeof(Field) * capacity)); if (newChars == nullptr || newFields == nullptr) { // UErrorCode is not available; fail silently. uprv_free(newChars); uprv_free(newFields); *this = FormattedStringBuilder(); // can't fail return *this; } fUsingHeap = true; fChars.heap.capacity = capacity; fChars.heap.ptr = newChars; fFields.heap.capacity = capacity; fFields.heap.ptr = newFields; } uprv_memcpy2(getCharPtr(), other.getCharPtr(), sizeof(char16_t) * capacity); uprv_memcpy2(getFieldPtr(), other.getFieldPtr(), sizeof(Field) * capacity); fZero = other.fZero; fLength = other.fLength; return *this; } int32_t FormattedStringBuilder::length() const { return fLength; } int32_t FormattedStringBuilder::codePointCount() const { return u_countChar32(getCharPtr() + fZero, fLength); } UChar32 FormattedStringBuilder::getFirstCodePoint() const { if (fLength == 0) { return -1; } UChar32 cp; U16_GET(getCharPtr() + fZero, 0, 0, fLength, cp); return cp; } UChar32 FormattedStringBuilder::getLastCodePoint() const { if (fLength == 0) { return -1; } int32_t offset = fLength; U16_BACK_1(getCharPtr() + fZero, 0, offset); UChar32 cp; U16_GET(getCharPtr() + fZero, 0, offset, fLength, cp); return cp; } UChar32 FormattedStringBuilder::codePointAt(int32_t index) const { UChar32 cp; U16_GET(getCharPtr() + fZero, 0, index, fLength, cp); return cp; } UChar32 FormattedStringBuilder::codePointBefore(int32_t index) const { int32_t offset = index; U16_BACK_1(getCharPtr() + fZero, 0, offset); UChar32 cp; U16_GET(getCharPtr() + fZero, 0, offset, fLength, cp); return cp; } FormattedStringBuilder &FormattedStringBuilder::clear() { // TODO: Reset the heap here? fZero = getCapacity() / 2; fLength = 0; return *this; } int32_t FormattedStringBuilder::insertCodePoint(int32_t index, UChar32 codePoint, Field field, UErrorCode &status) { int32_t count = U16_LENGTH(codePoint); int32_t position = prepareForInsert(index, count, status); if (U_FAILURE(status)) { return count; } if (count == 1) { getCharPtr()[position] = (char16_t) codePoint; getFieldPtr()[position] = field; } else { getCharPtr()[position] = U16_LEAD(codePoint); getCharPtr()[position + 1] = U16_TRAIL(codePoint); getFieldPtr()[position] = getFieldPtr()[position + 1] = field; } return count; } int32_t FormattedStringBuilder::insert(int32_t index, const UnicodeString &unistr, Field field, UErrorCode &status) { if (unistr.length() == 0) { // Nothing to insert. return 0; } else if (unistr.length() == 1) { // Fast path: insert using insertCodePoint. return insertCodePoint(index, unistr.charAt(0), field, status); } else { return insert(index, unistr, 0, unistr.length(), field, status); } } int32_t FormattedStringBuilder::insert(int32_t index, const UnicodeString &unistr, int32_t start, int32_t end, Field field, UErrorCode &status) { int32_t count = end - start; int32_t position = prepareForInsert(index, count, status); if (U_FAILURE(status)) { return count; } for (int32_t i = 0; i < count; i++) { getCharPtr()[position + i] = unistr.charAt(start + i); getFieldPtr()[position + i] = field; } return count; } int32_t FormattedStringBuilder::splice(int32_t startThis, int32_t endThis, const UnicodeString &unistr, int32_t startOther, int32_t endOther, Field field, UErrorCode& status) { int32_t thisLength = endThis - startThis; int32_t otherLength = endOther - startOther; int32_t count = otherLength - thisLength; if (U_FAILURE(status)) { return count; } int32_t position; if (count > 0) { // Overall, chars need to be added. position = prepareForInsert(startThis, count, status); } else { // Overall, chars need to be removed or kept the same. position = remove(startThis, -count); } if (U_FAILURE(status)) { return count; } for (int32_t i = 0; i < otherLength; i++) { getCharPtr()[position + i] = unistr.charAt(startOther + i); getFieldPtr()[position + i] = field; } return count; } int32_t FormattedStringBuilder::append(const FormattedStringBuilder &other, UErrorCode &status) { return insert(fLength, other, status); } int32_t FormattedStringBuilder::insert(int32_t index, const FormattedStringBuilder &other, UErrorCode &status) { if (U_FAILURE(status)) { return 0; } if (this == &other) { status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } int32_t count = other.fLength; if (count == 0) { // Nothing to insert. return 0; } int32_t position = prepareForInsert(index, count, status); if (U_FAILURE(status)) { return count; } for (int32_t i = 0; i < count; i++) { getCharPtr()[position + i] = other.charAt(i); getFieldPtr()[position + i] = other.fieldAt(i); } return count; } void FormattedStringBuilder::writeTerminator(UErrorCode& status) { int32_t position = prepareForInsert(fLength, 1, status); if (U_FAILURE(status)) { return; } getCharPtr()[position] = 0; getFieldPtr()[position] = kUndefinedField; fLength--; } int32_t FormattedStringBuilder::prepareForInsert(int32_t index, int32_t count, UErrorCode &status) { U_ASSERT(index >= 0); U_ASSERT(index <= fLength); U_ASSERT(count >= 0); U_ASSERT(fZero >= 0); U_ASSERT(fLength >= 0); U_ASSERT(getCapacity() - fZero >= fLength); if (U_FAILURE(status)) { return count; } if (index == 0 && fZero - count >= 0) { // Append to start fZero -= count; fLength += count; return fZero; } else if (index == fLength && count <= getCapacity() - fZero - fLength) { // Append to end fLength += count; return fZero + fLength - count; } else { // Move chars around and/or allocate more space return prepareForInsertHelper(index, count, status); } } int32_t FormattedStringBuilder::prepareForInsertHelper(int32_t index, int32_t count, UErrorCode &status) { int32_t oldCapacity = getCapacity(); int32_t oldZero = fZero; char16_t *oldChars = getCharPtr(); Field *oldFields = getFieldPtr(); int32_t newLength; if (uprv_add32_overflow(fLength, count, &newLength)) { status = U_INPUT_TOO_LONG_ERROR; return -1; } int32_t newZero; if (newLength > oldCapacity) { if (newLength > INT32_MAX / 2) { // We do not support more than 1G char16_t in this code because // dealing with >2G *bytes* can cause subtle bugs. status = U_INPUT_TOO_LONG_ERROR; return -1; } // Keep newCapacity also to at most 1G char16_t. int32_t newCapacity = newLength * 2; newZero = (newCapacity - newLength) / 2; // C++ note: malloc appears in two places: here and in the assignment operator. auto newChars = static_cast (uprv_malloc(sizeof(char16_t) * static_cast(newCapacity))); auto newFields = static_cast(uprv_malloc(sizeof(Field) * static_cast(newCapacity))); if (newChars == nullptr || newFields == nullptr) { uprv_free(newChars); uprv_free(newFields); status = U_MEMORY_ALLOCATION_ERROR; return -1; } // First copy the prefix and then the suffix, leaving room for the new chars that the // caller wants to insert. // C++ note: memcpy is OK because the src and dest do not overlap. uprv_memcpy2(newChars + newZero, oldChars + oldZero, sizeof(char16_t) * index); uprv_memcpy2(newChars + newZero + index + count, oldChars + oldZero + index, sizeof(char16_t) * (fLength - index)); uprv_memcpy2(newFields + newZero, oldFields + oldZero, sizeof(Field) * index); uprv_memcpy2(newFields + newZero + index + count, oldFields + oldZero + index, sizeof(Field) * (fLength - index)); if (fUsingHeap) { uprv_free(oldChars); uprv_free(oldFields); } fUsingHeap = true; fChars.heap.ptr = newChars; fChars.heap.capacity = newCapacity; fFields.heap.ptr = newFields; fFields.heap.capacity = newCapacity; } else { newZero = (oldCapacity - newLength) / 2; // C++ note: memmove is required because src and dest may overlap. // First copy the entire string to the location of the prefix, and then move the suffix // to make room for the new chars that the caller wants to insert. uprv_memmove2(oldChars + newZero, oldChars + oldZero, sizeof(char16_t) * fLength); uprv_memmove2(oldChars + newZero + index + count, oldChars + newZero + index, sizeof(char16_t) * (fLength - index)); uprv_memmove2(oldFields + newZero, oldFields + oldZero, sizeof(Field) * fLength); uprv_memmove2(oldFields + newZero + index + count, oldFields + newZero + index, sizeof(Field) * (fLength - index)); } fZero = newZero; fLength = newLength; return fZero + index; } int32_t FormattedStringBuilder::remove(int32_t index, int32_t count) { U_ASSERT(0 <= index); U_ASSERT(index <= fLength); U_ASSERT(count <= (fLength - index)); U_ASSERT(index <= getCapacity() - fZero); int32_t position = index + fZero; // TODO: Reset the heap here? (If the string after removal can fit on stack?) uprv_memmove2(getCharPtr() + position, getCharPtr() + position + count, sizeof(char16_t) * (fLength - index - count)); uprv_memmove2(getFieldPtr() + position, getFieldPtr() + position + count, sizeof(Field) * (fLength - index - count)); fLength -= count; return position; } UnicodeString FormattedStringBuilder::toUnicodeString() const { return UnicodeString(getCharPtr() + fZero, fLength); } const UnicodeString FormattedStringBuilder::toTempUnicodeString() const { // Readonly-alias constructor: return UnicodeString(false, getCharPtr() + fZero, fLength); } UnicodeString FormattedStringBuilder::toDebugString() const { UnicodeString sb; sb.append(u"", -1); return sb; } const char16_t *FormattedStringBuilder::chars() const { return getCharPtr() + fZero; } bool FormattedStringBuilder::contentEquals(const FormattedStringBuilder &other) const { if (fLength != other.fLength) { return false; } for (int32_t i = 0; i < fLength; i++) { if (charAt(i) != other.charAt(i) || fieldAt(i) != other.fieldAt(i)) { return false; } } return true; } bool FormattedStringBuilder::containsField(Field field) const { for (int32_t i = 0; i < fLength; i++) { if (field == fieldAt(i)) { return true; } } return false; } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/csmatch.cpp0000644000176200001440000000323014700200761016353 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2005-2012, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_CONVERSION #include "unicode/unistr.h" #include "unicode/ucnv.h" #include "csmatch.h" #include "csrecog.h" #include "inputext.h" U_NAMESPACE_BEGIN CharsetMatch::CharsetMatch() : textIn(nullptr), confidence(0), fCharsetName(nullptr), fLang(nullptr) { // nothing else to do. } void CharsetMatch::set(InputText *input, const CharsetRecognizer *cr, int32_t conf, const char *csName, const char *lang) { textIn = input; confidence = conf; fCharsetName = csName; fLang = lang; if (cr != nullptr) { if (fCharsetName == nullptr) { fCharsetName = cr->getName(); } if (fLang == nullptr) { fLang = cr->getLanguage(); } } } const char* CharsetMatch::getName()const { return fCharsetName; } const char* CharsetMatch::getLanguage()const { return fLang; } int32_t CharsetMatch::getConfidence()const { return confidence; } int32_t CharsetMatch::getUChars(char16_t *buf, int32_t cap, UErrorCode *status) const { UConverter *conv = ucnv_open(getName(), status); int32_t result = ucnv_toUChars(conv, buf, cap, (const char *) textIn->fRawInput, textIn->fRawLength, status); ucnv_close(conv); return result; } U_NAMESPACE_END #endif stringi/src/icu74/i18n/cpdtrans.cpp0000644000176200001440000005201414700200761016553 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 1999-2011, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 11/17/99 aliu Creation. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/unifilt.h" #include "unicode/uniset.h" #include "cpdtrans.h" #include "uvector.h" #include "tridpars.h" #include "cmemory.h" // keep in sync with Transliterator //static const char16_t ID_SEP = 0x002D; /*-*/ static const char16_t ID_DELIM = 0x003B; /*;*/ static const char16_t NEWLINE = 10; static const char16_t COLON_COLON[] = {0x3A, 0x3A, 0}; //"::" U_NAMESPACE_BEGIN const char16_t CompoundTransliterator::PASS_STRING[] = { 0x0025, 0x0050, 0x0061, 0x0073, 0x0073, 0 }; // "%Pass" UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompoundTransliterator) /** * Constructs a new compound transliterator given an array of * transliterators. The array of transliterators may be of any * length, including zero or one, however, useful compound * transliterators have at least two components. * @param transliterators array of Transliterator * objects * @param transliteratorCount The number of * Transliterator objects in transliterators. * @param filter the filter. Any character for which * filter.contains() returns false will not be * altered by this transliterator. If filter is * null then no filtering is applied. */ CompoundTransliterator::CompoundTransliterator( Transliterator* const transliterators[], int32_t transliteratorCount, UnicodeFilter* adoptedFilter) : Transliterator(joinIDs(transliterators, transliteratorCount), adoptedFilter), trans(0), count(0), numAnonymousRBTs(0) { setTransliterators(transliterators, transliteratorCount); } /** * Splits an ID of the form "ID;ID;..." into a compound using each * of the IDs. * @param id of above form * @param forward if false, does the list in reverse order, and * takes the inverse of each ID. */ CompoundTransliterator::CompoundTransliterator(const UnicodeString& id, UTransDirection direction, UnicodeFilter* adoptedFilter, UParseError& /*parseError*/, UErrorCode& status) : Transliterator(id, adoptedFilter), trans(0), numAnonymousRBTs(0) { // TODO add code for parseError...currently unused, but // later may be used by parsing code... init(id, direction, true, status); } CompoundTransliterator::CompoundTransliterator(const UnicodeString& id, UParseError& /*parseError*/, UErrorCode& status) : Transliterator(id, 0), // set filter to 0 here! trans(0), numAnonymousRBTs(0) { // TODO add code for parseError...currently unused, but // later may be used by parsing code... init(id, UTRANS_FORWARD, true, status); } /** * Private constructor for use of TransliteratorAlias */ CompoundTransliterator::CompoundTransliterator(const UnicodeString& newID, UVector& list, UnicodeFilter* adoptedFilter, int32_t anonymousRBTs, UParseError& /*parseError*/, UErrorCode& status) : Transliterator(newID, adoptedFilter), trans(0), numAnonymousRBTs(anonymousRBTs) { init(list, UTRANS_FORWARD, false, status); } /** * Private constructor for Transliterator from a vector of * transliterators. The caller is responsible for fixing up the * ID. */ CompoundTransliterator::CompoundTransliterator(UVector& list, UParseError& /*parseError*/, UErrorCode& status) : Transliterator(UnicodeString(), nullptr), trans(0), numAnonymousRBTs(0) { // TODO add code for parseError...currently unused, but // later may be used by parsing code... init(list, UTRANS_FORWARD, false, status); // assume caller will fixup ID } CompoundTransliterator::CompoundTransliterator(UVector& list, int32_t anonymousRBTs, UParseError& /*parseError*/, UErrorCode& status) : Transliterator(UnicodeString(), nullptr), trans(0), numAnonymousRBTs(anonymousRBTs) { init(list, UTRANS_FORWARD, false, status); } /** * Finish constructing a transliterator: only to be called by * constructors. Before calling init(), set trans and filter to nullptr. * @param id the id containing ';'-separated entries * @param direction either FORWARD or REVERSE * @param idSplitPoint the index into id at which the * adoptedSplitTransliterator should be inserted, if there is one, or * -1 if there is none. * @param adoptedSplitTransliterator a transliterator to be inserted * before the entry at offset idSplitPoint in the id string. May be * nullptr to insert no entry. * @param fixReverseID if true, then reconstruct the ID of reverse * entries by calling getID() of component entries. Some constructors * do not require this because they apply a facade ID anyway. * @param status the error code indicating success or failure */ void CompoundTransliterator::init(const UnicodeString& id, UTransDirection direction, UBool fixReverseID, UErrorCode& status) { // assert(trans == 0); if (U_FAILURE(status)) { return; } UVector list(status); UnicodeSet* compoundFilter = nullptr; UnicodeString regenID; if (!TransliteratorIDParser::parseCompoundID(id, direction, regenID, list, compoundFilter)) { status = U_INVALID_ID; delete compoundFilter; return; } TransliteratorIDParser::instantiateList(list, status); init(list, direction, fixReverseID, status); if (compoundFilter != nullptr) { adoptFilter(compoundFilter); } } /** * Finish constructing a transliterator: only to be called by * constructors. Before calling init(), set trans and filter to nullptr. * @param list a vector of transliterator objects to be adopted. It * should NOT be empty. The list should be in declared order. That * is, it should be in the FORWARD order; if direction is REVERSE then * the list order will be reversed. * @param direction either FORWARD or REVERSE * @param fixReverseID if true, then reconstruct the ID of reverse * entries by calling getID() of component entries. Some constructors * do not require this because they apply a facade ID anyway. * @param status the error code indicating success or failure */ void CompoundTransliterator::init(UVector& list, UTransDirection direction, UBool fixReverseID, UErrorCode& status) { // assert(trans == 0); // Allocate array if (U_SUCCESS(status)) { count = list.size(); trans = (Transliterator **)uprv_malloc(count * sizeof(Transliterator *)); /* test for nullptr */ if (trans == 0) { status = U_MEMORY_ALLOCATION_ERROR; return; } } if (U_FAILURE(status) || trans == 0) { // assert(trans == 0); return; } // Move the transliterators from the vector into an array. // Reverse the order if necessary. int32_t i; for (i=0; i 0) { newID.append(ID_DELIM); } newID.append(trans[i]->getID()); } setID(newID); } computeMaximumContextLength(); } /** * Return the IDs of the given list of transliterators, concatenated * with ID_DELIM delimiting them. Equivalent to the perlish expression * join(ID_DELIM, map($_.getID(), transliterators). */ UnicodeString CompoundTransliterator::joinIDs(Transliterator* const transliterators[], int32_t transCount) { UnicodeString id; for (int32_t i=0; i 0) { id.append(ID_DELIM); } id.append(transliterators[i]->getID()); } return id; // Return temporary } /** * Copy constructor. */ CompoundTransliterator::CompoundTransliterator(const CompoundTransliterator& t) : Transliterator(t), trans(0), count(0), numAnonymousRBTs(-1) { *this = t; } /** * Destructor */ CompoundTransliterator::~CompoundTransliterator() { freeTransliterators(); } void CompoundTransliterator::freeTransliterators() { if (trans != 0) { for (int32_t i=0; i count) { if (trans != nullptr) { uprv_free(trans); } trans = (Transliterator **)uprv_malloc(t.count * sizeof(Transliterator *)); } count = t.count; if (trans != nullptr) { for (i=0; iclone(); if (trans[i] == nullptr) { failed = true; break; } } } // if memory allocation failed delete backwards trans array if (failed && i > 0) { int32_t n; for (n = i-1; n >= 0; n--) { uprv_free(trans[n]); trans[n] = nullptr; } } numAnonymousRBTs = t.numAnonymousRBTs; return *this; } /** * Transliterator API. */ CompoundTransliterator* CompoundTransliterator::clone() const { return new CompoundTransliterator(*this); } /** * Returns the number of transliterators in this chain. * @return number of transliterators in this chain. */ int32_t CompoundTransliterator::getCount() const { return count; } /** * Returns the transliterator at the given index in this chain. * @param index index into chain, from 0 to getCount() - 1 * @return transliterator at the given index */ const Transliterator& CompoundTransliterator::getTransliterator(int32_t index) const { return *trans[index]; } void CompoundTransliterator::setTransliterators(Transliterator* const transliterators[], int32_t transCount) { Transliterator** a = (Transliterator **)uprv_malloc(transCount * sizeof(Transliterator *)); if (a == nullptr) { return; } int32_t i = 0; UBool failed = false; for (i=0; iclone(); if (a[i] == nullptr) { failed = true; break; } } if (failed && i > 0) { int32_t n; for (n = i-1; n >= 0; n--) { uprv_free(a[n]); a[n] = nullptr; } return; } adoptTransliterators(a, transCount); } void CompoundTransliterator::adoptTransliterators(Transliterator* adoptedTransliterators[], int32_t transCount) { // First free trans[] and set count to zero. Once this is done, // orphan the filter. Set up the new trans[]. freeTransliterators(); trans = adoptedTransliterators; count = transCount; computeMaximumContextLength(); setID(joinIDs(trans, count)); } /** * Append c to buf, unless buf is empty or buf already ends in c. */ static void _smartAppend(UnicodeString& buf, char16_t c) { if (buf.length() != 0 && buf.charAt(buf.length() - 1) != c) { buf.append(c); } } UnicodeString& CompoundTransliterator::toRules(UnicodeString& rulesSource, UBool escapeUnprintable) const { // We do NOT call toRules() on our component transliterators, in // general. If we have several rule-based transliterators, this // yields a concatenation of the rules -- not what we want. We do // handle compound RBT transliterators specially -- those for which // compoundRBTIndex >= 0. For the transliterator at compoundRBTIndex, // we do call toRules() recursively. rulesSource.truncate(0); if (numAnonymousRBTs >= 1 && getFilter() != nullptr) { // If we are a compound RBT and if we have a global // filter, then emit it at the top. UnicodeString pat; rulesSource.append(COLON_COLON, 2).append(getFilter()->toPattern(pat, escapeUnprintable)).append(ID_DELIM); } for (int32_t i=0; igetID().startsWith(PASS_STRING, 5)) { trans[i]->toRules(rule, escapeUnprintable); if (numAnonymousRBTs > 1 && i > 0 && trans[i - 1]->getID().startsWith(PASS_STRING, 5)) rule = UNICODE_STRING_SIMPLE("::Null;") + rule; // we also use toRules() on CompoundTransliterators (which we // check for by looking for a semicolon in the ID)-- this gets // the list of their child transliterators output in the right // format } else if (trans[i]->getID().indexOf(ID_DELIM) >= 0) { trans[i]->toRules(rule, escapeUnprintable); // for everything else, use Transliterator::toRules() } else { trans[i]->Transliterator::toRules(rule, escapeUnprintable); } _smartAppend(rulesSource, NEWLINE); rulesSource.append(rule); _smartAppend(rulesSource, ID_DELIM); } return rulesSource; } /** * Implement Transliterator framework */ void CompoundTransliterator::handleGetSourceSet(UnicodeSet& result) const { UnicodeSet set; result.clear(); for (int32_t i=0; igetSourceSet(set)); // Take the example of Hiragana-Latin. This is really // Hiragana-Katakana; Katakana-Latin. The source set of // these two is roughly [:Hiragana:] and [:Katakana:]. // But the source set for the entire transliterator is // actually [:Hiragana:] ONLY -- that is, the first // non-empty source set. // This is a heuristic, and not 100% reliable. if (!result.isEmpty()) { break; } } } /** * Override Transliterator framework */ UnicodeSet& CompoundTransliterator::getTargetSet(UnicodeSet& result) const { UnicodeSet set; result.clear(); for (int32_t i=0; igetTargetSet(set)); } return result; } /** * Implements {@link Transliterator#handleTransliterate}. */ void CompoundTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index, UBool incremental) const { /* Call each transliterator with the same contextStart and * start, but with the limit as modified * by preceding transliterators. The start index must be * reset for each transliterator to give each a chance to * transliterate the text. The initial contextStart index is known * to still point to the same place after each transliterator * is called because each transliterator will not change the * text between contextStart and the initial start index. * * IMPORTANT: After the first transliterator, each subsequent * transliterator only gets to transliterate text committed by * preceding transliterators; that is, the start (output * value) of transliterator i becomes the limit (input value) * of transliterator i+1. Finally, the overall limit is fixed * up before we return. * * Assumptions we make here: * (1) contextStart <= start <= limit <= contextLimit <= text.length() * (2) start <= start' <= limit' ;cursor doesn't move back * (3) start <= limit' ;text before cursor unchanged * - start' is the value of start after calling handleKT * - limit' is the value of limit after calling handleKT */ /** * Example: 3 transliterators. This example illustrates the * mechanics we need to implement. C, S, and L are the contextStart, * start, and limit. gl is the globalLimit. contextLimit is * equal to limit throughout. * * 1. h-u, changes hex to Unicode * * 4 7 a d 0 4 7 a * abc/u0061/u => abca/u * C S L C S L gl=f->a * * 2. upup, changes "x" to "XX" * * 4 7 a 4 7 a * abca/u => abcAA/u * C SL C S * L gl=a->b * 3. u-h, changes Unicode to hex * * 4 7 a 4 7 a d 0 3 * abcAA/u => abc/u0041/u0041/u * C S L C S * L gl=b->15 * 4. return * * 4 7 a d 0 3 * abc/u0041/u0041/u * C S L */ if (count < 1) { index.start = index.limit; return; // Short circuit for empty compound transliterators } // compoundLimit is the limit value for the entire compound // operation. We overwrite index.limit with the previous // index.start. After each transliteration, we update // compoundLimit for insertions or deletions that have happened. int32_t compoundLimit = index.limit; // compoundStart is the start for the entire compound // operation. int32_t compoundStart = index.start; int32_t delta = 0; // delta in length // Give each transliterator a crack at the run of characters. // See comments at the top of the method for more detail. for (int32_t i=0; ifilteredTransliterate(text, index, incremental); // In a properly written transliterator, start == limit after // handleTransliterate() returns when incremental is false. // Catch cases where the subclass doesn't do this, and throw // an exception. (Just pinning start to limit is a bad idea, // because what's probably happening is that the subclass // isn't transliterating all the way to the end, and it should // in non-incremental mode.) if (!incremental && index.start != index.limit) { // We can't throw an exception, so just fudge things index.start = index.limit; } // Cumulative delta for insertions/deletions delta += index.limit - limit; if (incremental) { // In the incremental case, only allow subsequent // transliterators to modify what has already been // completely processed by prior transliterators. In the // non-incrmental case, allow each transliterator to // process the entire text. index.limit = index.start; } } compoundLimit += delta; // Start is good where it is -- where the last transliterator left // it. Limit needs to be put back where it was, modulo // adjustments for deletions/insertions. index.limit = compoundLimit; } /** * Sets the length of the longest context required by this transliterator. * This is preceding context. */ void CompoundTransliterator::computeMaximumContextLength() { int32_t max = 0; for (int32_t i=0; igetMaximumContextLength(); if (len > max) { max = len; } } setMaximumContextLength(max); } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ /* eof */ stringi/src/icu74/i18n/number_skeletons.cpp0000644000176200001440000017734514700200761020333 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING // Allow implicit conversion from char16_t* to UnicodeString for this file: // Helpful in toString methods and elsewhere. #define UNISTR_FROM_STRING_EXPLICIT #include "number_decnum.h" #include "number_roundingutils.h" #include "number_skeletons.h" #include "umutex.h" #include "ucln_in.h" #include "patternprops.h" #include "unicode/ucharstriebuilder.h" #include "number_utils.h" #include "number_decimalquantity.h" #include "unicode/numberformatter.h" #include "uinvchar.h" #include "charstr.h" #include "string_segment.h" #include "unicode/errorcode.h" #include "util.h" #include "measunit_impl.h" using namespace icu; using namespace icu::number; using namespace icu::number::impl; using namespace icu::number::impl::skeleton; namespace { icu::UInitOnce gNumberSkeletonsInitOnce {}; char16_t* kSerializedStemTrie = nullptr; UBool U_CALLCONV cleanupNumberSkeletons() { uprv_free(kSerializedStemTrie); kSerializedStemTrie = nullptr; gNumberSkeletonsInitOnce.reset(); return true; } void U_CALLCONV initNumberSkeletons(UErrorCode& status) { ucln_i18n_registerCleanup(UCLN_I18N_NUMBER_SKELETONS, cleanupNumberSkeletons); UCharsTrieBuilder b(status); if (U_FAILURE(status)) { return; } // Section 1: b.add(u"compact-short", STEM_COMPACT_SHORT, status); b.add(u"compact-long", STEM_COMPACT_LONG, status); b.add(u"scientific", STEM_SCIENTIFIC, status); b.add(u"engineering", STEM_ENGINEERING, status); b.add(u"notation-simple", STEM_NOTATION_SIMPLE, status); b.add(u"base-unit", STEM_BASE_UNIT, status); b.add(u"percent", STEM_PERCENT, status); b.add(u"permille", STEM_PERMILLE, status); b.add(u"precision-integer", STEM_PRECISION_INTEGER, status); b.add(u"precision-unlimited", STEM_PRECISION_UNLIMITED, status); b.add(u"precision-currency-standard", STEM_PRECISION_CURRENCY_STANDARD, status); b.add(u"precision-currency-cash", STEM_PRECISION_CURRENCY_CASH, status); b.add(u"rounding-mode-ceiling", STEM_ROUNDING_MODE_CEILING, status); b.add(u"rounding-mode-floor", STEM_ROUNDING_MODE_FLOOR, status); b.add(u"rounding-mode-down", STEM_ROUNDING_MODE_DOWN, status); b.add(u"rounding-mode-up", STEM_ROUNDING_MODE_UP, status); b.add(u"rounding-mode-half-even", STEM_ROUNDING_MODE_HALF_EVEN, status); b.add(u"rounding-mode-half-odd", STEM_ROUNDING_MODE_HALF_ODD, status); b.add(u"rounding-mode-half-ceiling", STEM_ROUNDING_MODE_HALF_CEILING, status); b.add(u"rounding-mode-half-floor", STEM_ROUNDING_MODE_HALF_FLOOR, status); b.add(u"rounding-mode-half-down", STEM_ROUNDING_MODE_HALF_DOWN, status); b.add(u"rounding-mode-half-up", STEM_ROUNDING_MODE_HALF_UP, status); b.add(u"rounding-mode-unnecessary", STEM_ROUNDING_MODE_UNNECESSARY, status); b.add(u"integer-width-trunc", STEM_INTEGER_WIDTH_TRUNC, status); b.add(u"group-off", STEM_GROUP_OFF, status); b.add(u"group-min2", STEM_GROUP_MIN2, status); b.add(u"group-auto", STEM_GROUP_AUTO, status); b.add(u"group-on-aligned", STEM_GROUP_ON_ALIGNED, status); b.add(u"group-thousands", STEM_GROUP_THOUSANDS, status); b.add(u"latin", STEM_LATIN, status); b.add(u"unit-width-narrow", STEM_UNIT_WIDTH_NARROW, status); b.add(u"unit-width-short", STEM_UNIT_WIDTH_SHORT, status); b.add(u"unit-width-full-name", STEM_UNIT_WIDTH_FULL_NAME, status); b.add(u"unit-width-iso-code", STEM_UNIT_WIDTH_ISO_CODE, status); b.add(u"unit-width-formal", STEM_UNIT_WIDTH_FORMAL, status); b.add(u"unit-width-variant", STEM_UNIT_WIDTH_VARIANT, status); b.add(u"unit-width-hidden", STEM_UNIT_WIDTH_HIDDEN, status); b.add(u"sign-auto", STEM_SIGN_AUTO, status); b.add(u"sign-always", STEM_SIGN_ALWAYS, status); b.add(u"sign-never", STEM_SIGN_NEVER, status); b.add(u"sign-accounting", STEM_SIGN_ACCOUNTING, status); b.add(u"sign-accounting-always", STEM_SIGN_ACCOUNTING_ALWAYS, status); b.add(u"sign-except-zero", STEM_SIGN_EXCEPT_ZERO, status); b.add(u"sign-accounting-except-zero", STEM_SIGN_ACCOUNTING_EXCEPT_ZERO, status); b.add(u"sign-negative", STEM_SIGN_NEGATIVE, status); b.add(u"sign-accounting-negative", STEM_SIGN_ACCOUNTING_NEGATIVE, status); b.add(u"decimal-auto", STEM_DECIMAL_AUTO, status); b.add(u"decimal-always", STEM_DECIMAL_ALWAYS, status); if (U_FAILURE(status)) { return; } // Section 2: b.add(u"precision-increment", STEM_PRECISION_INCREMENT, status); b.add(u"measure-unit", STEM_MEASURE_UNIT, status); b.add(u"per-measure-unit", STEM_PER_MEASURE_UNIT, status); b.add(u"unit", STEM_UNIT, status); b.add(u"usage", STEM_UNIT_USAGE, status); b.add(u"currency", STEM_CURRENCY, status); b.add(u"integer-width", STEM_INTEGER_WIDTH, status); b.add(u"numbering-system", STEM_NUMBERING_SYSTEM, status); b.add(u"scale", STEM_SCALE, status); if (U_FAILURE(status)) { return; } // Section 3 (concise tokens): b.add(u"K", STEM_COMPACT_SHORT, status); b.add(u"KK", STEM_COMPACT_LONG, status); b.add(u"%", STEM_PERCENT, status); b.add(u"%x100", STEM_PERCENT_100, status); b.add(u",_", STEM_GROUP_OFF, status); b.add(u",?", STEM_GROUP_MIN2, status); b.add(u",!", STEM_GROUP_ON_ALIGNED, status); b.add(u"+!", STEM_SIGN_ALWAYS, status); b.add(u"+_", STEM_SIGN_NEVER, status); b.add(u"()", STEM_SIGN_ACCOUNTING, status); b.add(u"()!", STEM_SIGN_ACCOUNTING_ALWAYS, status); b.add(u"+?", STEM_SIGN_EXCEPT_ZERO, status); b.add(u"()?", STEM_SIGN_ACCOUNTING_EXCEPT_ZERO, status); b.add(u"+-", STEM_SIGN_NEGATIVE, status); b.add(u"()-", STEM_SIGN_ACCOUNTING_NEGATIVE, status); if (U_FAILURE(status)) { return; } // Build the CharsTrie // TODO: Use SLOW or FAST here? UnicodeString result; b.buildUnicodeString(USTRINGTRIE_BUILD_FAST, result, status); if (U_FAILURE(status)) { return; } // Copy the result into the global constant pointer size_t numBytes = result.length() * sizeof(char16_t); kSerializedStemTrie = static_cast(uprv_malloc(numBytes)); uprv_memcpy(kSerializedStemTrie, result.getBuffer(), numBytes); } inline void appendMultiple(UnicodeString& sb, UChar32 cp, int32_t count) { for (int i = 0; i < count; i++) { sb.append(cp); } } #define CHECK_NULL(seen, field, status) (void)(seen); /* for auto-format line wrapping */ \ UPRV_BLOCK_MACRO_BEGIN { \ if ((seen).field) { \ (status) = U_NUMBER_SKELETON_SYNTAX_ERROR; \ return STATE_NULL; \ } \ (seen).field = true; \ } UPRV_BLOCK_MACRO_END } // anonymous namespace Notation stem_to_object::notation(skeleton::StemEnum stem) { switch (stem) { case STEM_COMPACT_SHORT: return Notation::compactShort(); case STEM_COMPACT_LONG: return Notation::compactLong(); case STEM_SCIENTIFIC: return Notation::scientific(); case STEM_ENGINEERING: return Notation::engineering(); case STEM_NOTATION_SIMPLE: return Notation::simple(); default: UPRV_UNREACHABLE_EXIT; } } MeasureUnit stem_to_object::unit(skeleton::StemEnum stem) { switch (stem) { case STEM_BASE_UNIT: return MeasureUnit(); case STEM_PERCENT: return MeasureUnit::getPercent(); case STEM_PERMILLE: return MeasureUnit::getPermille(); default: UPRV_UNREACHABLE_EXIT; } } Precision stem_to_object::precision(skeleton::StemEnum stem) { switch (stem) { case STEM_PRECISION_INTEGER: return Precision::integer(); case STEM_PRECISION_UNLIMITED: return Precision::unlimited(); case STEM_PRECISION_CURRENCY_STANDARD: return Precision::currency(UCURR_USAGE_STANDARD); case STEM_PRECISION_CURRENCY_CASH: return Precision::currency(UCURR_USAGE_CASH); default: UPRV_UNREACHABLE_EXIT; } } UNumberFormatRoundingMode stem_to_object::roundingMode(skeleton::StemEnum stem) { switch (stem) { case STEM_ROUNDING_MODE_CEILING: return UNUM_ROUND_CEILING; case STEM_ROUNDING_MODE_FLOOR: return UNUM_ROUND_FLOOR; case STEM_ROUNDING_MODE_DOWN: return UNUM_ROUND_DOWN; case STEM_ROUNDING_MODE_UP: return UNUM_ROUND_UP; case STEM_ROUNDING_MODE_HALF_EVEN: return UNUM_ROUND_HALFEVEN; case STEM_ROUNDING_MODE_HALF_ODD: return UNUM_ROUND_HALF_ODD; case STEM_ROUNDING_MODE_HALF_CEILING: return UNUM_ROUND_HALF_CEILING; case STEM_ROUNDING_MODE_HALF_FLOOR: return UNUM_ROUND_HALF_FLOOR; case STEM_ROUNDING_MODE_HALF_DOWN: return UNUM_ROUND_HALFDOWN; case STEM_ROUNDING_MODE_HALF_UP: return UNUM_ROUND_HALFUP; case STEM_ROUNDING_MODE_UNNECESSARY: return UNUM_ROUND_UNNECESSARY; default: UPRV_UNREACHABLE_EXIT; } } UNumberGroupingStrategy stem_to_object::groupingStrategy(skeleton::StemEnum stem) { switch (stem) { case STEM_GROUP_OFF: return UNUM_GROUPING_OFF; case STEM_GROUP_MIN2: return UNUM_GROUPING_MIN2; case STEM_GROUP_AUTO: return UNUM_GROUPING_AUTO; case STEM_GROUP_ON_ALIGNED: return UNUM_GROUPING_ON_ALIGNED; case STEM_GROUP_THOUSANDS: return UNUM_GROUPING_THOUSANDS; default: return UNUM_GROUPING_COUNT; // for objects, throw; for enums, return COUNT } } UNumberUnitWidth stem_to_object::unitWidth(skeleton::StemEnum stem) { switch (stem) { case STEM_UNIT_WIDTH_NARROW: return UNUM_UNIT_WIDTH_NARROW; case STEM_UNIT_WIDTH_SHORT: return UNUM_UNIT_WIDTH_SHORT; case STEM_UNIT_WIDTH_FULL_NAME: return UNUM_UNIT_WIDTH_FULL_NAME; case STEM_UNIT_WIDTH_ISO_CODE: return UNUM_UNIT_WIDTH_ISO_CODE; case STEM_UNIT_WIDTH_FORMAL: return UNUM_UNIT_WIDTH_FORMAL; case STEM_UNIT_WIDTH_VARIANT: return UNUM_UNIT_WIDTH_VARIANT; case STEM_UNIT_WIDTH_HIDDEN: return UNUM_UNIT_WIDTH_HIDDEN; default: return UNUM_UNIT_WIDTH_COUNT; // for objects, throw; for enums, return COUNT } } UNumberSignDisplay stem_to_object::signDisplay(skeleton::StemEnum stem) { switch (stem) { case STEM_SIGN_AUTO: return UNUM_SIGN_AUTO; case STEM_SIGN_ALWAYS: return UNUM_SIGN_ALWAYS; case STEM_SIGN_NEVER: return UNUM_SIGN_NEVER; case STEM_SIGN_ACCOUNTING: return UNUM_SIGN_ACCOUNTING; case STEM_SIGN_ACCOUNTING_ALWAYS: return UNUM_SIGN_ACCOUNTING_ALWAYS; case STEM_SIGN_EXCEPT_ZERO: return UNUM_SIGN_EXCEPT_ZERO; case STEM_SIGN_ACCOUNTING_EXCEPT_ZERO: return UNUM_SIGN_ACCOUNTING_EXCEPT_ZERO; case STEM_SIGN_NEGATIVE: return UNUM_SIGN_NEGATIVE; case STEM_SIGN_ACCOUNTING_NEGATIVE: return UNUM_SIGN_ACCOUNTING_NEGATIVE; default: return UNUM_SIGN_COUNT; // for objects, throw; for enums, return COUNT } } UNumberDecimalSeparatorDisplay stem_to_object::decimalSeparatorDisplay(skeleton::StemEnum stem) { switch (stem) { case STEM_DECIMAL_AUTO: return UNUM_DECIMAL_SEPARATOR_AUTO; case STEM_DECIMAL_ALWAYS: return UNUM_DECIMAL_SEPARATOR_ALWAYS; default: return UNUM_DECIMAL_SEPARATOR_COUNT; // for objects, throw; for enums, return COUNT } } void enum_to_stem_string::roundingMode(UNumberFormatRoundingMode value, UnicodeString& sb) { switch (value) { case UNUM_ROUND_CEILING: sb.append(u"rounding-mode-ceiling", -1); break; case UNUM_ROUND_FLOOR: sb.append(u"rounding-mode-floor", -1); break; case UNUM_ROUND_DOWN: sb.append(u"rounding-mode-down", -1); break; case UNUM_ROUND_UP: sb.append(u"rounding-mode-up", -1); break; case UNUM_ROUND_HALFEVEN: sb.append(u"rounding-mode-half-even", -1); break; case UNUM_ROUND_HALF_ODD: sb.append(u"rounding-mode-half-odd", -1); break; case UNUM_ROUND_HALF_CEILING: sb.append(u"rounding-mode-half-ceiling", -1); break; case UNUM_ROUND_HALF_FLOOR: sb.append(u"rounding-mode-half-floor", -1); break; case UNUM_ROUND_HALFDOWN: sb.append(u"rounding-mode-half-down", -1); break; case UNUM_ROUND_HALFUP: sb.append(u"rounding-mode-half-up", -1); break; case UNUM_ROUND_UNNECESSARY: sb.append(u"rounding-mode-unnecessary", -1); break; default: UPRV_UNREACHABLE_EXIT; } } void enum_to_stem_string::groupingStrategy(UNumberGroupingStrategy value, UnicodeString& sb) { switch (value) { case UNUM_GROUPING_OFF: sb.append(u"group-off", -1); break; case UNUM_GROUPING_MIN2: sb.append(u"group-min2", -1); break; case UNUM_GROUPING_AUTO: sb.append(u"group-auto", -1); break; case UNUM_GROUPING_ON_ALIGNED: sb.append(u"group-on-aligned", -1); break; case UNUM_GROUPING_THOUSANDS: sb.append(u"group-thousands", -1); break; default: UPRV_UNREACHABLE_EXIT; } } void enum_to_stem_string::unitWidth(UNumberUnitWidth value, UnicodeString& sb) { switch (value) { case UNUM_UNIT_WIDTH_NARROW: sb.append(u"unit-width-narrow", -1); break; case UNUM_UNIT_WIDTH_SHORT: sb.append(u"unit-width-short", -1); break; case UNUM_UNIT_WIDTH_FULL_NAME: sb.append(u"unit-width-full-name", -1); break; case UNUM_UNIT_WIDTH_ISO_CODE: sb.append(u"unit-width-iso-code", -1); break; case UNUM_UNIT_WIDTH_FORMAL: sb.append(u"unit-width-formal", -1); break; case UNUM_UNIT_WIDTH_VARIANT: sb.append(u"unit-width-variant", -1); break; case UNUM_UNIT_WIDTH_HIDDEN: sb.append(u"unit-width-hidden", -1); break; default: UPRV_UNREACHABLE_EXIT; } } void enum_to_stem_string::signDisplay(UNumberSignDisplay value, UnicodeString& sb) { switch (value) { case UNUM_SIGN_AUTO: sb.append(u"sign-auto", -1); break; case UNUM_SIGN_ALWAYS: sb.append(u"sign-always", -1); break; case UNUM_SIGN_NEVER: sb.append(u"sign-never", -1); break; case UNUM_SIGN_ACCOUNTING: sb.append(u"sign-accounting", -1); break; case UNUM_SIGN_ACCOUNTING_ALWAYS: sb.append(u"sign-accounting-always", -1); break; case UNUM_SIGN_EXCEPT_ZERO: sb.append(u"sign-except-zero", -1); break; case UNUM_SIGN_ACCOUNTING_EXCEPT_ZERO: sb.append(u"sign-accounting-except-zero", -1); break; case UNUM_SIGN_NEGATIVE: sb.append(u"sign-negative", -1); break; case UNUM_SIGN_ACCOUNTING_NEGATIVE: sb.append(u"sign-accounting-negative", -1); break; default: UPRV_UNREACHABLE_EXIT; } } void enum_to_stem_string::decimalSeparatorDisplay(UNumberDecimalSeparatorDisplay value, UnicodeString& sb) { switch (value) { case UNUM_DECIMAL_SEPARATOR_AUTO: sb.append(u"decimal-auto", -1); break; case UNUM_DECIMAL_SEPARATOR_ALWAYS: sb.append(u"decimal-always", -1); break; default: UPRV_UNREACHABLE_EXIT; } } UnlocalizedNumberFormatter skeleton::create( const UnicodeString& skeletonString, UParseError* perror, UErrorCode& status) { // Initialize perror if (perror != nullptr) { perror->line = 0; perror->offset = -1; perror->preContext[0] = 0; perror->postContext[0] = 0; } umtx_initOnce(gNumberSkeletonsInitOnce, &initNumberSkeletons, status); if (U_FAILURE(status)) { return {}; } int32_t errOffset; MacroProps macros = parseSkeleton(skeletonString, errOffset, status); if (U_SUCCESS(status)) { return NumberFormatter::with().macros(macros); } if (perror == nullptr) { return {}; } // Populate the UParseError with the error location perror->offset = errOffset; int32_t contextStart = uprv_max(0, errOffset - U_PARSE_CONTEXT_LEN + 1); int32_t contextEnd = uprv_min(skeletonString.length(), errOffset + U_PARSE_CONTEXT_LEN - 1); skeletonString.extract(contextStart, errOffset - contextStart, perror->preContext, 0); perror->preContext[errOffset - contextStart] = 0; skeletonString.extract(errOffset, contextEnd - errOffset, perror->postContext, 0); perror->postContext[contextEnd - errOffset] = 0; return {}; } UnicodeString skeleton::generate(const MacroProps& macros, UErrorCode& status) { umtx_initOnce(gNumberSkeletonsInitOnce, &initNumberSkeletons, status); UnicodeString sb; GeneratorHelpers::generateSkeleton(macros, sb, status); return sb; } MacroProps skeleton::parseSkeleton( const UnicodeString& skeletonString, int32_t& errOffset, UErrorCode& status) { U_ASSERT(U_SUCCESS(status)); U_ASSERT(kSerializedStemTrie != nullptr); // Add a trailing whitespace to the end of the skeleton string to make code cleaner. UnicodeString tempSkeletonString(skeletonString); tempSkeletonString.append(u' '); SeenMacroProps seen; MacroProps macros; StringSegment segment(tempSkeletonString, false); UCharsTrie stemTrie(kSerializedStemTrie); ParseState stem = STATE_NULL; int32_t offset = 0; // Primary skeleton parse loop: while (offset < segment.length()) { UChar32 cp = segment.codePointAt(offset); bool isTokenSeparator = PatternProps::isWhiteSpace(cp); bool isOptionSeparator = (cp == u'/'); if (!isTokenSeparator && !isOptionSeparator) { // Non-separator token; consume it. offset += U16_LENGTH(cp); if (stem == STATE_NULL) { // We are currently consuming a stem. // Go to the next state in the stem trie. stemTrie.nextForCodePoint(cp); } continue; } // We are looking at a token or option separator. // If the segment is nonempty, parse it and reset the segment. // Otherwise, make sure it is a valid repeating separator. if (offset != 0) { segment.setLength(offset); if (stem == STATE_NULL) { // The first separator after the start of a token. Parse it as a stem. stem = parseStem(segment, stemTrie, seen, macros, status); stemTrie.reset(); } else { // A separator after the first separator of a token. Parse it as an option. stem = parseOption(stem, segment, macros, status); } segment.resetLength(); if (U_FAILURE(status)) { errOffset = segment.getOffset(); return macros; } // Consume the segment: segment.adjustOffset(offset); offset = 0; } else if (stem != STATE_NULL) { // A separator ('/' or whitespace) following an option separator ('/') // segment.setLength(U16_LENGTH(cp)); // for error message // throw new SkeletonSyntaxException("Unexpected separator character", segment); status = U_NUMBER_SKELETON_SYNTAX_ERROR; errOffset = segment.getOffset(); return macros; } else { // Two spaces in a row; this is OK. } // Does the current stem forbid options? if (isOptionSeparator && stem == STATE_NULL) { // segment.setLength(U16_LENGTH(cp)); // for error message // throw new SkeletonSyntaxException("Unexpected option separator", segment); status = U_NUMBER_SKELETON_SYNTAX_ERROR; errOffset = segment.getOffset(); return macros; } // Does the current stem require an option? if (isTokenSeparator && stem != STATE_NULL) { switch (stem) { case STATE_INCREMENT_PRECISION: case STATE_MEASURE_UNIT: case STATE_PER_MEASURE_UNIT: case STATE_IDENTIFIER_UNIT: case STATE_UNIT_USAGE: case STATE_CURRENCY_UNIT: case STATE_INTEGER_WIDTH: case STATE_NUMBERING_SYSTEM: case STATE_SCALE: // segment.setLength(U16_LENGTH(cp)); // for error message // throw new SkeletonSyntaxException("Stem requires an option", segment); status = U_NUMBER_SKELETON_SYNTAX_ERROR; errOffset = segment.getOffset(); return macros; default: break; } stem = STATE_NULL; } // Consume the separator: segment.adjustOffset(U16_LENGTH(cp)); } U_ASSERT(stem == STATE_NULL); return macros; } ParseState skeleton::parseStem(const StringSegment& segment, const UCharsTrie& stemTrie, SeenMacroProps& seen, MacroProps& macros, UErrorCode& status) { U_ASSERT(U_SUCCESS(status)); // First check for "blueprint" stems, which start with a "signal char" switch (segment.charAt(0)) { case u'.': CHECK_NULL(seen, precision, status); blueprint_helpers::parseFractionStem(segment, macros, status); return STATE_FRACTION_PRECISION; case u'@': CHECK_NULL(seen, precision, status); blueprint_helpers::parseDigitsStem(segment, macros, status); return STATE_PRECISION; case u'E': CHECK_NULL(seen, notation, status); blueprint_helpers::parseScientificStem(segment, macros, status); return STATE_NULL; case u'0': CHECK_NULL(seen, integerWidth, status); blueprint_helpers::parseIntegerStem(segment, macros, status); return STATE_NULL; default: break; } // Now look at the stemsTrie, which is already be pointing at our stem. UStringTrieResult stemResult = stemTrie.current(); if (stemResult != USTRINGTRIE_INTERMEDIATE_VALUE && stemResult != USTRINGTRIE_FINAL_VALUE) { // throw new SkeletonSyntaxException("Unknown stem", segment); status = U_NUMBER_SKELETON_SYNTAX_ERROR; return STATE_NULL; } auto stem = static_cast(stemTrie.getValue()); switch (stem) { // Stems with meaning on their own, not requiring an option: case STEM_COMPACT_SHORT: case STEM_COMPACT_LONG: case STEM_SCIENTIFIC: case STEM_ENGINEERING: case STEM_NOTATION_SIMPLE: CHECK_NULL(seen, notation, status); macros.notation = stem_to_object::notation(stem); switch (stem) { case STEM_SCIENTIFIC: case STEM_ENGINEERING: return STATE_SCIENTIFIC; // allows for scientific options default: return STATE_NULL; } case STEM_BASE_UNIT: case STEM_PERCENT: case STEM_PERMILLE: CHECK_NULL(seen, unit, status); macros.unit = stem_to_object::unit(stem); return STATE_NULL; case STEM_PERCENT_100: CHECK_NULL(seen, scale, status); CHECK_NULL(seen, unit, status); macros.scale = Scale::powerOfTen(2); macros.unit = NoUnit::percent(); return STATE_NULL; case STEM_PRECISION_INTEGER: case STEM_PRECISION_UNLIMITED: case STEM_PRECISION_CURRENCY_STANDARD: case STEM_PRECISION_CURRENCY_CASH: CHECK_NULL(seen, precision, status); macros.precision = stem_to_object::precision(stem); switch (stem) { case STEM_PRECISION_INTEGER: return STATE_FRACTION_PRECISION; // allows for "precision-integer/@##" default: return STATE_PRECISION; } case STEM_ROUNDING_MODE_CEILING: case STEM_ROUNDING_MODE_FLOOR: case STEM_ROUNDING_MODE_DOWN: case STEM_ROUNDING_MODE_UP: case STEM_ROUNDING_MODE_HALF_EVEN: case STEM_ROUNDING_MODE_HALF_ODD: case STEM_ROUNDING_MODE_HALF_CEILING: case STEM_ROUNDING_MODE_HALF_FLOOR: case STEM_ROUNDING_MODE_HALF_DOWN: case STEM_ROUNDING_MODE_HALF_UP: case STEM_ROUNDING_MODE_UNNECESSARY: CHECK_NULL(seen, roundingMode, status); macros.roundingMode = stem_to_object::roundingMode(stem); return STATE_NULL; case STEM_INTEGER_WIDTH_TRUNC: CHECK_NULL(seen, integerWidth, status); macros.integerWidth = IntegerWidth::zeroFillTo(0).truncateAt(0); return STATE_NULL; case STEM_GROUP_OFF: case STEM_GROUP_MIN2: case STEM_GROUP_AUTO: case STEM_GROUP_ON_ALIGNED: case STEM_GROUP_THOUSANDS: CHECK_NULL(seen, grouper, status); macros.grouper = Grouper::forStrategy(stem_to_object::groupingStrategy(stem)); return STATE_NULL; case STEM_LATIN: CHECK_NULL(seen, symbols, status); macros.symbols.setTo(NumberingSystem::createInstanceByName("latn", status)); return STATE_NULL; case STEM_UNIT_WIDTH_NARROW: case STEM_UNIT_WIDTH_SHORT: case STEM_UNIT_WIDTH_FULL_NAME: case STEM_UNIT_WIDTH_ISO_CODE: case STEM_UNIT_WIDTH_FORMAL: case STEM_UNIT_WIDTH_VARIANT: case STEM_UNIT_WIDTH_HIDDEN: CHECK_NULL(seen, unitWidth, status); macros.unitWidth = stem_to_object::unitWidth(stem); return STATE_NULL; case STEM_SIGN_AUTO: case STEM_SIGN_ALWAYS: case STEM_SIGN_NEVER: case STEM_SIGN_ACCOUNTING: case STEM_SIGN_ACCOUNTING_ALWAYS: case STEM_SIGN_EXCEPT_ZERO: case STEM_SIGN_ACCOUNTING_EXCEPT_ZERO: case STEM_SIGN_NEGATIVE: case STEM_SIGN_ACCOUNTING_NEGATIVE: CHECK_NULL(seen, sign, status); macros.sign = stem_to_object::signDisplay(stem); return STATE_NULL; case STEM_DECIMAL_AUTO: case STEM_DECIMAL_ALWAYS: CHECK_NULL(seen, decimal, status); macros.decimal = stem_to_object::decimalSeparatorDisplay(stem); return STATE_NULL; // Stems requiring an option: case STEM_PRECISION_INCREMENT: CHECK_NULL(seen, precision, status); return STATE_INCREMENT_PRECISION; case STEM_MEASURE_UNIT: CHECK_NULL(seen, unit, status); return STATE_MEASURE_UNIT; case STEM_PER_MEASURE_UNIT: CHECK_NULL(seen, perUnit, status); return STATE_PER_MEASURE_UNIT; case STEM_UNIT: CHECK_NULL(seen, unit, status); CHECK_NULL(seen, perUnit, status); return STATE_IDENTIFIER_UNIT; case STEM_UNIT_USAGE: CHECK_NULL(seen, usage, status); return STATE_UNIT_USAGE; case STEM_CURRENCY: CHECK_NULL(seen, unit, status); CHECK_NULL(seen, perUnit, status); return STATE_CURRENCY_UNIT; case STEM_INTEGER_WIDTH: CHECK_NULL(seen, integerWidth, status); return STATE_INTEGER_WIDTH; case STEM_NUMBERING_SYSTEM: CHECK_NULL(seen, symbols, status); return STATE_NUMBERING_SYSTEM; case STEM_SCALE: CHECK_NULL(seen, scale, status); return STATE_SCALE; default: UPRV_UNREACHABLE_EXIT; } } ParseState skeleton::parseOption(ParseState stem, const StringSegment& segment, MacroProps& macros, UErrorCode& status) { U_ASSERT(U_SUCCESS(status)); ///// Required options: ///// switch (stem) { case STATE_CURRENCY_UNIT: blueprint_helpers::parseCurrencyOption(segment, macros, status); return STATE_NULL; case STATE_MEASURE_UNIT: blueprint_helpers::parseMeasureUnitOption(segment, macros, status); return STATE_NULL; case STATE_PER_MEASURE_UNIT: blueprint_helpers::parseMeasurePerUnitOption(segment, macros, status); return STATE_NULL; case STATE_IDENTIFIER_UNIT: blueprint_helpers::parseIdentifierUnitOption(segment, macros, status); return STATE_NULL; case STATE_UNIT_USAGE: blueprint_helpers::parseUnitUsageOption(segment, macros, status); return STATE_NULL; case STATE_INCREMENT_PRECISION: blueprint_helpers::parseIncrementOption(segment, macros, status); return STATE_PRECISION; case STATE_INTEGER_WIDTH: blueprint_helpers::parseIntegerWidthOption(segment, macros, status); return STATE_NULL; case STATE_NUMBERING_SYSTEM: blueprint_helpers::parseNumberingSystemOption(segment, macros, status); return STATE_NULL; case STATE_SCALE: blueprint_helpers::parseScaleOption(segment, macros, status); return STATE_NULL; default: break; } ///// Non-required options: ///// // Scientific options switch (stem) { case STATE_SCIENTIFIC: if (blueprint_helpers::parseExponentWidthOption(segment, macros, status)) { return STATE_SCIENTIFIC; } if (U_FAILURE(status)) { return {}; } if (blueprint_helpers::parseExponentSignOption(segment, macros, status)) { return STATE_SCIENTIFIC; } if (U_FAILURE(status)) { return {}; } break; default: break; } // Frac-sig option switch (stem) { case STATE_FRACTION_PRECISION: if (blueprint_helpers::parseFracSigOption(segment, macros, status)) { return STATE_PRECISION; } if (U_FAILURE(status)) { return {}; } // If the fracSig option was not found, try normal precision options. stem = STATE_PRECISION; break; default: break; } // Trailing zeros option switch (stem) { case STATE_PRECISION: if (blueprint_helpers::parseTrailingZeroOption(segment, macros, status)) { return STATE_NULL; } if (U_FAILURE(status)) { return {}; } break; default: break; } // Unknown option // throw new SkeletonSyntaxException("Invalid option", segment); status = U_NUMBER_SKELETON_SYNTAX_ERROR; return STATE_NULL; } void GeneratorHelpers::generateSkeleton(const MacroProps& macros, UnicodeString& sb, UErrorCode& status) { if (U_FAILURE(status)) { return; } // Supported options if (GeneratorHelpers::notation(macros, sb, status)) { sb.append(u' '); } if (U_FAILURE(status)) { return; } if (GeneratorHelpers::unit(macros, sb, status)) { sb.append(u' '); } if (U_FAILURE(status)) { return; } if (GeneratorHelpers::usage(macros, sb, status)) { sb.append(u' '); } if (U_FAILURE(status)) { return; } if (GeneratorHelpers::precision(macros, sb, status)) { sb.append(u' '); } if (U_FAILURE(status)) { return; } if (GeneratorHelpers::roundingMode(macros, sb, status)) { sb.append(u' '); } if (U_FAILURE(status)) { return; } if (GeneratorHelpers::grouping(macros, sb, status)) { sb.append(u' '); } if (U_FAILURE(status)) { return; } if (GeneratorHelpers::integerWidth(macros, sb, status)) { sb.append(u' '); } if (U_FAILURE(status)) { return; } if (GeneratorHelpers::symbols(macros, sb, status)) { sb.append(u' '); } if (U_FAILURE(status)) { return; } if (GeneratorHelpers::unitWidth(macros, sb, status)) { sb.append(u' '); } if (U_FAILURE(status)) { return; } if (GeneratorHelpers::sign(macros, sb, status)) { sb.append(u' '); } if (U_FAILURE(status)) { return; } if (GeneratorHelpers::decimal(macros, sb, status)) { sb.append(u' '); } if (U_FAILURE(status)) { return; } if (GeneratorHelpers::scale(macros, sb, status)) { sb.append(u' '); } if (U_FAILURE(status)) { return; } // Unsupported options if (!macros.padder.isBogus()) { status = U_UNSUPPORTED_ERROR; return; } if (macros.unitDisplayCase.isSet()) { status = U_UNSUPPORTED_ERROR; return; } if (macros.affixProvider != nullptr) { status = U_UNSUPPORTED_ERROR; return; } if (macros.rules != nullptr) { status = U_UNSUPPORTED_ERROR; return; } // Remove the trailing space if (sb.length() > 0) { sb.truncate(sb.length() - 1); } } bool blueprint_helpers::parseExponentWidthOption(const StringSegment& segment, MacroProps& macros, UErrorCode&) { if (!isWildcardChar(segment.charAt(0))) { return false; } int32_t offset = 1; int32_t minExp = 0; for (; offset < segment.length(); offset++) { if (segment.charAt(offset) == u'e') { minExp++; } else { break; } } if (offset < segment.length()) { return false; } // Use the public APIs to enforce bounds checking macros.notation = static_cast(macros.notation).withMinExponentDigits(minExp); return true; } void blueprint_helpers::generateExponentWidthOption(int32_t minExponentDigits, UnicodeString& sb, UErrorCode&) { sb.append(kWildcardChar); appendMultiple(sb, u'e', minExponentDigits); } bool blueprint_helpers::parseExponentSignOption(const StringSegment& segment, MacroProps& macros, UErrorCode&) { // Get the sign display type out of the CharsTrie data structure. UCharsTrie tempStemTrie(kSerializedStemTrie); UStringTrieResult result = tempStemTrie.next( segment.toTempUnicodeString().getBuffer(), segment.length()); if (result != USTRINGTRIE_INTERMEDIATE_VALUE && result != USTRINGTRIE_FINAL_VALUE) { return false; } auto sign = stem_to_object::signDisplay(static_cast(tempStemTrie.getValue())); if (sign == UNUM_SIGN_COUNT) { return false; } macros.notation = static_cast(macros.notation).withExponentSignDisplay(sign); return true; } void blueprint_helpers::parseCurrencyOption(const StringSegment& segment, MacroProps& macros, UErrorCode& status) { // Unlike ICU4J, have to check length manually because ICU4C CurrencyUnit does not check it for us if (segment.length() != 3) { status = U_NUMBER_SKELETON_SYNTAX_ERROR; return; } //const char16_t* currencyCode = segment.toTempUnicodeString().getBuffer(); UnicodeString tmp = segment.toTempUnicodeString(); const UChar* currencyCode = tmp.getBuffer(); // Marek's patch UErrorCode localStatus = U_ZERO_ERROR; CurrencyUnit currency(currencyCode, localStatus); if (U_FAILURE(localStatus)) { // Not 3 ascii chars // throw new SkeletonSyntaxException("Invalid currency", segment); status = U_NUMBER_SKELETON_SYNTAX_ERROR; return; } // Slicing is OK macros.unit = currency; // NOLINT } void blueprint_helpers::generateCurrencyOption(const CurrencyUnit& currency, UnicodeString& sb, UErrorCode&) { sb.append(currency.getISOCurrency(), -1); } void blueprint_helpers::parseMeasureUnitOption(const StringSegment& segment, MacroProps& macros, UErrorCode& status) { U_ASSERT(U_SUCCESS(status)); const UnicodeString stemString = segment.toTempUnicodeString(); // NOTE: The category (type) of the unit is guaranteed to be a valid subtag (alphanumeric) // http://unicode.org/reports/tr35/#Validity_Data int firstHyphen = 0; while (firstHyphen < stemString.length() && stemString.charAt(firstHyphen) != '-') { firstHyphen++; } if (firstHyphen == stemString.length()) { // throw new SkeletonSyntaxException("Invalid measure unit option", segment); status = U_NUMBER_SKELETON_SYNTAX_ERROR; return; } // Need to do char <-> char16_t conversion... CharString type; SKELETON_UCHAR_TO_CHAR(type, stemString, 0, firstHyphen, status); CharString subType; SKELETON_UCHAR_TO_CHAR(subType, stemString, firstHyphen + 1, stemString.length(), status); // Note: the largest type as of this writing (Aug 2020) is "volume", which has 33 units. static constexpr int32_t CAPACITY = 40; MeasureUnit units[CAPACITY]; UErrorCode localStatus = U_ZERO_ERROR; int32_t numUnits = MeasureUnit::getAvailable(type.data(), units, CAPACITY, localStatus); if (U_FAILURE(localStatus)) { // More than 30 units in this type? status = U_INTERNAL_PROGRAM_ERROR; return; } for (int32_t i = 0; i < numUnits; i++) { auto& unit = units[i]; if (uprv_strcmp(subType.data(), unit.getSubtype()) == 0) { macros.unit = unit; return; } } // throw new SkeletonSyntaxException("Unknown measure unit", segment); status = U_NUMBER_SKELETON_SYNTAX_ERROR; } void blueprint_helpers::parseMeasurePerUnitOption(const StringSegment& segment, MacroProps& macros, UErrorCode& status) { // A little bit of a hack: save the current unit (numerator), call the main measure unit // parsing code, put back the numerator unit, and put the new unit into per-unit. MeasureUnit numerator = macros.unit; parseMeasureUnitOption(segment, macros, status); if (U_FAILURE(status)) { return; } macros.perUnit = macros.unit; macros.unit = numerator; } void blueprint_helpers::parseIdentifierUnitOption(const StringSegment& segment, MacroProps& macros, UErrorCode& status) { // Need to do char <-> char16_t conversion... U_ASSERT(U_SUCCESS(status)); CharString buffer; SKELETON_UCHAR_TO_CHAR(buffer, segment.toTempUnicodeString(), 0, segment.length(), status); ErrorCode internalStatus; macros.unit = MeasureUnit::forIdentifier(buffer.toStringPiece(), internalStatus); if (internalStatus.isFailure()) { // throw new SkeletonSyntaxException("Invalid core unit identifier", segment, e); status = U_NUMBER_SKELETON_SYNTAX_ERROR; return; } } void blueprint_helpers::parseUnitUsageOption(const StringSegment &segment, MacroProps ¯os, UErrorCode &status) { // Need to do char <-> char16_t conversion... U_ASSERT(U_SUCCESS(status)); CharString buffer; SKELETON_UCHAR_TO_CHAR(buffer, segment.toTempUnicodeString(), 0, segment.length(), status); macros.usage.set(buffer.toStringPiece()); // We do not do any validation of the usage string: it depends on the // unitPreferenceData in the units resources. } void blueprint_helpers::parseFractionStem(const StringSegment& segment, MacroProps& macros, UErrorCode& status) { U_ASSERT(segment.charAt(0) == u'.'); int32_t offset = 1; int32_t minFrac = 0; int32_t maxFrac; for (; offset < segment.length(); offset++) { if (segment.charAt(offset) == u'0') { minFrac++; } else { break; } } if (offset < segment.length()) { if (isWildcardChar(segment.charAt(offset))) { maxFrac = -1; offset++; } else { maxFrac = minFrac; for (; offset < segment.length(); offset++) { if (segment.charAt(offset) == u'#') { maxFrac++; } else { break; } } } } else { maxFrac = minFrac; } if (offset < segment.length()) { // throw new SkeletonSyntaxException("Invalid fraction stem", segment); status = U_NUMBER_SKELETON_SYNTAX_ERROR; return; } // Use the public APIs to enforce bounds checking if (maxFrac == -1) { if (minFrac == 0) { macros.precision = Precision::unlimited(); } else { macros.precision = Precision::minFraction(minFrac); } } else { macros.precision = Precision::minMaxFraction(minFrac, maxFrac); } } void blueprint_helpers::generateFractionStem(int32_t minFrac, int32_t maxFrac, UnicodeString& sb, UErrorCode&) { if (minFrac == 0 && maxFrac == 0) { sb.append(u"precision-integer", -1); return; } sb.append(u'.'); appendMultiple(sb, u'0', minFrac); if (maxFrac == -1) { sb.append(kWildcardChar); } else { appendMultiple(sb, u'#', maxFrac - minFrac); } } void blueprint_helpers::parseDigitsStem(const StringSegment& segment, MacroProps& macros, UErrorCode& status) { U_ASSERT(segment.charAt(0) == u'@'); int32_t offset = 0; int32_t minSig = 0; int32_t maxSig; for (; offset < segment.length(); offset++) { if (segment.charAt(offset) == u'@') { minSig++; } else { break; } } if (offset < segment.length()) { if (isWildcardChar(segment.charAt(offset))) { maxSig = -1; offset++; } else { maxSig = minSig; for (; offset < segment.length(); offset++) { if (segment.charAt(offset) == u'#') { maxSig++; } else { break; } } } } else { maxSig = minSig; } if (offset < segment.length()) { // throw new SkeletonSyntaxException("Invalid significant digits stem", segment); status = U_NUMBER_SKELETON_SYNTAX_ERROR; return; } // Use the public APIs to enforce bounds checking if (maxSig == -1) { macros.precision = Precision::minSignificantDigits(minSig); } else { macros.precision = Precision::minMaxSignificantDigits(minSig, maxSig); } } void blueprint_helpers::generateDigitsStem(int32_t minSig, int32_t maxSig, UnicodeString& sb, UErrorCode&) { appendMultiple(sb, u'@', minSig); if (maxSig == -1) { sb.append(kWildcardChar); } else { appendMultiple(sb, u'#', maxSig - minSig); } } void blueprint_helpers::parseScientificStem(const StringSegment& segment, MacroProps& macros, UErrorCode& status) { U_ASSERT(segment.charAt(0) == u'E'); { int32_t offset = 1; if (segment.length() == offset) { goto fail; } bool isEngineering = false; if (segment.charAt(offset) == u'E') { isEngineering = true; offset++; if (segment.length() == offset) { goto fail; } } UNumberSignDisplay signDisplay = UNUM_SIGN_AUTO; if (segment.charAt(offset) == u'+') { offset++; if (segment.length() == offset) { goto fail; } if (segment.charAt(offset) == u'!') { signDisplay = UNUM_SIGN_ALWAYS; } else if (segment.charAt(offset) == u'?') { signDisplay = UNUM_SIGN_EXCEPT_ZERO; } else { // NOTE: Other sign displays are not included because they aren't useful in this context goto fail; } offset++; if (segment.length() == offset) { goto fail; } } int32_t minDigits = 0; for (; offset < segment.length(); offset++) { if (segment.charAt(offset) != u'0') { goto fail; } minDigits++; } macros.notation = (isEngineering ? Notation::engineering() : Notation::scientific()) .withExponentSignDisplay(signDisplay) .withMinExponentDigits(minDigits); return; } fail: void(); // throw new SkeletonSyntaxException("Invalid scientific stem", segment); status = U_NUMBER_SKELETON_SYNTAX_ERROR; return; } void blueprint_helpers::parseIntegerStem(const StringSegment& segment, MacroProps& macros, UErrorCode& status) { U_ASSERT(segment.charAt(0) == u'0'); int32_t offset = 1; for (; offset < segment.length(); offset++) { if (segment.charAt(offset) != u'0') { offset--; break; } } if (offset < segment.length()) { // throw new SkeletonSyntaxException("Invalid integer stem", segment); status = U_NUMBER_SKELETON_SYNTAX_ERROR; return; } macros.integerWidth = IntegerWidth::zeroFillTo(offset); return; } bool blueprint_helpers::parseFracSigOption(const StringSegment& segment, MacroProps& macros, UErrorCode& status) { if (segment.charAt(0) != u'@') { return false; } int offset = 0; int minSig = 0; int maxSig; for (; offset < segment.length(); offset++) { if (segment.charAt(offset) == u'@') { minSig++; } else { break; } } if (offset < segment.length()) { if (isWildcardChar(segment.charAt(offset))) { // @+, @@+, @@@+ maxSig = -1; offset++; } else { // @#, @##, @### // @@#, @@##, @@@# maxSig = minSig; for (; offset < segment.length(); offset++) { if (segment.charAt(offset) == u'#') { maxSig++; } else { break; } } } } else { // @, @@, @@@ maxSig = minSig; } auto& oldPrecision = static_cast(macros.precision); if (offset < segment.length()) { UNumberRoundingPriority priority = UNUM_ROUNDING_PRIORITY_RELAXED; if (maxSig == -1) { // The wildcard character is not allowed with the priority annotation status = U_NUMBER_SKELETON_SYNTAX_ERROR; return false; } if (segment.codePointAt(offset) == u'r') { priority = UNUM_ROUNDING_PRIORITY_RELAXED; offset++; } else if (segment.codePointAt(offset) == u's') { priority = UNUM_ROUNDING_PRIORITY_STRICT; offset++; } else { // Invalid digits option for fraction rounder status = U_NUMBER_SKELETON_SYNTAX_ERROR; return false; } if (offset < segment.length()) { // Invalid digits option for fraction rounder status = U_NUMBER_SKELETON_SYNTAX_ERROR; return false; } macros.precision = oldPrecision.withSignificantDigits(minSig, maxSig, priority); } else if (maxSig == -1) { // withMinDigits macros.precision = oldPrecision.withMinDigits(minSig); } else if (minSig == 1) { // withMaxDigits macros.precision = oldPrecision.withMaxDigits(maxSig); } else { // Digits options with both min and max sig require the priority option status = U_NUMBER_SKELETON_SYNTAX_ERROR; return false; } return true; } bool blueprint_helpers::parseTrailingZeroOption(const StringSegment& segment, MacroProps& macros, UErrorCode&) { if (segment == u"w") { macros.precision = macros.precision.trailingZeroDisplay(UNUM_TRAILING_ZERO_HIDE_IF_WHOLE); return true; } return false; } void blueprint_helpers::parseIncrementOption(const StringSegment &segment, MacroProps ¯os, UErrorCode &status) { number::impl::parseIncrementOption(segment, macros.precision, status); } void blueprint_helpers::generateIncrementOption( uint32_t increment, digits_t incrementMagnitude, int32_t minFrac, UnicodeString& sb, UErrorCode&) { // Utilize DecimalQuantity/double_conversion to format this for us. DecimalQuantity dq; dq.setToLong(increment); dq.adjustMagnitude(incrementMagnitude); dq.setMinFraction(minFrac); sb.append(dq.toPlainString()); } void blueprint_helpers::parseIntegerWidthOption(const StringSegment& segment, MacroProps& macros, UErrorCode& status) { int32_t offset = 0; int32_t minInt = 0; int32_t maxInt; if (isWildcardChar(segment.charAt(0))) { maxInt = -1; offset++; } else { maxInt = 0; } for (; offset < segment.length(); offset++) { if (maxInt != -1 && segment.charAt(offset) == u'#') { maxInt++; } else { break; } } if (offset < segment.length()) { for (; offset < segment.length(); offset++) { if (segment.charAt(offset) == u'0') { minInt++; } else { break; } } } if (maxInt != -1) { maxInt += minInt; } if (offset < segment.length()) { // throw new SkeletonSyntaxException("Invalid integer width stem", segment); status = U_NUMBER_SKELETON_SYNTAX_ERROR; return; } // Use the public APIs to enforce bounds checking if (maxInt == -1) { macros.integerWidth = IntegerWidth::zeroFillTo(minInt); } else { macros.integerWidth = IntegerWidth::zeroFillTo(minInt).truncateAt(maxInt); } } void blueprint_helpers::generateIntegerWidthOption(int32_t minInt, int32_t maxInt, UnicodeString& sb, UErrorCode&) { if (maxInt == -1) { sb.append(kWildcardChar); } else { appendMultiple(sb, u'#', maxInt - minInt); } appendMultiple(sb, u'0', minInt); } void blueprint_helpers::parseNumberingSystemOption(const StringSegment& segment, MacroProps& macros, UErrorCode& status) { // Need to do char <-> char16_t conversion... U_ASSERT(U_SUCCESS(status)); CharString buffer; SKELETON_UCHAR_TO_CHAR(buffer, segment.toTempUnicodeString(), 0, segment.length(), status); NumberingSystem* ns = NumberingSystem::createInstanceByName(buffer.data(), status); if (ns == nullptr || U_FAILURE(status)) { // This is a skeleton syntax error; don't bubble up the low-level NumberingSystem error // throw new SkeletonSyntaxException("Unknown numbering system", segment); status = U_NUMBER_SKELETON_SYNTAX_ERROR; return; } macros.symbols.setTo(ns); } void blueprint_helpers::generateNumberingSystemOption(const NumberingSystem& ns, UnicodeString& sb, UErrorCode&) { // Need to do char <-> char16_t conversion... sb.append(UnicodeString(ns.getName(), -1, US_INV)); } void blueprint_helpers::parseScaleOption(const StringSegment& segment, MacroProps& macros, UErrorCode& status) { // Need to do char <-> char16_t conversion... U_ASSERT(U_SUCCESS(status)); CharString buffer; SKELETON_UCHAR_TO_CHAR(buffer, segment.toTempUnicodeString(), 0, segment.length(), status); LocalPointer decnum(new DecNum(), status); if (U_FAILURE(status)) { return; } decnum->setTo({buffer.data(), buffer.length()}, status); if (U_FAILURE(status) || decnum->isSpecial()) { // This is a skeleton syntax error; don't let the low-level decnum error bubble up status = U_NUMBER_SKELETON_SYNTAX_ERROR; return; } // NOTE: The constructor will optimize the decnum for us if possible. macros.scale = {0, decnum.orphan()}; } void blueprint_helpers::generateScaleOption(int32_t magnitude, const DecNum* arbitrary, UnicodeString& sb, UErrorCode& status) { // Utilize DecimalQuantity/double_conversion to format this for us. DecimalQuantity dq; if (arbitrary != nullptr) { dq.setToDecNum(*arbitrary, status); if (U_FAILURE(status)) { return; } } else { dq.setToInt(1); } dq.adjustMagnitude(magnitude); dq.roundToInfinity(); sb.append(dq.toPlainString()); } bool GeneratorHelpers::notation(const MacroProps& macros, UnicodeString& sb, UErrorCode& status) { if (macros.notation.fType == Notation::NTN_COMPACT) { UNumberCompactStyle style = macros.notation.fUnion.compactStyle; if (style == UNumberCompactStyle::UNUM_LONG) { sb.append(u"compact-long", -1); return true; } else if (style == UNumberCompactStyle::UNUM_SHORT) { sb.append(u"compact-short", -1); return true; } else { // Compact notation generated from custom data (not supported in skeleton) // The other compact notations are literals status = U_UNSUPPORTED_ERROR; return false; } } else if (macros.notation.fType == Notation::NTN_SCIENTIFIC) { const Notation::ScientificSettings& impl = macros.notation.fUnion.scientific; if (impl.fEngineeringInterval == 3) { sb.append(u"engineering", -1); } else { sb.append(u"scientific", -1); } if (impl.fMinExponentDigits > 1) { sb.append(u'/'); blueprint_helpers::generateExponentWidthOption(impl.fMinExponentDigits, sb, status); if (U_FAILURE(status)) { return false; } } if (impl.fExponentSignDisplay != UNUM_SIGN_AUTO) { sb.append(u'/'); enum_to_stem_string::signDisplay(impl.fExponentSignDisplay, sb); } return true; } else { // Default value is not shown in normalized form return false; } } bool GeneratorHelpers::unit(const MacroProps& macros, UnicodeString& sb, UErrorCode& status) { MeasureUnit unit = macros.unit; if (!utils::unitIsBaseUnit(macros.perUnit)) { if (utils::unitIsCurrency(macros.unit) || utils::unitIsCurrency(macros.perUnit)) { status = U_UNSUPPORTED_ERROR; return false; } unit = unit.product(macros.perUnit.reciprocal(status), status); } if (utils::unitIsCurrency(unit)) { sb.append(u"currency/", -1); CurrencyUnit currency(unit, status); if (U_FAILURE(status)) { return false; } blueprint_helpers::generateCurrencyOption(currency, sb, status); return true; } else if (utils::unitIsBaseUnit(unit)) { // Default value is not shown in normalized form return false; } else if (utils::unitIsPercent(unit)) { sb.append(u"percent", -1); return true; } else if (utils::unitIsPermille(unit)) { sb.append(u"permille", -1); return true; } else { sb.append(u"unit/", -1); sb.append(unit.getIdentifier()); return true; } } bool GeneratorHelpers::usage(const MacroProps& macros, UnicodeString& sb, UErrorCode& /* status */) { if (macros.usage.isSet()) { sb.append(u"usage/", -1); sb.append(UnicodeString(macros.usage.fValue, -1, US_INV)); return true; } return false; } bool GeneratorHelpers::precision(const MacroProps& macros, UnicodeString& sb, UErrorCode& status) { if (macros.precision.fType == Precision::RND_NONE) { sb.append(u"precision-unlimited", -1); } else if (macros.precision.fType == Precision::RND_FRACTION) { const Precision::FractionSignificantSettings& impl = macros.precision.fUnion.fracSig; blueprint_helpers::generateFractionStem(impl.fMinFrac, impl.fMaxFrac, sb, status); } else if (macros.precision.fType == Precision::RND_SIGNIFICANT) { const Precision::FractionSignificantSettings& impl = macros.precision.fUnion.fracSig; blueprint_helpers::generateDigitsStem(impl.fMinSig, impl.fMaxSig, sb, status); } else if (macros.precision.fType == Precision::RND_FRACTION_SIGNIFICANT) { const Precision::FractionSignificantSettings& impl = macros.precision.fUnion.fracSig; blueprint_helpers::generateFractionStem(impl.fMinFrac, impl.fMaxFrac, sb, status); sb.append(u'/'); if (impl.fRetain) { if (impl.fPriority == UNUM_ROUNDING_PRIORITY_RELAXED) { // withMinDigits blueprint_helpers::generateDigitsStem(impl.fMaxSig, -1, sb, status); } else { // withMaxDigits blueprint_helpers::generateDigitsStem(1, impl.fMaxSig, sb, status); } } else { blueprint_helpers::generateDigitsStem(impl.fMinSig, impl.fMaxSig, sb, status); if (impl.fPriority == UNUM_ROUNDING_PRIORITY_RELAXED) { sb.append(u'r'); } else { sb.append(u's'); } } } else if (macros.precision.fType == Precision::RND_INCREMENT || macros.precision.fType == Precision::RND_INCREMENT_ONE || macros.precision.fType == Precision::RND_INCREMENT_FIVE) { const Precision::IncrementSettings& impl = macros.precision.fUnion.increment; sb.append(u"precision-increment/", -1); blueprint_helpers::generateIncrementOption( impl.fIncrement, impl.fIncrementMagnitude, impl.fMinFrac, sb, status); } else if (macros.precision.fType == Precision::RND_CURRENCY) { UCurrencyUsage usage = macros.precision.fUnion.currencyUsage; if (usage == UCURR_USAGE_STANDARD) { sb.append(u"precision-currency-standard", -1); } else { sb.append(u"precision-currency-cash", -1); } } else { // Bogus or Error return false; } if (macros.precision.fTrailingZeroDisplay == UNUM_TRAILING_ZERO_HIDE_IF_WHOLE) { sb.append(u"/w", -1); } // NOTE: Always return true for rounding because the default value depends on other options. return true; } bool GeneratorHelpers::roundingMode(const MacroProps& macros, UnicodeString& sb, UErrorCode&) { if (macros.roundingMode == kDefaultMode) { return false; // Default } enum_to_stem_string::roundingMode(macros.roundingMode, sb); return true; } bool GeneratorHelpers::grouping(const MacroProps& macros, UnicodeString& sb, UErrorCode& status) { if (macros.grouper.isBogus()) { return false; // No value } else if (macros.grouper.fStrategy == UNUM_GROUPING_COUNT) { status = U_UNSUPPORTED_ERROR; return false; } else if (macros.grouper.fStrategy == UNUM_GROUPING_AUTO) { return false; // Default value } else { enum_to_stem_string::groupingStrategy(macros.grouper.fStrategy, sb); return true; } } bool GeneratorHelpers::integerWidth(const MacroProps& macros, UnicodeString& sb, UErrorCode& status) { if (macros.integerWidth.fHasError || macros.integerWidth.isBogus() || macros.integerWidth == IntegerWidth::standard()) { // Error or Default return false; } const auto& minMaxInt = macros.integerWidth.fUnion.minMaxInt; if (minMaxInt.fMinInt == 0 && minMaxInt.fMaxInt == 0) { sb.append(u"integer-width-trunc", -1); return true; } sb.append(u"integer-width/", -1); blueprint_helpers::generateIntegerWidthOption( minMaxInt.fMinInt, minMaxInt.fMaxInt, sb, status); return true; } bool GeneratorHelpers::symbols(const MacroProps& macros, UnicodeString& sb, UErrorCode& status) { if (macros.symbols.isNumberingSystem()) { const NumberingSystem& ns = *macros.symbols.getNumberingSystem(); if (uprv_strcmp(ns.getName(), "latn") == 0) { sb.append(u"latin", -1); } else { sb.append(u"numbering-system/", -1); blueprint_helpers::generateNumberingSystemOption(ns, sb, status); } return true; } else if (macros.symbols.isDecimalFormatSymbols()) { status = U_UNSUPPORTED_ERROR; return false; } else { // No custom symbols return false; } } bool GeneratorHelpers::unitWidth(const MacroProps& macros, UnicodeString& sb, UErrorCode&) { if (macros.unitWidth == UNUM_UNIT_WIDTH_SHORT || macros.unitWidth == UNUM_UNIT_WIDTH_COUNT) { return false; // Default or Bogus } enum_to_stem_string::unitWidth(macros.unitWidth, sb); return true; } bool GeneratorHelpers::sign(const MacroProps& macros, UnicodeString& sb, UErrorCode&) { if (macros.sign == UNUM_SIGN_AUTO || macros.sign == UNUM_SIGN_COUNT) { return false; // Default or Bogus } enum_to_stem_string::signDisplay(macros.sign, sb); return true; } bool GeneratorHelpers::decimal(const MacroProps& macros, UnicodeString& sb, UErrorCode&) { if (macros.decimal == UNUM_DECIMAL_SEPARATOR_AUTO || macros.decimal == UNUM_DECIMAL_SEPARATOR_COUNT) { return false; // Default or Bogus } enum_to_stem_string::decimalSeparatorDisplay(macros.decimal, sb); return true; } bool GeneratorHelpers::scale(const MacroProps& macros, UnicodeString& sb, UErrorCode& status) { if (!macros.scale.isValid()) { return false; // Default or Bogus } sb.append(u"scale/", -1); blueprint_helpers::generateScaleOption( macros.scale.fMagnitude, macros.scale.fArbitrary, sb, status); return true; } // Definitions of public API methods (put here for dependency disentanglement) #if (U_PF_WINDOWS <= U_PLATFORM && U_PLATFORM <= U_PF_CYGWIN) && defined(_MSC_VER) // Ignore MSVC warning 4661. This is generated for NumberFormatterSettings<>::toSkeleton() as this method // is defined elsewhere (in number_skeletons.cpp). The compiler is warning that the explicit template instantiation // inside this single translation unit (CPP file) is incomplete, and thus it isn't sure if the template class is // fully defined. However, since each translation unit explicitly instantiates all the necessary template classes, // they will all be passed to the linker, and the linker will still find and export all the class members. #pragma warning(push) #pragma warning(disable: 4661) #endif template UnicodeString NumberFormatterSettings::toSkeleton(UErrorCode& status) const { if (U_FAILURE(status)) { return ICU_Utility::makeBogusString(); } if (fMacros.copyErrorTo(status)) { return ICU_Utility::makeBogusString(); } return skeleton::generate(fMacros, status); } // Declare all classes that implement NumberFormatterSettings // See https://stackoverflow.com/a/495056/1407170 template class icu::number::NumberFormatterSettings; template class icu::number::NumberFormatterSettings; UnlocalizedNumberFormatter NumberFormatter::forSkeleton(const UnicodeString& skeleton, UErrorCode& status) { return skeleton::create(skeleton, nullptr, status); } UnlocalizedNumberFormatter NumberFormatter::forSkeleton(const UnicodeString& skeleton, UParseError& perror, UErrorCode& status) { return skeleton::create(skeleton, &perror, status); } #if (U_PF_WINDOWS <= U_PLATFORM && U_PLATFORM <= U_PF_CYGWIN) && defined(_MSC_VER) // Warning 4661. #pragma warning(pop) #endif #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/number_patternstring.h0000644000176200001440000003231214700200761020655 0ustar liggesusers// © 2017 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #ifndef __NUMBER_PATTERNSTRING_H__ #define __NUMBER_PATTERNSTRING_H__ #include #include "unicode/unum.h" #include "unicode/unistr.h" #include "number_types.h" #include "number_decimalquantity.h" #include "number_decimfmtprops.h" #include "number_affixutils.h" U_NAMESPACE_BEGIN namespace number { namespace impl { // Forward declaration class PatternParser; // Note: the order of fields in this enum matters for parsing. enum PatternSignType { /** Render using normal positive subpattern rules */ PATTERN_SIGN_TYPE_POS, /** Render using rules to force the display of a plus sign */ PATTERN_SIGN_TYPE_POS_SIGN, /** Render using negative subpattern rules */ PATTERN_SIGN_TYPE_NEG, /** Count for looping over the possibilities */ PATTERN_SIGN_TYPE_COUNT }; // Exported as U_I18N_API because it is a public member field of exported ParsedSubpatternInfo struct U_I18N_API Endpoints { int32_t start = 0; int32_t end = 0; }; // Exported as U_I18N_API because it is a public member field of exported ParsedPatternInfo struct U_I18N_API ParsedSubpatternInfo { uint64_t groupingSizes = 0x0000ffffffff0000L; int32_t integerLeadingHashSigns = 0; int32_t integerTrailingHashSigns = 0; int32_t integerNumerals = 0; int32_t integerAtSigns = 0; int32_t integerTotal = 0; // for convenience int32_t fractionNumerals = 0; int32_t fractionHashSigns = 0; int32_t fractionTotal = 0; // for convenience bool hasDecimal = false; int32_t widthExceptAffixes = 0; // Note: NullableValue causes issues here with std::move. bool hasPadding = false; UNumberFormatPadPosition paddingLocation = UNUM_PAD_BEFORE_PREFIX; DecimalQuantity rounding; bool exponentHasPlusSign = false; int32_t exponentZeros = 0; bool hasPercentSign = false; bool hasPerMilleSign = false; bool hasCurrencySign = false; bool hasCurrencyDecimal = false; bool hasMinusSign = false; bool hasPlusSign = false; Endpoints prefixEndpoints; Endpoints suffixEndpoints; Endpoints paddingEndpoints; }; // Exported as U_I18N_API because it is needed for the unit test PatternStringTest struct U_I18N_API ParsedPatternInfo : public AffixPatternProvider, public UMemory { UnicodeString pattern; ParsedSubpatternInfo positive; ParsedSubpatternInfo negative; ParsedPatternInfo() : state(this->pattern), currentSubpattern(nullptr) {} ~ParsedPatternInfo() override = default; // Need to declare this explicitly because of the destructor ParsedPatternInfo& operator=(ParsedPatternInfo&& src) noexcept = default; static int32_t getLengthFromEndpoints(const Endpoints& endpoints); char16_t charAt(int32_t flags, int32_t index) const override; int32_t length(int32_t flags) const override; UnicodeString getString(int32_t flags) const override; bool positiveHasPlusSign() const override; bool hasNegativeSubpattern() const override; bool negativeHasMinusSign() const override; bool hasCurrencySign() const override; bool containsSymbolType(AffixPatternType type, UErrorCode& status) const override; bool hasBody() const override; bool currencyAsDecimal() const override; private: struct U_I18N_API ParserState { const UnicodeString& pattern; // reference to the parent int32_t offset = 0; explicit ParserState(const UnicodeString& _pattern) : pattern(_pattern) {} ParserState& operator=(ParserState&& src) noexcept { // Leave pattern reference alone; it will continue to point to the same place in memory, // which gets overwritten by ParsedPatternInfo's implicit move assignment. offset = src.offset; return *this; } /** Returns the next code point, or -1 if string is too short. */ UChar32 peek(); /** Returns the code point after the next code point, or -1 if string is too short. */ UChar32 peek2(); /** Returns the next code point and then steps forward. */ UChar32 next(); // TODO: We don't currently do anything with the message string. // This method is here as a shell for Java compatibility. inline void toParseException(const char16_t* message) { (void) message; } } state; // NOTE: In Java, these are written as pure functions. // In C++, they're written as methods. // The behavior is the same. // Mutable transient pointer: ParsedSubpatternInfo* currentSubpattern; // In Java, "negative == null" tells us whether or not we had a negative subpattern. // In C++, we need to remember in another boolean. bool fHasNegativeSubpattern = false; const Endpoints& getEndpoints(int32_t flags) const; /** Run the recursive descent parser. */ void consumePattern(const UnicodeString& patternString, UErrorCode& status); void consumeSubpattern(UErrorCode& status); void consumePadding(PadPosition paddingLocation, UErrorCode& status); void consumeAffix(Endpoints& endpoints, UErrorCode& status); void consumeLiteral(UErrorCode& status); void consumeFormat(UErrorCode& status); void consumeIntegerFormat(UErrorCode& status); void consumeFractionFormat(UErrorCode& status); void consumeExponent(UErrorCode& status); friend class PatternParser; }; enum IgnoreRounding { IGNORE_ROUNDING_NEVER = 0, IGNORE_ROUNDING_IF_CURRENCY = 1, IGNORE_ROUNDING_ALWAYS = 2 }; class U_I18N_API PatternParser { public: /** * Runs the recursive descent parser on the given pattern string, returning a data structure with raw information * about the pattern string. * *

* To obtain a more useful form of the data, consider using {@link #parseToProperties} instead. * * TODO: Change argument type to const char16_t* instead of UnicodeString? * * @param patternString * The LDML decimal format pattern (Excel-style pattern) to parse. * @return The results of the parse. */ static void parseToPatternInfo(const UnicodeString& patternString, ParsedPatternInfo& patternInfo, UErrorCode& status); /** * Parses a pattern string into a new property bag. * * @param pattern * The pattern string, like "#,##0.00" * @param ignoreRounding * Whether to leave out rounding information (minFrac, maxFrac, and rounding increment) when parsing the * pattern. This may be desirable if a custom rounding mode, such as CurrencyUsage, is to be used * instead. * @return A property bag object. * @throws IllegalArgumentException * If there is a syntax error in the pattern string. */ static DecimalFormatProperties parseToProperties(const UnicodeString& pattern, IgnoreRounding ignoreRounding, UErrorCode& status); static DecimalFormatProperties parseToProperties(const UnicodeString& pattern, UErrorCode& status); /** * Parses a pattern string into an existing property bag. All properties that can be encoded into a pattern string * will be overwritten with either their default value or with the value coming from the pattern string. Properties * that cannot be encoded into a pattern string, such as rounding mode, are not modified. * * @param pattern * The pattern string, like "#,##0.00" * @param properties * The property bag object to overwrite. * @param ignoreRounding * See {@link #parseToProperties(String pattern, int ignoreRounding)}. * @throws IllegalArgumentException * If there was a syntax error in the pattern string. */ static void parseToExistingProperties(const UnicodeString& pattern, DecimalFormatProperties& properties, IgnoreRounding ignoreRounding, UErrorCode& status); private: static void parseToExistingPropertiesImpl(const UnicodeString& pattern, DecimalFormatProperties& properties, IgnoreRounding ignoreRounding, UErrorCode& status); /** Finalizes the temporary data stored in the ParsedPatternInfo to the Properties. */ static void patternInfoToProperties(DecimalFormatProperties& properties, ParsedPatternInfo& patternInfo, IgnoreRounding _ignoreRounding, UErrorCode& status); }; class U_I18N_API PatternStringUtils { public: /** * Determine whether a given roundingIncrement should be ignored for formatting * based on the current maxFrac value (maximum fraction digits). For example a * roundingIncrement of 0.01 should be ignored if maxFrac is 1, but not if maxFrac * is 2 or more. Note that roundingIncrements are rounded up in significance, so * a roundingIncrement of 0.006 is treated like 0.01 for this determination, i.e. * it should not be ignored if maxFrac is 2 or more (but a roundingIncrement of * 0.005 is treated like 0.001 for significance). * * This test is needed for both NumberPropertyMapper::oldToNew and * PatternStringUtils::propertiesToPatternString. In Java it cannot be * exported by NumberPropertyMapper (package private) so it is in * PatternStringUtils, do the same in C. * * @param roundIncr * The roundingIncrement to be checked. Must be non-zero. * @param maxFrac * The current maximum fraction digits value. * @return true if roundIncr should be ignored for formatting. */ static bool ignoreRoundingIncrement(double roundIncr, int32_t maxFrac); /** * Creates a pattern string from a property bag. * *

* Since pattern strings support only a subset of the functionality available in a property bag, a new property bag * created from the string returned by this function may not be the same as the original property bag. * * @param properties * The property bag to serialize. * @return A pattern string approximately serializing the property bag. */ static UnicodeString propertiesToPatternString(const DecimalFormatProperties& properties, UErrorCode& status); /** * Converts a pattern between standard notation and localized notation. Localized notation means that instead of * using generic placeholders in the pattern, you use the corresponding locale-specific characters instead. For * example, in locale fr-FR, the period in the pattern "0.000" means "decimal" in standard notation (as it * does in every other locale), but it means "grouping" in localized notation. * *

* A greedy string-substitution strategy is used to substitute locale symbols. If two symbols are ambiguous or have * the same prefix, the result is not well-defined. * *

* Locale symbols are not allowed to contain the ASCII quote character. * *

* This method is provided for backwards compatibility and should not be used in any new code. * * TODO(C++): This method is not yet implemented. * * @param input * The pattern to convert. * @param symbols * The symbols corresponding to the localized pattern. * @param toLocalized * true to convert from standard to localized notation; false to convert from localized to standard * notation. * @return The pattern expressed in the other notation. */ static UnicodeString convertLocalized(const UnicodeString& input, const DecimalFormatSymbols& symbols, bool toLocalized, UErrorCode& status); /** * This method contains the heart of the logic for rendering LDML affix strings. It handles * sign-always-shown resolution, whether to use the positive or negative subpattern, permille * substitution, and plural forms for CurrencyPluralInfo. */ static void patternInfoToStringBuilder(const AffixPatternProvider& patternInfo, bool isPrefix, PatternSignType patternSignType, bool approximately, StandardPlural::Form plural, bool perMilleReplacesPercent, bool dropCurrencySymbols, UnicodeString& output); static PatternSignType resolveSignDisplay(UNumberSignDisplay signDisplay, Signum signum); private: /** @return The number of chars inserted. */ static int escapePaddingString(UnicodeString input, UnicodeString& output, int startIndex, UErrorCode& status); }; } // namespace impl } // namespace number U_NAMESPACE_END #endif //__NUMBER_PATTERNSTRING_H__ #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/regexst.cpp0000644000176200001440000001506414700200761016422 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // // regexst.h // // Copyright (C) 2004-2015, International Business Machines Corporation and others. // All Rights Reserved. // // This file contains class RegexStaticSets // // This class is internal to the regular expression implementation. // For the public Regular Expression API, see the file "unicode/regex.h" // // RegexStaticSets groups together the common UnicodeSets that are needed // for compiling or executing RegularExpressions. This grouping simplifies // the thread safe lazy creation and sharing of these sets across // all instances of regular expressions. // #include "unicode/utypes.h" #if !UCONFIG_NO_REGULAR_EXPRESSIONS #include "unicode/unistr.h" #include "unicode/uniset.h" #include "unicode/uchar.h" #include "unicode/regex.h" #include "uprops.h" #include "cmemory.h" #include "cstring.h" #include "uassert.h" #include "ucln_in.h" #include "umutex.h" #include "regexcst.h" // Contains state table for the regex pattern parser. // generated by a Perl script. #include "regexst.h" U_NAMESPACE_BEGIN // "Rule Char" Characters are those with special meaning, and therefore // need to be escaped to appear as literals in a regexp. constexpr char16_t const *gRuleSet_rule_chars = u"*?+[(){}^$|\\."; // // The backslash escape characters that ICU's unescape() function will handle. // constexpr char16_t const *gUnescapeChars = u"acefnrtuUx"; // // Unicode Set pattern for Regular Expression \w // constexpr char16_t const *gIsWordPattern = u"[\\p{Alphabetic}\\p{M}\\p{Nd}\\p{Pc}\\u200c\\u200d]"; // // Unicode Set Definitions for Regular Expression \s // constexpr char16_t const *gIsSpacePattern = u"[\\p{WhiteSpace}]"; // // UnicodeSets used in implementation of Grapheme Cluster detection, \X // constexpr char16_t const *gGC_ControlPattern = u"[[:Zl:][:Zp:][:Cc:][:Cf:]-[:Grapheme_Extend:]]"; constexpr char16_t const *gGC_ExtendPattern = u"[\\p{Grapheme_Extend}]"; constexpr char16_t const *gGC_LPattern = u"[\\p{Hangul_Syllable_Type=L}]"; constexpr char16_t const *gGC_VPattern = u"[\\p{Hangul_Syllable_Type=V}]"; constexpr char16_t const *gGC_TPattern = u"[\\p{Hangul_Syllable_Type=T}]"; constexpr char16_t const *gGC_LVPattern = u"[\\p{Hangul_Syllable_Type=LV}]"; constexpr char16_t const *gGC_LVTPattern = u"[\\p{Hangul_Syllable_Type=LVT}]"; RegexStaticSets *RegexStaticSets::gStaticSets = nullptr; UInitOnce gStaticSetsInitOnce {}; RegexStaticSets::RegexStaticSets(UErrorCode *status) { // Initialize the shared static sets to their correct values. fUnescapeCharSet.addAll(UnicodeString(true, gUnescapeChars, -1)).freeze(); fPropSets[URX_ISWORD_SET].applyPattern(UnicodeString(true, gIsWordPattern, -1), *status).freeze(); fPropSets[URX_ISSPACE_SET].applyPattern(UnicodeString(true, gIsSpacePattern, -1), *status).freeze(); fPropSets[URX_GC_EXTEND].applyPattern(UnicodeString(true, gGC_ExtendPattern, -1), *status).freeze(); fPropSets[URX_GC_CONTROL].applyPattern(UnicodeString(true, gGC_ControlPattern, -1), *status).freeze(); fPropSets[URX_GC_L].applyPattern(UnicodeString(true, gGC_LPattern, -1), *status).freeze(); fPropSets[URX_GC_V].applyPattern(UnicodeString(true, gGC_VPattern, -1), *status).freeze(); fPropSets[URX_GC_T].applyPattern(UnicodeString(true, gGC_TPattern, -1), *status).freeze(); fPropSets[URX_GC_LV].applyPattern(UnicodeString(true, gGC_LVPattern, -1), *status).freeze(); fPropSets[URX_GC_LVT].applyPattern(UnicodeString(true, gGC_LVTPattern, -1), *status).freeze(); // // "Normal" is the set of characters that don't need special handling // when finding grapheme cluster boundaries. // fPropSets[URX_GC_NORMAL].complement(); fPropSets[URX_GC_NORMAL].remove(0xac00, 0xd7a4); fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_CONTROL]); fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_L]); fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_V]); fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_T]); fPropSets[URX_GC_NORMAL].freeze(); // Initialize the 8-bit fast bit sets from the parallel full // UnicodeSets. // // TODO: 25 Oct 2019 are these fast 8-bit sets worth keeping? // Measured 3.5% gain on (non) matching with the pattern "x(?:\\S+)+x" // This runs in exponential time, making it easy to adjust the time for // convenient measuring. // // This 8 bit optimization dates from the early days of ICU, // with a less optimized UnicodeSet. At the time, the difference // was substantial. for (int32_t i=0; i #include #include // ICU PATCH: Customize header file paths for ICU. #include "double-conversion-utils.h" #include "double-conversion-cached-powers.h" // ICU PATCH: Wrap in ICU namespace U_NAMESPACE_BEGIN namespace double_conversion { namespace PowersOfTenCache { struct CachedPower { uint64_t significand; int16_t binary_exponent; int16_t decimal_exponent; }; static const CachedPower kCachedPowers[] = { {DOUBLE_CONVERSION_UINT64_2PART_C(0xfa8fd5a0, 081c0288), -1220, -348}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xbaaee17f, a23ebf76), -1193, -340}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x8b16fb20, 3055ac76), -1166, -332}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xcf42894a, 5dce35ea), -1140, -324}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x9a6bb0aa, 55653b2d), -1113, -316}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xe61acf03, 3d1a45df), -1087, -308}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xab70fe17, c79ac6ca), -1060, -300}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xff77b1fc, bebcdc4f), -1034, -292}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xbe5691ef, 416bd60c), -1007, -284}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x8dd01fad, 907ffc3c), -980, -276}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xd3515c28, 31559a83), -954, -268}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x9d71ac8f, ada6c9b5), -927, -260}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xea9c2277, 23ee8bcb), -901, -252}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xaecc4991, 4078536d), -874, -244}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x823c1279, 5db6ce57), -847, -236}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xc2109436, 4dfb5637), -821, -228}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x9096ea6f, 3848984f), -794, -220}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xd77485cb, 25823ac7), -768, -212}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xa086cfcd, 97bf97f4), -741, -204}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xef340a98, 172aace5), -715, -196}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xb23867fb, 2a35b28e), -688, -188}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x84c8d4df, d2c63f3b), -661, -180}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xc5dd4427, 1ad3cdba), -635, -172}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x936b9fce, bb25c996), -608, -164}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xdbac6c24, 7d62a584), -582, -156}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xa3ab6658, 0d5fdaf6), -555, -148}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xf3e2f893, dec3f126), -529, -140}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xb5b5ada8, aaff80b8), -502, -132}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x87625f05, 6c7c4a8b), -475, -124}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xc9bcff60, 34c13053), -449, -116}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x964e858c, 91ba2655), -422, -108}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xdff97724, 70297ebd), -396, -100}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xa6dfbd9f, b8e5b88f), -369, -92}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xf8a95fcf, 88747d94), -343, -84}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xb9447093, 8fa89bcf), -316, -76}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x8a08f0f8, bf0f156b), -289, -68}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xcdb02555, 653131b6), -263, -60}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x993fe2c6, d07b7fac), -236, -52}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xe45c10c4, 2a2b3b06), -210, -44}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xaa242499, 697392d3), -183, -36}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xfd87b5f2, 8300ca0e), -157, -28}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xbce50864, 92111aeb), -130, -20}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x8cbccc09, 6f5088cc), -103, -12}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xd1b71758, e219652c), -77, -4}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x9c400000, 00000000), -50, 4}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xe8d4a510, 00000000), -24, 12}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xad78ebc5, ac620000), 3, 20}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x813f3978, f8940984), 30, 28}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xc097ce7b, c90715b3), 56, 36}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x8f7e32ce, 7bea5c70), 83, 44}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xd5d238a4, abe98068), 109, 52}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x9f4f2726, 179a2245), 136, 60}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xed63a231, d4c4fb27), 162, 68}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xb0de6538, 8cc8ada8), 189, 76}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x83c7088e, 1aab65db), 216, 84}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xc45d1df9, 42711d9a), 242, 92}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x924d692c, a61be758), 269, 100}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xda01ee64, 1a708dea), 295, 108}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xa26da399, 9aef774a), 322, 116}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xf209787b, b47d6b85), 348, 124}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xb454e4a1, 79dd1877), 375, 132}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x865b8692, 5b9bc5c2), 402, 140}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xc83553c5, c8965d3d), 428, 148}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x952ab45c, fa97a0b3), 455, 156}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xde469fbd, 99a05fe3), 481, 164}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xa59bc234, db398c25), 508, 172}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xf6c69a72, a3989f5c), 534, 180}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xb7dcbf53, 54e9bece), 561, 188}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x88fcf317, f22241e2), 588, 196}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xcc20ce9b, d35c78a5), 614, 204}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x98165af3, 7b2153df), 641, 212}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xe2a0b5dc, 971f303a), 667, 220}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xa8d9d153, 5ce3b396), 694, 228}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xfb9b7cd9, a4a7443c), 720, 236}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xbb764c4c, a7a44410), 747, 244}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x8bab8eef, b6409c1a), 774, 252}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xd01fef10, a657842c), 800, 260}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x9b10a4e5, e9913129), 827, 268}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xe7109bfb, a19c0c9d), 853, 276}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xac2820d9, 623bf429), 880, 284}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x80444b5e, 7aa7cf85), 907, 292}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xbf21e440, 03acdd2d), 933, 300}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x8e679c2f, 5e44ff8f), 960, 308}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xd433179d, 9c8cb841), 986, 316}, {DOUBLE_CONVERSION_UINT64_2PART_C(0x9e19db92, b4e31ba9), 1013, 324}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xeb96bf6e, badf77d9), 1039, 332}, {DOUBLE_CONVERSION_UINT64_2PART_C(0xaf87023b, 9bf0ee6b), 1066, 340}, }; static const int kCachedPowersOffset = 348; // -1 * the first decimal_exponent. static const double kD_1_LOG2_10 = 0.30102999566398114; // 1 / lg(10) void GetCachedPowerForBinaryExponentRange( int min_exponent, int max_exponent, DiyFp* power, int* decimal_exponent) { int kQ = DiyFp::kSignificandSize; double k = ceil((min_exponent + kQ - 1) * kD_1_LOG2_10); int foo = kCachedPowersOffset; int index = (foo + static_cast(k) - 1) / kDecimalExponentDistance + 1; DOUBLE_CONVERSION_ASSERT(0 <= index && index < static_cast(DOUBLE_CONVERSION_ARRAY_SIZE(kCachedPowers))); CachedPower cached_power = kCachedPowers[index]; DOUBLE_CONVERSION_ASSERT(min_exponent <= cached_power.binary_exponent); (void) max_exponent; // Mark variable as used. DOUBLE_CONVERSION_ASSERT(cached_power.binary_exponent <= max_exponent); *decimal_exponent = cached_power.decimal_exponent; *power = DiyFp(cached_power.significand, cached_power.binary_exponent); } void GetCachedPowerForDecimalExponent(int requested_exponent, DiyFp* power, int* found_exponent) { DOUBLE_CONVERSION_ASSERT(kMinDecimalExponent <= requested_exponent); DOUBLE_CONVERSION_ASSERT(requested_exponent < kMaxDecimalExponent + kDecimalExponentDistance); int index = (requested_exponent + kCachedPowersOffset) / kDecimalExponentDistance; CachedPower cached_power = kCachedPowers[index]; *power = DiyFp(cached_power.significand, cached_power.binary_exponent); *found_exponent = cached_power.decimal_exponent; DOUBLE_CONVERSION_ASSERT(*found_exponent <= requested_exponent); DOUBLE_CONVERSION_ASSERT(requested_exponent < *found_exponent + kDecimalExponentDistance); } } // namespace PowersOfTenCache } // namespace double_conversion // ICU PATCH: Close ICU namespace U_NAMESPACE_END #endif // ICU PATCH: close #if !UCONFIG_NO_FORMATTING stringi/src/icu74/i18n/casetrn.h0000644000176200001440000000555214700200761016046 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 2001-2008, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: casetrn.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2004sep03 * created by: Markus W. Scherer * * Implementation class for lower-/upper-/title-casing transliterators. */ #ifndef __CASETRN_H__ #define __CASETRN_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/translit.h" #include "ucase.h" U_NAMESPACE_BEGIN /** * A transliterator that performs locale-sensitive * case mapping. */ class CaseMapTransliterator : public Transliterator { public: /** * Constructs a transliterator. * @param loc the given locale. * @param id the transliterator ID. * @param map the full case mapping function (see ucase.h) */ CaseMapTransliterator(const UnicodeString &id, UCaseMapFull *map); /** * Destructor. */ virtual ~CaseMapTransliterator(); /** * Copy constructor. */ CaseMapTransliterator(const CaseMapTransliterator&); /** * Transliterator API. * @return a copy of the object. */ virtual CaseMapTransliterator* clone() const override = 0; /** * ICU "poor man's RTTI", returns a UClassID for the actual class. */ //virtual UClassID getDynamicClassID() const; /** * ICU "poor man's RTTI", returns a UClassID for this class. */ U_I18N_API static UClassID U_EXPORT2 getStaticClassID(); protected: /** * Implements {@link Transliterator#handleTransliterate}. * @param text the buffer holding transliterated and * untransliterated text * @param offset the start and limit of the text, the position * of the cursor, and the start and limit of transliteration. * @param incremental if true, assume more text may be coming after * pos.contextLimit. Otherwise, assume the text is complete. */ virtual void handleTransliterate(Replaceable& text, UTransPosition& offsets, UBool isIncremental) const override; UCaseMapFull *fMap; private: /** * Assignment operator. */ CaseMapTransliterator& operator=(const CaseMapTransliterator&); }; U_NAMESPACE_END /** case context iterator using a Replaceable. This must be a C function because it is a callback. */ U_CFUNC UChar32 U_CALLCONV utrans_rep_caseContextIterator(void *context, int8_t dir); #endif /* #if !UCONFIG_NO_TRANSLITERATION */ #endif stringi/src/icu74/i18n/tridpars.cpp0000644000176200001440000007324414700200761016575 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (c) 2002-2014, International Business Machines Corporation * and others. All Rights Reserved. ********************************************************************** * Date Name Description * 01/14/2002 aliu Creation. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "tridpars.h" #include "hash.h" #include "mutex.h" #include "transreg.h" #include "uassert.h" #include "ucln_in.h" #include "unicode/parsepos.h" #include "unicode/translit.h" #include "unicode/uchar.h" #include "unicode/uniset.h" #include "unicode/unistr.h" #include "unicode/utrans.h" #include "util.h" #include "uvector.h" U_NAMESPACE_BEGIN static const char16_t ID_DELIM = 0x003B; // ; static const char16_t TARGET_SEP = 0x002D; // - static const char16_t VARIANT_SEP = 0x002F; // / static const char16_t OPEN_REV = 0x0028; // ( static const char16_t CLOSE_REV = 0x0029; // ) //static const char16_t EMPTY[] = {0}; // "" static const char16_t ANY[] = {65,110,121,0}; // "Any" static const char16_t ANY_NULL[] = {65,110,121,45,78,117,108,108,0}; // "Any-Null" static const int32_t FORWARD = UTRANS_FORWARD; static const int32_t REVERSE = UTRANS_REVERSE; static Hashtable* SPECIAL_INVERSES = nullptr; static UInitOnce gSpecialInversesInitOnce {}; /** * The mutex controlling access to SPECIAL_INVERSES */ static UMutex LOCK; TransliteratorIDParser::Specs::Specs(const UnicodeString& s, const UnicodeString& t, const UnicodeString& v, UBool sawS, const UnicodeString& f) { source = s; target = t; variant = v; sawSource = sawS; filter = f; } TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b, const UnicodeString& f) { canonID = c; basicID = b; filter = f; } TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b) { canonID = c; basicID = b; } Transliterator* TransliteratorIDParser::SingleID::createInstance() { Transliterator* t; if (basicID.length() == 0) { t = createBasicInstance(UnicodeString(true, ANY_NULL, 8), &canonID); } else { t = createBasicInstance(basicID, &canonID); } if (t != nullptr) { if (filter.length() != 0) { UErrorCode ec = U_ZERO_ERROR; UnicodeSet *set = new UnicodeSet(filter, ec); if (U_FAILURE(ec)) { delete set; } else { t->adoptFilter(set); } } } return t; } /** * Parse a single ID, that is, an ID of the general form * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element * optional, the filters optional, and the variants optional. * @param id the id to be parsed * @param pos INPUT-OUTPUT parameter. On input, the position of * the first character to parse. On output, the position after * the last character parsed. * @param dir the direction. If the direction is REVERSE then the * SingleID is constructed for the reverse direction. * @return a SingleID object or nullptr */ TransliteratorIDParser::SingleID* TransliteratorIDParser::parseSingleID(const UnicodeString& id, int32_t& pos, int32_t dir, UErrorCode& status) { int32_t start = pos; // The ID will be of the form A, A(), A(B), or (B), where // A and B are filter IDs. Specs* specsA = nullptr; Specs* specsB = nullptr; UBool sawParen = false; // On the first pass, look for (B) or (). If this fails, then // on the second pass, look for A, A(B), or A(). for (int32_t pass=1; pass<=2; ++pass) { if (pass == 2) { specsA = parseFilterID(id, pos, true); if (specsA == nullptr) { pos = start; return nullptr; } } if (ICU_Utility::parseChar(id, pos, OPEN_REV)) { sawParen = true; if (!ICU_Utility::parseChar(id, pos, CLOSE_REV)) { specsB = parseFilterID(id, pos, true); // Must close with a ')' if (specsB == nullptr || !ICU_Utility::parseChar(id, pos, CLOSE_REV)) { delete specsA; pos = start; return nullptr; } } break; } } // Assemble return results SingleID* single; if (sawParen) { if (dir == FORWARD) { SingleID* b = specsToID(specsB, FORWARD); single = specsToID(specsA, FORWARD); // Null pointers check if (b == nullptr || single == nullptr) { delete b; delete single; status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } single->canonID.append(OPEN_REV) .append(b->canonID).append(CLOSE_REV); if (specsA != nullptr) { single->filter = specsA->filter; } delete b; } else { SingleID* a = specsToID(specsA, FORWARD); single = specsToID(specsB, FORWARD); // Check for null pointer. if (a == nullptr || single == nullptr) { delete a; delete single; status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } single->canonID.append(OPEN_REV) .append(a->canonID).append(CLOSE_REV); if (specsB != nullptr) { single->filter = specsB->filter; } delete a; } } else { // assert(specsA != nullptr); if (dir == FORWARD) { single = specsToID(specsA, FORWARD); } else { single = specsToSpecialInverse(*specsA, status); if (single == nullptr) { single = specsToID(specsA, REVERSE); } } // Check for nullptr pointer if (single == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } single->filter = specsA->filter; } delete specsA; delete specsB; return single; } /** * Parse a filter ID, that is, an ID of the general form * "[f1] s1-t1/v1", with the filters optional, and the variants optional. * @param id the id to be parsed * @param pos INPUT-OUTPUT parameter. On input, the position of * the first character to parse. On output, the position after * the last character parsed. * @return a SingleID object or null if the parse fails */ TransliteratorIDParser::SingleID* TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos) { int32_t start = pos; Specs* specs = parseFilterID(id, pos, true); if (specs == nullptr) { pos = start; return nullptr; } // Assemble return results SingleID* single = specsToID(specs, FORWARD); if (single != nullptr) { single->filter = specs->filter; } delete specs; return single; } /** * Parse a global filter of the form "[f]" or "([f])", depending * on 'withParens'. * @param id the pattern the parse * @param pos INPUT-OUTPUT parameter. On input, the position of * the first character to parse. On output, the position after * the last character parsed. * @param dir the direction. * @param withParens INPUT-OUTPUT parameter. On entry, if * withParens is 0, then parens are disallowed. If it is 1, * then parens are requires. If it is -1, then parens are * optional, and the return result will be set to 0 or 1. * @param canonID OUTPUT parameter. The pattern for the filter * added to the canonID, either at the end, if dir is FORWARD, or * at the start, if dir is REVERSE. The pattern will be enclosed * in parentheses if appropriate, and will be suffixed with an * ID_DELIM character. May be nullptr. * @return a UnicodeSet object or nullptr. A non-nullptr results * indicates a successful parse, regardless of whether the filter * applies to the given direction. The caller should discard it * if withParens != (dir == REVERSE). */ UnicodeSet* TransliteratorIDParser::parseGlobalFilter(const UnicodeString& id, int32_t& pos, int32_t dir, int32_t& withParens, UnicodeString* canonID) { UnicodeSet* filter = nullptr; int32_t start = pos; if (withParens == -1) { withParens = ICU_Utility::parseChar(id, pos, OPEN_REV) ? 1 : 0; } else if (withParens == 1) { if (!ICU_Utility::parseChar(id, pos, OPEN_REV)) { pos = start; return nullptr; } } ICU_Utility::skipWhitespace(id, pos, true); if (UnicodeSet::resemblesPattern(id, pos)) { ParsePosition ppos(pos); UErrorCode ec = U_ZERO_ERROR; filter = new UnicodeSet(id, ppos, USET_IGNORE_SPACE, nullptr, ec); /* test for nullptr */ if (filter == 0) { pos = start; return 0; } if (U_FAILURE(ec)) { delete filter; pos = start; return nullptr; } UnicodeString pattern; id.extractBetween(pos, ppos.getIndex(), pattern); pos = ppos.getIndex(); if (withParens == 1 && !ICU_Utility::parseChar(id, pos, CLOSE_REV)) { delete filter; pos = start; return nullptr; } // In the forward direction, append the pattern to the // canonID. In the reverse, insert it at zero, and invert // the presence of parens ("A" <-> "(A)"). if (canonID != nullptr) { if (dir == FORWARD) { if (withParens == 1) { pattern.insert(0, OPEN_REV); pattern.append(CLOSE_REV); } canonID->append(pattern).append(ID_DELIM); } else { if (withParens == 0) { pattern.insert(0, OPEN_REV); pattern.append(CLOSE_REV); } canonID->insert(0, pattern); canonID->insert(pattern.length(), ID_DELIM); } } } return filter; } U_CDECL_BEGIN static void U_CALLCONV _deleteSingleID(void* obj) { delete (TransliteratorIDParser::SingleID*) obj; } static void U_CALLCONV _deleteTransliteratorTrIDPars(void* obj) { delete (Transliterator*) obj; } U_CDECL_END /** * Parse a compound ID, consisting of an optional forward global * filter, a separator, one or more single IDs delimited by * separators, an an optional reverse global filter. The * separator is a semicolon. The global filters are UnicodeSet * patterns. The reverse global filter must be enclosed in * parentheses. * @param id the pattern the parse * @param dir the direction. * @param canonID OUTPUT parameter that receives the canonical ID, * consisting of canonical IDs for all elements, as returned by * parseSingleID(), separated by semicolons. Previous contents * are discarded. * @param list OUTPUT parameter that receives a list of SingleID * objects representing the parsed IDs. Previous contents are * discarded. * @param globalFilter OUTPUT parameter that receives a pointer to * a newly created global filter for this ID in this direction, or * nullptr if there is none. * @return true if the parse succeeds, that is, if the entire * id is consumed without syntax error. */ UBool TransliteratorIDParser::parseCompoundID(const UnicodeString& id, int32_t dir, UnicodeString& canonID, UVector& list, UnicodeSet*& globalFilter) { UErrorCode ec = U_ZERO_ERROR; int32_t i; int32_t pos = 0; int32_t withParens = 1; list.removeAllElements(); UObjectDeleter *save = list.setDeleter(_deleteSingleID); UnicodeSet* filter; globalFilter = nullptr; canonID.truncate(0); // Parse leading global filter, if any withParens = 0; // parens disallowed filter = parseGlobalFilter(id, pos, dir, withParens, &canonID); if (filter != nullptr) { if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) { // Not a global filter; backup and resume canonID.truncate(0); pos = 0; } if (dir == FORWARD) { globalFilter = filter; } else { delete filter; } filter = nullptr; } UBool sawDelimiter = true; for (;;) { SingleID* single = parseSingleID(id, pos, dir, ec); if (single == nullptr) { break; } if (dir == FORWARD) { list.adoptElement(single, ec); } else { list.insertElementAt(single, 0, ec); } if (U_FAILURE(ec)) { goto FAIL; } if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) { sawDelimiter = false; break; } } if (list.size() == 0) { goto FAIL; } // Construct canonical ID for (i=0; icanonID); if (i != (list.size()-1)) { canonID.append(ID_DELIM); } } // Parse trailing global filter, if any, and only if we saw // a trailing delimiter after the IDs. if (sawDelimiter) { withParens = 1; // parens required filter = parseGlobalFilter(id, pos, dir, withParens, &canonID); if (filter != nullptr) { // Don't require trailing ';', but parse it if present ICU_Utility::parseChar(id, pos, ID_DELIM); if (dir == REVERSE) { globalFilter = filter; } else { delete filter; } filter = nullptr; } } // Trailing unparsed text is a syntax error ICU_Utility::skipWhitespace(id, pos, true); if (pos != id.length()) { goto FAIL; } list.setDeleter(save); return true; FAIL: list.removeAllElements(); list.setDeleter(save); delete globalFilter; globalFilter = nullptr; return false; } /** * Convert the elements of the 'list' vector, which are SingleID * objects, into actual Transliterator objects. In the course of * this, some (or all) entries may be removed. If all entries * are removed, the nullptr transliterator will be added. * * Delete entries with empty basicIDs; these are generated by * elements like "(A)" in the forward direction, or "A()" in * the reverse. THIS MAY RESULT IN AN EMPTY VECTOR. Convert * SingleID entries to actual transliterators. * * @param list vector of SingleID objects. On exit, vector * of one or more Transliterators. * @return new value of insertIndex. The index will shift if * there are empty items, like "(Lower)", with indices less than * insertIndex. */ void TransliteratorIDParser::instantiateList(UVector& list, UErrorCode& ec) { UVector tlist(ec); if (U_FAILURE(ec)) { goto RETURN; } tlist.setDeleter(_deleteTransliteratorTrIDPars); Transliterator* t; int32_t i; for (i=0; i<=list.size(); ++i) { // [sic]: i<=list.size() // We run the loop too long by one, so we can // do an insert after the last element if (i==list.size()) { break; } SingleID* single = (SingleID*) list.elementAt(i); if (single->basicID.length() != 0) { t = single->createInstance(); if (t == nullptr) { ec = U_INVALID_ID; goto RETURN; } tlist.adoptElement(t, ec); if (U_FAILURE(ec)) { goto RETURN; } } } // An empty list is equivalent to a nullptr transliterator. if (tlist.size() == 0) { t = createBasicInstance(UnicodeString(true, ANY_NULL, 8), nullptr); if (t == nullptr) { // Should never happen ec = U_INTERNAL_TRANSLITERATOR_ERROR; } tlist.adoptElement(t, ec); } RETURN: UObjectDeleter *save = list.setDeleter(_deleteSingleID); list.removeAllElements(); if (U_SUCCESS(ec)) { list.setDeleter(_deleteTransliteratorTrIDPars); while (tlist.size() > 0) { t = (Transliterator*) tlist.orphanElementAt(0); list.adoptElement(t, ec); if (U_FAILURE(ec)) { list.removeAllElements(); break; } } } list.setDeleter(save); } /** * Parse an ID into pieces. Take IDs of the form T, T/V, S-T, * S-T/V, or S/V-T. If the source is missing, return a source of * ANY. * @param id the id string, in any of several forms * @return an array of 4 strings: source, target, variant, and * isSourcePresent. If the source is not present, ANY will be * given as the source, and isSourcePresent will be nullptr. Otherwise * isSourcePresent will be non-nullptr. The target may be empty if the * id is not well-formed. The variant may be empty. */ void TransliteratorIDParser::IDtoSTV(const UnicodeString& id, UnicodeString& source, UnicodeString& target, UnicodeString& variant, UBool& isSourcePresent) { source.setTo(ANY, 3); target.truncate(0); variant.truncate(0); int32_t sep = id.indexOf(TARGET_SEP); int32_t var = id.indexOf(VARIANT_SEP); if (var < 0) { var = id.length(); } isSourcePresent = false; if (sep < 0) { // Form: T/V or T (or /V) id.extractBetween(0, var, target); id.extractBetween(var, id.length(), variant); } else if (sep < var) { // Form: S-T/V or S-T (or -T/V or -T) if (sep > 0) { id.extractBetween(0, sep, source); isSourcePresent = true; } id.extractBetween(++sep, var, target); id.extractBetween(var, id.length(), variant); } else { // Form: (S/V-T or /V-T) if (var > 0) { id.extractBetween(0, var, source); isSourcePresent = true; } id.extractBetween(var, sep++, variant); id.extractBetween(sep, id.length(), target); } if (variant.length() > 0) { variant.remove(0, 1); } } /** * Given source, target, and variant strings, concatenate them into a * full ID. If the source is empty, then "Any" will be used for the * source, so the ID will always be of the form s-t/v or s-t. */ void TransliteratorIDParser::STVtoID(const UnicodeString& source, const UnicodeString& target, const UnicodeString& variant, UnicodeString& id) { id = source; if (id.length() == 0) { id.setTo(ANY, 3); } id.append(TARGET_SEP).append(target); if (variant.length() != 0) { id.append(VARIANT_SEP).append(variant); } // NUL-terminate the ID string for getTerminatedBuffer. // This prevents valgrind and Purify warnings. id.append((char16_t)0); id.truncate(id.length()-1); } /** * Register two targets as being inverses of one another. For * example, calling registerSpecialInverse("NFC", "NFD", true) causes * Transliterator to form the following inverse relationships: * *

NFC => NFD
 * Any-NFC => Any-NFD
 * NFD => NFC
 * Any-NFD => Any-NFC
* * (Without the special inverse registration, the inverse of NFC * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but * that the presence or absence of "Any-" is preserved. * *

The relationship is symmetrical; registering (a, b) is * equivalent to registering (b, a). * *

The relevant IDs must still be registered separately as * factories or classes. * *

Only the targets are specified. Special inverses always * have the form Any-Target1 <=> Any-Target2. The target should * have canonical casing (the casing desired to be produced when * an inverse is formed) and should contain no whitespace or other * extraneous characters. * * @param target the target against which to register the inverse * @param inverseTarget the inverse of target, that is * Any-target.getInverse() => Any-inverseTarget * @param bidirectional if true, register the reverse relation * as well, that is, Any-inverseTarget.getInverse() => Any-target */ void TransliteratorIDParser::registerSpecialInverse(const UnicodeString& target, const UnicodeString& inverseTarget, UBool bidirectional, UErrorCode &status) { umtx_initOnce(gSpecialInversesInitOnce, init, status); if (U_FAILURE(status)) { return; } // If target == inverseTarget then force bidirectional => false if (bidirectional && 0==target.caseCompare(inverseTarget, U_FOLD_CASE_DEFAULT)) { bidirectional = false; } Mutex lock(&LOCK); UnicodeString *tempus = new UnicodeString(inverseTarget); // Used for null pointer check before usage. if (tempus == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } SPECIAL_INVERSES->put(target, tempus, status); if (bidirectional) { tempus = new UnicodeString(target); if (tempus == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } SPECIAL_INVERSES->put(inverseTarget, tempus, status); } } //---------------------------------------------------------------- // Private implementation //---------------------------------------------------------------- /** * Parse an ID into component pieces. Take IDs of the form T, * T/V, S-T, S-T/V, or S/V-T. If the source is missing, return a * source of ANY. * @param id the id string, in any of several forms * @param pos INPUT-OUTPUT parameter. On input, pos is the * offset of the first character to parse in id. On output, * pos is the offset after the last parsed character. If the * parse failed, pos will be unchanged. * @param allowFilter2 if true, a UnicodeSet pattern is allowed * at any location between specs or delimiters, and is returned * as the fifth string in the array. * @return a Specs object, or nullptr if the parse failed. If * neither source nor target was seen in the parsed id, then the * parse fails. If allowFilter is true, then the parsed filter * pattern is returned in the Specs object, otherwise the returned * filter reference is nullptr. If the parse fails for any reason * nullptr is returned. */ TransliteratorIDParser::Specs* TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos, UBool allowFilter) { UnicodeString first; UnicodeString source; UnicodeString target; UnicodeString variant; UnicodeString filter; char16_t delimiter = 0; int32_t specCount = 0; int32_t start = pos; // This loop parses one of the following things with each // pass: a filter, a delimiter character (either '-' or '/'), // or a spec (source, target, or variant). for (;;) { ICU_Utility::skipWhitespace(id, pos, true); if (pos == id.length()) { break; } // Parse filters if (allowFilter && filter.length() == 0 && UnicodeSet::resemblesPattern(id, pos)) { ParsePosition ppos(pos); UErrorCode ec = U_ZERO_ERROR; UnicodeSet set(id, ppos, USET_IGNORE_SPACE, nullptr, ec); if (U_FAILURE(ec)) { pos = start; return nullptr; } id.extractBetween(pos, ppos.getIndex(), filter); pos = ppos.getIndex(); continue; } if (delimiter == 0) { char16_t c = id.charAt(pos); if ((c == TARGET_SEP && target.length() == 0) || (c == VARIANT_SEP && variant.length() == 0)) { delimiter = c; ++pos; continue; } } // We are about to try to parse a spec with no delimiter // when we can no longer do so (we can only do so at the // start); break. if (delimiter == 0 && specCount > 0) { break; } UnicodeString spec = ICU_Utility::parseUnicodeIdentifier(id, pos); if (spec.length() == 0) { // Note that if there was a trailing delimiter, we // consume it. So Foo-, Foo/, Foo-Bar/, and Foo/Bar- // are legal. break; } switch (delimiter) { case 0: first = spec; break; case TARGET_SEP: target = spec; break; case VARIANT_SEP: variant = spec; break; } ++specCount; delimiter = 0; } // A spec with no prior character is either source or target, // depending on whether an explicit "-target" was seen. if (first.length() != 0) { if (target.length() == 0) { target = first; } else { source = first; } } // Must have either source or target if (source.length() == 0 && target.length() == 0) { pos = start; return nullptr; } // Empty source or target defaults to ANY UBool sawSource = true; if (source.length() == 0) { source.setTo(ANY, 3); sawSource = false; } if (target.length() == 0) { target.setTo(ANY, 3); } return new Specs(source, target, variant, sawSource, filter); } /** * Givens a Spec object, convert it to a SingleID object. The * Spec object is a more unprocessed parse result. The SingleID * object contains information about canonical and basic IDs. * @return a SingleID; never returns nullptr. Returned object always * has 'filter' field of nullptr. */ TransliteratorIDParser::SingleID* TransliteratorIDParser::specsToID(const Specs* specs, int32_t dir) { UnicodeString canonID; UnicodeString basicID; UnicodeString basicPrefix; if (specs != nullptr) { UnicodeString buf; if (dir == FORWARD) { if (specs->sawSource) { buf.append(specs->source).append(TARGET_SEP); } else { basicPrefix = specs->source; basicPrefix.append(TARGET_SEP); } buf.append(specs->target); } else { buf.append(specs->target).append(TARGET_SEP).append(specs->source); } if (specs->variant.length() != 0) { buf.append(VARIANT_SEP).append(specs->variant); } basicID = basicPrefix; basicID.append(buf); if (specs->filter.length() != 0) { buf.insert(0, specs->filter); } canonID = buf; } return new SingleID(canonID, basicID); } /** * Given a Specs object, return a SingleID representing the * special inverse of that ID. If there is no special inverse * then return nullptr. * @return a SingleID or nullptr. Returned object always has * 'filter' field of nullptr. */ TransliteratorIDParser::SingleID* TransliteratorIDParser::specsToSpecialInverse(const Specs& specs, UErrorCode &status) { if (0!=specs.source.caseCompare(ANY, 3, U_FOLD_CASE_DEFAULT)) { return nullptr; } umtx_initOnce(gSpecialInversesInitOnce, init, status); if (U_FAILURE(status)) { return nullptr; } UnicodeString* inverseTarget; umtx_lock(&LOCK); inverseTarget = (UnicodeString*) SPECIAL_INVERSES->get(specs.target); umtx_unlock(&LOCK); if (inverseTarget != nullptr) { // If the original ID contained "Any-" then make the // special inverse "Any-Foo"; otherwise make it "Foo". // So "Any-NFC" => "Any-NFD" but "NFC" => "NFD". UnicodeString buf; if (specs.filter.length() != 0) { buf.append(specs.filter); } if (specs.sawSource) { buf.append(ANY, 3).append(TARGET_SEP); } buf.append(*inverseTarget); UnicodeString basicID(true, ANY, 3); basicID.append(TARGET_SEP).append(*inverseTarget); if (specs.variant.length() != 0) { buf.append(VARIANT_SEP).append(specs.variant); basicID.append(VARIANT_SEP).append(specs.variant); } return new SingleID(buf, basicID); } return nullptr; } /** * Glue method to get around access problems in C++. This would * ideally be inline but we want to avoid a circular header * dependency. */ Transliterator* TransliteratorIDParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) { return Transliterator::createBasicInstance(id, canonID); } /** * Initialize static memory. Called through umtx_initOnce only. */ void U_CALLCONV TransliteratorIDParser::init(UErrorCode &status) { U_ASSERT(SPECIAL_INVERSES == nullptr); ucln_i18n_registerCleanup(UCLN_I18N_TRANSLITERATOR, utrans_transliterator_cleanup); SPECIAL_INVERSES = new Hashtable(true, status); if (SPECIAL_INVERSES == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } SPECIAL_INVERSES->setValueDeleter(uprv_deleteUObject); } /** * Free static memory. */ void TransliteratorIDParser::cleanup() { if (SPECIAL_INVERSES) { delete SPECIAL_INVERSES; SPECIAL_INVERSES = nullptr; } gSpecialInversesInitOnce.reset(); } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ //eof stringi/src/icu74/i18n/vzone.cpp0000644000176200001440000001314114700200761016074 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2009-2011, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ /** * \file * \brief C API: VTimeZone classes */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/uobject.h" #include "vzone.h" #include "unicode/vtzone.h" #include "cmemory.h" #include "unicode/ustring.h" #include "unicode/parsepos.h" U_NAMESPACE_USE U_CAPI VZone* U_EXPORT2 vzone_openID(const char16_t* ID, int32_t idLength){ UnicodeString s(idLength==-1, ID, idLength); return (VZone*) (VTimeZone::createVTimeZoneByID(s)); } U_CAPI VZone* U_EXPORT2 vzone_openData(const char16_t* vtzdata, int32_t vtzdataLength, UErrorCode& status) { UnicodeString s(vtzdataLength==-1, vtzdata, vtzdataLength); return (VZone*) (VTimeZone::createVTimeZone(s,status)); } U_CAPI void U_EXPORT2 vzone_close(VZone* zone) { delete (VTimeZone*)zone; } U_CAPI VZone* U_EXPORT2 vzone_clone(const VZone *zone) { return (VZone*) (((VTimeZone*)zone)->VTimeZone::clone()); } U_CAPI UBool U_EXPORT2 vzone_equals(const VZone* zone1, const VZone* zone2) { return *(const VTimeZone*)zone1 == *(const VTimeZone*)zone2; } U_CAPI UBool U_EXPORT2 vzone_getTZURL(VZone* zone, char16_t* & url, int32_t & urlLength) { UnicodeString s; UBool b = ((VTimeZone*)zone)->VTimeZone::getTZURL(s); urlLength = s.length(); memcpy(url,s.getBuffer(),urlLength); return b; } U_CAPI void U_EXPORT2 vzone_setTZURL(VZone* zone, char16_t* url, int32_t urlLength) { UnicodeString s(urlLength==-1, url, urlLength); ((VTimeZone*)zone)->VTimeZone::setTZURL(s); } U_CAPI UBool U_EXPORT2 vzone_getLastModified(VZone* zone, UDate& lastModified) { return ((VTimeZone*)zone)->VTimeZone::getLastModified(lastModified); } U_CAPI void U_EXPORT2 vzone_setLastModified(VZone* zone, UDate lastModified) { return ((VTimeZone*)zone)->VTimeZone::setLastModified(lastModified); } U_CAPI void U_EXPORT2 vzone_write(VZone* zone, char16_t* & result, int32_t & resultLength, UErrorCode& status) { UnicodeString s; ((VTimeZone*)zone)->VTimeZone::write(s, status); resultLength = s.length(); result = (char16_t*)uprv_malloc(resultLength); memcpy(result,s.getBuffer(),resultLength); return; } U_CAPI void U_EXPORT2 vzone_writeFromStart(VZone* zone, UDate start, char16_t* & result, int32_t & resultLength, UErrorCode& status) { UnicodeString s; ((VTimeZone*)zone)->VTimeZone::write(start, s, status); resultLength = s.length(); result = (char16_t*)uprv_malloc(resultLength); memcpy(result,s.getBuffer(),resultLength); return; } U_CAPI void U_EXPORT2 vzone_writeSimple(VZone* zone, UDate time, char16_t* & result, int32_t & resultLength, UErrorCode& status) { UnicodeString s; ((VTimeZone*)zone)->VTimeZone::writeSimple(time, s, status); resultLength = s.length(); result = (char16_t*)uprv_malloc(resultLength); memcpy(result,s.getBuffer(),resultLength); return; } U_CAPI int32_t U_EXPORT2 vzone_getOffset(VZone* zone, uint8_t era, int32_t year, int32_t month, int32_t day, uint8_t dayOfWeek, int32_t millis, UErrorCode& status) { return ((VTimeZone*)zone)->VTimeZone::getOffset(era, year, month, day, dayOfWeek, millis, status); } U_CAPI int32_t U_EXPORT2 vzone_getOffset2(VZone* zone, uint8_t era, int32_t year, int32_t month, int32_t day, uint8_t dayOfWeek, int32_t millis, int32_t monthLength, UErrorCode& status) { return ((VTimeZone*)zone)->VTimeZone::getOffset(era, year, month, day, dayOfWeek, millis, monthLength, status); } U_CAPI void U_EXPORT2 vzone_getOffset3(VZone* zone, UDate date, UBool local, int32_t& rawOffset, int32_t& dstOffset, UErrorCode& ec) { return ((VTimeZone*)zone)->VTimeZone::getOffset(date, local, rawOffset, dstOffset, ec); } U_CAPI void U_EXPORT2 vzone_setRawOffset(VZone* zone, int32_t offsetMillis) { return ((VTimeZone*)zone)->VTimeZone::setRawOffset(offsetMillis); } U_CAPI int32_t U_EXPORT2 vzone_getRawOffset(VZone* zone) { return ((VTimeZone*)zone)->VTimeZone::getRawOffset(); } U_CAPI UBool U_EXPORT2 vzone_useDaylightTime(VZone* zone) { return ((VTimeZone*)zone)->VTimeZone::useDaylightTime(); } U_CAPI UBool U_EXPORT2 vzone_inDaylightTime(VZone* zone, UDate date, UErrorCode& status) { return ((VTimeZone*)zone)->VTimeZone::inDaylightTime(date, status); } U_CAPI UBool U_EXPORT2 vzone_hasSameRules(VZone* zone, const VZone* other) { return ((VTimeZone*)zone)->VTimeZone::hasSameRules(*(VTimeZone*)other); } U_CAPI UBool U_EXPORT2 vzone_getNextTransition(VZone* zone, UDate base, UBool inclusive, ZTrans* result) { return ((VTimeZone*)zone)->VTimeZone::getNextTransition(base, inclusive, *(TimeZoneTransition*)result); } U_CAPI UBool U_EXPORT2 vzone_getPreviousTransition(VZone* zone, UDate base, UBool inclusive, ZTrans* result) { return ((VTimeZone*)zone)->VTimeZone::getPreviousTransition(base, inclusive, *(TimeZoneTransition*)result); } U_CAPI int32_t U_EXPORT2 vzone_countTransitionRules(VZone* zone, UErrorCode& status) { return ((VTimeZone*)zone)->VTimeZone::countTransitionRules(status); } U_CAPI UClassID U_EXPORT2 vzone_getStaticClassID(VZone* zone) { return ((VTimeZone*)zone)->VTimeZone::getStaticClassID(); } U_CAPI UClassID U_EXPORT2 vzone_getDynamicClassID(VZone* zone) { return ((VTimeZone*)zone)->VTimeZone::getDynamicClassID(); } #endif stringi/src/icu74/i18n/timezone.cpp0000644000176200001440000016003114700200761016566 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 1997-2016, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* * * File TIMEZONE.CPP * * Modification History: * * Date Name Description * 12/05/96 clhuang Creation. * 04/21/97 aliu General clean-up and bug fixing. * 05/08/97 aliu Fixed Hashtable code per code review. * 07/09/97 helena Changed createInstance to createDefault. * 07/29/97 aliu Updated with all-new list of 96 UNIX-derived * TimeZones. Changed mechanism to load from static * array rather than resource bundle. * 07/07/1998 srl Bugfixes from the Java side: UTC GMT CAT NST * Added getDisplayName API * going to add custom parsing. * * ISSUES: * - should getDisplayName cache something? * - should custom time zones be cached? [probably] * 08/10/98 stephen Brought getDisplayName() API in-line w/ conventions * 08/19/98 stephen Changed createTimeZone() to never return 0 * 09/02/98 stephen Added getOffset(monthLen) and hasSameRules() * 09/15/98 stephen Added getStaticClassID() * 02/22/99 stephen Removed character literals for EBCDIC safety * 05/04/99 stephen Changed initDefault() for Mutex issues * 07/12/99 helena HPUX 11 CC Port. * 12/03/99 aliu Moved data out of static table into icudata.dll. * Substantial rewrite of zone lookup, default zone, and * available IDs code. Misc. cleanup. *********************************************************************************/ #include "utypeinfo.h" // for 'typeid' to work #include "unicode/utypes.h" #include "unicode/ustring.h" #include "uassert.h" #include "ustr_imp.h" #ifdef U_DEBUG_TZ # include # include "uresimp.h" // for debugging static void debug_tz_loc(const char *f, int32_t l) { fprintf(stderr, "%s:%d: ", f, l); } static void debug_tz_msg(const char *pat, ...) { va_list ap; va_start(ap, pat); vfprintf(stderr, pat, ap); fflush(stderr); } static char gStrBuf[256]; #define U_DEBUG_TZ_STR(x) u_austrncpy(gStrBuf,x,sizeof(gStrBuf)-1) // must use double parens, i.e.: U_DEBUG_TZ_MSG(("four is: %d",4)); #define U_DEBUG_TZ_MSG(x) {debug_tz_loc(__FILE__,__LINE__);debug_tz_msg x;} #else #define U_DEBUG_TZ_MSG(x) #endif #if !UCONFIG_NO_FORMATTING #include "unicode/simpletz.h" #include "unicode/calendar.h" #include "unicode/gregocal.h" #include "unicode/ures.h" #include "unicode/tzfmt.h" #include "unicode/numfmt.h" #include "gregoimp.h" #include "uresimp.h" // struct UResourceBundle #include "olsontz.h" #include "mutex.h" #include "unicode/udata.h" #include "ucln_in.h" #include "cstring.h" #include "cmemory.h" #include "unicode/strenum.h" #include "uassert.h" #include "zonemeta.h" #define kZONEINFO "zoneinfo64" #define kREGIONS "Regions" #define kZONES "Zones" #define kRULES "Rules" #define kNAMES "Names" #define kTZVERSION "TZVersion" #define kLINKS "links" #define kMAX_CUSTOM_HOUR 23 #define kMAX_CUSTOM_MIN 59 #define kMAX_CUSTOM_SEC 59 #define MINUS 0x002D #define PLUS 0x002B #define ZERO_DIGIT 0x0030 #define COLON 0x003A // Static data and constants static const char16_t WORLD[] = {0x30, 0x30, 0x31, 0x00}; /* "001" */ static const char16_t GMT_ID[] = {0x47, 0x4D, 0x54, 0x00}; /* "GMT" */ static const char16_t UNKNOWN_ZONE_ID[] = {0x45, 0x74, 0x63, 0x2F, 0x55, 0x6E, 0x6B, 0x6E, 0x6F, 0x77, 0x6E, 0x00}; /* "Etc/Unknown" */ static const int32_t GMT_ID_LENGTH = 3; static const int32_t UNKNOWN_ZONE_ID_LENGTH = 11; static icu::TimeZone* DEFAULT_ZONE = nullptr; static icu::UInitOnce gDefaultZoneInitOnce {}; alignas(icu::SimpleTimeZone) static char gRawGMT[sizeof(icu::SimpleTimeZone)]; alignas(icu::SimpleTimeZone) static char gRawUNKNOWN[sizeof(icu::SimpleTimeZone)]; static icu::UInitOnce gStaticZonesInitOnce {}; static UBool gStaticZonesInitialized = false; // Whether the static zones are initialized and ready to use. static char TZDATA_VERSION[16]; static icu::UInitOnce gTZDataVersionInitOnce {}; static int32_t* MAP_SYSTEM_ZONES = nullptr; static int32_t* MAP_CANONICAL_SYSTEM_ZONES = nullptr; static int32_t* MAP_CANONICAL_SYSTEM_LOCATION_ZONES = nullptr; static int32_t LEN_SYSTEM_ZONES = 0; static int32_t LEN_CANONICAL_SYSTEM_ZONES = 0; static int32_t LEN_CANONICAL_SYSTEM_LOCATION_ZONES = 0; static icu::UInitOnce gSystemZonesInitOnce {}; static icu::UInitOnce gCanonicalZonesInitOnce {}; static icu::UInitOnce gCanonicalLocationZonesInitOnce {}; U_CDECL_BEGIN static UBool U_CALLCONV timeZone_cleanup() { U_NAMESPACE_USE delete DEFAULT_ZONE; DEFAULT_ZONE = nullptr; gDefaultZoneInitOnce.reset(); if (gStaticZonesInitialized) { reinterpret_cast(gRawGMT)->~SimpleTimeZone(); reinterpret_cast(gRawUNKNOWN)->~SimpleTimeZone(); gStaticZonesInitialized = false; gStaticZonesInitOnce.reset(); } uprv_memset(TZDATA_VERSION, 0, sizeof(TZDATA_VERSION)); gTZDataVersionInitOnce.reset(); LEN_SYSTEM_ZONES = 0; uprv_free(MAP_SYSTEM_ZONES); MAP_SYSTEM_ZONES = 0; gSystemZonesInitOnce.reset(); LEN_CANONICAL_SYSTEM_ZONES = 0; uprv_free(MAP_CANONICAL_SYSTEM_ZONES); MAP_CANONICAL_SYSTEM_ZONES = 0; gCanonicalZonesInitOnce.reset(); LEN_CANONICAL_SYSTEM_LOCATION_ZONES = 0; uprv_free(MAP_CANONICAL_SYSTEM_LOCATION_ZONES); MAP_CANONICAL_SYSTEM_LOCATION_ZONES = 0; gCanonicalLocationZonesInitOnce.reset(); return true; } U_CDECL_END U_NAMESPACE_BEGIN static int32_t findInStringArray(UResourceBundle* array, const UnicodeString& id, UErrorCode &status) { UnicodeString copy; const char16_t *u; int32_t len; int32_t start = 0; int32_t limit = ures_getSize(array); int32_t mid; int32_t lastMid = INT32_MAX; if(U_FAILURE(status) || (limit < 1)) { return -1; } U_DEBUG_TZ_MSG(("fisa: Looking for %s, between %d and %d\n", U_DEBUG_TZ_STR(UnicodeString(id).getTerminatedBuffer()), start, limit)); for (;;) { mid = (int32_t)((start + limit) / 2); if (lastMid == mid) { /* Have we moved? */ break; /* We haven't moved, and it wasn't found. */ } lastMid = mid; u = ures_getStringByIndex(array, mid, &len, &status); if (U_FAILURE(status)) { break; } U_DEBUG_TZ_MSG(("tz: compare to %s, %d .. [%d] .. %d\n", U_DEBUG_TZ_STR(u), start, mid, limit)); copy.setTo(true, u, len); int r = id.compare(copy); if(r==0) { U_DEBUG_TZ_MSG(("fisa: found at %d\n", mid)); return mid; } else if(r<0) { limit = mid; } else { start = mid; } } U_DEBUG_TZ_MSG(("fisa: not found\n")); return -1; } /** * Fetch a specific zone by name. Replaces the getByKey call. * @param top Top timezone resource * @param id Time zone ID * @param oldbundle Bundle for reuse (or nullptr). see 'ures_open()' * @return the zone's bundle if found, or undefined if error. Reuses oldbundle. */ static UResourceBundle* getZoneByName(const UResourceBundle* top, const UnicodeString& id, UResourceBundle *oldbundle, UErrorCode& status) { // load the Rules object UResourceBundle *tmp = ures_getByKey(top, kNAMES, nullptr, &status); // search for the string int32_t idx = findInStringArray(tmp, id, status); if((idx == -1) && U_SUCCESS(status)) { // not found status = U_MISSING_RESOURCE_ERROR; //ures_close(oldbundle); //oldbundle = nullptr; } else { U_DEBUG_TZ_MSG(("gzbn: oldbundle= size %d, type %d, %s\n", ures_getSize(tmp), ures_getType(tmp), u_errorName(status))); tmp = ures_getByKey(top, kZONES, tmp, &status); // get Zones object from top U_DEBUG_TZ_MSG(("gzbn: loaded ZONES, size %d, type %d, path %s %s\n", ures_getSize(tmp), ures_getType(tmp), ures_getPath(tmp), u_errorName(status))); oldbundle = ures_getByIndex(tmp, idx, oldbundle, &status); // get nth Zone object U_DEBUG_TZ_MSG(("gzbn: loaded z#%d, size %d, type %d, path %s, %s\n", idx, ures_getSize(oldbundle), ures_getType(oldbundle), ures_getPath(oldbundle), u_errorName(status))); } ures_close(tmp); if(U_FAILURE(status)) { //ures_close(oldbundle); return nullptr; } else { return oldbundle; } } UResourceBundle* TimeZone::loadRule(const UResourceBundle* top, const UnicodeString& ruleid, UResourceBundle* oldbundle, UErrorCode& status) { char key[64]; ruleid.extract(0, sizeof(key)-1, key, (int32_t)sizeof(key)-1, US_INV); U_DEBUG_TZ_MSG(("loadRule(%s)\n", key)); UResourceBundle *r = ures_getByKey(top, kRULES, oldbundle, &status); U_DEBUG_TZ_MSG(("loadRule(%s) -> kRULES [%s]\n", key, u_errorName(status))); r = ures_getByKey(r, key, r, &status); U_DEBUG_TZ_MSG(("loadRule(%s) -> item [%s]\n", key, u_errorName(status))); return r; } /** * Given an ID, open the appropriate resource for the given time zone. * Dereference aliases if necessary. * @param id zone id * @param res resource, which must be ready for use (initialized but not open) * @param ec input-output error code * @return top-level resource bundle */ static UResourceBundle* openOlsonResource(const UnicodeString& id, UResourceBundle& res, UErrorCode& ec) { #ifdef U_DEBUG_TZ char buf[128]; id.extract(0, sizeof(buf)-1, buf, sizeof(buf), ""); #endif UResourceBundle *top = ures_openDirect(0, kZONEINFO, &ec); U_DEBUG_TZ_MSG(("pre: res sz=%d\n", ures_getSize(&res))); /* &res = */ getZoneByName(top, id, &res, ec); // Dereference if this is an alias. Docs say result should be 1 // but it is 0 in 2.8 (?). U_DEBUG_TZ_MSG(("Loading zone '%s' (%s, size %d) - %s\n", buf, ures_getKey((UResourceBundle*)&res), ures_getSize(&res), u_errorName(ec))); if (ures_getType(&res) == URES_INT) { int32_t deref = ures_getInt(&res, &ec) + 0; U_DEBUG_TZ_MSG(("getInt: %s - type is %d\n", u_errorName(ec), ures_getType(&res))); UResourceBundle *ares = ures_getByKey(top, kZONES, nullptr, &ec); // dereference Zones section ures_getByIndex(ares, deref, &res, &ec); ures_close(ares); U_DEBUG_TZ_MSG(("alias to #%d (%s) - %s\n", deref, "??", u_errorName(ec))); } else { U_DEBUG_TZ_MSG(("not an alias - size %d\n", ures_getSize(&res))); } U_DEBUG_TZ_MSG(("%s - final status is %s\n", buf, u_errorName(ec))); return top; } // ------------------------------------- namespace { void U_CALLCONV initStaticTimeZones() { // Initialize _GMT independently of other static data; it should // be valid even if we can't load the time zone UDataMemory. ucln_i18n_registerCleanup(UCLN_I18N_TIMEZONE, timeZone_cleanup); // new can't fail below, as we use placement new into statically allocated space. new(gRawGMT) SimpleTimeZone(0, UnicodeString(true, GMT_ID, GMT_ID_LENGTH)); new(gRawUNKNOWN) SimpleTimeZone(0, UnicodeString(true, UNKNOWN_ZONE_ID, UNKNOWN_ZONE_ID_LENGTH)); gStaticZonesInitialized = true; } } // anonymous namespace const TimeZone& U_EXPORT2 TimeZone::getUnknown() { umtx_initOnce(gStaticZonesInitOnce, &initStaticTimeZones); return *reinterpret_cast(gRawUNKNOWN); } const TimeZone* U_EXPORT2 TimeZone::getGMT() { umtx_initOnce(gStaticZonesInitOnce, &initStaticTimeZones); return reinterpret_cast(gRawGMT); } // ***************************************************************************** // class TimeZone // ***************************************************************************** UOBJECT_DEFINE_ABSTRACT_RTTI_IMPLEMENTATION(TimeZone) TimeZone::TimeZone() : UObject(), fID() { } // ------------------------------------- TimeZone::TimeZone(const UnicodeString &id) : UObject(), fID(id) { } // ------------------------------------- TimeZone::~TimeZone() { } // ------------------------------------- TimeZone::TimeZone(const TimeZone &source) : UObject(source), fID(source.fID) { } // ------------------------------------- TimeZone & TimeZone::operator=(const TimeZone &right) { if (this != &right) fID = right.fID; return *this; } // ------------------------------------- bool TimeZone::operator==(const TimeZone& that) const { return typeid(*this) == typeid(that) && fID == that.fID; } // ------------------------------------- namespace { TimeZone* createSystemTimeZone(const UnicodeString& id, UErrorCode& ec) { if (U_FAILURE(ec)) { return nullptr; } TimeZone* z = 0; StackUResourceBundle res; U_DEBUG_TZ_MSG(("pre-err=%s\n", u_errorName(ec))); UResourceBundle *top = openOlsonResource(id, res.ref(), ec); U_DEBUG_TZ_MSG(("post-err=%s\n", u_errorName(ec))); if (U_SUCCESS(ec)) { z = new OlsonTimeZone(top, res.getAlias(), id, ec); if (z == nullptr) { ec = U_MEMORY_ALLOCATION_ERROR; U_DEBUG_TZ_MSG(("cstz: olson time zone failed to initialize - err %s\n", u_errorName(ec))); } } ures_close(top); if (U_FAILURE(ec)) { U_DEBUG_TZ_MSG(("cstz: failed to create, err %s\n", u_errorName(ec))); delete z; z = nullptr; } return z; } /** * Lookup the given name in our system zone table. If found, * instantiate a new zone of that name and return it. If not * found, return 0. */ TimeZone* createSystemTimeZone(const UnicodeString& id) { UErrorCode ec = U_ZERO_ERROR; return createSystemTimeZone(id, ec); } } TimeZone* U_EXPORT2 TimeZone::createTimeZone(const UnicodeString& ID) { /* We first try to lookup the zone ID in our system list. If this * fails, we try to parse it as a custom string GMT[+-]hh:mm. If * all else fails, we return GMT, which is probably not what the * user wants, but at least is a functioning TimeZone object. * * We cannot return nullptr, because that would break compatibility * with the JDK. */ TimeZone* result = createSystemTimeZone(ID); if (result == nullptr) { U_DEBUG_TZ_MSG(("failed to load system time zone with id - falling to custom")); result = createCustomTimeZone(ID); } if (result == nullptr) { U_DEBUG_TZ_MSG(("failed to load time zone with id - falling to Etc/Unknown(GMT)")); const TimeZone& unknown = getUnknown(); // Unknown zone uses statically allocated memory, so creation of it can never fail due to OOM. result = unknown.clone(); } return result; } // ------------------------------------- TimeZone* U_EXPORT2 TimeZone::detectHostTimeZone() { // We access system timezone data through uprv_tzset(), uprv_tzname(), and others, // which have platform specific implementations in putil.cpp int32_t rawOffset = 0; const char *hostID; UBool hostDetectionSucceeded = true; // First, try to create a system timezone, based // on the string ID in tzname[0]. uprv_tzset(); // Initialize tz... system data uprv_tzname_clear_cache(); // Get the timezone ID from the host. This function should do // any required host-specific remapping; e.g., on Windows this // function maps the Windows Time Zone name to an ICU timezone ID. hostID = uprv_tzname(0); // Invert sign because UNIX semantics are backwards rawOffset = uprv_timezone() * -U_MILLIS_PER_SECOND; TimeZone* hostZone = nullptr; UnicodeString hostStrID(hostID, -1, US_INV); if (hostStrID.length() == 0) { // The host time zone detection (or remapping) above has failed and // we have no name at all. Fallback to using the Unknown zone. hostStrID = UnicodeString(true, UNKNOWN_ZONE_ID, UNKNOWN_ZONE_ID_LENGTH); hostDetectionSucceeded = false; } hostZone = createSystemTimeZone(hostStrID); #if U_PLATFORM_USES_ONLY_WIN32_API // hostID points to a heap-allocated location on Windows. uprv_free(const_cast(hostID)); #endif int32_t hostIDLen = hostStrID.length(); if (hostZone != nullptr && rawOffset != hostZone->getRawOffset() && (3 <= hostIDLen && hostIDLen <= 4)) { // Uh oh. This probably wasn't a good id. // It was probably an ambiguous abbreviation delete hostZone; hostZone = nullptr; } // Construct a fixed standard zone with the host's ID // and raw offset. if (hostZone == nullptr && hostDetectionSucceeded) { hostZone = new SimpleTimeZone(rawOffset, hostStrID); } // If we _still_ don't have a time zone, use the Unknown zone. // // Note: This is extremely unlikely situation. If // new SimpleTimeZone(...) above fails, the following // code may also fail. if (hostZone == nullptr) { // Unknown zone uses static allocated memory, so it must always exist. // However, clone() allocates memory and can fail. hostZone = TimeZone::getUnknown().clone(); } return hostZone; } // ------------------------------------- static UMutex gDefaultZoneMutex; /** * Initialize DEFAULT_ZONE from the system default time zone. * Upon return, DEFAULT_ZONE will not be nullptr, unless operator new() * returns nullptr. */ static void U_CALLCONV initDefault() { ucln_i18n_registerCleanup(UCLN_I18N_TIMEZONE, timeZone_cleanup); Mutex lock(&gDefaultZoneMutex); // If setDefault() has already been called we can skip getting the // default zone information from the system. if (DEFAULT_ZONE != nullptr) { return; } // NOTE: this code is safely single threaded, being only // run via umtx_initOnce(). // // Some of the locale/timezone OS functions may not be thread safe, // // The operating system might actually use ICU to implement timezones. // So we may have ICU calling ICU here, like on AIX. // There shouldn't be a problem with this; initOnce does not hold a mutex // while the init function is being run. // The code detecting the host time zone was separated from this // and implemented as TimeZone::detectHostTimeZone() TimeZone *default_zone = TimeZone::detectHostTimeZone(); U_ASSERT(DEFAULT_ZONE == nullptr); DEFAULT_ZONE = default_zone; } // ------------------------------------- TimeZone* U_EXPORT2 TimeZone::createDefault() { umtx_initOnce(gDefaultZoneInitOnce, initDefault); { Mutex lock(&gDefaultZoneMutex); return (DEFAULT_ZONE != nullptr) ? DEFAULT_ZONE->clone() : nullptr; } } // ------------------------------------- TimeZone* U_EXPORT2 TimeZone::forLocaleOrDefault(const Locale& locale) { char buffer[ULOC_KEYWORDS_CAPACITY] = ""; UErrorCode localStatus = U_ZERO_ERROR; int32_t count = locale.getKeywordValue("timezone", buffer, sizeof(buffer), localStatus); if (U_FAILURE(localStatus) || localStatus == U_STRING_NOT_TERMINATED_WARNING) { // the "timezone" keyword exceeds ULOC_KEYWORDS_CAPACITY; ignore and use default. count = 0; } if (count > 0) { return TimeZone::createTimeZone(UnicodeString(buffer, count, US_INV)); } return TimeZone::createDefault(); } // ------------------------------------- void U_EXPORT2 TimeZone::adoptDefault(TimeZone* zone) { if (zone != nullptr) { { Mutex lock(&gDefaultZoneMutex); TimeZone *old = DEFAULT_ZONE; DEFAULT_ZONE = zone; delete old; } ucln_i18n_registerCleanup(UCLN_I18N_TIMEZONE, timeZone_cleanup); } } // ------------------------------------- void U_EXPORT2 TimeZone::setDefault(const TimeZone& zone) { adoptDefault(zone.clone()); } //---------------------------------------------------------------------- static void U_CALLCONV initMap(USystemTimeZoneType type, UErrorCode& ec) { ucln_i18n_registerCleanup(UCLN_I18N_TIMEZONE, timeZone_cleanup); UResourceBundle *res = ures_openDirect(0, kZONEINFO, &ec); res = ures_getByKey(res, kNAMES, res, &ec); // dereference Zones section if (U_SUCCESS(ec)) { int32_t size = ures_getSize(res); int32_t *m = (int32_t *)uprv_malloc(size * sizeof(int32_t)); if (m == nullptr) { ec = U_MEMORY_ALLOCATION_ERROR; } else { int32_t numEntries = 0; for (int32_t i = 0; i < size; i++) { UnicodeString id = ures_getUnicodeStringByIndex(res, i, &ec); if (U_FAILURE(ec)) { break; } if (0 == id.compare(UNKNOWN_ZONE_ID, UNKNOWN_ZONE_ID_LENGTH)) { // exclude Etc/Unknown continue; } if (type == UCAL_ZONE_TYPE_CANONICAL || type == UCAL_ZONE_TYPE_CANONICAL_LOCATION) { UnicodeString canonicalID; ZoneMeta::getCanonicalCLDRID(id, canonicalID, ec); if (U_FAILURE(ec)) { break; } if (canonicalID != id) { // exclude aliases continue; } } if (type == UCAL_ZONE_TYPE_CANONICAL_LOCATION) { const char16_t *region = TimeZone::getRegion(id, ec); if (U_FAILURE(ec)) { break; } if (u_strcmp(region, WORLD) == 0) { // exclude non-location ("001") continue; } } m[numEntries++] = i; } if (U_SUCCESS(ec)) { int32_t *tmp = m; m = (int32_t *)uprv_realloc(tmp, numEntries * sizeof(int32_t)); if (m == nullptr) { // realloc failed.. use the original one even it has unused // area at the end m = tmp; } switch(type) { case UCAL_ZONE_TYPE_ANY: U_ASSERT(MAP_SYSTEM_ZONES == nullptr); MAP_SYSTEM_ZONES = m; LEN_SYSTEM_ZONES = numEntries; break; case UCAL_ZONE_TYPE_CANONICAL: U_ASSERT(MAP_CANONICAL_SYSTEM_ZONES == nullptr); MAP_CANONICAL_SYSTEM_ZONES = m; LEN_CANONICAL_SYSTEM_ZONES = numEntries; break; case UCAL_ZONE_TYPE_CANONICAL_LOCATION: U_ASSERT(MAP_CANONICAL_SYSTEM_LOCATION_ZONES == nullptr); MAP_CANONICAL_SYSTEM_LOCATION_ZONES = m; LEN_CANONICAL_SYSTEM_LOCATION_ZONES = numEntries; break; } } } } ures_close(res); } /** * This is the default implementation for subclasses that do not * override this method. This implementation calls through to the * 8-argument getOffset() method after suitable computations, and * correctly adjusts GMT millis to local millis when necessary. */ void TimeZone::getOffset(UDate date, UBool local, int32_t& rawOffset, int32_t& dstOffset, UErrorCode& ec) const { if (U_FAILURE(ec)) { return; } rawOffset = getRawOffset(); if (!local) { date += rawOffset; // now in local standard millis } // When local == true, date might not be in local standard // millis. getOffset taking 7 parameters used here assume // the given time in day is local standard time. // At STD->DST transition, there is a range of time which // does not exist. When 'date' is in this time range // (and local == true), this method interprets the specified // local time as DST. At DST->STD transition, there is a // range of time which occurs twice. In this case, this // method interprets the specified local time as STD. // To support the behavior above, we need to call getOffset // (with 7 args) twice when local == true and DST is // detected in the initial call. for (int32_t pass=0; ; ++pass) { int32_t year, month, dom, dow, millis; double day = ClockMath::floorDivide(date, U_MILLIS_PER_DAY, &millis); Grego::dayToFields(day, year, month, dom, dow); dstOffset = getOffset(GregorianCalendar::AD, year, month, dom, (uint8_t) dow, millis, Grego::monthLength(year, month), ec) - rawOffset; // Recompute if local==true, dstOffset!=0. if (pass!=0 || !local || dstOffset == 0) { break; } // adjust to local standard millis date -= dstOffset; } } // ------------------------------------- // New available IDs API as of ICU 2.4. Uses StringEnumeration API. class TZEnumeration : public StringEnumeration { private: // Map into to zones. Our results are zone[map[i]] for // i=0..len-1, where zone[i] is the i-th Olson zone. If map==nullptr // then our results are zone[i] for i=0..len-1. Len will be zero // if the zone data could not be loaded. int32_t* map; int32_t* localMap; int32_t len; int32_t pos; TZEnumeration(int32_t* mapData, int32_t mapLen, UBool adoptMapData) : pos(0) { map = mapData; localMap = adoptMapData ? mapData : nullptr; len = mapLen; } UBool getID(int32_t i, UErrorCode& ec) { int32_t idLen = 0; const char16_t* id = nullptr; UResourceBundle *top = ures_openDirect(0, kZONEINFO, &ec); top = ures_getByKey(top, kNAMES, top, &ec); // dereference Zones section id = ures_getStringByIndex(top, i, &idLen, &ec); if(U_FAILURE(ec)) { unistr.truncate(0); } else { unistr.fastCopyFrom(UnicodeString(true, id, idLen)); } ures_close(top); return U_SUCCESS(ec); } static int32_t* getMap(USystemTimeZoneType type, int32_t& len, UErrorCode& ec) { len = 0; if (U_FAILURE(ec)) { return nullptr; } int32_t* m = nullptr; switch (type) { case UCAL_ZONE_TYPE_ANY: umtx_initOnce(gSystemZonesInitOnce, &initMap, type, ec); m = MAP_SYSTEM_ZONES; len = LEN_SYSTEM_ZONES; break; case UCAL_ZONE_TYPE_CANONICAL: umtx_initOnce(gCanonicalZonesInitOnce, &initMap, type, ec); m = MAP_CANONICAL_SYSTEM_ZONES; len = LEN_CANONICAL_SYSTEM_ZONES; break; case UCAL_ZONE_TYPE_CANONICAL_LOCATION: umtx_initOnce(gCanonicalLocationZonesInitOnce, &initMap, type, ec); m = MAP_CANONICAL_SYSTEM_LOCATION_ZONES; len = LEN_CANONICAL_SYSTEM_LOCATION_ZONES; break; default: ec = U_ILLEGAL_ARGUMENT_ERROR; m = nullptr; len = 0; break; } return m; } public: #define DEFAULT_FILTERED_MAP_SIZE 8 #define MAP_INCREMENT_SIZE 8 static TZEnumeration* create(USystemTimeZoneType type, const char* region, const int32_t* rawOffset, UErrorCode& ec) { if (U_FAILURE(ec)) { return nullptr; } int32_t baseLen; int32_t *baseMap = getMap(type, baseLen, ec); if (U_FAILURE(ec)) { return nullptr; } // If any additional conditions are available, // create instance local map filtered by the conditions. int32_t *filteredMap = nullptr; int32_t numEntries = 0; if (region != nullptr || rawOffset != nullptr) { int32_t filteredMapSize = DEFAULT_FILTERED_MAP_SIZE; filteredMap = (int32_t *)uprv_malloc(filteredMapSize * sizeof(int32_t)); if (filteredMap == nullptr) { ec = U_MEMORY_ALLOCATION_ERROR; return nullptr; } // Walk through the base map UResourceBundle *res = ures_openDirect(0, kZONEINFO, &ec); res = ures_getByKey(res, kNAMES, res, &ec); // dereference Zones section for (int32_t i = 0; i < baseLen; i++) { int32_t zidx = baseMap[i]; UnicodeString id = ures_getUnicodeStringByIndex(res, zidx, &ec); if (U_FAILURE(ec)) { break; } if (region != nullptr) { // Filter by region char tzregion[4]; // max 3 letters + null term TimeZone::getRegion(id, tzregion, sizeof(tzregion), ec); if (U_FAILURE(ec)) { break; } if (uprv_stricmp(tzregion, region) != 0) { // region does not match continue; } } if (rawOffset != nullptr) { // Filter by raw offset // Note: This is VERY inefficient TimeZone *z = createSystemTimeZone(id, ec); if (U_FAILURE(ec)) { break; } int32_t tzoffset = z->getRawOffset(); delete z; if (tzoffset != *rawOffset) { continue; } } if (filteredMapSize <= numEntries) { filteredMapSize += MAP_INCREMENT_SIZE; int32_t *tmp = (int32_t *)uprv_realloc(filteredMap, filteredMapSize * sizeof(int32_t)); if (tmp == nullptr) { ec = U_MEMORY_ALLOCATION_ERROR; break; } else { filteredMap = tmp; } } filteredMap[numEntries++] = zidx; } if (U_FAILURE(ec)) { uprv_free(filteredMap); filteredMap = nullptr; } ures_close(res); } TZEnumeration *result = nullptr; if (U_SUCCESS(ec)) { // Finally, create a new enumeration instance if (filteredMap == nullptr) { result = new TZEnumeration(baseMap, baseLen, false); } else { result = new TZEnumeration(filteredMap, numEntries, true); filteredMap = nullptr; } if (result == nullptr) { ec = U_MEMORY_ALLOCATION_ERROR; } } if (filteredMap != nullptr) { uprv_free(filteredMap); } return result; } TZEnumeration(const TZEnumeration &other) : StringEnumeration(), map(nullptr), localMap(nullptr), len(0), pos(0) { if (other.localMap != nullptr) { localMap = (int32_t *)uprv_malloc(other.len * sizeof(int32_t)); if (localMap != nullptr) { len = other.len; uprv_memcpy(localMap, other.localMap, len * sizeof(int32_t)); pos = other.pos; map = localMap; } else { len = 0; pos = 0; map = nullptr; } } else { map = other.map; localMap = nullptr; len = other.len; pos = other.pos; } } virtual ~TZEnumeration(); virtual StringEnumeration *clone() const override { return new TZEnumeration(*this); } virtual int32_t count(UErrorCode& status) const override { return U_FAILURE(status) ? 0 : len; } virtual const UnicodeString* snext(UErrorCode& status) override { if (U_SUCCESS(status) && map != nullptr && pos < len) { getID(map[pos], status); ++pos; return &unistr; } return 0; } virtual void reset(UErrorCode& /*status*/) override { pos = 0; } public: static UClassID U_EXPORT2 getStaticClassID(); virtual UClassID getDynamicClassID() const override; }; TZEnumeration::~TZEnumeration() { if (localMap != nullptr) { uprv_free(localMap); } } UOBJECT_DEFINE_RTTI_IMPLEMENTATION(TZEnumeration) StringEnumeration* U_EXPORT2 TimeZone::createTimeZoneIDEnumeration( USystemTimeZoneType zoneType, const char* region, const int32_t* rawOffset, UErrorCode& ec) { return TZEnumeration::create(zoneType, region, rawOffset, ec); } StringEnumeration* U_EXPORT2 TimeZone::createEnumeration(UErrorCode& status) { return TZEnumeration::create(UCAL_ZONE_TYPE_ANY, nullptr, nullptr, status); } StringEnumeration* U_EXPORT2 TimeZone::createEnumerationForRawOffset(int32_t rawOffset, UErrorCode& status) { return TZEnumeration::create(UCAL_ZONE_TYPE_ANY, nullptr, &rawOffset, status); } StringEnumeration* U_EXPORT2 TimeZone::createEnumerationForRegion(const char* region, UErrorCode& status) { return TZEnumeration::create(UCAL_ZONE_TYPE_ANY, region, nullptr, status); } // // Next 3 methods are equivalent to above, but ignores UErrorCode. // These methods were deprecated in ICU 70. StringEnumeration* U_EXPORT2 TimeZone::createEnumeration() { UErrorCode ec = U_ZERO_ERROR; return createEnumeration(ec); } StringEnumeration* U_EXPORT2 TimeZone::createEnumeration(int32_t rawOffset) { UErrorCode ec = U_ZERO_ERROR; return createEnumerationForRawOffset(rawOffset, ec); } StringEnumeration* U_EXPORT2 TimeZone::createEnumeration(const char* region) { UErrorCode ec = U_ZERO_ERROR; return createEnumerationForRegion(region, ec); } // --------------------------------------- int32_t U_EXPORT2 TimeZone::countEquivalentIDs(const UnicodeString& id) { int32_t result = 0; UErrorCode ec = U_ZERO_ERROR; StackUResourceBundle res; U_DEBUG_TZ_MSG(("countEquivalentIDs..\n")); UResourceBundle *top = openOlsonResource(id, res.ref(), ec); if (U_SUCCESS(ec)) { StackUResourceBundle r; ures_getByKey(res.getAlias(), kLINKS, r.getAlias(), &ec); ures_getIntVector(r.getAlias(), &result, &ec); } ures_close(top); return result; } // --------------------------------------- const UnicodeString U_EXPORT2 TimeZone::getEquivalentID(const UnicodeString& id, int32_t index) { U_DEBUG_TZ_MSG(("gEI(%d)\n", index)); UnicodeString result; UErrorCode ec = U_ZERO_ERROR; StackUResourceBundle res; UResourceBundle *top = openOlsonResource(id, res.ref(), ec); int32_t zone = -1; if (U_SUCCESS(ec)) { StackUResourceBundle r; int32_t size; ures_getByKey(res.getAlias(), kLINKS, r.getAlias(), &ec); const int32_t *v = ures_getIntVector(r.getAlias(), &size, &ec); if (U_SUCCESS(ec)) { if (index >= 0 && index < size) { zone = v[index]; } } } if (zone >= 0) { UResourceBundle *ares = ures_getByKey(top, kNAMES, nullptr, &ec); // dereference Zones section if (U_SUCCESS(ec)) { int32_t idLen = 0; const char16_t* id2 = ures_getStringByIndex(ares, zone, &idLen, &ec); result.fastCopyFrom(UnicodeString(true, id2, idLen)); U_DEBUG_TZ_MSG(("gei(%d) -> %d, len%d, %s\n", index, zone, result.length(), u_errorName(ec))); } ures_close(ares); } ures_close(top); #if defined(U_DEBUG_TZ) if(result.length() ==0) { U_DEBUG_TZ_MSG(("equiv [__, #%d] -> 0 (%s)\n", index, u_errorName(ec))); } #endif return result; } // --------------------------------------- // These methods are used by ZoneMeta class only. const char16_t* TimeZone::findID(const UnicodeString& id) { const char16_t *result = nullptr; UErrorCode ec = U_ZERO_ERROR; UResourceBundle *rb = ures_openDirect(nullptr, kZONEINFO, &ec); // resolve zone index by name UResourceBundle *names = ures_getByKey(rb, kNAMES, nullptr, &ec); int32_t idx = findInStringArray(names, id, ec); result = ures_getStringByIndex(names, idx, nullptr, &ec); if (U_FAILURE(ec)) { result = nullptr; } ures_close(names); ures_close(rb); return result; } const char16_t* TimeZone::dereferOlsonLink(const UnicodeString& id) { const char16_t *result = nullptr; UErrorCode ec = U_ZERO_ERROR; UResourceBundle *rb = ures_openDirect(nullptr, kZONEINFO, &ec); // resolve zone index by name UResourceBundle *names = ures_getByKey(rb, kNAMES, nullptr, &ec); int32_t idx = findInStringArray(names, id, ec); result = ures_getStringByIndex(names, idx, nullptr, &ec); // open the zone bundle by index ures_getByKey(rb, kZONES, rb, &ec); ures_getByIndex(rb, idx, rb, &ec); if (U_SUCCESS(ec)) { if (ures_getType(rb) == URES_INT) { // this is a link - dereference the link int32_t deref = ures_getInt(rb, &ec); const char16_t* tmp = ures_getStringByIndex(names, deref, nullptr, &ec); if (U_SUCCESS(ec)) { result = tmp; } } } ures_close(names); ures_close(rb); return result; } const char16_t* TimeZone::getRegion(const UnicodeString& id) { UErrorCode status = U_ZERO_ERROR; return getRegion(id, status); } const char16_t* TimeZone::getRegion(const UnicodeString& id, UErrorCode& status) { if (U_FAILURE(status)) { return nullptr; } const char16_t *result = nullptr; UResourceBundle *rb = ures_openDirect(nullptr, kZONEINFO, &status); // resolve zone index by name UResourceBundle *res = ures_getByKey(rb, kNAMES, nullptr, &status); int32_t idx = findInStringArray(res, id, status); // get region mapping ures_getByKey(rb, kREGIONS, res, &status); const char16_t *tmp = ures_getStringByIndex(res, idx, nullptr, &status); if (U_SUCCESS(status)) { result = tmp; } ures_close(res); ures_close(rb); return result; } // --------------------------------------- int32_t TimeZone::getRegion(const UnicodeString& id, char *region, int32_t capacity, UErrorCode& status) { int32_t resultLen = 0; *region = 0; if (U_FAILURE(status)) { return 0; } const char16_t *uregion = nullptr; // "Etc/Unknown" is not a system zone ID, // but in the zone data if (id.compare(UNKNOWN_ZONE_ID, UNKNOWN_ZONE_ID_LENGTH) != 0) { uregion = getRegion(id); } if (uregion == nullptr) { status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } resultLen = u_strlen(uregion); // A region code is represented by invariant characters u_UCharsToChars(uregion, region, uprv_min(resultLen, capacity)); if (capacity < resultLen) { status = U_BUFFER_OVERFLOW_ERROR; return resultLen; } return u_terminateChars(region, capacity, resultLen, &status); } // --------------------------------------- UnicodeString& TimeZone::getDisplayName(UnicodeString& result) const { return getDisplayName(false,LONG,Locale::getDefault(), result); } UnicodeString& TimeZone::getDisplayName(const Locale& locale, UnicodeString& result) const { return getDisplayName(false, LONG, locale, result); } UnicodeString& TimeZone::getDisplayName(UBool inDaylight, EDisplayType style, UnicodeString& result) const { return getDisplayName(inDaylight,style, Locale::getDefault(), result); } //-------------------------------------- int32_t TimeZone::getDSTSavings()const { if (useDaylightTime()) { return 3600000; } return 0; } //--------------------------------------- UnicodeString& TimeZone::getDisplayName(UBool inDaylight, EDisplayType style, const Locale& locale, UnicodeString& result) const { UErrorCode status = U_ZERO_ERROR; UDate date = Calendar::getNow(); UTimeZoneFormatTimeType timeType = UTZFMT_TIME_TYPE_UNKNOWN; int32_t offset; if (style == GENERIC_LOCATION || style == LONG_GENERIC || style == SHORT_GENERIC) { LocalPointer tzfmt(TimeZoneFormat::createInstance(locale, status)); if (U_FAILURE(status)) { result.remove(); return result; } // Generic format switch (style) { case GENERIC_LOCATION: tzfmt->format(UTZFMT_STYLE_GENERIC_LOCATION, *this, date, result, &timeType); break; case LONG_GENERIC: tzfmt->format(UTZFMT_STYLE_GENERIC_LONG, *this, date, result, &timeType); break; case SHORT_GENERIC: tzfmt->format(UTZFMT_STYLE_GENERIC_SHORT, *this, date, result, &timeType); break; default: UPRV_UNREACHABLE_EXIT; } // Generic format many use Localized GMT as the final fallback. // When Localized GMT format is used, the result might not be // appropriate for the requested daylight value. if ((inDaylight && timeType == UTZFMT_TIME_TYPE_STANDARD) || (!inDaylight && timeType == UTZFMT_TIME_TYPE_DAYLIGHT)) { offset = inDaylight ? getRawOffset() + getDSTSavings() : getRawOffset(); if (style == SHORT_GENERIC) { tzfmt->formatOffsetShortLocalizedGMT(offset, result, status); } else { tzfmt->formatOffsetLocalizedGMT(offset, result, status); } } } else if (style == LONG_GMT || style == SHORT_GMT) { LocalPointer tzfmt(TimeZoneFormat::createInstance(locale, status)); if (U_FAILURE(status)) { result.remove(); return result; } offset = inDaylight && useDaylightTime() ? getRawOffset() + getDSTSavings() : getRawOffset(); switch (style) { case LONG_GMT: tzfmt->formatOffsetLocalizedGMT(offset, result, status); break; case SHORT_GMT: tzfmt->formatOffsetISO8601Basic(offset, false, false, false, result, status); break; default: UPRV_UNREACHABLE_EXIT; } } else { U_ASSERT(style == LONG || style == SHORT || style == SHORT_COMMONLY_USED); UTimeZoneNameType nameType = UTZNM_UNKNOWN; switch (style) { case LONG: nameType = inDaylight ? UTZNM_LONG_DAYLIGHT : UTZNM_LONG_STANDARD; break; case SHORT: case SHORT_COMMONLY_USED: nameType = inDaylight ? UTZNM_SHORT_DAYLIGHT : UTZNM_SHORT_STANDARD; break; default: UPRV_UNREACHABLE_EXIT; } LocalPointer tznames(TimeZoneNames::createInstance(locale, status)); if (U_FAILURE(status)) { result.remove(); return result; } UnicodeString canonicalID(ZoneMeta::getCanonicalCLDRID(*this)); tznames->getDisplayName(canonicalID, nameType, date, result); if (result.isEmpty()) { // Fallback to localized GMT LocalPointer tzfmt(TimeZoneFormat::createInstance(locale, status)); offset = inDaylight && useDaylightTime() ? getRawOffset() + getDSTSavings() : getRawOffset(); if (style == LONG) { tzfmt->formatOffsetLocalizedGMT(offset, result, status); } else { tzfmt->formatOffsetShortLocalizedGMT(offset, result, status); } } } if (U_FAILURE(status)) { result.remove(); } return result; } /** * Parse a custom time zone identifier and return a corresponding zone. * @param id a string of the form GMT[+-]hh:mm, GMT[+-]hhmm, or * GMT[+-]hh. * @return a newly created SimpleTimeZone with the given offset and * no Daylight Savings Time, or null if the id cannot be parsed. */ TimeZone* TimeZone::createCustomTimeZone(const UnicodeString& id) { int32_t sign, hour, min, sec; if (parseCustomID(id, sign, hour, min, sec)) { UnicodeString customID; formatCustomID(hour, min, sec, (sign < 0), customID); int32_t offset = sign * ((hour * 60 + min) * 60 + sec) * 1000; return new SimpleTimeZone(offset, customID); } return nullptr; } UnicodeString& TimeZone::getCustomID(const UnicodeString& id, UnicodeString& normalized, UErrorCode& status) { normalized.remove(); if (U_FAILURE(status)) { return normalized; } int32_t sign, hour, min, sec; if (parseCustomID(id, sign, hour, min, sec)) { formatCustomID(hour, min, sec, (sign < 0), normalized); } else { status = U_ILLEGAL_ARGUMENT_ERROR; } return normalized; } UBool TimeZone::parseCustomID(const UnicodeString& id, int32_t& sign, int32_t& hour, int32_t& min, int32_t& sec) { static const int32_t kParseFailed = -99999; NumberFormat* numberFormat = 0; UnicodeString idUppercase = id; idUppercase.toUpper(""); if (id.length() > GMT_ID_LENGTH && idUppercase.startsWith(GMT_ID, GMT_ID_LENGTH)) { ParsePosition pos(GMT_ID_LENGTH); sign = 1; hour = 0; min = 0; sec = 0; if (id[pos.getIndex()] == MINUS /*'-'*/) { sign = -1; } else if (id[pos.getIndex()] != PLUS /*'+'*/) { return false; } pos.setIndex(pos.getIndex() + 1); UErrorCode success = U_ZERO_ERROR; numberFormat = NumberFormat::createInstance(success); if(U_FAILURE(success)){ return false; } numberFormat->setParseIntegerOnly(true); //numberFormat->setLenient(true); // TODO: May need to set this, depends on latest timezone parsing // Look for either hh:mm, hhmm, or hh int32_t start = pos.getIndex(); Formattable n(kParseFailed); numberFormat->parse(id, n, pos); if (pos.getIndex() == start) { delete numberFormat; return false; } hour = n.getLong(); if (pos.getIndex() < id.length()) { if (pos.getIndex() - start > 2 || id[pos.getIndex()] != COLON) { delete numberFormat; return false; } // hh:mm pos.setIndex(pos.getIndex() + 1); int32_t oldPos = pos.getIndex(); n.setLong(kParseFailed); numberFormat->parse(id, n, pos); if ((pos.getIndex() - oldPos) != 2) { // must be 2 digits delete numberFormat; return false; } min = n.getLong(); if (pos.getIndex() < id.length()) { if (id[pos.getIndex()] != COLON) { delete numberFormat; return false; } // [:ss] pos.setIndex(pos.getIndex() + 1); oldPos = pos.getIndex(); n.setLong(kParseFailed); numberFormat->parse(id, n, pos); if (pos.getIndex() != id.length() || (pos.getIndex() - oldPos) != 2) { delete numberFormat; return false; } sec = n.getLong(); } } else { // Supported formats are below - // // HHmmss // Hmmss // HHmm // Hmm // HH // H int32_t length = pos.getIndex() - start; if (length <= 0 || 6 < length) { // invalid length delete numberFormat; return false; } switch (length) { case 1: case 2: // already set to hour break; case 3: case 4: min = hour % 100; hour /= 100; break; case 5: case 6: sec = hour % 100; min = (hour/100) % 100; hour /= 10000; break; } } delete numberFormat; if (hour > kMAX_CUSTOM_HOUR || min > kMAX_CUSTOM_MIN || sec > kMAX_CUSTOM_SEC) { return false; } return true; } return false; } UnicodeString& TimeZone::formatCustomID(int32_t hour, int32_t min, int32_t sec, UBool negative, UnicodeString& id) { // Create time zone ID - GMT[+|-]hhmm[ss] id.setTo(GMT_ID, GMT_ID_LENGTH); if (hour | min | sec) { if (negative) { id += (char16_t)MINUS; } else { id += (char16_t)PLUS; } if (hour < 10) { id += (char16_t)ZERO_DIGIT; } else { id += (char16_t)(ZERO_DIGIT + hour/10); } id += (char16_t)(ZERO_DIGIT + hour%10); id += (char16_t)COLON; if (min < 10) { id += (char16_t)ZERO_DIGIT; } else { id += (char16_t)(ZERO_DIGIT + min/10); } id += (char16_t)(ZERO_DIGIT + min%10); if (sec) { id += (char16_t)COLON; if (sec < 10) { id += (char16_t)ZERO_DIGIT; } else { id += (char16_t)(ZERO_DIGIT + sec/10); } id += (char16_t)(ZERO_DIGIT + sec%10); } } return id; } UBool TimeZone::hasSameRules(const TimeZone& other) const { return (getRawOffset() == other.getRawOffset() && useDaylightTime() == other.useDaylightTime()); } static void U_CALLCONV initTZDataVersion(UErrorCode &status) { ucln_i18n_registerCleanup(UCLN_I18N_TIMEZONE, timeZone_cleanup); int32_t len = 0; StackUResourceBundle bundle; ures_openDirectFillIn(bundle.getAlias(), nullptr, kZONEINFO, &status); const char16_t *tzver = ures_getStringByKey(bundle.getAlias(), kTZVERSION, &len, &status); if (U_SUCCESS(status)) { if (len >= (int32_t)sizeof(TZDATA_VERSION)) { // Ensure that there is always space for a trailing nul in TZDATA_VERSION len = sizeof(TZDATA_VERSION) - 1; } u_UCharsToChars(tzver, TZDATA_VERSION, len); } } const char* TimeZone::getTZDataVersion(UErrorCode& status) { umtx_initOnce(gTZDataVersionInitOnce, &initTZDataVersion, status); return (const char*)TZDATA_VERSION; } UnicodeString& TimeZone::getCanonicalID(const UnicodeString& id, UnicodeString& canonicalID, UErrorCode& status) { UBool isSystemID = false; return getCanonicalID(id, canonicalID, isSystemID, status); } UnicodeString& TimeZone::getCanonicalID(const UnicodeString& id, UnicodeString& canonicalID, UBool& isSystemID, UErrorCode& status) { canonicalID.remove(); isSystemID = false; if (U_FAILURE(status)) { return canonicalID; } if (id.compare(UNKNOWN_ZONE_ID, UNKNOWN_ZONE_ID_LENGTH) == 0) { // special case - Etc/Unknown is a canonical ID, but not system ID canonicalID.fastCopyFrom(id); isSystemID = false; } else { ZoneMeta::getCanonicalCLDRID(id, canonicalID, status); if (U_SUCCESS(status)) { isSystemID = true; } else { // Not a system ID status = U_ZERO_ERROR; getCustomID(id, canonicalID, status); } } return canonicalID; } UnicodeString& TimeZone::getIanaID(const UnicodeString& id, UnicodeString& ianaID, UErrorCode& status) { ianaID.remove(); if (U_FAILURE(status)) { return ianaID; } if (id.compare(ConstChar16Ptr(UNKNOWN_ZONE_ID), UNKNOWN_ZONE_ID_LENGTH) == 0) { status = U_ILLEGAL_ARGUMENT_ERROR; ianaID.setToBogus(); } else { ZoneMeta::getIanaID(id, ianaID, status); } return ianaID; } UnicodeString& TimeZone::getWindowsID(const UnicodeString& id, UnicodeString& winid, UErrorCode& status) { winid.remove(); if (U_FAILURE(status)) { return winid; } // canonicalize the input ID UnicodeString canonicalID; UBool isSystemID = false; getCanonicalID(id, canonicalID, isSystemID, status); if (U_FAILURE(status) || !isSystemID) { // mapping data is only applicable to tz database IDs if (status == U_ILLEGAL_ARGUMENT_ERROR) { // getWindowsID() sets an empty string where // getCanonicalID() sets a U_ILLEGAL_ARGUMENT_ERROR. status = U_ZERO_ERROR; } return winid; } UResourceBundle *mapTimezones = ures_openDirect(nullptr, "windowsZones", &status); ures_getByKey(mapTimezones, "mapTimezones", mapTimezones, &status); if (U_FAILURE(status)) { return winid; } UResourceBundle *winzone = nullptr; UBool found = false; while (ures_hasNext(mapTimezones) && !found) { winzone = ures_getNextResource(mapTimezones, winzone, &status); if (U_FAILURE(status)) { break; } if (ures_getType(winzone) != URES_TABLE) { continue; } UResourceBundle *regionalData = nullptr; while (ures_hasNext(winzone) && !found) { regionalData = ures_getNextResource(winzone, regionalData, &status); if (U_FAILURE(status)) { break; } if (ures_getType(regionalData) != URES_STRING) { continue; } int32_t len; const char16_t *tzids = ures_getString(regionalData, &len, &status); if (U_FAILURE(status)) { break; } const char16_t *start = tzids; UBool hasNext = true; while (hasNext) { const char16_t *end = u_strchr(start, (char16_t)0x20); if (end == nullptr) { end = tzids + len; hasNext = false; } if (canonicalID.compare(start, static_cast(end - start)) == 0) { winid = UnicodeString(ures_getKey(winzone), -1 , US_INV); found = true; break; } start = end + 1; } } ures_close(regionalData); } ures_close(winzone); ures_close(mapTimezones); return winid; } #define MAX_WINDOWS_ID_SIZE 128 UnicodeString& TimeZone::getIDForWindowsID(const UnicodeString& winid, const char* region, UnicodeString& id, UErrorCode& status) { id.remove(); if (U_FAILURE(status)) { return id; } UResourceBundle *zones = ures_openDirect(nullptr, "windowsZones", &status); ures_getByKey(zones, "mapTimezones", zones, &status); if (U_FAILURE(status)) { ures_close(zones); return id; } UErrorCode tmperr = U_ZERO_ERROR; char winidKey[MAX_WINDOWS_ID_SIZE]; int32_t winKeyLen = winid.extract(0, winid.length(), winidKey, sizeof(winidKey) - 1, US_INV); if (winKeyLen == 0 || winKeyLen >= (int32_t)sizeof(winidKey)) { ures_close(zones); return id; } winidKey[winKeyLen] = 0; ures_getByKey(zones, winidKey, zones, &tmperr); // use tmperr, because windows mapping might not // be available by design if (U_FAILURE(tmperr)) { ures_close(zones); return id; } const char16_t *tzid = nullptr; int32_t len = 0; UBool gotID = false; if (region) { const char16_t *tzids = ures_getStringByKey(zones, region, &len, &tmperr); // use tmperr, because // regional mapping is optional if (U_SUCCESS(tmperr)) { // first ID delimited by space is the default one const char16_t *end = u_strchr(tzids, (char16_t)0x20); if (end == nullptr) { id.setTo(tzids, -1); } else { id.setTo(tzids, static_cast(end - tzids)); } gotID = true; } } if (!gotID) { tzid = ures_getStringByKey(zones, "001", &len, &status); // using status, because "001" must be // available at this point if (U_SUCCESS(status)) { id.setTo(tzid, len); } } ures_close(zones); return id; } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ //eof stringi/src/icu74/i18n/ucol_imp.h0000644000176200001440000001040414700200761016206 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 1998-2014, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * * Private implementation header for C collation * file name: ucol_imp.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2000dec11 * created by: Vladimir Weinstein * * Modification history * Date Name Comments * 02/16/2001 synwee Added UCOL_GETPREVCE for the use in ucoleitr * 02/27/2001 synwee Added getMaxExpansion data structure in UCollator * 03/02/2001 synwee Added UCOL_IMPLICIT_CE * 03/12/2001 synwee Added pointer start to collIterate. */ #ifndef UCOL_IMP_H #define UCOL_IMP_H #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION // This part needs to compile as plain C code, for cintltst. #include "unicode/ucol.h" /** Check whether two collators are equal. Collators are considered equal if they * will sort strings the same. This means that both the current attributes and the * rules must be equivalent. * @param source first collator * @param target second collator * @return true or false * @internal ICU 3.0 */ U_CAPI UBool U_EXPORT2 ucol_equals(const UCollator *source, const UCollator *target); /** * Convenience string denoting the Collation data tree */ #define U_ICUDATA_COLL U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "coll" #ifdef __cplusplus #include "unicode/locid.h" #include "unicode/ures.h" U_NAMESPACE_BEGIN struct CollationCacheEntry; class Locale; class UnicodeString; class UnifiedCache; /** Implemented in ucol_res.cpp. */ class CollationLoader { public: static void appendRootRules(UnicodeString &s); static void loadRules(const char *localeID, const char *collationType, UnicodeString &rules, UErrorCode &errorCode); // Adds a reference to returned value. static const CollationCacheEntry *loadTailoring(const Locale &locale, UErrorCode &errorCode); // Cache callback. Adds a reference to returned value. const CollationCacheEntry *createCacheEntry(UErrorCode &errorCode); private: static void U_CALLCONV loadRootRules(UErrorCode &errorCode); // The following members are used by loadTailoring() // and the cache callback. static const uint32_t TRIED_SEARCH = 1; static const uint32_t TRIED_DEFAULT = 2; static const uint32_t TRIED_STANDARD = 4; CollationLoader(const CollationCacheEntry *re, const Locale &requested, UErrorCode &errorCode); ~CollationLoader(); // All loadFromXXX methods add a reference to the returned value. const CollationCacheEntry *loadFromLocale(UErrorCode &errorCode); const CollationCacheEntry *loadFromBundle(UErrorCode &errorCode); const CollationCacheEntry *loadFromCollations(UErrorCode &errorCode); const CollationCacheEntry *loadFromData(UErrorCode &errorCode); // Adds a reference to returned value. const CollationCacheEntry *getCacheEntry(UErrorCode &errorCode); /** * Returns the rootEntry (with one addRef()) if loc==root, * or else returns a new cache entry with ref count 1 for the loc and * the root tailoring. */ const CollationCacheEntry *makeCacheEntryFromRoot( const Locale &loc, UErrorCode &errorCode) const; /** * Returns the entryFromCache as is if loc==validLocale, * or else returns a new cache entry with ref count 1 for the loc and * the same tailoring. In the latter case, a ref count is removed from * entryFromCache. */ static const CollationCacheEntry *makeCacheEntry( const Locale &loc, const CollationCacheEntry *entryFromCache, UErrorCode &errorCode); const UnifiedCache *cache; const CollationCacheEntry *rootEntry; Locale validLocale; Locale locale; char type[16]; char defaultType[16]; uint32_t typesTried; UBool typeFallback; UResourceBundle *bundle; UResourceBundle *collations; UResourceBundle *data; }; U_NAMESPACE_END #endif /* __cplusplus */ #endif /* #if !UCONFIG_NO_COLLATION */ #endif stringi/src/icu74/i18n/number_formatimpl.cpp0000644000176200001440000006244714700200761020472 0ustar liggesusers// © 2017 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "cstring.h" #include "unicode/ures.h" #include "uresimp.h" #include "charstr.h" #include "number_formatimpl.h" #include "unicode/numfmt.h" #include "number_patternstring.h" #include "number_utils.h" #include "unicode/numberformatter.h" #include "unicode/dcfmtsym.h" #include "number_scientific.h" #include "number_compact.h" #include "uresimp.h" #include "ureslocs.h" using namespace icu; using namespace icu::number; using namespace icu::number::impl; NumberFormatterImpl::NumberFormatterImpl(const MacroProps& macros, UErrorCode& status) : NumberFormatterImpl(macros, true, status) { } int32_t NumberFormatterImpl::formatStatic(const MacroProps ¯os, UFormattedNumberData *results, UErrorCode &status) { DecimalQuantity &inValue = results->quantity; FormattedStringBuilder &outString = results->getStringRef(); NumberFormatterImpl impl(macros, false, status); MicroProps& micros = impl.preProcessUnsafe(inValue, status); if (U_FAILURE(status)) { return 0; } int32_t length = writeNumber(micros.simple, inValue, outString, 0, status); length += writeAffixes(micros, outString, 0, length, status); results->outputUnit = std::move(micros.outputUnit); results->gender = micros.gender; return length; } int32_t NumberFormatterImpl::getPrefixSuffixStatic(const MacroProps& macros, Signum signum, StandardPlural::Form plural, FormattedStringBuilder& outString, UErrorCode& status) { NumberFormatterImpl impl(macros, false, status); return impl.getPrefixSuffixUnsafe(signum, plural, outString, status); } // NOTE: C++ SPECIFIC DIFFERENCE FROM JAVA: // The "safe" apply method uses a new MicroProps. In the MicroPropsGenerator, fMicros is copied into the new instance. // The "unsafe" method simply re-uses fMicros, eliminating the extra copy operation. // See MicroProps::processQuantity() for details. int32_t NumberFormatterImpl::format(UFormattedNumberData *results, UErrorCode &status) const { DecimalQuantity &inValue = results->quantity; FormattedStringBuilder &outString = results->getStringRef(); MicroProps micros; preProcess(inValue, micros, status); if (U_FAILURE(status)) { return 0; } int32_t length = writeNumber(micros.simple, inValue, outString, 0, status); length += writeAffixes(micros, outString, 0, length, status); results->outputUnit = std::move(micros.outputUnit); results->gender = micros.gender; return length; } void NumberFormatterImpl::preProcess(DecimalQuantity& inValue, MicroProps& microsOut, UErrorCode& status) const { if (U_FAILURE(status)) { return; } if (fMicroPropsGenerator == nullptr) { status = U_INTERNAL_PROGRAM_ERROR; return; } fMicroPropsGenerator->processQuantity(inValue, microsOut, status); microsOut.integerWidth.apply(inValue, status); } MicroProps& NumberFormatterImpl::preProcessUnsafe(DecimalQuantity& inValue, UErrorCode& status) { if (U_FAILURE(status)) { return fMicros; // must always return a value } if (fMicroPropsGenerator == nullptr) { status = U_INTERNAL_PROGRAM_ERROR; return fMicros; // must always return a value } fMicroPropsGenerator->processQuantity(inValue, fMicros, status); fMicros.integerWidth.apply(inValue, status); return fMicros; } int32_t NumberFormatterImpl::getPrefixSuffix(Signum signum, StandardPlural::Form plural, FormattedStringBuilder& outString, UErrorCode& status) const { if (U_FAILURE(status)) { return 0; } // #13453: DecimalFormat wants the affixes from the pattern only (modMiddle, aka pattern modifier). // Safe path: use fImmutablePatternModifier. const Modifier* modifier = fImmutablePatternModifier->getModifier(signum, plural); modifier->apply(outString, 0, 0, status); if (U_FAILURE(status)) { return 0; } return modifier->getPrefixLength(); } int32_t NumberFormatterImpl::getPrefixSuffixUnsafe(Signum signum, StandardPlural::Form plural, FormattedStringBuilder& outString, UErrorCode& status) { if (U_FAILURE(status)) { return 0; } // #13453: DecimalFormat wants the affixes from the pattern only (modMiddle, aka pattern modifier). // Unsafe path: use fPatternModifier. fPatternModifier->setNumberProperties(signum, plural); fPatternModifier->apply(outString, 0, 0, status); if (U_FAILURE(status)) { return 0; } return fPatternModifier->getPrefixLength(); } NumberFormatterImpl::NumberFormatterImpl(const MacroProps& macros, bool safe, UErrorCode& status) { fMicroPropsGenerator = macrosToMicroGenerator(macros, safe, status); } ////////// const MicroPropsGenerator* NumberFormatterImpl::macrosToMicroGenerator(const MacroProps& macros, bool safe, UErrorCode& status) { if (U_FAILURE(status)) { return nullptr; } const MicroPropsGenerator* chain = &fMicros; // Check that macros is error-free before continuing. if (macros.copyErrorTo(status)) { return nullptr; } // TODO: Accept currency symbols from DecimalFormatSymbols? // Pre-compute a few values for efficiency. bool isCurrency = utils::unitIsCurrency(macros.unit); bool isBaseUnit = utils::unitIsBaseUnit(macros.unit); bool isPercent = utils::unitIsPercent(macros.unit); bool isPermille = utils::unitIsPermille(macros.unit); bool isCompactNotation = macros.notation.fType == Notation::NTN_COMPACT; bool isAccounting = macros.sign == UNUM_SIGN_ACCOUNTING || macros.sign == UNUM_SIGN_ACCOUNTING_ALWAYS || macros.sign == UNUM_SIGN_ACCOUNTING_EXCEPT_ZERO || macros.sign == UNUM_SIGN_ACCOUNTING_NEGATIVE; CurrencyUnit currency(u"", status); if (isCurrency) { currency = CurrencyUnit(macros.unit, status); // Restore CurrencyUnit from MeasureUnit } UNumberUnitWidth unitWidth = UNUM_UNIT_WIDTH_SHORT; if (macros.unitWidth != UNUM_UNIT_WIDTH_COUNT) { unitWidth = macros.unitWidth; } // Use CLDR unit data for all MeasureUnits (not currency and not // no-unit), except use the dedicated percent pattern for percent and // permille. However, use the CLDR unit data for percent/permille if a // long name was requested OR if compact notation is being used, since // compact notation overrides the middle modifier (micros.modMiddle) // normally used for the percent pattern. bool isCldrUnit = !isCurrency && !isBaseUnit && (unitWidth == UNUM_UNIT_WIDTH_FULL_NAME || !(isPercent || isPermille) || isCompactNotation ); bool isMixedUnit = isCldrUnit && (uprv_strcmp(macros.unit.getType(), "") == 0) && macros.unit.getComplexity(status) == UMEASURE_UNIT_MIXED; // Select the numbering system. LocalPointer nsLocal; const NumberingSystem* ns; if (macros.symbols.isNumberingSystem()) { ns = macros.symbols.getNumberingSystem(); } else { // TODO: Is there a way to avoid creating the NumberingSystem object? ns = NumberingSystem::createInstance(macros.locale, status); // Give ownership to the function scope. nsLocal.adoptInstead(ns); } const char* nsName = U_SUCCESS(status) ? ns->getName() : "latn"; uprv_strncpy(fMicros.nsName, nsName, 8); fMicros.nsName[8] = 0; // guarantee NUL-terminated // Default gender: none. fMicros.gender = ""; // Resolve the symbols. Do this here because currency may need to customize them. if (macros.symbols.isDecimalFormatSymbols()) { fMicros.simple.symbols = macros.symbols.getDecimalFormatSymbols(); } else { LocalPointer newSymbols( new DecimalFormatSymbols(macros.locale, *ns, status), status); if (U_FAILURE(status)) { return nullptr; } if (isCurrency) { newSymbols->setCurrency(currency.getISOCurrency(), status); if (U_FAILURE(status)) { return nullptr; } } fMicros.simple.symbols = newSymbols.getAlias(); fSymbols.adoptInstead(newSymbols.orphan()); } // Load and parse the pattern string. It is used for grouping sizes and affixes only. // If we are formatting currency, check for a currency-specific pattern. const char16_t* pattern = nullptr; if (isCurrency && fMicros.simple.symbols->getCurrencyPattern() != nullptr) { pattern = fMicros.simple.symbols->getCurrencyPattern(); } if (pattern == nullptr) { CldrPatternStyle patternStyle; if (isCldrUnit) { patternStyle = CLDR_PATTERN_STYLE_DECIMAL; } else if (isPercent || isPermille) { patternStyle = CLDR_PATTERN_STYLE_PERCENT; } else if (!isCurrency || unitWidth == UNUM_UNIT_WIDTH_FULL_NAME) { patternStyle = CLDR_PATTERN_STYLE_DECIMAL; } else if (isAccounting) { // NOTE: Although ACCOUNTING and ACCOUNTING_ALWAYS are only supported in currencies right now, // the API contract allows us to add support to other units in the future. patternStyle = CLDR_PATTERN_STYLE_ACCOUNTING; } else { patternStyle = CLDR_PATTERN_STYLE_CURRENCY; } pattern = utils::getPatternForStyle(macros.locale, nsName, patternStyle, status); if (U_FAILURE(status)) { return nullptr; } } auto patternInfo = new ParsedPatternInfo(); if (patternInfo == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } fPatternInfo.adoptInstead(patternInfo); PatternParser::parseToPatternInfo(UnicodeString(pattern), *patternInfo, status); if (U_FAILURE(status)) { return nullptr; } ///////////////////////////////////////////////////////////////////////////////////// /// START POPULATING THE DEFAULT MICROPROPS AND BUILDING THE MICROPROPS GENERATOR /// ///////////////////////////////////////////////////////////////////////////////////// // Unit Preferences and Conversions as our first step if (macros.usage.isSet()) { if (!isCldrUnit) { // We only support "usage" when the input unit is specified, and is // a CLDR Unit. status = U_ILLEGAL_ARGUMENT_ERROR; return nullptr; } auto usagePrefsHandler = new UsagePrefsHandler(macros.locale, macros.unit, macros.usage.fValue, chain, status); fUsagePrefsHandler.adoptInsteadAndCheckErrorCode(usagePrefsHandler, status); chain = fUsagePrefsHandler.getAlias(); } else if (isMixedUnit) { auto unitConversionHandler = new UnitConversionHandler(macros.unit, chain, status); fUnitConversionHandler.adoptInsteadAndCheckErrorCode(unitConversionHandler, status); chain = fUnitConversionHandler.getAlias(); } // Multiplier if (macros.scale.isValid()) { fMicros.helpers.multiplier.setAndChain(macros.scale, chain); chain = &fMicros.helpers.multiplier; } // Rounding strategy Precision precision; if (!macros.precision.isBogus()) { precision = macros.precision; } else if (isCompactNotation) { precision = Precision::integer().withMinDigits(2); } else if (isCurrency) { precision = Precision::currency(UCURR_USAGE_STANDARD); } else if (macros.usage.isSet()) { // Bogus Precision - it will get set in the UsagePrefsHandler instead precision = Precision(); } else { precision = Precision::maxFraction(6); } UNumberFormatRoundingMode roundingMode; roundingMode = macros.roundingMode; fMicros.rounder = {precision, roundingMode, currency, status}; if (U_FAILURE(status)) { return nullptr; } // Grouping strategy if (!macros.grouper.isBogus()) { fMicros.simple.grouping = macros.grouper; } else if (isCompactNotation) { // Compact notation uses minGrouping by default since ICU 59 fMicros.simple.grouping = Grouper::forStrategy(UNUM_GROUPING_MIN2); } else { fMicros.simple.grouping = Grouper::forStrategy(UNUM_GROUPING_AUTO); } fMicros.simple.grouping.setLocaleData(*fPatternInfo, macros.locale); // Padding strategy if (!macros.padder.isBogus()) { fMicros.padding = macros.padder; } else { fMicros.padding = Padder::none(); } // Integer width if (!macros.integerWidth.isBogus()) { fMicros.integerWidth = macros.integerWidth; } else { fMicros.integerWidth = IntegerWidth::standard(); } // Sign display if (macros.sign != UNUM_SIGN_COUNT) { fMicros.sign = macros.sign; } else { fMicros.sign = UNUM_SIGN_AUTO; } // Decimal mark display if (macros.decimal != UNUM_DECIMAL_SEPARATOR_COUNT) { fMicros.simple.decimal = macros.decimal; } else { fMicros.simple.decimal = UNUM_DECIMAL_SEPARATOR_AUTO; } // Use monetary separator symbols fMicros.simple.useCurrency = isCurrency; // Inner modifier (scientific notation) if (macros.notation.fType == Notation::NTN_SCIENTIFIC) { auto newScientificHandler = new ScientificHandler(¯os.notation, fMicros.simple.symbols, chain); if (newScientificHandler == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } fScientificHandler.adoptInstead(newScientificHandler); chain = fScientificHandler.getAlias(); } else { // No inner modifier required fMicros.modInner = &fMicros.helpers.emptyStrongModifier; } // Middle modifier (patterns, positive/negative, currency symbols, percent) auto patternModifier = new MutablePatternModifier(false); if (patternModifier == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } fPatternModifier.adoptInstead(patternModifier); const AffixPatternProvider* affixProvider = macros.affixProvider != nullptr && ( // For more information on this condition, see ICU-22073 !isCompactNotation || isCurrency == macros.affixProvider->hasCurrencySign()) ? macros.affixProvider : static_cast(fPatternInfo.getAlias()); patternModifier->setPatternInfo(affixProvider, kUndefinedField); patternModifier->setPatternAttributes(fMicros.sign, isPermille, macros.approximately); if (patternModifier->needsPlurals()) { patternModifier->setSymbols( fMicros.simple.symbols, currency, unitWidth, resolvePluralRules(macros.rules, macros.locale, status), status); } else { patternModifier->setSymbols(fMicros.simple.symbols, currency, unitWidth, nullptr, status); } if (safe) { fImmutablePatternModifier.adoptInsteadAndCheckErrorCode(patternModifier->createImmutable(status), status); } if (U_FAILURE(status)) { return nullptr; } // currencyAsDecimal if (affixProvider->currencyAsDecimal()) { fMicros.simple.currencyAsDecimal = patternModifier->getCurrencySymbolForUnitWidth(status); } // Outer modifier (CLDR units and currency long names) if (isCldrUnit) { const char *unitDisplayCase = ""; if (macros.unitDisplayCase.isSet()) { unitDisplayCase = macros.unitDisplayCase.fValue; } if (macros.usage.isSet()) { fLongNameMultiplexer.adoptInsteadAndCheckErrorCode( LongNameMultiplexer::forMeasureUnits( macros.locale, *fUsagePrefsHandler->getOutputUnits(), unitWidth, unitDisplayCase, resolvePluralRules(macros.rules, macros.locale, status), chain, status), status); chain = fLongNameMultiplexer.getAlias(); } else if (isMixedUnit) { fMixedUnitLongNameHandler.adoptInsteadAndCheckErrorCode(new MixedUnitLongNameHandler(), status); MixedUnitLongNameHandler::forMeasureUnit( macros.locale, macros.unit, unitWidth, unitDisplayCase, resolvePluralRules(macros.rules, macros.locale, status), chain, fMixedUnitLongNameHandler.getAlias(), status); chain = fMixedUnitLongNameHandler.getAlias(); } else { MeasureUnit unit = macros.unit; if (!utils::unitIsBaseUnit(macros.perUnit)) { unit = unit.product(macros.perUnit.reciprocal(status), status); // This isn't strictly necessary, but was what we specced out // when perUnit became a backward-compatibility thing: // unit/perUnit use case is only valid if both units are // built-ins, or the product is a built-in. if (uprv_strcmp(unit.getType(), "") == 0 && (uprv_strcmp(macros.unit.getType(), "") == 0 || uprv_strcmp(macros.perUnit.getType(), "") == 0)) { status = U_UNSUPPORTED_ERROR; return nullptr; } } fLongNameHandler.adoptInsteadAndCheckErrorCode(new LongNameHandler(), status); LongNameHandler::forMeasureUnit(macros.locale, unit, unitWidth, unitDisplayCase, resolvePluralRules(macros.rules, macros.locale, status), chain, fLongNameHandler.getAlias(), status); chain = fLongNameHandler.getAlias(); } } else if (isCurrency && unitWidth == UNUM_UNIT_WIDTH_FULL_NAME) { fLongNameHandler.adoptInsteadAndCheckErrorCode( LongNameHandler::forCurrencyLongNames( macros.locale, currency, resolvePluralRules(macros.rules, macros.locale, status), chain, status), status); chain = fLongNameHandler.getAlias(); } else { // No outer modifier required fMicros.modOuter = &fMicros.helpers.emptyWeakModifier; } if (U_FAILURE(status)) { return nullptr; } // Compact notation if (isCompactNotation) { CompactType compactType = (isCurrency && unitWidth != UNUM_UNIT_WIDTH_FULL_NAME) ? CompactType::TYPE_CURRENCY : CompactType::TYPE_DECIMAL; auto newCompactHandler = new CompactHandler( macros.notation.fUnion.compactStyle, macros.locale, nsName, compactType, resolvePluralRules(macros.rules, macros.locale, status), patternModifier, safe, chain, status); if (U_FAILURE(status)) { return nullptr; } if (newCompactHandler == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } fCompactHandler.adoptInstead(newCompactHandler); chain = fCompactHandler.getAlias(); } if (U_FAILURE(status)) { return nullptr; } // Always add the pattern modifier as the last element of the chain. if (safe) { fImmutablePatternModifier->addToChain(chain); chain = fImmutablePatternModifier.getAlias(); } else { patternModifier->addToChain(chain); chain = patternModifier; } return chain; } const PluralRules* NumberFormatterImpl::resolvePluralRules( const PluralRules* rulesPtr, const Locale& locale, UErrorCode& status) { if (rulesPtr != nullptr) { return rulesPtr; } // Lazily create PluralRules if (fRules.isNull()) { fRules.adoptInstead(PluralRules::forLocale(locale, status)); } return fRules.getAlias(); } int32_t NumberFormatterImpl::writeAffixes( const MicroProps& micros, FormattedStringBuilder& string, int32_t start, int32_t end, UErrorCode& status) { U_ASSERT(micros.modOuter != nullptr); // Always apply the inner modifier (which is "strong"). int32_t length = micros.modInner->apply(string, start, end, status); if (micros.padding.isValid()) { length += micros.padding .padAndApply(*micros.modMiddle, *micros.modOuter, string, start, length + end, status); } else { length += micros.modMiddle->apply(string, start, length + end, status); length += micros.modOuter->apply(string, start, length + end, status); } return length; } int32_t NumberFormatterImpl::writeNumber( const SimpleMicroProps& micros, DecimalQuantity& quantity, FormattedStringBuilder& string, int32_t index, UErrorCode& status) { int32_t length = 0; if (quantity.isInfinite()) { length += string.insert( length + index, micros.symbols->getSymbol(DecimalFormatSymbols::ENumberFormatSymbol::kInfinitySymbol), {UFIELD_CATEGORY_NUMBER, UNUM_INTEGER_FIELD}, status); } else if (quantity.isNaN()) { length += string.insert( length + index, micros.symbols->getSymbol(DecimalFormatSymbols::ENumberFormatSymbol::kNaNSymbol), {UFIELD_CATEGORY_NUMBER, UNUM_INTEGER_FIELD}, status); } else { // Add the integer digits length += writeIntegerDigits( micros, quantity, string, length + index, status); // Add the decimal point if (quantity.getLowerDisplayMagnitude() < 0 || micros.decimal == UNUM_DECIMAL_SEPARATOR_ALWAYS) { if (!micros.currencyAsDecimal.isBogus()) { length += string.insert( length + index, micros.currencyAsDecimal, {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}, status); } else if (micros.useCurrency) { length += string.insert( length + index, micros.symbols->getSymbol( DecimalFormatSymbols::ENumberFormatSymbol::kMonetarySeparatorSymbol), {UFIELD_CATEGORY_NUMBER, UNUM_DECIMAL_SEPARATOR_FIELD}, status); } else { length += string.insert( length + index, micros.symbols->getSymbol( DecimalFormatSymbols::ENumberFormatSymbol::kDecimalSeparatorSymbol), {UFIELD_CATEGORY_NUMBER, UNUM_DECIMAL_SEPARATOR_FIELD}, status); } } // Add the fraction digits length += writeFractionDigits(micros, quantity, string, length + index, status); if (length == 0) { // Force output of the digit for value 0 length += utils::insertDigitFromSymbols( string, index, 0, *micros.symbols, {UFIELD_CATEGORY_NUMBER, UNUM_INTEGER_FIELD}, status); } } return length; } int32_t NumberFormatterImpl::writeIntegerDigits( const SimpleMicroProps& micros, DecimalQuantity& quantity, FormattedStringBuilder& string, int32_t index, UErrorCode& status) { int length = 0; int integerCount = quantity.getUpperDisplayMagnitude() + 1; for (int i = 0; i < integerCount; i++) { // Add grouping separator if (micros.grouping.groupAtPosition(i, quantity)) { length += string.insert( index, micros.useCurrency ? micros.symbols->getSymbol( DecimalFormatSymbols::ENumberFormatSymbol::kMonetaryGroupingSeparatorSymbol) : micros.symbols->getSymbol( DecimalFormatSymbols::ENumberFormatSymbol::kGroupingSeparatorSymbol), {UFIELD_CATEGORY_NUMBER, UNUM_GROUPING_SEPARATOR_FIELD}, status); } // Get and append the next digit value int8_t nextDigit = quantity.getDigit(i); length += utils::insertDigitFromSymbols( string, index, nextDigit, *micros.symbols, {UFIELD_CATEGORY_NUMBER, UNUM_INTEGER_FIELD}, status); } return length; } int32_t NumberFormatterImpl::writeFractionDigits( const SimpleMicroProps& micros, DecimalQuantity& quantity, FormattedStringBuilder& string, int32_t index, UErrorCode& status) { int length = 0; int fractionCount = -quantity.getLowerDisplayMagnitude(); for (int i = 0; i < fractionCount; i++) { // Get and append the next digit value int8_t nextDigit = quantity.getDigit(-i - 1); length += utils::insertDigitFromSymbols( string, length + index, nextDigit, *micros.symbols, {UFIELD_CATEGORY_NUMBER, UNUM_FRACTION_FIELD}, status); } return length; } #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/number_scientific.h0000644000176200001440000000343614700200761020076 0ustar liggesusers// © 2017 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #ifndef __NUMBER_SCIENTIFIC_H__ #define __NUMBER_SCIENTIFIC_H__ #include "number_types.h" U_NAMESPACE_BEGIN namespace number { namespace impl { // Forward-declare class ScientificHandler; class U_I18N_API ScientificModifier : public UMemory, public Modifier { public: ScientificModifier(); void set(int32_t exponent, const ScientificHandler *handler); int32_t apply(FormattedStringBuilder &output, int32_t leftIndex, int32_t rightIndex, UErrorCode &status) const override; int32_t getPrefixLength() const override; int32_t getCodePointCount() const override; bool isStrong() const override; bool containsField(Field field) const override; void getParameters(Parameters& output) const override; bool semanticallyEquivalent(const Modifier& other) const override; private: int32_t fExponent; const ScientificHandler *fHandler; }; class ScientificHandler : public UMemory, public MicroPropsGenerator, public MultiplierProducer { public: ScientificHandler(const Notation *notation, const DecimalFormatSymbols *symbols, const MicroPropsGenerator *parent); void processQuantity(DecimalQuantity &quantity, MicroProps µs, UErrorCode &status) const override; int32_t getMultiplier(int32_t magnitude) const override; private: const Notation::ScientificSettings fSettings; const DecimalFormatSymbols *fSymbols; const MicroPropsGenerator *fParent; friend class ScientificModifier; }; } // namespace impl } // namespace number U_NAMESPACE_END #endif //__NUMBER_SCIENTIFIC_H__ #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/olsontz.h0000644000176200001440000003632514700200761016121 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (c) 2003-2013, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Author: Alan Liu * Created: July 21 2003 * Since: ICU 2.8 ********************************************************************** */ #ifndef OLSONTZ_H #define OLSONTZ_H #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/basictz.h" #include "umutex.h" struct UResourceBundle; U_NAMESPACE_BEGIN class SimpleTimeZone; /** * A time zone based on the Olson tz database. Olson time zones change * behavior over time. The raw offset, rules, presence or absence of * daylight savings time, and even the daylight savings amount can all * vary. * * This class uses a resource bundle named "zoneinfo". Zoneinfo is a * table containing different kinds of resources. In several places, * zones are referred to using integers. A zone's integer is a number * from 0..n-1, where n is the number of zones, with the zones sorted * in lexicographic order. * * 1. Zones. These have keys corresponding to the Olson IDs, e.g., * "Asia/Shanghai". Each resource describes the behavior of the given * zone. Zones come in two different formats. * * a. Zone (table). A zone is a table resource contains several * type of resources below: * * - typeOffsets:intvector (Required) * * Sets of UTC raw/dst offset pairs in seconds. Entries at * 2n represents raw offset and 2n+1 represents dst offset * paired with the raw offset at 2n. The very first pair represents * the initial zone offset (before the first transition) always. * * - trans:intvector (Optional) * * List of transition times represented by 32bit seconds from the * epoch (1970-01-01T00:00Z) in ascending order. * * - transPre32/transPost32:intvector (Optional) * * List of transition times before/after 32bit minimum seconds. * Each time is represented by a pair of 32bit integer. * * - typeMap:bin (Optional) * * Array of bytes representing the mapping between each transition * time (transPre32/trans/transPost32) and its corresponding offset * data (typeOffsets). * * - finalRule:string (Optional) * * If a recurrent transition rule is applicable to a zone forever * after the final transition time, finalRule represents the rule * in Rules data. * * - finalRaw:int (Optional) * * When finalRule is available, finalRaw is required and specifies * the raw (base) offset of the rule. * * - finalYear:int (Optional) * * When finalRule is available, finalYear is required and specifies * the start year of the rule. * * - links:intvector (Optional) * * When this zone data is shared with other zones, links specifies * all zones including the zone itself. Each zone is referenced by * integer index. * * b. Link (int, length 1). A link zone is an int resource. The * integer is the zone number of the target zone. The key of this * resource is an alternate name for the target zone. This data * is corresponding to Link data in the tz database. * * * 2. Rules. These have keys corresponding to the Olson rule IDs, * with an underscore prepended, e.g., "_EU". Each resource describes * the behavior of the given rule using an intvector, containing the * onset list, the cessation list, and the DST savings. The onset and * cessation lists consist of the month, dowim, dow, time, and time * mode. The end result is that the 11 integers describing the rule * can be passed directly into the SimpleTimeZone 13-argument * constructor (the other two arguments will be the raw offset, taken * from the complex zone element 5, and the ID string, which is not * used), with the times and the DST savings multiplied by 1000 to * scale from seconds to milliseconds. * * 3. Regions. An array specifies mapping between zones and regions. * Each item is either a 2-letter ISO country code or "001" * (UN M.49 - World). This data is generated from "zone.tab" * in the tz database. */ class U_I18N_API OlsonTimeZone: public BasicTimeZone { public: /** * Construct from a resource bundle. * @param top the top-level zoneinfo resource bundle. This is used * to lookup the rule that `res' may refer to, if there is one. * @param res the resource bundle of the zone to be constructed * @param tzid the time zone ID * @param ec input-output error code */ OlsonTimeZone(const UResourceBundle* top, const UResourceBundle* res, const UnicodeString& tzid, UErrorCode& ec); /** * Copy constructor */ OlsonTimeZone(const OlsonTimeZone& other); /** * Destructor */ virtual ~OlsonTimeZone(); /** * Assignment operator */ OlsonTimeZone& operator=(const OlsonTimeZone& other); /** * Returns true if the two TimeZone objects are equal. */ virtual bool operator==(const TimeZone& other) const override; /** * TimeZone API. */ virtual OlsonTimeZone* clone() const override; /** * TimeZone API. */ static UClassID U_EXPORT2 getStaticClassID(); /** * TimeZone API. */ virtual UClassID getDynamicClassID() const override; /** * TimeZone API. Do not call this; prefer getOffset(UDate,...). */ virtual int32_t getOffset(uint8_t era, int32_t year, int32_t month, int32_t day, uint8_t dayOfWeek, int32_t millis, UErrorCode& ec) const override; /** * TimeZone API. Do not call this; prefer getOffset(UDate,...). */ virtual int32_t getOffset(uint8_t era, int32_t year, int32_t month, int32_t day, uint8_t dayOfWeek, int32_t millis, int32_t monthLength, UErrorCode& ec) const override; /** * TimeZone API. */ virtual void getOffset(UDate date, UBool local, int32_t& rawOffset, int32_t& dstOffset, UErrorCode& ec) const override; /** * BasicTimeZone API. */ virtual void getOffsetFromLocal( UDate date, UTimeZoneLocalOption nonExistingTimeOpt, UTimeZoneLocalOption duplicatedTimeOpt, int32_t& rawOffset, int32_t& dstOffset, UErrorCode& status) const override; /** * TimeZone API. This method has no effect since objects of this * class are quasi-immutable (the base class allows the ID to be * changed). */ virtual void setRawOffset(int32_t offsetMillis) override; /** * TimeZone API. For a historical zone, the raw offset can change * over time, so this API is not useful. In order to approximate * expected behavior, this method returns the raw offset for the * current moment in time. */ virtual int32_t getRawOffset() const override; /** * TimeZone API. For a historical zone, whether DST is used or * not varies over time. In order to approximate expected * behavior, this method returns true if DST is observed at any * point in the current year. */ virtual UBool useDaylightTime() const override; /** * TimeZone API. */ virtual UBool inDaylightTime(UDate date, UErrorCode& ec) const override; /** * TimeZone API. */ virtual int32_t getDSTSavings() const override; /** * TimeZone API. Also comare historic transitions. */ virtual UBool hasSameRules(const TimeZone& other) const override; /** * BasicTimeZone API. * Gets the first time zone transition after the base time. * @param base The base time. * @param inclusive Whether the base time is inclusive or not. * @param result Receives the first transition after the base time. * @return true if the transition is found. */ virtual UBool getNextTransition(UDate base, UBool inclusive, TimeZoneTransition& result) const override; /** * BasicTimeZone API. * Gets the most recent time zone transition before the base time. * @param base The base time. * @param inclusive Whether the base time is inclusive or not. * @param result Receives the most recent transition before the base time. * @return true if the transition is found. */ virtual UBool getPreviousTransition(UDate base, UBool inclusive, TimeZoneTransition& result) const override; /** * BasicTimeZone API. * Returns the number of TimeZoneRules which represents time transitions, * for this time zone, that is, all TimeZoneRules for this time zone except * InitialTimeZoneRule. The return value range is 0 or any positive value. * @param status Receives error status code. * @return The number of TimeZoneRules representing time transitions. */ virtual int32_t countTransitionRules(UErrorCode& status) const override; /** * Gets the InitialTimeZoneRule and the set of TimeZoneRule * which represent time transitions for this time zone. On successful return, * the argument initial points to non-nullptr InitialTimeZoneRule and * the array trsrules is filled with 0 or multiple TimeZoneRule * instances up to the size specified by trscount. The results are referencing the * rule instance held by this time zone instance. Therefore, after this time zone * is destructed, they are no longer available. * @param initial Receives the initial timezone rule * @param trsrules Receives the timezone transition rules * @param trscount On input, specify the size of the array 'transitions' receiving * the timezone transition rules. On output, actual number of * rules filled in the array will be set. * @param status Receives error status code. */ virtual void getTimeZoneRules(const InitialTimeZoneRule*& initial, const TimeZoneRule* trsrules[], int32_t& trscount, UErrorCode& status) const override; /** * Internal API returning the canonical ID of this zone. * This ID won't be affected by setID(). */ const char16_t *getCanonicalID() const; private: /** * Default constructor. Creates a time zone with an empty ID and * a fixed GMT offset of zero. */ OlsonTimeZone(); private: void constructEmpty(); void getHistoricalOffset(UDate date, UBool local, int32_t NonExistingTimeOpt, int32_t DuplicatedTimeOpt, int32_t& rawoff, int32_t& dstoff) const; int16_t transitionCount() const; int64_t transitionTimeInSeconds(int16_t transIdx) const; double transitionTime(int16_t transIdx) const; /* * Following 3 methods return an offset at the given transition time index. * When the index is negative, return the initial offset. */ int32_t zoneOffsetAt(int16_t transIdx) const; int32_t rawOffsetAt(int16_t transIdx) const; int32_t dstOffsetAt(int16_t transIdx) const; /* * Following methods return the initial offset. */ int32_t initialRawOffset() const; int32_t initialDstOffset() const; /** * Number of transitions in each time range */ int16_t transitionCountPre32; int16_t transitionCount32; int16_t transitionCountPost32; /** * Time of each transition in seconds from 1970 epoch before 32bit second range (<= 1900). * Each transition in this range is represented by a pair of int32_t. * Length is transitionCount int32_t's. nullptr if no transitions in this range. */ const int32_t *transitionTimesPre32; // alias into res; do not delete /** * Time of each transition in seconds from 1970 epoch in 32bit second range. * Length is transitionCount int32_t's. nullptr if no transitions in this range. */ const int32_t *transitionTimes32; // alias into res; do not delete /** * Time of each transition in seconds from 1970 epoch after 32bit second range (>= 2038). * Each transition in this range is represented by a pair of int32_t. * Length is transitionCount int32_t's. nullptr if no transitions in this range. */ const int32_t *transitionTimesPost32; // alias into res; do not delete /** * Number of types, 1..255 */ int16_t typeCount; /** * Offset from GMT in seconds for each type. * Length is typeCount int32_t's. At least one type (a pair of int32_t) * is required. */ const int32_t *typeOffsets; // alias into res; do not delete /** * Type description data, consisting of transitionCount uint8_t * type indices (from 0..typeCount-1). * Length is transitionCount int16_t's. nullptr if no transitions. */ const uint8_t *typeMapData; // alias into res; do not delete /** * A SimpleTimeZone that governs the behavior for date >= finalMillis. */ SimpleTimeZone *finalZone; // owned, may be nullptr /** * For date >= finalMillis, the finalZone will be used. */ double finalStartMillis; /** * For year >= finalYear, the finalZone will be used. */ int32_t finalStartYear; /* * Canonical (CLDR) ID of this zone */ const char16_t *canonicalID; /* BasicTimeZone support */ void clearTransitionRules(); void deleteTransitionRules(); void checkTransitionRules(UErrorCode& status) const; public: // Internal, for access from plain C code void initTransitionRules(UErrorCode& status); private: InitialTimeZoneRule *initialRule; TimeZoneTransition *firstTZTransition; int16_t firstTZTransitionIdx; TimeZoneTransition *firstFinalTZTransition; TimeArrayTimeZoneRule **historicRules; int16_t historicRuleCount; SimpleTimeZone *finalZoneWithStartYear; // hack UInitOnce transitionRulesInitOnce {}; }; inline int16_t OlsonTimeZone::transitionCount() const { return transitionCountPre32 + transitionCount32 + transitionCountPost32; } inline double OlsonTimeZone::transitionTime(int16_t transIdx) const { return (double)transitionTimeInSeconds(transIdx) * U_MILLIS_PER_SECOND; } inline int32_t OlsonTimeZone::zoneOffsetAt(int16_t transIdx) const { int16_t typeIdx = (transIdx >= 0 ? typeMapData[transIdx] : 0) << 1; return typeOffsets[typeIdx] + typeOffsets[typeIdx + 1]; } inline int32_t OlsonTimeZone::rawOffsetAt(int16_t transIdx) const { int16_t typeIdx = (transIdx >= 0 ? typeMapData[transIdx] : 0) << 1; return typeOffsets[typeIdx]; } inline int32_t OlsonTimeZone::dstOffsetAt(int16_t transIdx) const { int16_t typeIdx = (transIdx >= 0 ? typeMapData[transIdx] : 0) << 1; return typeOffsets[typeIdx + 1]; } inline int32_t OlsonTimeZone::initialRawOffset() const { return typeOffsets[0]; } inline int32_t OlsonTimeZone::initialDstOffset() const { return typeOffsets[1]; } inline const char16_t* OlsonTimeZone::getCanonicalID() const { return canonicalID; } U_NAMESPACE_END #endif // !UCONFIG_NO_FORMATTING #endif // OLSONTZ_H //eof stringi/src/icu74/i18n/csrsbcs.cpp0000644000176200001440000015651414700200761016411 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2005-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ #include "unicode/utypes.h" #include "cmemory.h" #if !UCONFIG_NO_CONVERSION #include "csrsbcs.h" #include "csmatch.h" #define N_GRAM_SIZE 3 #define N_GRAM_MASK 0xFFFFFF U_NAMESPACE_BEGIN NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap) : ngram(0), byteIndex(0) { ngramList = theNgramList; charMap = theCharMap; ngramCount = hitCount = 0; } NGramParser::~NGramParser() { } /* * Binary search for value in table, which must have exactly 64 entries. */ int32_t NGramParser::search(const int32_t *table, int32_t value) { int32_t index = 0; if (table[index + 32] <= value) { index += 32; } if (table[index + 16] <= value) { index += 16; } if (table[index + 8] <= value) { index += 8; } if (table[index + 4] <= value) { index += 4; } if (table[index + 2] <= value) { index += 2; } if (table[index + 1] <= value) { index += 1; } if (table[index] > value) { index -= 1; } if (index < 0 || table[index] != value) { return -1; } return index; } void NGramParser::lookup(int32_t thisNgram) { ngramCount += 1; if (search(ngramList, thisNgram) >= 0) { hitCount += 1; } } void NGramParser::addByte(int32_t b) { ngram = ((ngram << 8) + b) & N_GRAM_MASK; lookup(ngram); } int32_t NGramParser::nextByte(InputText *det) { if (byteIndex >= det->fInputLen) { return -1; } return det->fInputBytes[byteIndex++]; } void NGramParser::parseCharacters(InputText *det) { int32_t b; bool ignoreSpace = false; while ((b = nextByte(det)) >= 0) { uint8_t mb = charMap[b]; // TODO: 0x20 might not be a space in all character sets... if (mb != 0) { if (!(mb == 0x20 && ignoreSpace)) { addByte(mb); } ignoreSpace = (mb == 0x20); } } } int32_t NGramParser::parse(InputText *det) { parseCharacters(det); // TODO: Is this OK? The buffer could have ended in the middle of a word... addByte(0x20); double rawPercent = (double) hitCount / (double) ngramCount; // if (rawPercent <= 2.0) { // return 0; // } // TODO - This is a bit of a hack to take care of a case // were we were getting a confidence of 135... if (rawPercent > 0.33) { return 98; } return (int32_t) (rawPercent * 300.0); } #if !UCONFIG_ONLY_HTML_CONVERSION static const uint8_t unshapeMap_IBM420[] = { /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */ /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, /* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, /* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, /* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F, /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E, /* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF, /* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF, /* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF, /* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF, /* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, /* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, }; NGramParser_IBM420::NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap):NGramParser(theNgramList, theCharMap) { alef = 0x00; } NGramParser_IBM420::~NGramParser_IBM420() {} int32_t NGramParser_IBM420::isLamAlef(int32_t b) { if(b == 0xB2 || b == 0xB3){ return 0x47; }else if(b == 0xB4 || b == 0xB5){ return 0x49; }else if(b == 0xB8 || b == 0xB9){ return 0x56; }else return 0x00; } /* * Arabic shaping needs to be done manually. Cannot call ArabicShaping class * because CharsetDetector is dealing with bytes not Unicode code points. We could * convert the bytes to Unicode code points but that would leave us dependent * on CharsetICU which we try to avoid. IBM420 converter amongst different versions * of JDK can produce different results and therefore is also avoided. */ int32_t NGramParser_IBM420::nextByte(InputText *det) { if (byteIndex >= det->fInputLen || det->fInputBytes[byteIndex] == 0) { return -1; } int next; alef = isLamAlef(det->fInputBytes[byteIndex]); if(alef != 0x00) next = 0xB1 & 0xFF; else next = unshapeMap_IBM420[det->fInputBytes[byteIndex]& 0xFF] & 0xFF; byteIndex++; return next; } void NGramParser_IBM420::parseCharacters(InputText *det) { int32_t b; bool ignoreSpace = false; while ((b = nextByte(det)) >= 0) { uint8_t mb = charMap[b]; // TODO: 0x20 might not be a space in all character sets... if (mb != 0) { if (!(mb == 0x20 && ignoreSpace)) { addByte(mb); } ignoreSpace = (mb == 0x20); } if(alef != 0x00){ mb = charMap[alef & 0xFF]; // TODO: 0x20 might not be a space in all character sets... if (mb != 0) { if (!(mb == 0x20 && ignoreSpace)) { addByte(mb); } ignoreSpace = (mb == 0x20); } } } } #endif CharsetRecog_sbcs::CharsetRecog_sbcs() { // nothing else to do } CharsetRecog_sbcs::~CharsetRecog_sbcs() { // nothing to do } int32_t CharsetRecog_sbcs::match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t byteMap[]) const { NGramParser parser(ngrams, byteMap); int32_t result; result = parser.parse(det); return result; } static const uint8_t charMap_8859_1[] = { 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20, 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, }; static const uint8_t charMap_8859_2[] = { 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0x20, 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF, 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0xB7, 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20, }; static const uint8_t charMap_8859_5[] = { 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF, }; static const uint8_t charMap_8859_6[] = { 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0x20, 0x20, 0x20, 0x20, 0x20, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, }; static const uint8_t charMap_8859_7[] = { 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xA1, 0xA2, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xDC, 0x20, 0xDD, 0xDE, 0xDF, 0x20, 0xFC, 0x20, 0xFD, 0xFE, 0xC0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0x20, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xDC, 0xDD, 0xDE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20, }; static const uint8_t charMap_8859_8[] = { 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0x20, 0x20, 0x20, 0x20, 0x20, }; static const uint8_t charMap_8859_9[] = { 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20, 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x69, 0xFE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, }; static const int32_t ngrams_windows_1251[] = { 0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE, 0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED, 0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2, 0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520, }; static const uint8_t charMap_windows_1251[] = { 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 0x90, 0x83, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F, 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F, 0x20, 0xA2, 0xA2, 0xBC, 0x20, 0xB4, 0x20, 0x20, 0xB8, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0xBF, 0x20, 0x20, 0xB3, 0xB3, 0xB4, 0xB5, 0x20, 0x20, 0xB8, 0x20, 0xBA, 0x20, 0xBC, 0xBE, 0xBE, 0xBF, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, }; static const int32_t ngrams_windows_1256[] = { 0x20C7E1, 0x20C7E4, 0x20C8C7, 0x20DAE1, 0x20DDED, 0x20E1E1, 0x20E3E4, 0x20E6C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E120, 0xC7E1C3, 0xC7E1C7, 0xC7E1C8, 0xC7E1CA, 0xC7E1CC, 0xC7E1CD, 0xC7E1CF, 0xC7E1D3, 0xC7E1DA, 0xC7E1DE, 0xC7E1E3, 0xC7E1E6, 0xC7E1ED, 0xC7E320, 0xC7E420, 0xC7E4CA, 0xC820C7, 0xC920C7, 0xC920DD, 0xC920E1, 0xC920E3, 0xC920E6, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xDA20C7, 0xDAE1EC, 0xDDED20, 0xE120C7, 0xE1C920, 0xE1EC20, 0xE1ED20, 0xE320C7, 0xE3C720, 0xE3C920, 0xE3E420, 0xE420C7, 0xE520C7, 0xE5C720, 0xE6C7E1, 0xE6E420, 0xEC20C7, 0xED20C7, 0xED20E3, 0xED20E6, 0xEDC920, 0xEDD120, 0xEDE420, }; static const uint8_t charMap_windows_1256[] = { 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x81, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20, 0x88, 0x20, 0x8A, 0x20, 0x9C, 0x8D, 0x8E, 0x8F, 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x98, 0x20, 0x9A, 0x20, 0x9C, 0x20, 0x20, 0x9F, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0x20, 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0x20, 0x20, 0x20, 0x20, 0xF4, 0x20, 0x20, 0x20, 0x20, 0xF9, 0x20, 0xFB, 0xFC, 0x20, 0x20, 0xFF, }; static const int32_t ngrams_KOI8_R[] = { 0x20C4CF, 0x20C920, 0x20CBC1, 0x20CBCF, 0x20CEC1, 0x20CEC5, 0x20CFC2, 0x20D0CF, 0x20D0D2, 0x20D2C1, 0x20D3CF, 0x20D3D4, 0x20D4CF, 0x20D720, 0x20D7CF, 0x20DAC1, 0x20DCD4, 0x20DED4, 0xC1CEC9, 0xC1D4D8, 0xC5CCD8, 0xC5CEC9, 0xC5D3D4, 0xC5D420, 0xC7CF20, 0xC920D0, 0xC9C520, 0xC9C920, 0xC9D120, 0xCCC5CE, 0xCCC920, 0xCCD8CE, 0xCEC120, 0xCEC520, 0xCEC9C5, 0xCEC9D1, 0xCECF20, 0xCECFD7, 0xCF20D0, 0xCF20D3, 0xCF20D7, 0xCFC7CF, 0xCFCA20, 0xCFCCD8, 0xCFCD20, 0xCFD3D4, 0xCFD720, 0xCFD7C1, 0xD0CFCC, 0xD0D2C5, 0xD0D2C9, 0xD0D2CF, 0xD2C1D7, 0xD2C5C4, 0xD3D120, 0xD3D4C1, 0xD3D4C9, 0xD3D4D7, 0xD4C5CC, 0xD4CF20, 0xD4CFD2, 0xD4D820, 0xD9C820, 0xDED4CF, }; static const uint8_t charMap_KOI8_R[] = { 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, }; #if !UCONFIG_ONLY_HTML_CONVERSION static const int32_t ngrams_IBM424_he_rtl[] = { 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641, 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045, 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056, 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069, }; static const int32_t ngrams_IBM424_he_ltr[] = { 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141, 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054, 0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940, 0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651, }; static const uint8_t charMap_IBM424_he[] = { /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */ /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 4- */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 5- */ 0x40, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 7- */ 0x40, 0x71, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00, 0x40, 0x40, /* 8- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 9- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* B- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, }; static const int32_t ngrams_IBM420_ar_rtl[] = { 0x4056B1, 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158, 0x56B163, 0x56B167, 0x56B169, 0x56B173, 0x56B178, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1CF, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x6240AB, 0x6240B1, 0x6240BB, 0x6240CF, 0x634056, 0x734056, 0x736240, 0x754056, 0x756240, 0x784056, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB1DA40, 0xB1DC40, 0xBB4056, 0xBB5640, 0xBB6240, 0xBBBD40, 0xBD4056, 0xBF4056, 0xBF5640, 0xCF56B1, 0xCFBD40, 0xDA4056, 0xDC4056, 0xDC40BB, 0xDC40CF, 0xDC6240, 0xDC7540, 0xDCBD40, }; static const int32_t ngrams_IBM420_ar_ltr[] = { 0x404656, 0x4056BB, 0x4056BF, 0x406273, 0x406275, 0x4062B1, 0x4062BB, 0x4062DC, 0x406356, 0x407556, 0x4075DC, 0x40B156, 0x40BB56, 0x40BD56, 0x40BDBB, 0x40BDCF, 0x40BDDC, 0x40DAB1, 0x40DCAB, 0x40DCB1, 0x49B156, 0x564056, 0x564058, 0x564062, 0x564063, 0x564073, 0x564075, 0x564078, 0x56409A, 0x5640B1, 0x5640BB, 0x5640BD, 0x5640BF, 0x5640DA, 0x5640DC, 0x565840, 0x56B156, 0x56CF40, 0x58B156, 0x63B156, 0x63BD56, 0x67B156, 0x69B156, 0x73B156, 0x78B156, 0x9AB156, 0xAB4062, 0xADB156, 0xB14062, 0xB15640, 0xB156CF, 0xB19A40, 0xB1B140, 0xBB4062, 0xBB40DC, 0xBBB156, 0xBD5640, 0xBDBB40, 0xCF4062, 0xCF40DC, 0xCFB156, 0xDAB19A, 0xDCAB40, 0xDCB156 }; static const uint8_t charMap_IBM420_ar[]= { /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */ /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 4- */ 0x40, 0x40, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 5- */ 0x40, 0x51, 0x52, 0x40, 0x40, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 7- */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F, /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, /* B- */ 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0xCB, 0x40, 0xCD, 0x40, 0xCF, /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF, /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40, }; #endif //ISO-8859-1,2,5,6,7,8,9 Ngrams struct NGramsPlusLang { const int32_t ngrams[64]; const char * lang; }; static const NGramsPlusLang ngrams_8859_1[] = { { { 0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F, 0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74, 0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420, 0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320, }, "en" }, { { 0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320, 0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520, 0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572, }, "da" }, { { 0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F, 0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220, 0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572, }, "de" }, { { 0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365, 0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C, 0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064, 0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20, }, "es" }, { { 0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E, 0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20, 0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420, 0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220, }, "fr" }, { { 0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073, 0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220, 0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20, 0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F, }, "it" }, { { 0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665, 0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E, 0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F, 0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F, }, "nl" }, { { 0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469, 0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474, 0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572, }, "no" }, { { 0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365, 0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20, 0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065, 0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F, }, "pt" }, { { 0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469, 0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220, 0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20, 0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220, }, "sv" } }; static const NGramsPlusLang ngrams_8859_2[] = { { { 0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F, 0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465, 0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865, 0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564, }, "cs" }, { { 0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69, 0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20, 0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061, 0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320, }, "hu" }, { { 0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779, 0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20, 0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769, 0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720, }, "pl" }, { { 0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69, 0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070, 0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72, 0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20, }, "ro" } }; static const int32_t ngrams_8859_5_ru[] = { 0x20D220, 0x20D2DE, 0x20D4DE, 0x20D7D0, 0x20D820, 0x20DAD0, 0x20DADE, 0x20DDD0, 0x20DDD5, 0x20DED1, 0x20DFDE, 0x20DFE0, 0x20E0D0, 0x20E1DE, 0x20E1E2, 0x20E2DE, 0x20E7E2, 0x20EDE2, 0xD0DDD8, 0xD0E2EC, 0xD3DE20, 0xD5DBEC, 0xD5DDD8, 0xD5E1E2, 0xD5E220, 0xD820DF, 0xD8D520, 0xD8D820, 0xD8EF20, 0xDBD5DD, 0xDBD820, 0xDBECDD, 0xDDD020, 0xDDD520, 0xDDD8D5, 0xDDD8EF, 0xDDDE20, 0xDDDED2, 0xDE20D2, 0xDE20DF, 0xDE20E1, 0xDED220, 0xDED2D0, 0xDED3DE, 0xDED920, 0xDEDBEC, 0xDEDC20, 0xDEE1E2, 0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520, }; static const int32_t ngrams_8859_6_ar[] = { 0x20C7E4, 0x20C7E6, 0x20C8C7, 0x20D9E4, 0x20E1EA, 0x20E4E4, 0x20E5E6, 0x20E8C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E420, 0xC7E4C3, 0xC7E4C7, 0xC7E4C8, 0xC7E4CA, 0xC7E4CC, 0xC7E4CD, 0xC7E4CF, 0xC7E4D3, 0xC7E4D9, 0xC7E4E2, 0xC7E4E5, 0xC7E4E8, 0xC7E4EA, 0xC7E520, 0xC7E620, 0xC7E6CA, 0xC820C7, 0xC920C7, 0xC920E1, 0xC920E4, 0xC920E5, 0xC920E8, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xD920C7, 0xD9E4E9, 0xE1EA20, 0xE420C7, 0xE4C920, 0xE4E920, 0xE4EA20, 0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620, }; static const int32_t ngrams_8859_7_el[] = { 0x20E1ED, 0x20E1F0, 0x20E3E9, 0x20E4E9, 0x20E5F0, 0x20E720, 0x20EAE1, 0x20ECE5, 0x20EDE1, 0x20EF20, 0x20F0E1, 0x20F0EF, 0x20F0F1, 0x20F3F4, 0x20F3F5, 0x20F4E7, 0x20F4EF, 0xDFE120, 0xE120E1, 0xE120F4, 0xE1E920, 0xE1ED20, 0xE1F0FC, 0xE1F220, 0xE3E9E1, 0xE5E920, 0xE5F220, 0xE720F4, 0xE7ED20, 0xE7F220, 0xE920F4, 0xE9E120, 0xE9EADE, 0xE9F220, 0xEAE1E9, 0xEAE1F4, 0xECE520, 0xED20E1, 0xED20E5, 0xED20F0, 0xEDE120, 0xEFF220, 0xEFF520, 0xF0EFF5, 0xF0F1EF, 0xF0FC20, 0xF220E1, 0xF220E5, 0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20, }; static const int32_t ngrams_8859_8_I_he[] = { 0x20E0E5, 0x20E0E7, 0x20E0E9, 0x20E0FA, 0x20E1E9, 0x20E1EE, 0x20E4E0, 0x20E4E5, 0x20E4E9, 0x20E4EE, 0x20E4F2, 0x20E4F9, 0x20E4FA, 0x20ECE0, 0x20ECE4, 0x20EEE0, 0x20F2EC, 0x20F9EC, 0xE0FA20, 0xE420E0, 0xE420E1, 0xE420E4, 0xE420EC, 0xE420EE, 0xE420F9, 0xE4E5E0, 0xE5E020, 0xE5ED20, 0xE5EF20, 0xE5F820, 0xE5FA20, 0xE920E4, 0xE9E420, 0xE9E5FA, 0xE9E9ED, 0xE9ED20, 0xE9EF20, 0xE9F820, 0xE9FA20, 0xEC20E0, 0xEC20E4, 0xECE020, 0xECE420, 0xED20E0, 0xED20E1, 0xED20E4, 0xED20EC, 0xED20EE, 0xED20F9, 0xEEE420, 0xEF20E4, 0xF0E420, 0xF0E920, 0xF0E9ED, 0xF2EC20, 0xF820E4, 0xF8E9ED, 0xF9EC20, 0xFA20E0, 0xFA20E1, 0xFA20E4, 0xFA20EC, 0xFA20EE, 0xFA20F9, }; static const int32_t ngrams_8859_8_he[] = { 0x20E0E5, 0x20E0EC, 0x20E4E9, 0x20E4EC, 0x20E4EE, 0x20E4F0, 0x20E9F0, 0x20ECF2, 0x20ECF9, 0x20EDE5, 0x20EDE9, 0x20EFE5, 0x20EFE9, 0x20F8E5, 0x20F8E9, 0x20FAE0, 0x20FAE5, 0x20FAE9, 0xE020E4, 0xE020EC, 0xE020ED, 0xE020FA, 0xE0E420, 0xE0E5E4, 0xE0EC20, 0xE0EE20, 0xE120E4, 0xE120ED, 0xE120FA, 0xE420E4, 0xE420E9, 0xE420EC, 0xE420ED, 0xE420EF, 0xE420F8, 0xE420FA, 0xE4EC20, 0xE5E020, 0xE5E420, 0xE7E020, 0xE9E020, 0xE9E120, 0xE9E420, 0xEC20E4, 0xEC20ED, 0xEC20FA, 0xECF220, 0xECF920, 0xEDE9E9, 0xEDE9F0, 0xEDE9F8, 0xEE20E4, 0xEE20ED, 0xEE20FA, 0xEEE120, 0xEEE420, 0xF2E420, 0xF920E4, 0xF920ED, 0xF920FA, 0xF9E420, 0xFAE020, 0xFAE420, 0xFAE5E9, }; static const int32_t ngrams_8859_9_tr[] = { 0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861, 0x20696C, 0x206B61, 0x206B6F, 0x206D61, 0x206F6C, 0x207361, 0x207461, 0x207665, 0x207961, 0x612062, 0x616B20, 0x616C61, 0x616D61, 0x616E20, 0x616EFD, 0x617220, 0x617261, 0x6172FD, 0x6173FD, 0x617961, 0x626972, 0x646120, 0x646520, 0x646920, 0x652062, 0x65206B, 0x656469, 0x656E20, 0x657220, 0x657269, 0x657369, 0x696C65, 0x696E20, 0x696E69, 0x697220, 0x6C616E, 0x6C6172, 0x6C6520, 0x6C6572, 0x6E2061, 0x6E2062, 0x6E206B, 0x6E6461, 0x6E6465, 0x6E6520, 0x6E6920, 0x6E696E, 0x6EFD20, 0x72696E, 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD, }; CharsetRecog_8859_1::~CharsetRecog_8859_1() { // nothing to do } UBool CharsetRecog_8859_1::match(InputText *textIn, CharsetMatch *results) const { const char *name = textIn->fC1Bytes? "windows-1252" : "ISO-8859-1"; uint32_t i; int32_t bestConfidenceSoFar = -1; for (i=0; i < UPRV_LENGTHOF(ngrams_8859_1) ; i++) { const int32_t *ngrams = ngrams_8859_1[i].ngrams; const char *lang = ngrams_8859_1[i].lang; int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_1); if (confidence > bestConfidenceSoFar) { results->set(textIn, this, confidence, name, lang); bestConfidenceSoFar = confidence; } } return (bestConfidenceSoFar > 0); } const char *CharsetRecog_8859_1::getName() const { return "ISO-8859-1"; } CharsetRecog_8859_2::~CharsetRecog_8859_2() { // nothing to do } UBool CharsetRecog_8859_2::match(InputText *textIn, CharsetMatch *results) const { const char *name = textIn->fC1Bytes? "windows-1250" : "ISO-8859-2"; uint32_t i; int32_t bestConfidenceSoFar = -1; for (i=0; i < UPRV_LENGTHOF(ngrams_8859_2) ; i++) { const int32_t *ngrams = ngrams_8859_2[i].ngrams; const char *lang = ngrams_8859_2[i].lang; int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_2); if (confidence > bestConfidenceSoFar) { results->set(textIn, this, confidence, name, lang); bestConfidenceSoFar = confidence; } } return (bestConfidenceSoFar > 0); } const char *CharsetRecog_8859_2::getName() const { return "ISO-8859-2"; } CharsetRecog_8859_5::~CharsetRecog_8859_5() { // nothing to do } const char *CharsetRecog_8859_5::getName() const { return "ISO-8859-5"; } CharsetRecog_8859_5_ru::~CharsetRecog_8859_5_ru() { // nothing to do } const char *CharsetRecog_8859_5_ru::getLanguage() const { return "ru"; } UBool CharsetRecog_8859_5_ru::match(InputText *textIn, CharsetMatch *results) const { int32_t confidence = match_sbcs(textIn, ngrams_8859_5_ru, charMap_8859_5); results->set(textIn, this, confidence); return (confidence > 0); } CharsetRecog_8859_6::~CharsetRecog_8859_6() { // nothing to do } const char *CharsetRecog_8859_6::getName() const { return "ISO-8859-6"; } CharsetRecog_8859_6_ar::~CharsetRecog_8859_6_ar() { // nothing to do } const char *CharsetRecog_8859_6_ar::getLanguage() const { return "ar"; } UBool CharsetRecog_8859_6_ar::match(InputText *textIn, CharsetMatch *results) const { int32_t confidence = match_sbcs(textIn, ngrams_8859_6_ar, charMap_8859_6); results->set(textIn, this, confidence); return (confidence > 0); } CharsetRecog_8859_7::~CharsetRecog_8859_7() { // nothing to do } const char *CharsetRecog_8859_7::getName() const { return "ISO-8859-7"; } CharsetRecog_8859_7_el::~CharsetRecog_8859_7_el() { // nothing to do } const char *CharsetRecog_8859_7_el::getLanguage() const { return "el"; } UBool CharsetRecog_8859_7_el::match(InputText *textIn, CharsetMatch *results) const { const char *name = textIn->fC1Bytes? "windows-1253" : "ISO-8859-7"; int32_t confidence = match_sbcs(textIn, ngrams_8859_7_el, charMap_8859_7); results->set(textIn, this, confidence, name, "el"); return (confidence > 0); } CharsetRecog_8859_8::~CharsetRecog_8859_8() { // nothing to do } const char *CharsetRecog_8859_8::getName() const { return "ISO-8859-8"; } CharsetRecog_8859_8_I_he::~CharsetRecog_8859_8_I_he () { // nothing to do } const char *CharsetRecog_8859_8_I_he::getName() const { return "ISO-8859-8-I"; } const char *CharsetRecog_8859_8_I_he::getLanguage() const { return "he"; } UBool CharsetRecog_8859_8_I_he::match(InputText *textIn, CharsetMatch *results) const { const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8-I"; int32_t confidence = match_sbcs(textIn, ngrams_8859_8_I_he, charMap_8859_8); results->set(textIn, this, confidence, name, "he"); return (confidence > 0); } CharsetRecog_8859_8_he::~CharsetRecog_8859_8_he() { // od ot gnihton } const char *CharsetRecog_8859_8_he::getLanguage() const { return "he"; } UBool CharsetRecog_8859_8_he::match(InputText *textIn, CharsetMatch *results) const { const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8"; int32_t confidence = match_sbcs(textIn, ngrams_8859_8_he, charMap_8859_8); results->set(textIn, this, confidence, name, "he"); return (confidence > 0); } CharsetRecog_8859_9::~CharsetRecog_8859_9() { // nothing to do } const char *CharsetRecog_8859_9::getName() const { return "ISO-8859-9"; } CharsetRecog_8859_9_tr::~CharsetRecog_8859_9_tr () { // nothing to do } const char *CharsetRecog_8859_9_tr::getLanguage() const { return "tr"; } UBool CharsetRecog_8859_9_tr::match(InputText *textIn, CharsetMatch *results) const { const char *name = textIn->fC1Bytes? "windows-1254" : "ISO-8859-9"; int32_t confidence = match_sbcs(textIn, ngrams_8859_9_tr, charMap_8859_9); results->set(textIn, this, confidence, name, "tr"); return (confidence > 0); } CharsetRecog_windows_1256::~CharsetRecog_windows_1256() { // nothing to do } const char *CharsetRecog_windows_1256::getName() const { return "windows-1256"; } const char *CharsetRecog_windows_1256::getLanguage() const { return "ar"; } UBool CharsetRecog_windows_1256::match(InputText *textIn, CharsetMatch *results) const { int32_t confidence = match_sbcs(textIn, ngrams_windows_1256, charMap_windows_1256); results->set(textIn, this, confidence); return (confidence > 0); } CharsetRecog_windows_1251::~CharsetRecog_windows_1251() { // nothing to do } const char *CharsetRecog_windows_1251::getName() const { return "windows-1251"; } const char *CharsetRecog_windows_1251::getLanguage() const { return "ru"; } UBool CharsetRecog_windows_1251::match(InputText *textIn, CharsetMatch *results) const { int32_t confidence = match_sbcs(textIn, ngrams_windows_1251, charMap_windows_1251); results->set(textIn, this, confidence); return (confidence > 0); } CharsetRecog_KOI8_R::~CharsetRecog_KOI8_R() { // nothing to do } const char *CharsetRecog_KOI8_R::getName() const { return "KOI8-R"; } const char *CharsetRecog_KOI8_R::getLanguage() const { return "ru"; } UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const { int32_t confidence = match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R); results->set(textIn, this, confidence); return (confidence > 0); } #if !UCONFIG_ONLY_HTML_CONVERSION CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he() { // nothing to do } const char *CharsetRecog_IBM424_he::getLanguage() const { return "he"; } CharsetRecog_IBM424_he_rtl::~CharsetRecog_IBM424_he_rtl() { // nothing to do } const char *CharsetRecog_IBM424_he_rtl::getName() const { return "IBM424_rtl"; } UBool CharsetRecog_IBM424_he_rtl::match(InputText *textIn, CharsetMatch *results) const { int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_rtl, charMap_IBM424_he); results->set(textIn, this, confidence); return (confidence > 0); } CharsetRecog_IBM424_he_ltr::~CharsetRecog_IBM424_he_ltr() { // nothing to do } const char *CharsetRecog_IBM424_he_ltr::getName() const { return "IBM424_ltr"; } UBool CharsetRecog_IBM424_he_ltr::match(InputText *textIn, CharsetMatch *results) const { int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_ltr, charMap_IBM424_he); results->set(textIn, this, confidence); return (confidence > 0); } CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar() { // nothing to do } const char *CharsetRecog_IBM420_ar::getLanguage() const { return "ar"; } int32_t CharsetRecog_IBM420_ar::match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t byteMap[]) const { NGramParser_IBM420 parser(ngrams, byteMap); int32_t result; result = parser.parse(det); return result; } CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl() { // nothing to do } const char *CharsetRecog_IBM420_ar_rtl::getName() const { return "IBM420_rtl"; } UBool CharsetRecog_IBM420_ar_rtl::match(InputText *textIn, CharsetMatch *results) const { int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_rtl, charMap_IBM420_ar); results->set(textIn, this, confidence); return (confidence > 0); } CharsetRecog_IBM420_ar_ltr::~CharsetRecog_IBM420_ar_ltr() { // nothing to do } const char *CharsetRecog_IBM420_ar_ltr::getName() const { return "IBM420_ltr"; } UBool CharsetRecog_IBM420_ar_ltr::match(InputText *textIn, CharsetMatch *results) const { int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420_ar); results->set(textIn, this, confidence); return (confidence > 0); } #endif U_NAMESPACE_END #endif stringi/src/icu74/i18n/number_usageprefs.cpp0000644000176200001440000001507114700200761020453 0ustar liggesusers// © 2020 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "number_usageprefs.h" #include "cstring.h" #include "number_decimalquantity.h" #include "number_microprops.h" #include "number_roundingutils.h" #include "number_skeletons.h" #include "unicode/char16ptr.h" #include "unicode/currunit.h" #include "unicode/fmtable.h" #include "unicode/measure.h" #include "unicode/numberformatter.h" #include "unicode/platform.h" #include "unicode/unum.h" #include "unicode/urename.h" #include "units_data.h" using namespace icu; using namespace icu::number; using namespace icu::number::impl; using icu::StringSegment; using icu::units::ConversionRates; // Copy constructor StringProp::StringProp(const StringProp &other) : StringProp() { this->operator=(other); } // Copy assignment operator StringProp &StringProp::operator=(const StringProp &other) { if (this == &other) { return *this; } // self-assignment: no-op fLength = 0; fError = other.fError; if (fValue != nullptr) { uprv_free(fValue); fValue = nullptr; } if (other.fValue == nullptr) { return *this; } if (U_FAILURE(other.fError)) { // We don't bother trying to allocating memory if we're in any case busy // copying an errored StringProp. return *this; } fValue = (char *)uprv_malloc(other.fLength + 1); if (fValue == nullptr) { fError = U_MEMORY_ALLOCATION_ERROR; return *this; } fLength = other.fLength; uprv_strncpy(fValue, other.fValue, fLength + 1); return *this; } // Move constructor StringProp::StringProp(StringProp &&src) noexcept : fValue(src.fValue), fLength(src.fLength), fError(src.fError) { // Take ownership away from src if necessary src.fValue = nullptr; } // Move assignment operator StringProp &StringProp::operator=(StringProp &&src) noexcept { if (this == &src) { return *this; } if (fValue != nullptr) { uprv_free(fValue); } fValue = src.fValue; fLength = src.fLength; fError = src.fError; // Take ownership away from src if necessary src.fValue = nullptr; return *this; } StringProp::~StringProp() { if (fValue != nullptr) { uprv_free(fValue); fValue = nullptr; } } void StringProp::set(StringPiece value) { if (fValue != nullptr) { uprv_free(fValue); fValue = nullptr; } fLength = value.length(); fValue = (char *)uprv_malloc(fLength + 1); if (fValue == nullptr) { fLength = 0; fError = U_MEMORY_ALLOCATION_ERROR; return; } if (fLength > 0) { uprv_strncpy(fValue, value.data(), fLength); } fValue[fLength] = 0; } // Populates micros.mixedMeasures and modifies quantity, based on the values in // measures. void mixedMeasuresToMicros(const MaybeStackVector &measures, DecimalQuantity *quantity, MicroProps *micros, UErrorCode status) { micros->mixedMeasuresCount = measures.length(); if (micros->mixedMeasures.getCapacity() < micros->mixedMeasuresCount) { if (micros->mixedMeasures.resize(micros->mixedMeasuresCount) == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } } for (int32_t i = 0; i < micros->mixedMeasuresCount; i++) { switch (measures[i]->getNumber().getType()) { case Formattable::kInt64: micros->mixedMeasures[i] = measures[i]->getNumber().getInt64(); break; case Formattable::kDouble: U_ASSERT(micros->indexOfQuantity < 0); quantity->setToDouble(measures[i]->getNumber().getDouble()); micros->indexOfQuantity = i; break; default: U_ASSERT(0 == "Found a Measure Number which is neither a double nor an int"); UPRV_UNREACHABLE_EXIT; break; } if (U_FAILURE(status)) { return; } } if (micros->indexOfQuantity < 0) { // There is no quantity. status = U_INTERNAL_PROGRAM_ERROR; } } UsagePrefsHandler::UsagePrefsHandler(const Locale &locale, const MeasureUnit &inputUnit, const StringPiece usage, const MicroPropsGenerator *parent, UErrorCode &status) : fUnitsRouter(inputUnit, locale, usage, status), fParent(parent) { } void UsagePrefsHandler::processQuantity(DecimalQuantity &quantity, MicroProps µs, UErrorCode &status) const { fParent->processQuantity(quantity, micros, status); if (U_FAILURE(status)) { return; } quantity.roundToInfinity(); // Enables toDouble const units::RouteResult routed = fUnitsRouter.route(quantity.toDouble(), µs.rounder, status); if (U_FAILURE(status)) { return; } const MaybeStackVector& routedMeasures = routed.measures; micros.outputUnit = routed.outputUnit.copy(status).build(status); if (U_FAILURE(status)) { return; } mixedMeasuresToMicros(routedMeasures, &quantity, µs, status); } UnitConversionHandler::UnitConversionHandler(const MeasureUnit &targetUnit, const MicroPropsGenerator *parent, UErrorCode &status) : fOutputUnit(targetUnit), fParent(parent) { MeasureUnitImpl tempInput, tempOutput; ConversionRates conversionRates(status); if (U_FAILURE(status)) { return; } const MeasureUnitImpl &targetUnitImpl = MeasureUnitImpl::forMeasureUnit(targetUnit, tempOutput, status); fUnitConverter.adoptInsteadAndCheckErrorCode( new ComplexUnitsConverter(targetUnitImpl, conversionRates, status), status); } void UnitConversionHandler::processQuantity(DecimalQuantity &quantity, MicroProps µs, UErrorCode &status) const { fParent->processQuantity(quantity, micros, status); if (U_FAILURE(status)) { return; } quantity.roundToInfinity(); // Enables toDouble MaybeStackVector measures = fUnitConverter->convert(quantity.toDouble(), µs.rounder, status); micros.outputUnit = fOutputUnit; if (U_FAILURE(status)) { return; } mixedMeasuresToMicros(measures, &quantity, µs, status); } #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/csdetect.cpp0000644000176200001440000003332514700200761016537 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2005-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_CONVERSION #include "unicode/ucsdet.h" #include "csdetect.h" #include "csmatch.h" #include "uenumimp.h" #include "cmemory.h" #include "cstring.h" #include "umutex.h" #include "ucln_in.h" #include "uarrsort.h" #include "inputext.h" #include "csrsbcs.h" #include "csrmbcs.h" #include "csrutf8.h" #include "csrucode.h" #include "csr2022.h" #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) #define DELETE_ARRAY(array) uprv_free((void *) (array)) U_NAMESPACE_BEGIN struct CSRecognizerInfo : public UMemory { CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled) : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {} ~CSRecognizerInfo() {delete recognizer;} CharsetRecognizer *recognizer; UBool isDefaultEnabled; }; U_NAMESPACE_END static icu::CSRecognizerInfo **fCSRecognizers = nullptr; static icu::UInitOnce gCSRecognizersInitOnce {}; static int32_t fCSRecognizers_size = 0; U_CDECL_BEGIN static UBool U_CALLCONV csdet_cleanup() { U_NAMESPACE_USE if (fCSRecognizers != nullptr) { for(int32_t r = 0; r < fCSRecognizers_size; r += 1) { delete fCSRecognizers[r]; fCSRecognizers[r] = nullptr; } DELETE_ARRAY(fCSRecognizers); fCSRecognizers = nullptr; fCSRecognizers_size = 0; } gCSRecognizersInitOnce.reset(); return true; } static int32_t U_CALLCONV charsetMatchComparator(const void * /*context*/, const void *left, const void *right) { U_NAMESPACE_USE const CharsetMatch **csm_l = (const CharsetMatch **) left; const CharsetMatch **csm_r = (const CharsetMatch **) right; // NOTE: compare is backwards to sort from highest to lowest. return (*csm_r)->getConfidence() - (*csm_l)->getConfidence(); } static void U_CALLCONV initRecognizers(UErrorCode &status) { U_NAMESPACE_USE ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup); CSRecognizerInfo *tempArray[] = { new CSRecognizerInfo(new CharsetRecog_UTF8(), true), new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), true), new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), true), new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), true), new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), true), new CSRecognizerInfo(new CharsetRecog_8859_1(), true), new CSRecognizerInfo(new CharsetRecog_8859_2(), true), new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), true), new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), true), new CSRecognizerInfo(new CharsetRecog_8859_7_el(), true), new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), true), new CSRecognizerInfo(new CharsetRecog_8859_8_he(), true), new CSRecognizerInfo(new CharsetRecog_windows_1251(), true), new CSRecognizerInfo(new CharsetRecog_windows_1256(), true), new CSRecognizerInfo(new CharsetRecog_KOI8_R(), true), new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), true), new CSRecognizerInfo(new CharsetRecog_sjis(), true), new CSRecognizerInfo(new CharsetRecog_gb_18030(), true), new CSRecognizerInfo(new CharsetRecog_euc_jp(), true), new CSRecognizerInfo(new CharsetRecog_euc_kr(), true), new CSRecognizerInfo(new CharsetRecog_big5(), true), new CSRecognizerInfo(new CharsetRecog_2022JP(), true), #if !UCONFIG_ONLY_HTML_CONVERSION new CSRecognizerInfo(new CharsetRecog_2022KR(), true), new CSRecognizerInfo(new CharsetRecog_2022CN(), true), new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), false), new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), false), new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), false), new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), false) #endif }; int32_t rCount = UPRV_LENGTHOF(tempArray); fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount); if (fCSRecognizers == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; } else { fCSRecognizers_size = rCount; for (int32_t r = 0; r < rCount; r += 1) { fCSRecognizers[r] = tempArray[r]; if (fCSRecognizers[r] == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; } } } } U_CDECL_END U_NAMESPACE_BEGIN void CharsetDetector::setRecognizers(UErrorCode &status) { umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status); } CharsetDetector::CharsetDetector(UErrorCode &status) : textIn(new InputText(status)), resultArray(nullptr), resultCount(0), fStripTags(false), fFreshTextSet(false), fEnabledRecognizers(nullptr) { if (U_FAILURE(status)) { return; } setRecognizers(status); if (U_FAILURE(status)) { return; } resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size); if (resultArray == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { resultArray[i] = new CharsetMatch(); if (resultArray[i] == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; break; } } } CharsetDetector::~CharsetDetector() { delete textIn; for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { delete resultArray[i]; } uprv_free(resultArray); if (fEnabledRecognizers) { uprv_free(fEnabledRecognizers); } } void CharsetDetector::setText(const char *in, int32_t len) { textIn->setText(in, len); fFreshTextSet = true; } UBool CharsetDetector::setStripTagsFlag(UBool flag) { UBool temp = fStripTags; fStripTags = flag; fFreshTextSet = true; return temp; } UBool CharsetDetector::getStripTagsFlag() const { return fStripTags; } void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const { textIn->setDeclaredEncoding(encoding,len); } int32_t CharsetDetector::getDetectableCount() { UErrorCode status = U_ZERO_ERROR; setRecognizers(status); return fCSRecognizers_size; } const CharsetMatch *CharsetDetector::detect(UErrorCode &status) { int32_t maxMatchesFound = 0; detectAll(maxMatchesFound, status); if(maxMatchesFound > 0) { return resultArray[0]; } else { return nullptr; } } const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status) { if(!textIn->isSet()) { status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set return nullptr; } else if (fFreshTextSet) { CharsetRecognizer *csr; int32_t i; textIn->MungeInput(fStripTags); // Iterate over all possible charsets, remember all that // give a match quality > 0. resultCount = 0; for (i = 0; i < fCSRecognizers_size; i += 1) { csr = fCSRecognizers[i]->recognizer; if (csr->match(textIn, resultArray[resultCount])) { resultCount++; } } if (resultCount > 1) { uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, nullptr, true, &status); } fFreshTextSet = false; } maxMatchesFound = resultCount; if (maxMatchesFound == 0) { status = U_INVALID_CHAR_FOUND; return nullptr; } return resultArray; } void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status) { if (U_FAILURE(status)) { return; } int32_t modIdx = -1; UBool isDefaultVal = false; for (int32_t i = 0; i < fCSRecognizers_size; i++) { CSRecognizerInfo *csrinfo = fCSRecognizers[i]; if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) { modIdx = i; isDefaultVal = (csrinfo->isDefaultEnabled == enabled); break; } } if (modIdx < 0) { // No matching encoding found status = U_ILLEGAL_ARGUMENT_ERROR; return; } if (fEnabledRecognizers == nullptr && !isDefaultVal) { // Create an array storing the non default setting fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size); if (fEnabledRecognizers == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } // Initialize the array with default info for (int32_t i = 0; i < fCSRecognizers_size; i++) { fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled; } } if (fEnabledRecognizers != nullptr) { fEnabledRecognizers[modIdx] = enabled; } } /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const { if( index > fCSRecognizers_size-1 || index < 0) { status = U_INDEX_OUTOFBOUNDS_ERROR; return 0; } else { return fCSRecognizers[index]->getName(); } }*/ U_NAMESPACE_END U_CDECL_BEGIN typedef struct { int32_t currIndex; UBool all; UBool *enabledRecognizers; } Context; static void U_CALLCONV enumClose(UEnumeration *en) { if(en->context != nullptr) { DELETE_ARRAY(en->context); } DELETE_ARRAY(en); } static int32_t U_CALLCONV enumCount(UEnumeration *en, UErrorCode *) { if (((Context *)en->context)->all) { // ucsdet_getAllDetectableCharsets, all charset detector names return fCSRecognizers_size; } // Otherwise, ucsdet_getDetectableCharsets - only enabled ones int32_t count = 0; UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; if (enabledArray != nullptr) { // custom set for (int32_t i = 0; i < fCSRecognizers_size; i++) { if (enabledArray[i]) { count++; } } } else { // default set for (int32_t i = 0; i < fCSRecognizers_size; i++) { if (fCSRecognizers[i]->isDefaultEnabled) { count++; } } } return count; } static const char* U_CALLCONV enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) { const char *currName = nullptr; if (((Context *)en->context)->currIndex < fCSRecognizers_size) { if (((Context *)en->context)->all) { // ucsdet_getAllDetectableCharsets, all charset detector names currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); ((Context *)en->context)->currIndex++; } else { // ucsdet_getDetectableCharsets UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; if (enabledArray != nullptr) { // custom set while (currName == nullptr && ((Context *)en->context)->currIndex < fCSRecognizers_size) { if (enabledArray[((Context *)en->context)->currIndex]) { currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); } ((Context *)en->context)->currIndex++; } } else { // default set while (currName == nullptr && ((Context *)en->context)->currIndex < fCSRecognizers_size) { if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) { currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); } ((Context *)en->context)->currIndex++; } } } } if(resultLength != nullptr) { *resultLength = currName == nullptr ? 0 : (int32_t)uprv_strlen(currName); } return currName; } static void U_CALLCONV enumReset(UEnumeration *en, UErrorCode *) { ((Context *)en->context)->currIndex = 0; } static const UEnumeration gCSDetEnumeration = { nullptr, nullptr, enumClose, enumCount, uenum_unextDefault, enumNext, enumReset }; U_CDECL_END U_NAMESPACE_BEGIN UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status) { /* Initialize recognized charsets. */ setRecognizers(status); if(U_FAILURE(status)) { return 0; } UEnumeration *en = NEW_ARRAY(UEnumeration, 1); if (en == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return 0; } memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); en->context = (void*)NEW_ARRAY(Context, 1); if (en->context == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; DELETE_ARRAY(en); return 0; } uprv_memset(en->context, 0, sizeof(Context)); ((Context*)en->context)->all = true; return en; } UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const { if(U_FAILURE(status)) { return 0; } UEnumeration *en = NEW_ARRAY(UEnumeration, 1); if (en == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return 0; } memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); en->context = (void*)NEW_ARRAY(Context, 1); if (en->context == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; DELETE_ARRAY(en); return 0; } uprv_memset(en->context, 0, sizeof(Context)); ((Context*)en->context)->all = false; ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers; return en; } U_NAMESPACE_END #endif stringi/src/icu74/i18n/chnsecal.cpp0000644000176200001440000010726114700200761016522 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * Copyright (C) 2007-2014, International Business Machines Corporation * and others. All Rights Reserved. ****************************************************************************** * * File CHNSECAL.CPP * * Modification History: * * Date Name Description * 9/18/2007 ajmacher ported from java ChineseCalendar ***************************************************************************** */ #include "chnsecal.h" #include #if !UCONFIG_NO_FORMATTING #include "umutex.h" #include #include "gregoimp.h" // Math #include "astro.h" // CalendarAstronomer #include "unicode/simpletz.h" #include "uhash.h" #include "ucln_in.h" #include "cstring.h" // Debugging #ifdef U_DEBUG_CHNSECAL # include # include static void debug_chnsecal_loc(const char *f, int32_t l) { fprintf(stderr, "%s:%d: ", f, l); } static void debug_chnsecal_msg(const char *pat, ...) { va_list ap; va_start(ap, pat); vfprintf(stderr, pat, ap); fflush(stderr); } // must use double parens, i.e.: U_DEBUG_CHNSECAL_MSG(("four is: %d",4)); #define U_DEBUG_CHNSECAL_MSG(x) {debug_chnsecal_loc(__FILE__,__LINE__);debug_chnsecal_msg x;} #else #define U_DEBUG_CHNSECAL_MSG(x) #endif // --- The cache -- static icu::UMutex astroLock; static icu::CalendarAstronomer *gChineseCalendarAstro = nullptr; // Lazy Creation & Access synchronized by class CalendarCache with a mutex. static icu::CalendarCache *gChineseCalendarWinterSolsticeCache = nullptr; static icu::CalendarCache *gChineseCalendarNewYearCache = nullptr; static icu::TimeZone *gChineseCalendarZoneAstroCalc = nullptr; static icu::UInitOnce gChineseCalendarZoneAstroCalcInitOnce {}; /** * The start year of the Chinese calendar, the 61st year of the reign * of Huang Di. Some sources use the first year of his reign, * resulting in EXTENDED_YEAR values 60 years greater and ERA (cycle) * values one greater. */ static const int32_t CHINESE_EPOCH_YEAR = -2636; // Gregorian year /** * The offset from GMT in milliseconds at which we perform astronomical * computations. Some sources use a different historically accurate * offset of GMT+7:45:40 for years before 1929; we do not do this. */ static const int32_t CHINA_OFFSET = 8 * kOneHour; /** * Value to be added or subtracted from the local days of a new moon to * get close to the next or prior new moon, but not cross it. Must be * >= 1 and < CalendarAstronomer.SYNODIC_MONTH. */ static const int32_t SYNODIC_GAP = 25; U_CDECL_BEGIN static UBool calendar_chinese_cleanup() { if (gChineseCalendarAstro) { delete gChineseCalendarAstro; gChineseCalendarAstro = nullptr; } if (gChineseCalendarWinterSolsticeCache) { delete gChineseCalendarWinterSolsticeCache; gChineseCalendarWinterSolsticeCache = nullptr; } if (gChineseCalendarNewYearCache) { delete gChineseCalendarNewYearCache; gChineseCalendarNewYearCache = nullptr; } if (gChineseCalendarZoneAstroCalc) { delete gChineseCalendarZoneAstroCalc; gChineseCalendarZoneAstroCalc = nullptr; } gChineseCalendarZoneAstroCalcInitOnce.reset(); return true; } U_CDECL_END U_NAMESPACE_BEGIN // Implementation of the ChineseCalendar class //------------------------------------------------------------------------- // Constructors... //------------------------------------------------------------------------- ChineseCalendar* ChineseCalendar::clone() const { return new ChineseCalendar(*this); } ChineseCalendar::ChineseCalendar(const Locale& aLocale, UErrorCode& success) : Calendar(TimeZone::forLocaleOrDefault(aLocale), aLocale, success), hasLeapMonthBetweenWinterSolstices(false), fEpochYear(CHINESE_EPOCH_YEAR), fZoneAstroCalc(getChineseCalZoneAstroCalc()) { setTimeInMillis(getNow(), success); // Call this again now that the vtable is set up properly. } ChineseCalendar::ChineseCalendar(const Locale& aLocale, int32_t epochYear, const TimeZone* zoneAstroCalc, UErrorCode &success) : Calendar(TimeZone::forLocaleOrDefault(aLocale), aLocale, success), hasLeapMonthBetweenWinterSolstices(false), fEpochYear(epochYear), fZoneAstroCalc(zoneAstroCalc) { setTimeInMillis(getNow(), success); // Call this again now that the vtable is set up properly. } ChineseCalendar::ChineseCalendar(const ChineseCalendar& other) : Calendar(other) { hasLeapMonthBetweenWinterSolstices = other.hasLeapMonthBetweenWinterSolstices; fEpochYear = other.fEpochYear; fZoneAstroCalc = other.fZoneAstroCalc; } ChineseCalendar::~ChineseCalendar() { } const char *ChineseCalendar::getType() const { return "chinese"; } static void U_CALLCONV initChineseCalZoneAstroCalc() { gChineseCalendarZoneAstroCalc = new SimpleTimeZone(CHINA_OFFSET, UNICODE_STRING_SIMPLE("CHINA_ZONE") ); ucln_i18n_registerCleanup(UCLN_I18N_CHINESE_CALENDAR, calendar_chinese_cleanup); } const TimeZone* ChineseCalendar::getChineseCalZoneAstroCalc() const { umtx_initOnce(gChineseCalendarZoneAstroCalcInitOnce, &initChineseCalZoneAstroCalc); return gChineseCalendarZoneAstroCalc; } //------------------------------------------------------------------------- // Minimum / Maximum access functions //------------------------------------------------------------------------- static const int32_t LIMITS[UCAL_FIELD_COUNT][4] = { // Minimum Greatest Least Maximum // Minimum Maximum { 1, 1, 83333, 83333}, // ERA { 1, 1, 60, 60}, // YEAR { 0, 0, 11, 11}, // MONTH { 1, 1, 50, 55}, // WEEK_OF_YEAR {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // WEEK_OF_MONTH { 1, 1, 29, 30}, // DAY_OF_MONTH { 1, 1, 353, 385}, // DAY_OF_YEAR {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // DAY_OF_WEEK { -1, -1, 5, 5}, // DAY_OF_WEEK_IN_MONTH {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // AM_PM {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // HOUR {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // HOUR_OF_DAY {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // MINUTE {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // SECOND {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // MILLISECOND {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // ZONE_OFFSET {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // DST_OFFSET { -5000000, -5000000, 5000000, 5000000}, // YEAR_WOY {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // DOW_LOCAL { -5000000, -5000000, 5000000, 5000000}, // EXTENDED_YEAR {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // JULIAN_DAY {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // MILLISECONDS_IN_DAY { 0, 0, 1, 1}, // IS_LEAP_MONTH { 0, 0, 11, 12}, // ORDINAL_MONTH }; /** * @draft ICU 2.4 */ int32_t ChineseCalendar::handleGetLimit(UCalendarDateFields field, ELimitType limitType) const { return LIMITS[field][limitType]; } //---------------------------------------------------------------------- // Calendar framework //---------------------------------------------------------------------- /** * Implement abstract Calendar method to return the extended year * defined by the current fields. This will use either the ERA and * YEAR field as the cycle and year-of-cycle, or the EXTENDED_YEAR * field as the continuous year count, depending on which is newer. * @stable ICU 2.8 */ int32_t ChineseCalendar::handleGetExtendedYear() { int32_t year; if (newestStamp(UCAL_ERA, UCAL_YEAR, kUnset) <= fStamp[UCAL_EXTENDED_YEAR]) { year = internalGet(UCAL_EXTENDED_YEAR, 1); // Default to year 1 } else { int32_t cycle = internalGet(UCAL_ERA, 1) - 1; // 0-based cycle // adjust to the instance specific epoch year = cycle * 60 + internalGet(UCAL_YEAR, 1) - (fEpochYear - CHINESE_EPOCH_YEAR); } return year; } /** * Override Calendar method to return the number of days in the given * extended year and month. * *

Note: This method also reads the IS_LEAP_MONTH field to determine * whether or not the given month is a leap month. * @stable ICU 2.8 */ int32_t ChineseCalendar::handleGetMonthLength(int32_t extendedYear, int32_t month) const { int32_t thisStart = handleComputeMonthStart(extendedYear, month, true) - kEpochStartAsJulianDay + 1; // Julian day -> local days int32_t nextStart = newMoonNear(thisStart + SYNODIC_GAP, true); return nextStart - thisStart; } /** * Override Calendar to compute several fields specific to the Chinese * calendar system. These are: * *

  • ERA *
  • YEAR *
  • MONTH *
  • DAY_OF_MONTH *
  • DAY_OF_YEAR *
  • EXTENDED_YEAR
* * The DAY_OF_WEEK and DOW_LOCAL fields are already set when this * method is called. The getGregorianXxx() methods return Gregorian * calendar equivalents for the given Julian day. * *

Compute the ChineseCalendar-specific field IS_LEAP_MONTH. * @stable ICU 2.8 */ void ChineseCalendar::handleComputeFields(int32_t julianDay, UErrorCode &/*status*/) { computeChineseFields(julianDay - kEpochStartAsJulianDay, // local days getGregorianYear(), getGregorianMonth(), true); // set all fields } /** * Field resolution table that incorporates IS_LEAP_MONTH. */ const UFieldResolutionTable ChineseCalendar::CHINESE_DATE_PRECEDENCE[] = { { { UCAL_DAY_OF_MONTH, kResolveSTOP }, { UCAL_WEEK_OF_YEAR, UCAL_DAY_OF_WEEK, kResolveSTOP }, { UCAL_WEEK_OF_MONTH, UCAL_DAY_OF_WEEK, kResolveSTOP }, { UCAL_DAY_OF_WEEK_IN_MONTH, UCAL_DAY_OF_WEEK, kResolveSTOP }, { UCAL_WEEK_OF_YEAR, UCAL_DOW_LOCAL, kResolveSTOP }, { UCAL_WEEK_OF_MONTH, UCAL_DOW_LOCAL, kResolveSTOP }, { UCAL_DAY_OF_WEEK_IN_MONTH, UCAL_DOW_LOCAL, kResolveSTOP }, { UCAL_DAY_OF_YEAR, kResolveSTOP }, { kResolveRemap | UCAL_DAY_OF_MONTH, UCAL_IS_LEAP_MONTH, kResolveSTOP }, { kResolveSTOP } }, { { UCAL_WEEK_OF_YEAR, kResolveSTOP }, { UCAL_WEEK_OF_MONTH, kResolveSTOP }, { UCAL_DAY_OF_WEEK_IN_MONTH, kResolveSTOP }, { kResolveRemap | UCAL_DAY_OF_WEEK_IN_MONTH, UCAL_DAY_OF_WEEK, kResolveSTOP }, { kResolveRemap | UCAL_DAY_OF_WEEK_IN_MONTH, UCAL_DOW_LOCAL, kResolveSTOP }, { kResolveSTOP } }, {{kResolveSTOP}} }; /** * Override Calendar to add IS_LEAP_MONTH to the field resolution * table. * @stable ICU 2.8 */ const UFieldResolutionTable* ChineseCalendar::getFieldResolutionTable() const { return CHINESE_DATE_PRECEDENCE; } /** * Return the Julian day number of day before the first day of the * given month in the given extended year. * *

Note: This method reads the IS_LEAP_MONTH field to determine * whether the given month is a leap month. * @param eyear the extended year * @param month the zero-based month. The month is also determined * by reading the IS_LEAP_MONTH field. * @return the Julian day number of the day before the first * day of the given month and year * @stable ICU 2.8 */ int32_t ChineseCalendar::handleComputeMonthStart(int32_t eyear, int32_t month, UBool useMonth) const { ChineseCalendar *nonConstThis = (ChineseCalendar*)this; // cast away const // If the month is out of range, adjust it into range, and // modify the extended year value accordingly. if (month < 0 || month > 11) { double m = month; eyear += (int32_t)ClockMath::floorDivide(m, 12.0, &m); month = (int32_t)m; } int32_t gyear = eyear + fEpochYear - 1; // Gregorian year int32_t theNewYear = newYear(gyear); int32_t newMoon = newMoonNear(theNewYear + month * 29, true); int32_t julianDay = newMoon + kEpochStartAsJulianDay; // Save fields for later restoration int32_t saveMonth = internalGet(UCAL_MONTH); int32_t saveOrdinalMonth = internalGet(UCAL_ORDINAL_MONTH); int32_t saveIsLeapMonth = internalGet(UCAL_IS_LEAP_MONTH); // Ignore IS_LEAP_MONTH field if useMonth is false int32_t isLeapMonth = useMonth ? saveIsLeapMonth : 0; UErrorCode status = U_ZERO_ERROR; nonConstThis->computeGregorianFields(julianDay, status); if (U_FAILURE(status)) return 0; // This will modify the MONTH and IS_LEAP_MONTH fields (only) nonConstThis->computeChineseFields(newMoon, getGregorianYear(), getGregorianMonth(), false); if (month != internalGet(UCAL_MONTH) || isLeapMonth != internalGet(UCAL_IS_LEAP_MONTH)) { newMoon = newMoonNear(newMoon + SYNODIC_GAP, true); julianDay = newMoon + kEpochStartAsJulianDay; } nonConstThis->internalSet(UCAL_MONTH, saveMonth); nonConstThis->internalSet(UCAL_ORDINAL_MONTH, saveOrdinalMonth); nonConstThis->internalSet(UCAL_IS_LEAP_MONTH, saveIsLeapMonth); return julianDay - 1; } /** * Override Calendar to handle leap months properly. * @stable ICU 2.8 */ void ChineseCalendar::add(UCalendarDateFields field, int32_t amount, UErrorCode& status) { switch (field) { case UCAL_MONTH: case UCAL_ORDINAL_MONTH: if (amount != 0) { int32_t dom = get(UCAL_DAY_OF_MONTH, status); if (U_FAILURE(status)) break; int32_t day = get(UCAL_JULIAN_DAY, status) - kEpochStartAsJulianDay; // Get local day if (U_FAILURE(status)) break; int32_t moon = day - dom + 1; // New moon offsetMonth(moon, dom, amount, status); } break; default: Calendar::add(field, amount, status); break; } } /** * Override Calendar to handle leap months properly. * @stable ICU 2.8 */ void ChineseCalendar::add(EDateFields field, int32_t amount, UErrorCode& status) { add((UCalendarDateFields)field, amount, status); } /** * Override Calendar to handle leap months properly. * @stable ICU 2.8 */ void ChineseCalendar::roll(UCalendarDateFields field, int32_t amount, UErrorCode& status) { switch (field) { case UCAL_MONTH: case UCAL_ORDINAL_MONTH: if (amount != 0) { int32_t dom = get(UCAL_DAY_OF_MONTH, status); if (U_FAILURE(status)) break; int32_t day = get(UCAL_JULIAN_DAY, status) - kEpochStartAsJulianDay; // Get local day if (U_FAILURE(status)) break; int32_t moon = day - dom + 1; // New moon (start of this month) // Note throughout the following: Months 12 and 1 are never // followed by a leap month (D&R p. 185). // Compute the adjusted month number m. This is zero-based // value from 0..11 in a non-leap year, and from 0..12 in a // leap year. int32_t m = get(UCAL_MONTH, status); // 0-based month if (U_FAILURE(status)) break; if (hasLeapMonthBetweenWinterSolstices) { // (member variable) if (get(UCAL_IS_LEAP_MONTH, status) == 1) { ++m; } else { // Check for a prior leap month. (In the // following, month 0 is the first month of the // year.) Month 0 is never followed by a leap // month, and we know month m is not a leap month. // moon1 will be the start of month 0 if there is // no leap month between month 0 and month m; // otherwise it will be the start of month 1. int moon1 = moon - (int) (CalendarAstronomer::SYNODIC_MONTH * (m - 0.5)); moon1 = newMoonNear(moon1, true); if (isLeapMonthBetween(moon1, moon)) { ++m; } } if (U_FAILURE(status)) break; } // Now do the standard roll computation on m, with the // allowed range of 0..n-1, where n is 12 or 13. int32_t n = hasLeapMonthBetweenWinterSolstices ? 13 : 12; // Months in this year int32_t newM = (m + amount) % n; if (newM < 0) { newM += n; } if (newM != m) { offsetMonth(moon, dom, newM - m, status); } } break; default: Calendar::roll(field, amount, status); break; } } void ChineseCalendar::roll(EDateFields field, int32_t amount, UErrorCode& status) { roll((UCalendarDateFields)field, amount, status); } //------------------------------------------------------------------ // Support methods and constants //------------------------------------------------------------------ /** * Convert local days to UTC epoch milliseconds. * This is not an accurate conversion in that getTimezoneOffset * takes the milliseconds in GMT (not local time). In theory, more * accurate algorithm can be implemented but practically we do not need * to go through that complication as long as the historical timezone * changes did not happen around the 'tricky' new moon (new moon around * midnight). * * @param days days after January 1, 1970 0:00 in the astronomical base zone * @return milliseconds after January 1, 1970 0:00 GMT */ double ChineseCalendar::daysToMillis(double days) const { double millis = days * (double)kOneDay; if (fZoneAstroCalc != nullptr) { int32_t rawOffset, dstOffset; UErrorCode status = U_ZERO_ERROR; fZoneAstroCalc->getOffset(millis, false, rawOffset, dstOffset, status); if (U_SUCCESS(status)) { return millis - (double)(rawOffset + dstOffset); } } return millis - (double)CHINA_OFFSET; } /** * Convert UTC epoch milliseconds to local days. * @param millis milliseconds after January 1, 1970 0:00 GMT * @return days after January 1, 1970 0:00 in the astronomical base zone */ double ChineseCalendar::millisToDays(double millis) const { if (fZoneAstroCalc != nullptr) { int32_t rawOffset, dstOffset; UErrorCode status = U_ZERO_ERROR; fZoneAstroCalc->getOffset(millis, false, rawOffset, dstOffset, status); if (U_SUCCESS(status)) { return ClockMath::floorDivide(millis + (double)(rawOffset + dstOffset), kOneDay); } } return ClockMath::floorDivide(millis + (double)CHINA_OFFSET, kOneDay); } //------------------------------------------------------------------ // Astronomical computations //------------------------------------------------------------------ /** * Return the major solar term on or after December 15 of the given * Gregorian year, that is, the winter solstice of the given year. * Computations are relative to Asia/Shanghai time zone. * @param gyear a Gregorian year * @return days after January 1, 1970 0:00 Asia/Shanghai of the * winter solstice of the given year */ int32_t ChineseCalendar::winterSolstice(int32_t gyear) const { UErrorCode status = U_ZERO_ERROR; int32_t cacheValue = CalendarCache::get(&gChineseCalendarWinterSolsticeCache, gyear, status); if (cacheValue == 0) { // In books December 15 is used, but it fails for some years // using our algorithms, e.g.: 1298 1391 1492 1553 1560. That // is, winterSolstice(1298) starts search at Dec 14 08:00:00 // PST 1298 with a final result of Dec 14 10:31:59 PST 1299. double ms = daysToMillis(Grego::fieldsToDay(gyear, UCAL_DECEMBER, 1)); umtx_lock(&astroLock); if(gChineseCalendarAstro == nullptr) { gChineseCalendarAstro = new CalendarAstronomer(); ucln_i18n_registerCleanup(UCLN_I18N_CHINESE_CALENDAR, calendar_chinese_cleanup); } gChineseCalendarAstro->setTime(ms); UDate solarLong = gChineseCalendarAstro->getSunTime(CalendarAstronomer::WINTER_SOLSTICE(), true); umtx_unlock(&astroLock); // Winter solstice is 270 degrees solar longitude aka Dongzhi cacheValue = (int32_t)millisToDays(solarLong); CalendarCache::put(&gChineseCalendarWinterSolsticeCache, gyear, cacheValue, status); } if(U_FAILURE(status)) { cacheValue = 0; } return cacheValue; } /** * Return the closest new moon to the given date, searching either * forward or backward in time. * @param days days after January 1, 1970 0:00 Asia/Shanghai * @param after if true, search for a new moon on or after the given * date; otherwise, search for a new moon before it * @return days after January 1, 1970 0:00 Asia/Shanghai of the nearest * new moon after or before days */ int32_t ChineseCalendar::newMoonNear(double days, UBool after) const { umtx_lock(&astroLock); if(gChineseCalendarAstro == nullptr) { gChineseCalendarAstro = new CalendarAstronomer(); ucln_i18n_registerCleanup(UCLN_I18N_CHINESE_CALENDAR, calendar_chinese_cleanup); } gChineseCalendarAstro->setTime(daysToMillis(days)); UDate newMoon = gChineseCalendarAstro->getMoonTime(CalendarAstronomer::NEW_MOON(), after); umtx_unlock(&astroLock); return (int32_t) millisToDays(newMoon); } /** * Return the nearest integer number of synodic months between * two dates. * @param day1 days after January 1, 1970 0:00 Asia/Shanghai * @param day2 days after January 1, 1970 0:00 Asia/Shanghai * @return the nearest integer number of months between day1 and day2 */ int32_t ChineseCalendar::synodicMonthsBetween(int32_t day1, int32_t day2) const { double roundme = ((day2 - day1) / CalendarAstronomer::SYNODIC_MONTH); return (int32_t) (roundme + (roundme >= 0 ? .5 : -.5)); } /** * Return the major solar term on or before a given date. This * will be an integer from 1..12, with 1 corresponding to 330 degrees, * 2 to 0 degrees, 3 to 30 degrees,..., and 12 to 300 degrees. * @param days days after January 1, 1970 0:00 Asia/Shanghai */ int32_t ChineseCalendar::majorSolarTerm(int32_t days) const { umtx_lock(&astroLock); if(gChineseCalendarAstro == nullptr) { gChineseCalendarAstro = new CalendarAstronomer(); ucln_i18n_registerCleanup(UCLN_I18N_CHINESE_CALENDAR, calendar_chinese_cleanup); } gChineseCalendarAstro->setTime(daysToMillis(days)); UDate solarLongitude = gChineseCalendarAstro->getSunLongitude(); umtx_unlock(&astroLock); // Compute (floor(solarLongitude / (pi/6)) + 2) % 12 int32_t term = ( ((int32_t)(6 * solarLongitude / CalendarAstronomer::PI)) + 2 ) % 12; if (term < 1) { term += 12; } return term; } /** * Return true if the given month lacks a major solar term. * @param newMoon days after January 1, 1970 0:00 Asia/Shanghai of a new * moon */ UBool ChineseCalendar::hasNoMajorSolarTerm(int32_t newMoon) const { return majorSolarTerm(newMoon) == majorSolarTerm(newMoonNear(newMoon + SYNODIC_GAP, true)); } //------------------------------------------------------------------ // Time to fields //------------------------------------------------------------------ /** * Return true if there is a leap month on or after month newMoon1 and * at or before month newMoon2. * @param newMoon1 days after January 1, 1970 0:00 astronomical base zone * of a new moon * @param newMoon2 days after January 1, 1970 0:00 astronomical base zone * of a new moon */ UBool ChineseCalendar::isLeapMonthBetween(int32_t newMoon1, int32_t newMoon2) const { #ifdef U_DEBUG_CHNSECAL // This is only needed to debug the timeOfAngle divergence bug. // Remove this later. Liu 11/9/00 if (synodicMonthsBetween(newMoon1, newMoon2) >= 50) { U_DEBUG_CHNSECAL_MSG(( "isLeapMonthBetween(%d, %d): Invalid parameters", newMoon1, newMoon2 )); } #endif while (newMoon2 >= newMoon1) { if (hasNoMajorSolarTerm(newMoon2)) { return true; } newMoon2 = newMoonNear(newMoon2 - SYNODIC_GAP, false); } return false; } /** * Compute fields for the Chinese calendar system. This method can * either set all relevant fields, as required by * handleComputeFields(), or it can just set the MONTH and * IS_LEAP_MONTH fields, as required by * handleComputeMonthStart(). * *

As a side effect, this method sets {@link #hasLeapMonthBetweenWinterSolstices}. * @param days days after January 1, 1970 0:00 astronomical base zone * of the date to compute fields for * @param gyear the Gregorian year of the given date * @param gmonth the Gregorian month of the given date * @param setAllFields if true, set the EXTENDED_YEAR, ERA, YEAR, * DAY_OF_MONTH, and DAY_OF_YEAR fields. In either case set the MONTH * and IS_LEAP_MONTH fields. */ void ChineseCalendar::computeChineseFields(int32_t days, int32_t gyear, int32_t gmonth, UBool setAllFields) { // Find the winter solstices before and after the target date. // These define the boundaries of this Chinese year, specifically, // the position of month 11, which always contains the solstice. // We want solsticeBefore <= date < solsticeAfter. int32_t solsticeBefore; int32_t solsticeAfter = winterSolstice(gyear); if (days < solsticeAfter) { solsticeBefore = winterSolstice(gyear - 1); } else { solsticeBefore = solsticeAfter; solsticeAfter = winterSolstice(gyear + 1); } // Find the start of the month after month 11. This will be either // the prior month 12 or leap month 11 (very rare). Also find the // start of the following month 11. int32_t firstMoon = newMoonNear(solsticeBefore + 1, true); int32_t lastMoon = newMoonNear(solsticeAfter + 1, false); int32_t thisMoon = newMoonNear(days + 1, false); // Start of this month // Note: hasLeapMonthBetweenWinterSolstices is a member variable hasLeapMonthBetweenWinterSolstices = synodicMonthsBetween(firstMoon, lastMoon) == 12; int32_t month = synodicMonthsBetween(firstMoon, thisMoon); int32_t theNewYear = newYear(gyear); if (days < theNewYear) { theNewYear = newYear(gyear-1); } if (hasLeapMonthBetweenWinterSolstices && isLeapMonthBetween(firstMoon, thisMoon)) { month--; } if (month < 1) { month += 12; } int32_t ordinalMonth = synodicMonthsBetween(theNewYear, thisMoon); if (ordinalMonth < 0) { ordinalMonth += 12; } UBool isLeapMonth = hasLeapMonthBetweenWinterSolstices && hasNoMajorSolarTerm(thisMoon) && !isLeapMonthBetween(firstMoon, newMoonNear(thisMoon - SYNODIC_GAP, false)); internalSet(UCAL_MONTH, month-1); // Convert from 1-based to 0-based internalSet(UCAL_ORDINAL_MONTH, ordinalMonth); // Convert from 1-based to 0-based internalSet(UCAL_IS_LEAP_MONTH, isLeapMonth?1:0); if (setAllFields) { // Extended year and cycle year is based on the epoch year int32_t extended_year = gyear - fEpochYear; int cycle_year = gyear - CHINESE_EPOCH_YEAR; if (month < 11 || gmonth >= UCAL_JULY) { extended_year++; cycle_year++; } int32_t dayOfMonth = days - thisMoon + 1; internalSet(UCAL_EXTENDED_YEAR, extended_year); // 0->0,60 1->1,1 60->1,60 61->2,1 etc. int32_t yearOfCycle; int32_t cycle = ClockMath::floorDivide(cycle_year - 1, 60, &yearOfCycle); internalSet(UCAL_ERA, cycle + 1); internalSet(UCAL_YEAR, yearOfCycle + 1); internalSet(UCAL_DAY_OF_MONTH, dayOfMonth); // Days will be before the first new year we compute if this // date is in month 11, leap 11, 12. There is never a leap 12. // New year computations are cached so this should be cheap in // the long run. int32_t theNewYear = newYear(gyear); if (days < theNewYear) { theNewYear = newYear(gyear-1); } internalSet(UCAL_DAY_OF_YEAR, days - theNewYear + 1); } } //------------------------------------------------------------------ // Fields to time //------------------------------------------------------------------ /** * Return the Chinese new year of the given Gregorian year. * @param gyear a Gregorian year * @return days after January 1, 1970 0:00 astronomical base zone of the * Chinese new year of the given year (this will be a new moon) */ int32_t ChineseCalendar::newYear(int32_t gyear) const { UErrorCode status = U_ZERO_ERROR; int32_t cacheValue = CalendarCache::get(&gChineseCalendarNewYearCache, gyear, status); if (cacheValue == 0) { int32_t solsticeBefore= winterSolstice(gyear - 1); int32_t solsticeAfter = winterSolstice(gyear); int32_t newMoon1 = newMoonNear(solsticeBefore + 1, true); int32_t newMoon2 = newMoonNear(newMoon1 + SYNODIC_GAP, true); int32_t newMoon11 = newMoonNear(solsticeAfter + 1, false); if (synodicMonthsBetween(newMoon1, newMoon11) == 12 && (hasNoMajorSolarTerm(newMoon1) || hasNoMajorSolarTerm(newMoon2))) { cacheValue = newMoonNear(newMoon2 + SYNODIC_GAP, true); } else { cacheValue = newMoon2; } CalendarCache::put(&gChineseCalendarNewYearCache, gyear, cacheValue, status); } if(U_FAILURE(status)) { cacheValue = 0; } return cacheValue; } /** * Adjust this calendar to be delta months before or after a given * start position, pinning the day of month if necessary. The start * position is given as a local days number for the start of the month * and a day-of-month. Used by add() and roll(). * @param newMoon the local days of the first day of the month of the * start position (days after January 1, 1970 0:00 Asia/Shanghai) * @param dom the 1-based day-of-month of the start position * @param delta the number of months to move forward or backward from * the start position * @param status The status. */ void ChineseCalendar::offsetMonth(int32_t newMoon, int32_t dom, int32_t delta, UErrorCode& status) { if (U_FAILURE(status)) { return; } // Move to the middle of the month before our target month. double value = newMoon; value += (CalendarAstronomer::SYNODIC_MONTH * (static_cast(delta) - 0.5)); if (value < INT32_MIN || value > INT32_MAX) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } newMoon = static_cast(value); // Search forward to the target month's new moon newMoon = newMoonNear(newMoon, true); // Find the target dom int32_t jd = newMoon + kEpochStartAsJulianDay - 1 + dom; // Pin the dom. In this calendar all months are 29 or 30 days // so pinning just means handling dom 30. if (dom > 29) { set(UCAL_JULIAN_DAY, jd-1); // TODO Fix this. We really shouldn't ever have to // explicitly call complete(). This is either a bug in // this method, in ChineseCalendar, or in // Calendar.getActualMaximum(). I suspect the last. complete(status); if (U_FAILURE(status)) return; if (getActualMaximum(UCAL_DAY_OF_MONTH, status) >= dom) { if (U_FAILURE(status)) return; set(UCAL_JULIAN_DAY, jd); } } else { set(UCAL_JULIAN_DAY, jd); } } constexpr uint32_t kChineseRelatedYearDiff = -2637; int32_t ChineseCalendar::getRelatedYear(UErrorCode &status) const { int32_t year = get(UCAL_EXTENDED_YEAR, status); if (U_FAILURE(status)) { return 0; } return year + kChineseRelatedYearDiff; } void ChineseCalendar::setRelatedYear(int32_t year) { // set extended year set(UCAL_EXTENDED_YEAR, year - kChineseRelatedYearDiff); } // default century static UDate gSystemDefaultCenturyStart = DBL_MIN; static int32_t gSystemDefaultCenturyStartYear = -1; static icu::UInitOnce gSystemDefaultCenturyInitOnce {}; UBool ChineseCalendar::haveDefaultCentury() const { return true; } UDate ChineseCalendar::defaultCenturyStart() const { return internalGetDefaultCenturyStart(); } int32_t ChineseCalendar::defaultCenturyStartYear() const { return internalGetDefaultCenturyStartYear(); } static void U_CALLCONV initializeSystemDefaultCentury() { // initialize systemDefaultCentury and systemDefaultCenturyYear based // on the current time. They'll be set to 80 years before // the current time. UErrorCode status = U_ZERO_ERROR; ChineseCalendar calendar(Locale("@calendar=chinese"),status); if (U_SUCCESS(status)) { calendar.setTime(Calendar::getNow(), status); calendar.add(UCAL_YEAR, -80, status); gSystemDefaultCenturyStart = calendar.getTime(status); gSystemDefaultCenturyStartYear = calendar.get(UCAL_YEAR, status); } // We have no recourse upon failure unless we want to propagate the failure // out. } UDate ChineseCalendar::internalGetDefaultCenturyStart() const { // lazy-evaluate systemDefaultCenturyStart umtx_initOnce(gSystemDefaultCenturyInitOnce, &initializeSystemDefaultCentury); return gSystemDefaultCenturyStart; } int32_t ChineseCalendar::internalGetDefaultCenturyStartYear() const { // lazy-evaluate systemDefaultCenturyStartYear umtx_initOnce(gSystemDefaultCenturyInitOnce, &initializeSystemDefaultCentury); return gSystemDefaultCenturyStartYear; } bool ChineseCalendar::inTemporalLeapYear(UErrorCode &status) const { int32_t days = getActualMaximum(UCAL_DAY_OF_YEAR, status); if (U_FAILURE(status)) return false; return days > 360; } UOBJECT_DEFINE_RTTI_IMPLEMENTATION(ChineseCalendar) static const char * const gTemporalLeapMonthCodes[] = { "M01L", "M02L", "M03L", "M04L", "M05L", "M06L", "M07L", "M08L", "M09L", "M10L", "M11L", "M12L", nullptr }; const char* ChineseCalendar::getTemporalMonthCode(UErrorCode &status) const { // We need to call get, not internalGet, to force the calculation // from UCAL_ORDINAL_MONTH. int32_t is_leap = get(UCAL_IS_LEAP_MONTH, status); if (U_FAILURE(status)) return nullptr; if (is_leap != 0) { int32_t month = get(UCAL_MONTH, status); if (U_FAILURE(status)) return nullptr; return gTemporalLeapMonthCodes[month]; } return Calendar::getTemporalMonthCode(status); } void ChineseCalendar::setTemporalMonthCode(const char* code, UErrorCode& status ) { if (U_FAILURE(status)) return; int32_t len = static_cast(uprv_strlen(code)); if (len != 4 || code[0] != 'M' || code[3] != 'L') { set(UCAL_IS_LEAP_MONTH, 0); return Calendar::setTemporalMonthCode(code, status); } for (int m = 0; gTemporalLeapMonthCodes[m] != nullptr; m++) { if (uprv_strcmp(code, gTemporalLeapMonthCodes[m]) == 0) { set(UCAL_MONTH, m); set(UCAL_IS_LEAP_MONTH, 1); return; } } status = U_ILLEGAL_ARGUMENT_ERROR; } int32_t ChineseCalendar::internalGetMonth() const { if (resolveFields(kMonthPrecedence) == UCAL_MONTH) { return internalGet(UCAL_MONTH); } LocalPointer temp(this->clone()); temp->set(UCAL_MONTH, 0); temp->set(UCAL_IS_LEAP_MONTH, 0); temp->set(UCAL_DATE, 1); // Calculate the UCAL_MONTH and UCAL_IS_LEAP_MONTH by adding number of // months. UErrorCode status = U_ZERO_ERROR; temp->roll(UCAL_MONTH, internalGet(UCAL_ORDINAL_MONTH), status); U_ASSERT(U_SUCCESS(status)); ChineseCalendar *nonConstThis = (ChineseCalendar*)this; // cast away const nonConstThis->internalSet(UCAL_IS_LEAP_MONTH, temp->get(UCAL_IS_LEAP_MONTH, status)); U_ASSERT(U_SUCCESS(status)); int32_t month = temp->get(UCAL_MONTH, status); U_ASSERT(U_SUCCESS(status)); nonConstThis->internalSet(UCAL_MONTH, month); return month; } int32_t ChineseCalendar::internalGetMonth(int32_t defaultValue) const { if (resolveFields(kMonthPrecedence) == UCAL_MONTH) { return internalGet(UCAL_MONTH, defaultValue); } return internalGetMonth(); } U_NAMESPACE_END #endif stringi/src/icu74/i18n/collation.h0000644000176200001440000004556714700200761016405 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2010-2015, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * collation.h * * created on: 2010oct27 * created by: Markus W. Scherer */ #ifndef __COLLATION_H__ #define __COLLATION_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION U_NAMESPACE_BEGIN /** * Collation v2 basic definitions and static helper functions. * * Data structures except for expansion tables store 32-bit CEs which are * either specials (see tags below) or are compact forms of 64-bit CEs. */ class U_I18N_API Collation { public: // Special sort key bytes for all levels. static const uint8_t TERMINATOR_BYTE = 0; static const uint8_t LEVEL_SEPARATOR_BYTE = 1; /** The secondary/tertiary lower limit for tailoring before any root elements. */ static const uint32_t BEFORE_WEIGHT16 = 0x0100; /** * Merge-sort-key separator. * Same as the unique primary and identical-level weights of U+FFFE. * Must not be used as primary compression low terminator. * Otherwise usable. */ static const uint8_t MERGE_SEPARATOR_BYTE = 2; static const uint32_t MERGE_SEPARATOR_PRIMARY = 0x02000000; // U+FFFE static const uint32_t MERGE_SEPARATOR_CE32 = 0x02000505; // U+FFFE /** * Primary compression low terminator, must be greater than MERGE_SEPARATOR_BYTE. * Reserved value in primary second byte if the lead byte is compressible. * Otherwise usable in all CE weight bytes. */ static const uint8_t PRIMARY_COMPRESSION_LOW_BYTE = 3; /** * Primary compression high terminator. * Reserved value in primary second byte if the lead byte is compressible. * Otherwise usable in all CE weight bytes. */ static const uint8_t PRIMARY_COMPRESSION_HIGH_BYTE = 0xff; /** Default secondary/tertiary weight lead byte. */ static const uint8_t COMMON_BYTE = 5; static const uint32_t COMMON_WEIGHT16 = 0x0500; /** Middle 16 bits of a CE with a common secondary weight. */ static const uint32_t COMMON_SECONDARY_CE = 0x05000000; /** Lower 16 bits of a CE with a common tertiary weight. */ static const uint32_t COMMON_TERTIARY_CE = 0x0500; /** Lower 32 bits of a CE with common secondary and tertiary weights. */ static const uint32_t COMMON_SEC_AND_TER_CE = 0x05000500; static const uint32_t SECONDARY_MASK = 0xffff0000; static const uint32_t CASE_MASK = 0xc000; static const uint32_t SECONDARY_AND_CASE_MASK = SECONDARY_MASK | CASE_MASK; /** Only the 2*6 bits for the pure tertiary weight. */ static const uint32_t ONLY_TERTIARY_MASK = 0x3f3f; /** Only the secondary & tertiary bits; no case, no quaternary. */ static const uint32_t ONLY_SEC_TER_MASK = SECONDARY_MASK | ONLY_TERTIARY_MASK; /** Case bits and tertiary bits. */ static const uint32_t CASE_AND_TERTIARY_MASK = CASE_MASK | ONLY_TERTIARY_MASK; static const uint32_t QUATERNARY_MASK = 0xc0; /** Case bits and quaternary bits. */ static const uint32_t CASE_AND_QUATERNARY_MASK = CASE_MASK | QUATERNARY_MASK; static const uint8_t UNASSIGNED_IMPLICIT_BYTE = 0xfe; // compressible /** * First unassigned: AlphabeticIndex overflow boundary. * We want a 3-byte primary so that it fits into the root elements table. * * This 3-byte primary will not collide with * any unassigned-implicit 4-byte primaries because * the first few hundred Unicode code points all have real mappings. */ static const uint32_t FIRST_UNASSIGNED_PRIMARY = 0xfe040200; static const uint8_t TRAIL_WEIGHT_BYTE = 0xff; // not compressible static const uint32_t FIRST_TRAILING_PRIMARY = 0xff020200; // [first trailing] static const uint32_t MAX_PRIMARY = 0xffff0000; // U+FFFF static const uint32_t MAX_REGULAR_CE32 = 0xffff0505; // U+FFFF // CE32 value for U+FFFD as well as illegal UTF-8 byte sequences (which behave like U+FFFD). // We use the third-highest primary weight for U+FFFD (as in UCA 6.3+). static const uint32_t FFFD_PRIMARY = MAX_PRIMARY - 0x20000; static const uint32_t FFFD_CE32 = MAX_REGULAR_CE32 - 0x20000; /** * A CE32 is special if its low byte is this or greater. * Impossible case bits 11 mark special CE32s. * This value itself is used to indicate a fallback to the base collator. */ static const uint8_t SPECIAL_CE32_LOW_BYTE = 0xc0; static const uint32_t FALLBACK_CE32 = SPECIAL_CE32_LOW_BYTE; /** * Low byte of a long-primary special CE32. */ static const uint8_t LONG_PRIMARY_CE32_LOW_BYTE = 0xc1; // SPECIAL_CE32_LOW_BYTE | LONG_PRIMARY_TAG static const uint32_t UNASSIGNED_CE32 = 0xffffffff; // Compute an unassigned-implicit CE. static const uint32_t NO_CE32 = 1; /** No CE: End of input. Only used in runtime code, not stored in data. */ static const uint32_t NO_CE_PRIMARY = 1; // not a left-adjusted weight static const uint32_t NO_CE_WEIGHT16 = 0x0100; // weight of LEVEL_SEPARATOR_BYTE static const int64_t NO_CE = INT64_C(0x101000100); // NO_CE_PRIMARY, NO_CE_WEIGHT16, NO_CE_WEIGHT16 /** Sort key levels. */ enum Level { /** Unspecified level. */ NO_LEVEL, PRIMARY_LEVEL, SECONDARY_LEVEL, CASE_LEVEL, TERTIARY_LEVEL, QUATERNARY_LEVEL, IDENTICAL_LEVEL, /** Beyond sort key bytes. */ ZERO_LEVEL }; /** * Sort key level flags: xx_FLAG = 1 << xx_LEVEL. * In Java, use enum Level with flag() getters, or use EnumSet rather than hand-made bit sets. */ static const uint32_t NO_LEVEL_FLAG = 1; static const uint32_t PRIMARY_LEVEL_FLAG = 2; static const uint32_t SECONDARY_LEVEL_FLAG = 4; static const uint32_t CASE_LEVEL_FLAG = 8; static const uint32_t TERTIARY_LEVEL_FLAG = 0x10; static const uint32_t QUATERNARY_LEVEL_FLAG = 0x20; static const uint32_t IDENTICAL_LEVEL_FLAG = 0x40; static const uint32_t ZERO_LEVEL_FLAG = 0x80; /** * Special-CE32 tags, from bits 3..0 of a special 32-bit CE. * Bits 31..8 are available for tag-specific data. * Bits 5..4: Reserved. May be used in the future to indicate lccc!=0 and tccc!=0. */ enum { /** * Fall back to the base collator. * This is the tag value in SPECIAL_CE32_LOW_BYTE and FALLBACK_CE32. * Bits 31..8: Unused, 0. */ FALLBACK_TAG = 0, /** * Long-primary CE with COMMON_SEC_AND_TER_CE. * Bits 31..8: Three-byte primary. */ LONG_PRIMARY_TAG = 1, /** * Long-secondary CE with zero primary. * Bits 31..16: Secondary weight. * Bits 15.. 8: Tertiary weight. */ LONG_SECONDARY_TAG = 2, /** * Unused. * May be used in the future for single-byte secondary CEs (SHORT_SECONDARY_TAG), * storing the secondary in bits 31..24, the ccc in bits 23..16, * and the tertiary in bits 15..8. */ RESERVED_TAG_3 = 3, /** * Latin mini expansions of two simple CEs [pp, 05, tt] [00, ss, 05]. * Bits 31..24: Single-byte primary weight pp of the first CE. * Bits 23..16: Tertiary weight tt of the first CE. * Bits 15.. 8: Secondary weight ss of the second CE. */ LATIN_EXPANSION_TAG = 4, /** * Points to one or more simple/long-primary/long-secondary 32-bit CE32s. * Bits 31..13: Index into uint32_t table. * Bits 12.. 8: Length=1..31. */ EXPANSION32_TAG = 5, /** * Points to one or more 64-bit CEs. * Bits 31..13: Index into CE table. * Bits 12.. 8: Length=1..31. */ EXPANSION_TAG = 6, /** * Builder data, used only in the CollationDataBuilder, not in runtime data. * * If bit 8 is 0: Builder context, points to a list of context-sensitive mappings. * Bits 31..13: Index to the builder's list of ConditionalCE32 for this character. * Bits 12.. 9: Unused, 0. * * If bit 8 is 1 (IS_BUILDER_JAMO_CE32): Builder-only jamoCE32 value. * The builder fetches the Jamo CE32 from the trie. * Bits 31..13: Jamo code point. * Bits 12.. 9: Unused, 0. */ BUILDER_DATA_TAG = 7, /** * Points to prefix trie. * Bits 31..13: Index into prefix/contraction data. * Bits 12.. 8: Unused, 0. */ PREFIX_TAG = 8, /** * Points to contraction data. * Bits 31..13: Index into prefix/contraction data. * Bit 12: Unused, 0. * Bit 11: CONTRACT_HAS_STARTER flag. (Used by ICU4X only.) * Bit 10: CONTRACT_TRAILING_CCC flag. * Bit 9: CONTRACT_NEXT_CCC flag. * Bit 8: CONTRACT_SINGLE_CP_NO_MATCH flag. */ CONTRACTION_TAG = 9, /** * Decimal digit. * Bits 31..13: Index into uint32_t table for non-numeric-collation CE32. * Bit 12: Unused, 0. * Bits 11.. 8: Digit value 0..9. */ DIGIT_TAG = 10, /** * Tag for U+0000, for moving the NUL-termination handling * from the regular fastpath into specials-handling code. * Bits 31..8: Unused, 0. */ U0000_TAG = 11, /** * Tag for a Hangul syllable. * Bits 31..9: Unused, 0. * Bit 8: HANGUL_NO_SPECIAL_JAMO flag. */ HANGUL_TAG = 12, /** * Tag for a lead surrogate code unit. * Optional optimization for UTF-16 string processing. * Bits 31..10: Unused, 0. * 9.. 8: =0: All associated supplementary code points are unassigned-implicit. * =1: All associated supplementary code points fall back to the base data. * else: (Normally 2) Look up the data for the supplementary code point. */ LEAD_SURROGATE_TAG = 13, /** * Tag for CEs with primary weights in code point order. * Bits 31..13: Index into CE table, for one data "CE". * Bits 12.. 8: Unused, 0. * * This data "CE" has the following bit fields: * Bits 63..32: Three-byte primary pppppp00. * 31.. 8: Start/base code point of the in-order range. * 7: Flag isCompressible primary. * 6.. 0: Per-code point primary-weight increment. */ OFFSET_TAG = 14, /** * Implicit CE tag. Compute an unassigned-implicit CE. * All bits are set (UNASSIGNED_CE32=0xffffffff). */ IMPLICIT_TAG = 15 }; static UBool isAssignedCE32(uint32_t ce32) { return ce32 != FALLBACK_CE32 && ce32 != UNASSIGNED_CE32; } /** * We limit the number of CEs in an expansion * so that we can use a small number of length bits in the data structure, * and so that an implementation can copy CEs at runtime without growing a destination buffer. */ static const int32_t MAX_EXPANSION_LENGTH = 31; static const int32_t MAX_INDEX = 0x7ffff; /** * Set if there is no match for the single (no-suffix) character itself. * This is only possible if there is a prefix. * In this case, discontiguous contraction matching cannot add combining marks * starting from an empty suffix. * The default CE32 is used anyway if there is no suffix match. */ static const uint32_t CONTRACT_SINGLE_CP_NO_MATCH = 0x100; /** Set if the first character of every contraction suffix has lccc!=0. */ static const uint32_t CONTRACT_NEXT_CCC = 0x200; /** Set if any contraction suffix ends with lccc!=0. */ static const uint32_t CONTRACT_TRAILING_CCC = 0x400; /** Set if any contraction suffix contains a starter. (Used by ICU4X only.) */ static const uint32_t CONTRACT_HAS_STARTER = 0x800; /** For HANGUL_TAG: None of its Jamo CE32s isSpecialCE32(). */ static const uint32_t HANGUL_NO_SPECIAL_JAMO = 0x100; static const uint32_t LEAD_ALL_UNASSIGNED = 0; static const uint32_t LEAD_ALL_FALLBACK = 0x100; static const uint32_t LEAD_MIXED = 0x200; static const uint32_t LEAD_TYPE_MASK = 0x300; static uint32_t makeLongPrimaryCE32(uint32_t p) { return p | LONG_PRIMARY_CE32_LOW_BYTE; } /** Turns the long-primary CE32 into a primary weight pppppp00. */ static inline uint32_t primaryFromLongPrimaryCE32(uint32_t ce32) { return ce32 & 0xffffff00; } static inline int64_t ceFromLongPrimaryCE32(uint32_t ce32) { return ((int64_t)(ce32 & 0xffffff00) << 32) | COMMON_SEC_AND_TER_CE; } static uint32_t makeLongSecondaryCE32(uint32_t lower32) { return lower32 | SPECIAL_CE32_LOW_BYTE | LONG_SECONDARY_TAG; } static inline int64_t ceFromLongSecondaryCE32(uint32_t ce32) { return ce32 & 0xffffff00; } /** Makes a special CE32 with tag, index and length. */ static uint32_t makeCE32FromTagIndexAndLength(int32_t tag, int32_t index, int32_t length) { return (index << 13) | (length << 8) | SPECIAL_CE32_LOW_BYTE | tag; } /** Makes a special CE32 with only tag and index. */ static uint32_t makeCE32FromTagAndIndex(int32_t tag, int32_t index) { return (index << 13) | SPECIAL_CE32_LOW_BYTE | tag; } static inline UBool isSpecialCE32(uint32_t ce32) { return (ce32 & 0xff) >= SPECIAL_CE32_LOW_BYTE; } static inline int32_t tagFromCE32(uint32_t ce32) { return (int32_t)(ce32 & 0xf); } static inline UBool hasCE32Tag(uint32_t ce32, int32_t tag) { return isSpecialCE32(ce32) && tagFromCE32(ce32) == tag; } static inline UBool isLongPrimaryCE32(uint32_t ce32) { return hasCE32Tag(ce32, LONG_PRIMARY_TAG); } static UBool isSimpleOrLongCE32(uint32_t ce32) { return !isSpecialCE32(ce32) || tagFromCE32(ce32) == LONG_PRIMARY_TAG || tagFromCE32(ce32) == LONG_SECONDARY_TAG; } /** * @return true if the ce32 yields one or more CEs without further data lookups */ static UBool isSelfContainedCE32(uint32_t ce32) { return !isSpecialCE32(ce32) || tagFromCE32(ce32) == LONG_PRIMARY_TAG || tagFromCE32(ce32) == LONG_SECONDARY_TAG || tagFromCE32(ce32) == LATIN_EXPANSION_TAG; } static inline UBool isPrefixCE32(uint32_t ce32) { return hasCE32Tag(ce32, PREFIX_TAG); } static inline UBool isContractionCE32(uint32_t ce32) { return hasCE32Tag(ce32, CONTRACTION_TAG); } static inline UBool ce32HasContext(uint32_t ce32) { return isSpecialCE32(ce32) && (tagFromCE32(ce32) == PREFIX_TAG || tagFromCE32(ce32) == CONTRACTION_TAG); } /** * Get the first of the two Latin-expansion CEs encoded in ce32. * @see LATIN_EXPANSION_TAG */ static inline int64_t latinCE0FromCE32(uint32_t ce32) { return ((int64_t)(ce32 & 0xff000000) << 32) | COMMON_SECONDARY_CE | ((ce32 & 0xff0000) >> 8); } /** * Get the second of the two Latin-expansion CEs encoded in ce32. * @see LATIN_EXPANSION_TAG */ static inline int64_t latinCE1FromCE32(uint32_t ce32) { return ((ce32 & 0xff00) << 16) | COMMON_TERTIARY_CE; } /** * Returns the data index from a special CE32. */ static inline int32_t indexFromCE32(uint32_t ce32) { return (int32_t)(ce32 >> 13); } /** * Returns the data length from a ce32. */ static inline int32_t lengthFromCE32(uint32_t ce32) { return (ce32 >> 8) & 31; } /** * Returns the digit value from a DIGIT_TAG ce32. */ static inline char digitFromCE32(uint32_t ce32) { return (char)((ce32 >> 8) & 0xf); } /** Returns a 64-bit CE from a simple CE32 (not special). */ static inline int64_t ceFromSimpleCE32(uint32_t ce32) { // normal form ppppsstt -> pppp0000ss00tt00 // assert (ce32 & 0xff) < SPECIAL_CE32_LOW_BYTE return ((int64_t)(ce32 & 0xffff0000) << 32) | ((ce32 & 0xff00) << 16) | ((ce32 & 0xff) << 8); } /** Returns a 64-bit CE from a simple/long-primary/long-secondary CE32. */ static inline int64_t ceFromCE32(uint32_t ce32) { uint32_t tertiary = ce32 & 0xff; if(tertiary < SPECIAL_CE32_LOW_BYTE) { // normal form ppppsstt -> pppp0000ss00tt00 return ((int64_t)(ce32 & 0xffff0000) << 32) | ((ce32 & 0xff00) << 16) | (tertiary << 8); } else { ce32 -= tertiary; if((tertiary & 0xf) == LONG_PRIMARY_TAG) { // long-primary form ppppppC1 -> pppppp00050000500 return ((int64_t)ce32 << 32) | COMMON_SEC_AND_TER_CE; } else { // long-secondary form ssssttC2 -> 00000000sssstt00 // assert (tertiary & 0xf) == LONG_SECONDARY_TAG return ce32; } } } /** Creates a CE from a primary weight. */ static inline int64_t makeCE(uint32_t p) { return ((int64_t)p << 32) | COMMON_SEC_AND_TER_CE; } /** * Creates a CE from a primary weight, * 16-bit secondary/tertiary weights, and a 2-bit quaternary. */ static inline int64_t makeCE(uint32_t p, uint32_t s, uint32_t t, uint32_t q) { return ((int64_t)p << 32) | (s << 16) | t | (q << 6); } /** * Increments a 2-byte primary by a code point offset. */ static uint32_t incTwoBytePrimaryByOffset(uint32_t basePrimary, UBool isCompressible, int32_t offset); /** * Increments a 3-byte primary by a code point offset. */ static uint32_t incThreeBytePrimaryByOffset(uint32_t basePrimary, UBool isCompressible, int32_t offset); /** * Decrements a 2-byte primary by one range step (1..0x7f). */ static uint32_t decTwoBytePrimaryByOneStep(uint32_t basePrimary, UBool isCompressible, int32_t step); /** * Decrements a 3-byte primary by one range step (1..0x7f). */ static uint32_t decThreeBytePrimaryByOneStep(uint32_t basePrimary, UBool isCompressible, int32_t step); /** * Computes a 3-byte primary for c's OFFSET_TAG data "CE". */ static uint32_t getThreeBytePrimaryForOffsetData(UChar32 c, int64_t dataCE); /** * Returns the unassigned-character implicit primary weight for any valid code point c. */ static uint32_t unassignedPrimaryFromCodePoint(UChar32 c); static inline int64_t unassignedCEFromCodePoint(UChar32 c) { return makeCE(unassignedPrimaryFromCodePoint(c)); } private: Collation() = delete; // No instantiation. }; U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION #endif // __COLLATION_H__ stringi/src/icu74/i18n/number_microprops.h0000644000176200001440000001536314700200761020155 0ustar liggesusers// © 2017 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #ifndef __NUMBER_MICROPROPS_H__ #define __NUMBER_MICROPROPS_H__ // TODO: minimize includes #include "unicode/numberformatter.h" #include "number_types.h" #include "number_decimalquantity.h" #include "number_scientific.h" #include "number_patternstring.h" #include "number_modifiers.h" #include "number_multiplier.h" #include "number_roundingutils.h" #include "decNumber.h" #include "charstr.h" #include "util.h" U_NAMESPACE_BEGIN namespace number { namespace impl { /** * A copyable container for the integer values of mixed unit measurements. * * If memory allocation fails during copying, no values are copied and status is * set to U_MEMORY_ALLOCATION_ERROR. */ class IntMeasures : public MaybeStackArray { public: /** * Default constructor initializes with internal T[stackCapacity] buffer. * * Stack Capacity: most mixed units are expected to consist of two or three * subunits, so one or two integer measures should be enough. */ IntMeasures() : MaybeStackArray() {} /** * Copy constructor. * * If memory allocation fails during copying, no values are copied and * status is set to U_MEMORY_ALLOCATION_ERROR. */ IntMeasures(const IntMeasures &other) : MaybeStackArray() { this->operator=(other); } // Assignment operator IntMeasures &operator=(const IntMeasures &rhs) { if (this == &rhs) { return *this; } copyFrom(rhs, status); return *this; } /** Move constructor */ IntMeasures(IntMeasures &&src) = default; /** Move assignment */ IntMeasures &operator=(IntMeasures &&src) = default; UErrorCode status = U_ZERO_ERROR; }; struct SimpleMicroProps : public UMemory { Grouper grouping; bool useCurrency = false; UNumberDecimalSeparatorDisplay decimal = UNUM_DECIMAL_SEPARATOR_AUTO; // Currency symbol to be used as the decimal separator UnicodeString currencyAsDecimal = ICU_Utility::makeBogusString(); // Note: This struct has no direct ownership of the following pointer. const DecimalFormatSymbols* symbols = nullptr; }; /** * MicroProps is the first MicroPropsGenerator that should be should be called, * producing an initialized MicroProps instance that will be passed on and * modified throughout the rest of the chain of MicroPropsGenerator instances. */ struct MicroProps : public MicroPropsGenerator { SimpleMicroProps simple; // NOTE: All of these fields are properly initialized in NumberFormatterImpl. RoundingImpl rounder; Padder padding; IntegerWidth integerWidth; UNumberSignDisplay sign; char nsName[9]; // No ownership: must point at a string which will outlive MicroProps // instances, e.g. a string with static storage duration, or just a string // that will never be deallocated or modified. const char *gender; // Note: This struct has no direct ownership of the following pointers. // Pointers to Modifiers provided by the number formatting pipeline (when // the value is known): // A Modifier provided by LongNameHandler, used for currency long names and // units. If there is no LongNameHandler needed, this should be an // EmptyModifier. (This is typically the third modifier applied.) const Modifier* modOuter; // A Modifier for short currencies and compact notation. (This is typically // the second modifier applied.) const Modifier* modMiddle = nullptr; // A Modifier provided by ScientificHandler, used for scientific notation. // This is typically the first modifier applied. const Modifier* modInner; // The following "helper" fields may optionally be used during the MicroPropsGenerator. // They live here to retain memory. struct { // The ScientificModifier for which ScientificHandler is responsible. // ScientificHandler::processQuantity() modifies this Modifier. ScientificModifier scientificModifier; // EmptyModifier used for modOuter EmptyModifier emptyWeakModifier{false}; // EmptyModifier used for modInner EmptyModifier emptyStrongModifier{true}; MultiplierFormatHandler multiplier; // A Modifier used for Mixed Units. When formatting mixed units, // LongNameHandler assigns this Modifier. SimpleModifier mixedUnitModifier; } helpers; // The MeasureUnit with which the output is represented. May also have // UMEASURE_UNIT_MIXED complexity, in which case mixedMeasures comes into // play. MeasureUnit outputUnit; // Contains all the values of each unit in mixed units. For quantity (which is the floating value of // the smallest unit in the mixed unit), the value stores in `quantity`. // NOTE: the value of quantity in `mixedMeasures` will be left unset. IntMeasures mixedMeasures; // Points to quantity position, -1 if the position is not set yet. int32_t indexOfQuantity = -1; // Number of mixedMeasures that have been populated int32_t mixedMeasuresCount = 0; MicroProps() = default; MicroProps(const MicroProps& other) = default; MicroProps& operator=(const MicroProps& other) = default; /** * As MicroProps is the "base instance", this implementation of * MicroPropsGenerator::processQuantity() just ensures that the output * `micros` is correctly initialized. * * For the "safe" invocation of this function, micros must not be *this, * such that a copy of the base instance is made. For the "unsafe" path, * this function can be used only once, because the base MicroProps instance * will be modified and thus not be available for re-use. * * @param quantity The quantity for consideration and optional mutation. * @param micros The MicroProps instance to populate. If this parameter is * not already `*this`, it will be overwritten with a copy of `*this`. */ void processQuantity(DecimalQuantity &quantity, MicroProps µs, UErrorCode &status) const override { (void) quantity; (void) status; if (this == µs) { // Unsafe path: no need to perform a copy. U_ASSERT(!exhausted); micros.exhausted = true; U_ASSERT(exhausted); } else { // Safe path: copy self into the output micros. U_ASSERT(!exhausted); micros = *this; } } private: // Internal fields: bool exhausted = false; }; } // namespace impl } // namespace number U_NAMESPACE_END #endif // __NUMBER_MICROPROPS_H__ #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/uspoof_conf.cpp0000644000176200001440000004227414700200761017264 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * * Copyright (C) 2008-2015, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * file name: uspoof_conf.cpp * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2009Jan05 (refactoring earlier files) * created by: Andy Heninger * * Internal classes for compiling confusable data into its binary (runtime) form. */ #include "unicode/utypes.h" #include "unicode/uspoof.h" #if !UCONFIG_NO_REGULAR_EXPRESSIONS #if !UCONFIG_NO_NORMALIZATION #include "unicode/unorm.h" #include "unicode/uregex.h" #include "unicode/ustring.h" #include "cmemory.h" #include "uspoof_impl.h" #include "uhash.h" #include "uvector.h" #include "uassert.h" #include "uarrsort.h" #include "uspoof_conf.h" U_NAMESPACE_USE //--------------------------------------------------------------------- // // buildConfusableData Compile the source confusable data, as defined by // the Unicode data file confusables.txt, into the binary // structures used by the confusable detector. // // The binary structures are described in uspoof_impl.h // // 1. Parse the data, making a hash table mapping from a UChar32 to a String. // // 2. Sort all of the strings encountered by length, since they will need to // be stored in that order in the final string table. // TODO: Sorting these strings by length is no longer needed since the removal of // the string lengths table. This logic can be removed to save processing time // when building confusables data. // // 3. Build a list of keys (UChar32s) from the four mapping tables. Sort the // list because that will be the ordering of our runtime table. // // 4. Generate the run time string table. This is generated before the key & value // tables because we need the string indexes when building those tables. // // 5. Build the run-time key and value tables. These are parallel tables, and are built // at the same time // SPUString::SPUString(LocalPointer s) { fStr = std::move(s); fCharOrStrTableIndex = 0; } SPUString::~SPUString() { } SPUStringPool::SPUStringPool(UErrorCode &status) : fVec(nullptr), fHash(nullptr) { LocalPointer vec(new UVector(status), status); if (U_FAILURE(status)) { return; } vec->setDeleter( [](void *obj) {delete (SPUString *)obj;}); fVec = vec.orphan(); fHash = uhash_open(uhash_hashUnicodeString, // key hash function uhash_compareUnicodeString, // Key Comparator nullptr, // Value Comparator &status); } SPUStringPool::~SPUStringPool() { delete fVec; uhash_close(fHash); } int32_t SPUStringPool::size() { return fVec->size(); } SPUString *SPUStringPool::getByIndex(int32_t index) { SPUString *retString = (SPUString *)fVec->elementAt(index); return retString; } // Comparison function for ordering strings in the string pool. // Compare by length first, then, within a group of the same length, // by code point order. // Conforms to the type signature for a USortComparator in uvector.h static int32_t U_CALLCONV SPUStringCompare(UHashTok left, UHashTok right) { const SPUString *sL = const_cast( static_cast(left.pointer)); const SPUString *sR = const_cast( static_cast(right.pointer)); int32_t lenL = sL->fStr->length(); int32_t lenR = sR->fStr->length(); if (lenL < lenR) { return -1; } else if (lenL > lenR) { return 1; } else { return sL->fStr->compare(*(sR->fStr)); } } void SPUStringPool::sort(UErrorCode &status) { fVec->sort(SPUStringCompare, status); } SPUString *SPUStringPool::addString(UnicodeString *src, UErrorCode &status) { LocalPointer lpSrc(src); if (U_FAILURE(status)) { return nullptr; } SPUString *hashedString = static_cast(uhash_get(fHash, src)); if (hashedString != nullptr) { return hashedString; } LocalPointer spuStr(new SPUString(std::move(lpSrc)), status); hashedString = spuStr.getAlias(); fVec->adoptElement(spuStr.orphan(), status); if (U_FAILURE(status)) { return nullptr; } uhash_put(fHash, src, hashedString, &status); return hashedString; } ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status) : fSpoofImpl(spImpl), fInput(nullptr), fTable(nullptr), fKeySet(nullptr), fKeyVec(nullptr), fValueVec(nullptr), fStringTable(nullptr), stringPool(nullptr), fParseLine(nullptr), fParseHexNum(nullptr), fLineNum(0) { if (U_FAILURE(status)) { return; } fTable = uhash_open(uhash_hashLong, uhash_compareLong, nullptr, &status); fKeySet = new UnicodeSet(); if (fKeySet == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } fKeyVec = new UVector(status); if (fKeyVec == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } fValueVec = new UVector(status); if (fValueVec == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } stringPool = new SPUStringPool(status); if (stringPool == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } } ConfusabledataBuilder::~ConfusabledataBuilder() { uprv_free(fInput); uregex_close(fParseLine); uregex_close(fParseHexNum); uhash_close(fTable); delete fKeySet; delete fKeyVec; delete fStringTable; delete fValueVec; delete stringPool; } void ConfusabledataBuilder::buildConfusableData(SpoofImpl * spImpl, const char * confusables, int32_t confusablesLen, int32_t *errorType, UParseError *pe, UErrorCode &status) { if (U_FAILURE(status)) { return; } ConfusabledataBuilder builder(spImpl, status); builder.build(confusables, confusablesLen, status); if (U_FAILURE(status) && errorType != nullptr) { *errorType = USPOOF_SINGLE_SCRIPT_CONFUSABLE; pe->line = builder.fLineNum; } } void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesLen, UErrorCode &status) { // Convert the user input data from UTF-8 to char16_t (UTF-16) int32_t inputLen = 0; if (U_FAILURE(status)) { return; } u_strFromUTF8(nullptr, 0, &inputLen, confusables, confusablesLen, &status); if (status != U_BUFFER_OVERFLOW_ERROR) { return; } status = U_ZERO_ERROR; fInput = static_cast(uprv_malloc((inputLen+1) * sizeof(char16_t))); if (fInput == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } u_strFromUTF8(fInput, inputLen+1, nullptr, confusables, confusablesLen, &status); // Regular Expression to parse a line from Confusables.txt. The expression will match // any line. What was matched is determined by examining which capture groups have a match. // Capture Group 1: the source char // Capture Group 2: the replacement chars // Capture Group 3-6 the table type, SL, SA, ML, or MA (deprecated) // Capture Group 7: A blank or comment only line. // Capture Group 8: A syntactically invalid line. Anything that didn't match before. // Example Line from the confusables.txt source file: // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... " UnicodeString pattern( "(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" // Match the source char "[ \\t]*([0-9A-Fa-f]+" // Match the replacement char(s) "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" // (continued) "\\s*(?:(SL)|(SA)|(ML)|(MA))" // Match the table type "[ \\t]*(?:#.*?)?$" // Match any trailing #comment "|^([ \\t]*(?:#.*?)?)$" // OR match empty lines or lines with only a #comment "|^(.*?)$", -1, US_INV); // OR match any line, which catches illegal lines. // TODO: Why are we using the regex C API here? C++ would just take UnicodeString... fParseLine = uregex_open(pattern.getBuffer(), pattern.length(), 0, nullptr, &status); // Regular expression for parsing a hex number out of a space-separated list of them. // Capture group 1 gets the number, with spaces removed. pattern = UNICODE_STRING_SIMPLE("\\s*([0-9A-F]+)"); fParseHexNum = uregex_open(pattern.getBuffer(), pattern.length(), 0, nullptr, &status); // Zap any Byte Order Mark at the start of input. Changing it to a space is benign // given the syntax of the input. if (*fInput == 0xfeff) { *fInput = 0x20; } // Parse the input, one line per iteration of this loop. uregex_setText(fParseLine, fInput, inputLen, &status); while (uregex_findNext(fParseLine, &status)) { fLineNum++; if (uregex_start(fParseLine, 7, &status) >= 0) { // this was a blank or comment line. continue; } if (uregex_start(fParseLine, 8, &status) >= 0) { // input file syntax error. status = U_PARSE_ERROR; return; } // We have a good input line. Extract the key character and mapping string, and // put them into the appropriate mapping table. UChar32 keyChar = SpoofImpl::ScanHex(fInput, uregex_start(fParseLine, 1, &status), uregex_end(fParseLine, 1, &status), status); int32_t mapStringStart = uregex_start(fParseLine, 2, &status); int32_t mapStringLength = uregex_end(fParseLine, 2, &status) - mapStringStart; uregex_setText(fParseHexNum, &fInput[mapStringStart], mapStringLength, &status); UnicodeString *mapString = new UnicodeString(); if (mapString == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } while (uregex_findNext(fParseHexNum, &status)) { UChar32 c = SpoofImpl::ScanHex(&fInput[mapStringStart], uregex_start(fParseHexNum, 1, &status), uregex_end(fParseHexNum, 1, &status), status); mapString->append(c); } U_ASSERT(mapString->length() >= 1); // Put the map (value) string into the string pool // This a little like a Java intern() - any duplicates will be eliminated. SPUString *smapString = stringPool->addString(mapString, status); // Add the UChar32 -> string mapping to the table. // For Unicode 8, the SL, SA and ML tables have been discontinued. // All input data from confusables.txt is tagged MA. uhash_iput(fTable, keyChar, smapString, &status); if (U_FAILURE(status)) { return; } fKeySet->add(keyChar); } // Input data is now all parsed and collected. // Now create the run-time binary form of the data. // // This is done in two steps. First the data is assembled into vectors and strings, // for ease of construction, then the contents of these collections are dumped // into the actual raw-bytes data storage. // Build up the string array, and record the index of each string therein // in the (build time only) string pool. // Strings of length one are not entered into the strings array. // (Strings in the table are sorted by length) stringPool->sort(status); fStringTable = new UnicodeString(); int32_t poolSize = stringPool->size(); int32_t i; for (i=0; igetByIndex(i); int32_t strLen = s->fStr->length(); int32_t strIndex = fStringTable->length(); if (strLen == 1) { // strings of length one do not get an entry in the string table. // Keep the single string character itself here, which is the same // convention that is used in the final run-time string table index. s->fCharOrStrTableIndex = s->fStr->charAt(0); } else { s->fCharOrStrTableIndex = strIndex; fStringTable->append(*(s->fStr)); } } // Construct the compile-time Key and Value tables // // For each key code point, check which mapping tables it applies to, // and create the final data for the key & value structures. // // The four logical mapping tables are conflated into one combined table. // If multiple logical tables have the same mapping for some key, they // share a single entry in the combined table. // If more than one mapping exists for the same key code point, multiple // entries will be created in the table for (int32_t range=0; rangegetRangeCount(); range++) { // It is an oddity of the UnicodeSet API that simply enumerating the contained // code points requires a nested loop. for (UChar32 keyChar=fKeySet->getRangeStart(range); keyChar <= fKeySet->getRangeEnd(range); keyChar++) { SPUString *targetMapping = static_cast(uhash_iget(fTable, keyChar)); U_ASSERT(targetMapping != nullptr); // Set an error code if trying to consume a long string. Otherwise, // codePointAndLengthToKey will abort on a U_ASSERT. if (targetMapping->fStr->length() > 256) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } int32_t key = ConfusableDataUtils::codePointAndLengthToKey(keyChar, targetMapping->fStr->length()); int32_t value = targetMapping->fCharOrStrTableIndex; fKeyVec->addElement(key, status); fValueVec->addElement(value, status); } } // Put the assembled data into the flat runtime array outputData(status); // All of the intermediate allocated data belongs to the ConfusabledataBuilder // object (this), and is deleted in the destructor. return; } // // outputData The confusable data has been compiled and stored in intermediate // collections and strings. Copy it from there to the final flat // binary array. // // Note that as each section is added to the output data, the // expand (reserveSpace() function will likely relocate it in memory. // Be careful with pointers. // void ConfusabledataBuilder::outputData(UErrorCode &status) { U_ASSERT(fSpoofImpl->fSpoofData->fDataOwned); // The Key Table // While copying the keys to the runtime array, // also sanity check that they are sorted. int32_t numKeys = fKeyVec->size(); int32_t *keys = static_cast(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(int32_t), status)); if (U_FAILURE(status)) { return; } int i; UChar32 previousCodePoint = 0; for (i=0; ielementAti(i); UChar32 codePoint = ConfusableDataUtils::keyToCodePoint(key); (void)previousCodePoint; // Suppress unused variable warning. // strictly greater because there can be only one entry per code point U_ASSERT(codePoint > previousCodePoint); keys[i] = key; previousCodePoint = codePoint; } SpoofDataHeader *rawData = fSpoofImpl->fSpoofData->fRawData; rawData->fCFUKeys = (int32_t)((char *)keys - (char *)rawData); rawData->fCFUKeysSize = numKeys; fSpoofImpl->fSpoofData->fCFUKeys = keys; // The Value Table, parallels the key table int32_t numValues = fValueVec->size(); U_ASSERT(numKeys == numValues); uint16_t *values = static_cast(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(uint16_t), status)); if (U_FAILURE(status)) { return; } for (i=0; i(fValueVec->elementAti(i)); U_ASSERT(value < 0xffff); values[i] = static_cast(value); } rawData = fSpoofImpl->fSpoofData->fRawData; rawData->fCFUStringIndex = (int32_t)((char *)values - (char *)rawData); rawData->fCFUStringIndexSize = numValues; fSpoofImpl->fSpoofData->fCFUValues = values; // The Strings Table. uint32_t stringsLength = fStringTable->length(); // Reserve an extra space so the string will be nul-terminated. This is // only a convenience, for when debugging; it is not needed otherwise. char16_t *strings = static_cast(fSpoofImpl->fSpoofData->reserveSpace(stringsLength*sizeof(char16_t)+2, status)); if (U_FAILURE(status)) { return; } fStringTable->extract(strings, stringsLength+1, status); rawData = fSpoofImpl->fSpoofData->fRawData; U_ASSERT(rawData->fCFUStringTable == 0); rawData->fCFUStringTable = (int32_t)((char *)strings - (char *)rawData); rawData->fCFUStringTableLen = stringsLength; fSpoofImpl->fSpoofData->fCFUStrings = strings; } #endif #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS stringi/src/icu74/i18n/scriptset.h0000644000176200001440000000603014700200761016417 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2013, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * * scriptset.h * * created on: 2013 Jan 7 * created by: Andy Heninger */ #ifndef __SCRIPTSET_H__ #define __SCRIPTSET_H__ #include "unicode/utypes.h" #include "unicode/uobject.h" #include "unicode/uscript.h" #include "uelement.h" U_NAMESPACE_BEGIN //------------------------------------------------------------------------------- // // ScriptSet - A bit set representing a set of scripts. // // This class was originally used exclusively with script sets appearing // as part of the spoof check whole script confusable binary data. Its // use has since become more general, but the continued use to wrap // prebuilt binary data does constrain the design. // //------------------------------------------------------------------------------- class U_I18N_API ScriptSet: public UMemory { public: static constexpr int32_t SCRIPT_LIMIT = 224; // multiple of 32! ScriptSet(); ScriptSet(const ScriptSet &other); ~ScriptSet(); bool operator == (const ScriptSet &other) const; bool operator != (const ScriptSet &other) const {return !(*this == other);} ScriptSet & operator = (const ScriptSet &other); UBool test(UScriptCode script, UErrorCode &status) const; ScriptSet &Union(const ScriptSet &other); ScriptSet &set(UScriptCode script, UErrorCode &status); ScriptSet &reset(UScriptCode script, UErrorCode &status); ScriptSet &intersect(const ScriptSet &other); ScriptSet &intersect(UScriptCode script, UErrorCode &status); UBool intersects(const ScriptSet &other) const; // Sets contain at least one script in common. UBool contains(const ScriptSet &other) const; // All set bits in other are also set in this. ScriptSet &setAll(); ScriptSet &resetAll(); int32_t countMembers() const; int32_t hashCode() const; int32_t nextSetBit(int32_t script) const; UBool isEmpty() const; UnicodeString &displayScripts(UnicodeString &dest) const; // append script names to dest string. ScriptSet & parseScripts(const UnicodeString &scriptsString, UErrorCode &status); // Replaces ScriptSet contents. // Wraps around UScript::getScriptExtensions() and adds the corresponding scripts to this instance. void setScriptExtensions(UChar32 codePoint, UErrorCode& status); private: uint32_t bits[SCRIPT_LIMIT / 32]; }; U_NAMESPACE_END U_CAPI UBool U_EXPORT2 uhash_compareScriptSet(const UElement key1, const UElement key2); U_CAPI int32_t U_EXPORT2 uhash_hashScriptSet(const UElement key); U_CAPI void U_EXPORT2 uhash_deleteScriptSet(void *obj); U_CAPI UBool U_EXPORT2 uhash_equalsScriptSet(const UElement key1, const UElement key2); #endif // __SCRIPTSET_H_ stringi/src/icu74/i18n/numparse_parsednumber.cpp0000644000176200001440000000644714700200761021347 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING // Allow implicit conversion from char16_t* to UnicodeString for this file: // Helpful in toString methods and elsewhere. #define UNISTR_FROM_STRING_EXPLICIT #include "numparse_types.h" #include "number_decimalquantity.h" #include "string_segment.h" #include "putilimp.h" #include using namespace icu; using namespace icu::number; using namespace icu::number::impl; using namespace icu::numparse; using namespace icu::numparse::impl; ParsedNumber::ParsedNumber() { clear(); } void ParsedNumber::clear() { quantity.bogus = true; charEnd = 0; flags = 0; prefix.setToBogus(); suffix.setToBogus(); currencyCode[0] = 0; } void ParsedNumber::setCharsConsumed(const StringSegment& segment) { charEnd = segment.getOffset(); } void ParsedNumber::postProcess() { if (!quantity.bogus && 0 != (flags & FLAG_NEGATIVE)) { quantity.negate(); } } bool ParsedNumber::success() const { return charEnd > 0 && 0 == (flags & FLAG_FAIL); } bool ParsedNumber::seenNumber() const { return !quantity.bogus || 0 != (flags & FLAG_NAN) || 0 != (flags & FLAG_INFINITY); } double ParsedNumber::getDouble(UErrorCode& status) const { bool sawNaN = 0 != (flags & FLAG_NAN); bool sawInfinity = 0 != (flags & FLAG_INFINITY); // Check for NaN, infinity, and -0.0 if (sawNaN) { // Can't use NAN or std::nan because the byte pattern is platform-dependent; // MSVC sets the sign bit, but Clang and GCC do not return uprv_getNaN(); } if (sawInfinity) { if (0 != (flags & FLAG_NEGATIVE)) { return -INFINITY; } else { return INFINITY; } } if (quantity.bogus) { status = U_INVALID_STATE_ERROR; return 0.0; } if (quantity.isZeroish() && quantity.isNegative()) { return -0.0; } if (quantity.fitsInLong()) { return static_cast(quantity.toLong()); } else { return quantity.toDouble(); } } void ParsedNumber::populateFormattable(Formattable& output, parse_flags_t parseFlags) const { bool sawNaN = 0 != (flags & FLAG_NAN); bool sawInfinity = 0 != (flags & FLAG_INFINITY); bool integerOnly = 0 != (parseFlags & PARSE_FLAG_INTEGER_ONLY); // Check for NaN, infinity, and -0.0 if (sawNaN) { // Can't use NAN or std::nan because the byte pattern is platform-dependent; // MSVC sets the sign bit, but Clang and GCC do not output.setDouble(uprv_getNaN()); return; } if (sawInfinity) { if (0 != (flags & FLAG_NEGATIVE)) { output.setDouble(-INFINITY); return; } else { output.setDouble(INFINITY); return; } } U_ASSERT(!quantity.bogus); if (quantity.isZeroish() && quantity.isNegative() && !integerOnly) { output.setDouble(-0.0); return; } // All other numbers output.adoptDecimalQuantity(new DecimalQuantity(quantity)); } bool ParsedNumber::isBetterThan(const ParsedNumber& other) { // Favor results with strictly more characters consumed. return charEnd > other.charEnd; } #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/transreg.h0000644000176200001440000004201514700200761016227 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (c) 2001-2014, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 08/10/2001 aliu Creation. ********************************************************************** */ #ifndef _TRANSREG_H #define _TRANSREG_H #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/uobject.h" #include "unicode/translit.h" #include "hash.h" #include "uvector.h" U_NAMESPACE_BEGIN class TransliteratorEntry; class TransliteratorSpec; class UnicodeString; //------------------------------------------------------------------ // TransliteratorAlias //------------------------------------------------------------------ /** * A TransliteratorAlias object is returned by get() if the given ID * actually translates into something else. The caller then invokes * the create() method on the alias to create the actual * transliterator, and deletes the alias. * * Why all the shenanigans? To prevent circular calls between * the registry code and the transliterator code that deadlocks. */ class TransliteratorAlias : public UMemory { public: /** * Construct a simple alias (type == SIMPLE) * @param aliasID the given id. */ TransliteratorAlias(const UnicodeString& aliasID, const UnicodeSet* compoundFilter); /** * Construct a compound RBT alias (type == COMPOUND) */ TransliteratorAlias(const UnicodeString& ID, const UnicodeString& idBlocks, UVector* adoptedTransliterators, const UnicodeSet* compoundFilter); /** * Construct a rules alias (type = RULES) */ TransliteratorAlias(const UnicodeString& theID, const UnicodeString& rules, UTransDirection dir); ~TransliteratorAlias(); /** * The whole point of create() is that the caller must invoke * it when the registry mutex is NOT held, to prevent deadlock. * It may only be called once. * * Note: Only call create() if isRuleBased() returns false. * * This method must be called *outside* of the TransliteratorRegistry * mutex. */ Transliterator* create(UParseError&, UErrorCode&); /** * Return true if this alias is rule-based. If so, the caller * must call parse() on it, then call TransliteratorRegistry::reget(). */ UBool isRuleBased() const; /** * If isRuleBased() returns true, then the caller must call this * method, followed by TransliteratorRegistry::reget(). The latter * method must be called inside the TransliteratorRegistry mutex. * * Note: Only call parse() if isRuleBased() returns true. * * This method must be called *outside* of the TransliteratorRegistry * mutex, because it can instantiate Transliterators embedded in * the rules via the "&Latin-Arabic()" syntax. */ void parse(TransliteratorParser& parser, UParseError& pe, UErrorCode& ec) const; private: // We actually come in three flavors: // 1. Simple alias // Here aliasID is the alias string. Everything else is // null, zero, empty. // 2. CompoundRBT // Here ID is the ID, aliasID is the idBlock, trans is the // contained RBT, and idSplitPoint is the offset in aliasID // where the contained RBT goes. compoundFilter is the // compound filter, and it is _not_ owned. // 3. Rules // Here ID is the ID, aliasID is the rules string. // idSplitPoint is the UTransDirection. UnicodeString ID; UnicodeString aliasesOrRules; UVector* transes; // owned const UnicodeSet* compoundFilter; // alias UTransDirection direction; enum { SIMPLE, COMPOUND, RULES } type; TransliteratorAlias(const TransliteratorAlias &other); // forbid copying of this class TransliteratorAlias &operator=(const TransliteratorAlias &other); // forbid copying of this class }; /** * A registry of system transliterators. This is the data structure * that implements the mapping between transliterator IDs and the data * or function pointers used to create the corresponding * transliterators. There is one instance of the registry that is * created statically. * * The registry consists of a dynamic component -- a hashtable -- and * a static component -- locale resource bundles. The dynamic store * is semantically overlaid on the static store, so the static mapping * can be dynamically overridden. * * This is an internal class that is only used by Transliterator. * Transliterator maintains one static instance of this class and * delegates all registry-related operations to it. * * @author Alan Liu */ class TransliteratorRegistry : public UMemory { public: /** * Constructor * @param status Output param set to success/failure code. */ TransliteratorRegistry(UErrorCode& status); /** * Nonvirtual destructor -- this class is not subclassable. */ ~TransliteratorRegistry(); //------------------------------------------------------------------ // Basic public API //------------------------------------------------------------------ /** * Given a simple ID (forward direction, no inline filter, not * compound) attempt to instantiate it from the registry. Return * 0 on failure. * * Return a non-nullptr aliasReturn value if the ID points to an alias. * We cannot instantiate it ourselves because the alias may contain * filters or compounds, which we do not understand. Caller should * make aliasReturn nullptr before calling. * @param ID the given ID * @param aliasReturn output param to receive TransliteratorAlias; * should be nullptr on entry * @param parseError Struct to receive information on position * of error if an error is encountered * @param status Output param set to success/failure code. */ Transliterator* get(const UnicodeString& ID, TransliteratorAlias*& aliasReturn, UErrorCode& status); /** * The caller must call this after calling get(), if [a] calling get() * returns an alias, and [b] the alias is rule based. In that * situation the caller must call alias->parse() to do the parsing * OUTSIDE THE REGISTRY MUTEX, then call this method to retry * instantiating the transliterator. * * Note: Another alias might be returned by this method. * * This method (like all public methods of this class) must be called * from within the TransliteratorRegistry mutex. * * @param aliasReturn output param to receive TransliteratorAlias; * should be nullptr on entry */ Transliterator* reget(const UnicodeString& ID, TransliteratorParser& parser, TransliteratorAlias*& aliasReturn, UErrorCode& status); /** * Register a prototype (adopted). This adds an entry to the * dynamic store, or replaces an existing entry. Any entry in the * underlying static locale resource store is masked. */ void put(Transliterator* adoptedProto, UBool visible, UErrorCode& ec); /** * Register an ID and a factory function pointer. This adds an * entry to the dynamic store, or replaces an existing entry. Any * entry in the underlying static locale resource store is masked. */ void put(const UnicodeString& ID, Transliterator::Factory factory, Transliterator::Token context, UBool visible, UErrorCode& ec); /** * Register an ID and a resource name. This adds an entry to the * dynamic store, or replaces an existing entry. Any entry in the * underlying static locale resource store is masked. */ void put(const UnicodeString& ID, const UnicodeString& resourceName, UTransDirection dir, UBool readonlyResourceAlias, UBool visible, UErrorCode& ec); /** * Register an ID and an alias ID. This adds an entry to the * dynamic store, or replaces an existing entry. Any entry in the * underlying static locale resource store is masked. */ void put(const UnicodeString& ID, const UnicodeString& alias, UBool readonlyAliasAlias, UBool visible, UErrorCode& ec); /** * Unregister an ID. This removes an entry from the dynamic store * if there is one. The static locale resource store is * unaffected. * @param ID the given ID. */ void remove(const UnicodeString& ID); //------------------------------------------------------------------ // Public ID and spec management //------------------------------------------------------------------ /** * Return a StringEnumeration over the IDs currently registered * with the system. * @internal */ StringEnumeration* getAvailableIDs() const; /** * == OBSOLETE - remove in ICU 3.4 == * Return the number of IDs currently registered with the system. * To retrieve the actual IDs, call getAvailableID(i) with * i from 0 to countAvailableIDs() - 1. * @return the number of IDs currently registered with the system. * @internal */ int32_t countAvailableIDs() const; /** * == OBSOLETE - remove in ICU 3.4 == * Return the index-th available ID. index must be between 0 * and countAvailableIDs() - 1, inclusive. If index is out of * range, the result of getAvailableID(0) is returned. * @param index the given index. * @return the index-th available ID. index must be between 0 * and countAvailableIDs() - 1, inclusive. If index is out of * range, the result of getAvailableID(0) is returned. * @internal */ const UnicodeString& getAvailableID(int32_t index) const; /** * Return the number of registered source specifiers. * @return the number of registered source specifiers. */ int32_t countAvailableSources() const; /** * Return a registered source specifier. * @param index which specifier to return, from 0 to n-1, where * n = countAvailableSources() * @param result fill-in parameter to receive the source specifier. * If index is out of range, result will be empty. * @return reference to result */ UnicodeString& getAvailableSource(int32_t index, UnicodeString& result) const; /** * Return the number of registered target specifiers for a given * source specifier. * @param source the given source specifier. * @return the number of registered target specifiers for a given * source specifier. */ int32_t countAvailableTargets(const UnicodeString& source) const; /** * Return a registered target specifier for a given source. * @param index which specifier to return, from 0 to n-1, where * n = countAvailableTargets(source) * @param source the source specifier * @param result fill-in parameter to receive the target specifier. * If source is invalid or if index is out of range, result will * be empty. * @return reference to result */ UnicodeString& getAvailableTarget(int32_t index, const UnicodeString& source, UnicodeString& result) const; /** * Return the number of registered variant specifiers for a given * source-target pair. There is always at least one variant: If * just source-target is registered, then the single variant * NO_VARIANT is returned. If source-target/variant is registered * then that variant is returned. * @param source the source specifiers * @param target the target specifiers * @return the number of registered variant specifiers for a given * source-target pair. */ int32_t countAvailableVariants(const UnicodeString& source, const UnicodeString& target) const; /** * Return a registered variant specifier for a given source-target * pair. If NO_VARIANT is one of the variants, then it will be * at index 0. * @param index which specifier to return, from 0 to n-1, where * n = countAvailableVariants(source, target) * @param source the source specifier * @param target the target specifier * @param result fill-in parameter to receive the variant * specifier. If source is invalid or if target is invalid or if * index is out of range, result will be empty. * @return reference to result */ UnicodeString& getAvailableVariant(int32_t index, const UnicodeString& source, const UnicodeString& target, UnicodeString& result) const; private: //---------------------------------------------------------------- // Private implementation //---------------------------------------------------------------- TransliteratorEntry* find(const UnicodeString& ID); TransliteratorEntry* find(UnicodeString& source, UnicodeString& target, UnicodeString& variant); TransliteratorEntry* findInDynamicStore(const TransliteratorSpec& src, const TransliteratorSpec& trg, const UnicodeString& variant) const; TransliteratorEntry* findInStaticStore(const TransliteratorSpec& src, const TransliteratorSpec& trg, const UnicodeString& variant); static TransliteratorEntry* findInBundle(const TransliteratorSpec& specToOpen, const TransliteratorSpec& specToFind, const UnicodeString& variant, UTransDirection direction); void registerEntry(const UnicodeString& source, const UnicodeString& target, const UnicodeString& variant, TransliteratorEntry* adopted, UBool visible); void registerEntry(const UnicodeString& ID, TransliteratorEntry* adopted, UBool visible); void registerEntry(const UnicodeString& ID, const UnicodeString& source, const UnicodeString& target, const UnicodeString& variant, TransliteratorEntry* adopted, UBool visible); void registerSTV(const UnicodeString& source, const UnicodeString& target, const UnicodeString& variant); void removeSTV(const UnicodeString& source, const UnicodeString& target, const UnicodeString& variant); Transliterator* instantiateEntry(const UnicodeString& ID, TransliteratorEntry *entry, TransliteratorAlias*& aliasReturn, UErrorCode& status); /** * A StringEnumeration over the registered IDs in this object. */ class Enumeration : public StringEnumeration { public: Enumeration(const TransliteratorRegistry& reg); virtual ~Enumeration(); virtual int32_t count(UErrorCode& status) const override; virtual const UnicodeString* snext(UErrorCode& status) override; virtual void reset(UErrorCode& status) override; static UClassID U_EXPORT2 getStaticClassID(); virtual UClassID getDynamicClassID() const override; private: int32_t pos; int32_t size; const TransliteratorRegistry& reg; }; friend class Enumeration; private: /** * Dynamic registry mapping full IDs to Entry objects. This * contains both public and internal entities. The visibility is * controlled by whether an entry is listed in availableIDs and * specDAG or not. */ Hashtable registry; /** * DAG of visible IDs by spec. Hashtable: source => (Hashtable: * target => variant bitmask) */ Hashtable specDAG; /** * Vector of all variant names */ UVector variantList; /** * Vector of public full IDs. */ Hashtable availableIDs; TransliteratorRegistry(const TransliteratorRegistry &other); // forbid copying of this class TransliteratorRegistry &operator=(const TransliteratorRegistry &other); // forbid copying of this class }; U_NAMESPACE_END U_CFUNC UBool utrans_transliterator_cleanup(); #endif /* #if !UCONFIG_NO_TRANSLITERATION */ #endif //eof stringi/src/icu74/i18n/udatpg.cpp0000644000176200001440000003201014700200761016213 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 2009-2015, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: udatpg.cpp * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2007jul30 * created by: Markus W. Scherer */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/udatpg.h" #include "unicode/uenum.h" #include "unicode/strenum.h" #include "unicode/dtptngen.h" #include "ustrenum.h" U_NAMESPACE_USE U_CAPI UDateTimePatternGenerator * U_EXPORT2 udatpg_open(const char *locale, UErrorCode *pErrorCode) { if(locale==nullptr) { return (UDateTimePatternGenerator *)DateTimePatternGenerator::createInstance(*pErrorCode); } else { return (UDateTimePatternGenerator *)DateTimePatternGenerator::createInstance(Locale(locale), *pErrorCode); } } U_CAPI UDateTimePatternGenerator * U_EXPORT2 udatpg_openEmpty(UErrorCode *pErrorCode) { return (UDateTimePatternGenerator *)DateTimePatternGenerator::createEmptyInstance(*pErrorCode); } U_CAPI void U_EXPORT2 udatpg_close(UDateTimePatternGenerator *dtpg) { delete (DateTimePatternGenerator *)dtpg; } U_CAPI UDateTimePatternGenerator * U_EXPORT2 udatpg_clone(const UDateTimePatternGenerator *dtpg, UErrorCode *pErrorCode) { if(U_FAILURE(*pErrorCode)) { return nullptr; } return (UDateTimePatternGenerator *)(((const DateTimePatternGenerator *)dtpg)->clone()); } U_CAPI int32_t U_EXPORT2 udatpg_getBestPattern(UDateTimePatternGenerator *dtpg, const char16_t *skeleton, int32_t length, char16_t *bestPattern, int32_t capacity, UErrorCode *pErrorCode) { return udatpg_getBestPatternWithOptions(dtpg, skeleton, length, UDATPG_MATCH_NO_OPTIONS, bestPattern, capacity, pErrorCode); } U_CAPI int32_t U_EXPORT2 udatpg_getBestPatternWithOptions(UDateTimePatternGenerator *dtpg, const char16_t *skeleton, int32_t length, UDateTimePatternMatchOptions options, char16_t *bestPattern, int32_t capacity, UErrorCode *pErrorCode) { if(U_FAILURE(*pErrorCode)) { return 0; } if(skeleton==nullptr && length!=0) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } UnicodeString skeletonString((UBool)(length<0), skeleton, length); UnicodeString result=((DateTimePatternGenerator *)dtpg)->getBestPattern(skeletonString, options, *pErrorCode); return result.extract(bestPattern, capacity, *pErrorCode); } U_CAPI int32_t U_EXPORT2 udatpg_getSkeleton(UDateTimePatternGenerator * /* dtpg */, const char16_t *pattern, int32_t length, char16_t *skeleton, int32_t capacity, UErrorCode *pErrorCode) { if(U_FAILURE(*pErrorCode)) { return 0; } if(pattern==nullptr && length!=0) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } UnicodeString patternString((UBool)(length<0), pattern, length); UnicodeString result=DateTimePatternGenerator::staticGetSkeleton( patternString, *pErrorCode); return result.extract(skeleton, capacity, *pErrorCode); } U_CAPI int32_t U_EXPORT2 udatpg_getBaseSkeleton(UDateTimePatternGenerator * /* dtpg */, const char16_t *pattern, int32_t length, char16_t *skeleton, int32_t capacity, UErrorCode *pErrorCode) { if(U_FAILURE(*pErrorCode)) { return 0; } if(pattern==nullptr && length!=0) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } UnicodeString patternString((UBool)(length<0), pattern, length); UnicodeString result=DateTimePatternGenerator::staticGetBaseSkeleton( patternString, *pErrorCode); return result.extract(skeleton, capacity, *pErrorCode); } U_CAPI UDateTimePatternConflict U_EXPORT2 udatpg_addPattern(UDateTimePatternGenerator *dtpg, const char16_t *pattern, int32_t patternLength, UBool override, char16_t *conflictingPattern, int32_t capacity, int32_t *pLength, UErrorCode *pErrorCode) { if(U_FAILURE(*pErrorCode)) { return UDATPG_NO_CONFLICT; } if(pattern==nullptr && patternLength!=0) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return UDATPG_NO_CONFLICT; } UnicodeString patternString((UBool)(patternLength<0), pattern, patternLength); UnicodeString conflictingPatternString; UDateTimePatternConflict result=((DateTimePatternGenerator *)dtpg)-> addPattern(patternString, override, conflictingPatternString, *pErrorCode); int32_t length=conflictingPatternString.extract(conflictingPattern, capacity, *pErrorCode); if(pLength!=nullptr) { *pLength=length; } return result; } U_CAPI void U_EXPORT2 udatpg_setAppendItemFormat(UDateTimePatternGenerator *dtpg, UDateTimePatternField field, const char16_t *value, int32_t length) { UnicodeString valueString((UBool)(length<0), value, length); ((DateTimePatternGenerator *)dtpg)->setAppendItemFormat(field, valueString); } U_CAPI const char16_t * U_EXPORT2 udatpg_getAppendItemFormat(const UDateTimePatternGenerator *dtpg, UDateTimePatternField field, int32_t *pLength) { const UnicodeString &result=((const DateTimePatternGenerator *)dtpg)->getAppendItemFormat(field); if(pLength!=nullptr) { *pLength=result.length(); } return result.getBuffer(); } U_CAPI void U_EXPORT2 udatpg_setAppendItemName(UDateTimePatternGenerator *dtpg, UDateTimePatternField field, const char16_t *value, int32_t length) { UnicodeString valueString((UBool)(length<0), value, length); ((DateTimePatternGenerator *)dtpg)->setAppendItemName(field, valueString); } U_CAPI const char16_t * U_EXPORT2 udatpg_getAppendItemName(const UDateTimePatternGenerator *dtpg, UDateTimePatternField field, int32_t *pLength) { const UnicodeString &result=((const DateTimePatternGenerator *)dtpg)->getAppendItemName(field); if(pLength!=nullptr) { *pLength=result.length(); } return result.getBuffer(); } U_CAPI int32_t U_EXPORT2 udatpg_getFieldDisplayName(const UDateTimePatternGenerator *dtpg, UDateTimePatternField field, UDateTimePGDisplayWidth width, char16_t *fieldName, int32_t capacity, UErrorCode *pErrorCode) { if (U_FAILURE(*pErrorCode)) return -1; if (fieldName == nullptr ? capacity != 0 : capacity < 0) { *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; return -1; } UnicodeString result = ((const DateTimePatternGenerator *)dtpg)->getFieldDisplayName(field,width); if (fieldName == nullptr) { return result.length(); } return result.extract(fieldName, capacity, *pErrorCode); } U_CAPI void U_EXPORT2 udatpg_setDateTimeFormat(const UDateTimePatternGenerator *dtpg, const char16_t *dtFormat, int32_t length) { UnicodeString dtFormatString((UBool)(length<0), dtFormat, length); ((DateTimePatternGenerator *)dtpg)->setDateTimeFormat(dtFormatString); } U_CAPI const char16_t * U_EXPORT2 udatpg_getDateTimeFormat(const UDateTimePatternGenerator *dtpg, int32_t *pLength) { UErrorCode status = U_ZERO_ERROR; return udatpg_getDateTimeFormatForStyle(dtpg, UDAT_MEDIUM, pLength, &status); } U_CAPI void U_EXPORT2 udatpg_setDateTimeFormatForStyle(UDateTimePatternGenerator *udtpg, UDateFormatStyle style, const char16_t *dateTimeFormat, int32_t length, UErrorCode *pErrorCode) { if (U_FAILURE(*pErrorCode)) { return; } else if (dateTimeFormat==nullptr) { *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; return; } DateTimePatternGenerator *dtpg = reinterpret_cast(udtpg); UnicodeString dtFormatString((UBool)(length<0), dateTimeFormat, length); dtpg->setDateTimeFormat(style, dtFormatString, *pErrorCode); } U_CAPI const char16_t* U_EXPORT2 udatpg_getDateTimeFormatForStyle(const UDateTimePatternGenerator *udtpg, UDateFormatStyle style, int32_t *pLength, UErrorCode *pErrorCode) { static const char16_t emptyString[] = { (char16_t)0 }; if (U_FAILURE(*pErrorCode)) { if (pLength !=nullptr) { *pLength = 0; } return emptyString; } const DateTimePatternGenerator *dtpg = reinterpret_cast(udtpg); const UnicodeString &result = dtpg->getDateTimeFormat(style, *pErrorCode); if (pLength != nullptr) { *pLength=result.length(); } // Note: The UnicodeString for the dateTimeFormat string in the DateTimePatternGenerator // was NUL-terminated what it was set, to avoid doing it here which could re-allocate // the buffe and affect and cont references to the string or its buffer. return result.getBuffer(); } U_CAPI void U_EXPORT2 udatpg_setDecimal(UDateTimePatternGenerator *dtpg, const char16_t *decimal, int32_t length) { UnicodeString decimalString((UBool)(length<0), decimal, length); ((DateTimePatternGenerator *)dtpg)->setDecimal(decimalString); } U_CAPI const char16_t * U_EXPORT2 udatpg_getDecimal(const UDateTimePatternGenerator *dtpg, int32_t *pLength) { const UnicodeString &result=((const DateTimePatternGenerator *)dtpg)->getDecimal(); if(pLength!=nullptr) { *pLength=result.length(); } return result.getBuffer(); } U_CAPI int32_t U_EXPORT2 udatpg_replaceFieldTypes(UDateTimePatternGenerator *dtpg, const char16_t *pattern, int32_t patternLength, const char16_t *skeleton, int32_t skeletonLength, char16_t *dest, int32_t destCapacity, UErrorCode *pErrorCode) { return udatpg_replaceFieldTypesWithOptions(dtpg, pattern, patternLength, skeleton, skeletonLength, UDATPG_MATCH_NO_OPTIONS, dest, destCapacity, pErrorCode); } U_CAPI int32_t U_EXPORT2 udatpg_replaceFieldTypesWithOptions(UDateTimePatternGenerator *dtpg, const char16_t *pattern, int32_t patternLength, const char16_t *skeleton, int32_t skeletonLength, UDateTimePatternMatchOptions options, char16_t *dest, int32_t destCapacity, UErrorCode *pErrorCode) { if(U_FAILURE(*pErrorCode)) { return 0; } if((pattern==nullptr && patternLength!=0) || (skeleton==nullptr && skeletonLength!=0)) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } UnicodeString patternString((UBool)(patternLength<0), pattern, patternLength); UnicodeString skeletonString((UBool)(skeletonLength<0), skeleton, skeletonLength); UnicodeString result=((DateTimePatternGenerator *)dtpg)->replaceFieldTypes(patternString, skeletonString, options, *pErrorCode); return result.extract(dest, destCapacity, *pErrorCode); } U_CAPI UEnumeration * U_EXPORT2 udatpg_openSkeletons(const UDateTimePatternGenerator *dtpg, UErrorCode *pErrorCode) { return uenum_openFromStringEnumeration( ((DateTimePatternGenerator *)dtpg)->getSkeletons(*pErrorCode), pErrorCode); } U_CAPI UEnumeration * U_EXPORT2 udatpg_openBaseSkeletons(const UDateTimePatternGenerator *dtpg, UErrorCode *pErrorCode) { return uenum_openFromStringEnumeration( ((DateTimePatternGenerator *)dtpg)->getBaseSkeletons(*pErrorCode), pErrorCode); } U_CAPI const char16_t * U_EXPORT2 udatpg_getPatternForSkeleton(const UDateTimePatternGenerator *dtpg, const char16_t *skeleton, int32_t skeletonLength, int32_t *pLength) { UnicodeString skeletonString((UBool)(skeletonLength<0), skeleton, skeletonLength); const UnicodeString &result=((const DateTimePatternGenerator *)dtpg)->getPatternForSkeleton(skeletonString); if(pLength!=nullptr) { *pLength=result.length(); } return result.getBuffer(); } U_CAPI UDateFormatHourCycle U_EXPORT2 udatpg_getDefaultHourCycle(const UDateTimePatternGenerator *dtpg, UErrorCode* pErrorCode) { return ((const DateTimePatternGenerator *)dtpg)->getDefaultHourCycle(*pErrorCode); } #endif stringi/src/icu74/i18n/sharednumberformat.h0000644000176200001440000000240714700200761020273 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * Copyright (C) 2014, International Business Machines * Corporation and others. All Rights Reserved. ****************************************************************************** * sharednumberformat.h */ #ifndef __SHARED_NUMBERFORMAT_H__ #define __SHARED_NUMBERFORMAT_H__ #include "unicode/utypes.h" #include "sharedobject.h" #include "unifiedcache.h" U_NAMESPACE_BEGIN class NumberFormat; class U_I18N_API SharedNumberFormat : public SharedObject { public: SharedNumberFormat(NumberFormat *nfToAdopt) : ptr(nfToAdopt) { } virtual ~SharedNumberFormat(); const NumberFormat *get() const { return ptr; } const NumberFormat *operator->() const { return ptr; } const NumberFormat &operator*() const { return *ptr; } private: NumberFormat *ptr; SharedNumberFormat(const SharedNumberFormat &) = delete; SharedNumberFormat &operator=(const SharedNumberFormat &) = delete; }; template<> U_I18N_API const SharedNumberFormat *LocaleCacheKey::createObject( const void * /*unused*/, UErrorCode &status) const; U_NAMESPACE_END #endif stringi/src/icu74/i18n/ztrans.cpp0000644000176200001440000000556114700200761016263 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2009-2010, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ /** * \file * \brief C API: Time zone transition classes */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/uobject.h" #include "ztrans.h" #include "unicode/tztrans.h" #include "cmemory.h" #include "unicode/ustring.h" #include "unicode/parsepos.h" U_NAMESPACE_USE U_CAPI ZTrans* U_EXPORT2 ztrans_open(UDate time, const void* from, const void* to){ return (ZTrans*) new TimeZoneTransition(time,*(TimeZoneRule*)from,*(TimeZoneRule*)to); } U_CAPI ZTrans* U_EXPORT2 ztrans_openEmpty() { return (ZTrans*) new TimeZoneTransition(); } U_CAPI void U_EXPORT2 ztrans_close(ZTrans *trans) { delete (TimeZoneTransition*)trans; } U_CAPI ZTrans* U_EXPORT2 ztrans_clone(ZTrans *trans) { return (ZTrans*) (((TimeZoneTransition*)trans)->TimeZoneTransition::clone()); } U_CAPI UBool U_EXPORT2 ztrans_equals(const ZTrans* trans1, const ZTrans* trans2){ return *(const TimeZoneTransition*)trans1 == *(const TimeZoneTransition*)trans2; } U_CAPI UDate U_EXPORT2 ztrans_getTime(ZTrans* trans) { return ((TimeZoneTransition*)trans)->TimeZoneTransition::getTime(); } U_CAPI void U_EXPORT2 ztrans_setTime(ZTrans* trans, UDate time) { return ((TimeZoneTransition*)trans)->TimeZoneTransition::setTime(time); } U_CAPI void* U_EXPORT2 ztrans_getFrom(ZTrans* & trans) { return (void*) (((TimeZoneTransition*)trans)->TimeZoneTransition::getFrom()); } U_CAPI void U_EXPORT2 ztrans_setFrom(ZTrans* trans, const void* from) { return ((TimeZoneTransition*)trans)->TimeZoneTransition::setFrom(*(TimeZoneRule*)from); } U_CAPI void U_EXPORT2 ztrans_adoptFrom(ZTrans* trans, void* from) { return ((TimeZoneTransition*)trans)->TimeZoneTransition::adoptFrom((TimeZoneRule*)from); } U_CAPI void* U_EXPORT2 ztrans_getTo(ZTrans* trans){ return (void*) (((TimeZoneTransition*)trans)->TimeZoneTransition::getTo()); } U_CAPI void U_EXPORT2 ztrans_setTo(ZTrans* trans, const void* to) { return ((TimeZoneTransition*)trans)->TimeZoneTransition::setTo(*(TimeZoneRule*)to); } U_CAPI void U_EXPORT2 ztrans_adoptTo(ZTrans* trans, void* to) { return ((TimeZoneTransition*)trans)->TimeZoneTransition::adoptTo((TimeZoneRule*)to); } U_CAPI UClassID U_EXPORT2 ztrans_getStaticClassID(ZTrans* trans) { return ((TimeZoneTransition*)trans)->TimeZoneTransition::getStaticClassID(); } U_CAPI UClassID U_EXPORT2 ztrans_getDynamicClassID(ZTrans* trans){ return ((TimeZoneTransition*)trans)->TimeZoneTransition::getDynamicClassID(); } #endif stringi/src/icu74/i18n/csrutf8.cpp0000644000176200001440000000571514700200761016341 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2005-2014, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_CONVERSION #include "csrutf8.h" #include "csmatch.h" U_NAMESPACE_BEGIN CharsetRecog_UTF8::~CharsetRecog_UTF8() { // nothing to do } const char *CharsetRecog_UTF8::getName() const { return "UTF-8"; } UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const { bool hasBOM = false; int32_t numValid = 0; int32_t numInvalid = 0; const uint8_t *inputBytes = input->fRawInput; int32_t i; int32_t trailBytes = 0; int32_t confidence; if (input->fRawLength >= 3 && inputBytes[0] == 0xEF && inputBytes[1] == 0xBB && inputBytes[2] == 0xBF) { hasBOM = true; } // Scan for multi-byte sequences for (i=0; i < input->fRawLength; i += 1) { int32_t b = inputBytes[i]; if ((b & 0x80) == 0) { continue; // ASCII } // Hi bit on char found. Figure out how long the sequence should be if ((b & 0x0E0) == 0x0C0) { trailBytes = 1; } else if ((b & 0x0F0) == 0x0E0) { trailBytes = 2; } else if ((b & 0x0F8) == 0xF0) { trailBytes = 3; } else { numInvalid += 1; continue; } // Verify that we've got the right number of trail bytes in the sequence for (;;) { i += 1; if (i >= input->fRawLength) { break; } b = inputBytes[i]; if ((b & 0xC0) != 0x080) { numInvalid += 1; break; } if (--trailBytes == 0) { numValid += 1; break; } } } // Cook up some sort of confidence score, based on presence of a BOM // and the existence of valid and/or invalid multi-byte sequences. confidence = 0; if (hasBOM && numInvalid == 0) { confidence = 100; } else if (hasBOM && numValid > numInvalid*10) { confidence = 80; } else if (numValid > 3 && numInvalid == 0) { confidence = 100; } else if (numValid > 0 && numInvalid == 0) { confidence = 80; } else if (numValid == 0 && numInvalid == 0) { // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which // accepts ASCII with confidence = 10. confidence = 15; } else if (numValid > numInvalid*10) { // Probably corrupt utf-8 data. Valid sequences aren't likely by chance. confidence = 25; } results->set(input, this, confidence); return (confidence > 0); } U_NAMESPACE_END #endif stringi/src/icu74/i18n/iso8601cal.cpp0000644000176200001440000000203114700200761016520 0ustar liggesusers// © 2022 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "iso8601cal.h" #include "unicode/gregocal.h" U_NAMESPACE_BEGIN UOBJECT_DEFINE_RTTI_IMPLEMENTATION(ISO8601Calendar) ISO8601Calendar::ISO8601Calendar(const Locale& aLocale, UErrorCode& success) : GregorianCalendar(aLocale, success) { UErrorCode fwStatus = U_ZERO_ERROR; int32_t fwLength = aLocale.getKeywordValue("fw", nullptr, 0, fwStatus); // Do not set first day of week for iso8601 to Monday if we have fw keyword // and let the value set by the Calendar constructor to take care of it. if (U_SUCCESS(fwStatus) && fwLength == 0) { setFirstDayOfWeek(UCAL_MONDAY); } setMinimalDaysInFirstWeek(4); } ISO8601Calendar::~ISO8601Calendar() { } ISO8601Calendar* ISO8601Calendar::clone() const { return new ISO8601Calendar(*this); } const char *ISO8601Calendar::getType() const { return "iso8601"; } U_NAMESPACE_END #endif stringi/src/icu74/i18n/chnsecal.h0000644000176200001440000003137314700200761016167 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ***************************************************************************** * Copyright (C) 2007-2013, International Business Machines Corporation * and others. All Rights Reserved. ***************************************************************************** * * File CHNSECAL.H * * Modification History: * * Date Name Description * 9/18/2007 ajmacher ported from java ChineseCalendar ***************************************************************************** */ #ifndef CHNSECAL_H #define CHNSECAL_H #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/calendar.h" #include "unicode/timezone.h" U_NAMESPACE_BEGIN /** * ChineseCalendar is a concrete subclass of {@link Calendar} * that implements a traditional Chinese calendar. The traditional Chinese * calendar is a lunisolar calendar: Each month starts on a new moon, and * the months are numbered according to solar events, specifically, to * guarantee that month 11 always contains the winter solstice. In order * to accomplish this, leap months are inserted in certain years. Leap * months are numbered the same as the month they follow. The decision of * which month is a leap month depends on the relative movements of the sun * and moon. * *

This class defines one addition field beyond those defined by * Calendar: The IS_LEAP_MONTH field takes the * value of 0 for normal months, or 1 for leap months. * *

All astronomical computations are performed with respect to a time * zone of GMT+8:00 and a longitude of 120 degrees east. Although some * calendars implement a historically more accurate convention of using * Beijing's local longitude (116 degrees 25 minutes east) and time zone * (GMT+7:45:40) for dates before 1929, we do not implement this here. * *

Years are counted in two different ways in the Chinese calendar. The * first method is by sequential numbering from the 61st year of the reign * of Huang Di, 2637 BCE, which is designated year 1 on the Chinese * calendar. The second method uses 60-year cycles from the same starting * point, which is designated year 1 of cycle 1. In this class, the * EXTENDED_YEAR field contains the sequential year count. * The ERA field contains the cycle number, and the * YEAR field contains the year of the cycle, a value between * 1 and 60. * *

There is some variation in what is considered the starting point of * the calendar, with some sources starting in the first year of the reign * of Huang Di, rather than the 61st. This gives continuous year numbers * 60 years greater and cycle numbers one greater than what this class * implements. * *

Because ChineseCalendar defines an additional field and * redefines the way the ERA field is used, it requires a new * format class, ChineseDateFormat. As always, use the * methods DateFormat.getXxxInstance(Calendar cal,...) to * obtain a formatter for this calendar. * *

References:

    * *
  • Dershowitz and Reingold, Calendrical Calculations, * Cambridge University Press, 1997
  • * *
  • The * Calendar FAQ
  • * *
* *

* This class should only be subclassed to implement variants of the Chinese lunar calendar.

*

* ChineseCalendar usually should be instantiated using * {@link com.ibm.icu.util.Calendar#getInstance(ULocale)} passing in a ULocale * with the tag "@calendar=chinese".

* * @see com.ibm.icu.text.ChineseDateFormat * @see com.ibm.icu.util.Calendar * @author Alan Liu * @internal */ class U_I18N_API ChineseCalendar : public Calendar { public: //------------------------------------------------------------------------- // Constructors... //------------------------------------------------------------------------- /** * Constructs a ChineseCalendar based on the current time in the default time zone * with the given locale. * * @param aLocale The given locale. * @param success Indicates the status of ChineseCalendar object construction. * Returns U_ZERO_ERROR if constructed successfully. * @internal */ ChineseCalendar(const Locale& aLocale, UErrorCode &success); /** * Returns true if the date is in a leap year. * * @param status ICU Error Code * @return True if the date in the fields is in a Temporal proposal * defined leap year. False otherwise. */ virtual bool inTemporalLeapYear(UErrorCode &status) const override; /** * Gets The Temporal monthCode value corresponding to the month for the date. * The value is a string identifier that starts with the literal grapheme * "M" followed by two graphemes representing the zero-padded month number * of the current month in a normal (non-leap) year and suffixed by an * optional literal grapheme "L" if this is a leap month in a lunisolar * calendar. For Chinese calendars (including Dangi), the values are * "M01" .. "M12" for non-leap year, and "M01" .. "M12" with one of * "M01L" .. "M12L" for leap year. * * @param status ICU Error Code * @return One of 24 possible strings in * {"M01" .. "M12", "M01L" .. "M12L"}. * @draft ICU 73 */ virtual const char* getTemporalMonthCode(UErrorCode &status) const override; /** * Sets The Temporal monthCode which is a string identifier that starts * with the literal grapheme "M" followed by two graphemes representing * the zero-padded month number of the current month in a normal * (non-leap) year and suffixed by an optional literal grapheme "L" if this * is a leap month in a lunisolar calendar. For Chinese calendars, the values * are "M01" .. "M12" for non-leap years, and "M01" .. "M12" plus one in * "M01L" .. "M12L" for leap year. * * @param temporalMonth The value to be set for temporal monthCode. One of * 24 possible strings in {"M01" .. "M12", "M01L" .. "M12L"}. * @param status ICU Error Code * * @draft ICU 73 */ virtual void setTemporalMonthCode(const char* code, UErrorCode& status) override; protected: /** * Constructs a ChineseCalendar based on the current time in the default time zone * with the given locale, using the specified epoch year and time zone for * astronomical calculations. * * @param aLocale The given locale. * @param epochYear The epoch year to use for calculation. * @param zoneAstroCalc The TimeZone to use for astronomical calculations. If null, * will be set appropriately for Chinese calendar (UTC + 8:00). * @param success Indicates the status of ChineseCalendar object construction; * if successful, will not be changed to an error value. * @internal */ ChineseCalendar(const Locale& aLocale, int32_t epochYear, const TimeZone* zoneAstroCalc, UErrorCode &success); public: /** * Copy Constructor * @internal */ ChineseCalendar(const ChineseCalendar& other); /** * Destructor. * @internal */ virtual ~ChineseCalendar(); // clone virtual ChineseCalendar* clone() const override; private: //------------------------------------------------------------------------- // Internal data.... //------------------------------------------------------------------------- // There is a leap month between the Winter Solstice before and after the // current date.This is different from leap year because in some year, such as // 1813 and 2033, the leap month is after the Winter Solstice of that year. So // this value could be false for a date prior to the Winter Solstice of that // year but that year still has a leap month and therefor is a leap year. UBool hasLeapMonthBetweenWinterSolstices; int32_t fEpochYear; // Start year of this Chinese calendar instance. const TimeZone* fZoneAstroCalc; // Zone used for the astronomical calculation // of this Chinese calendar instance. //---------------------------------------------------------------------- // Calendar framework //---------------------------------------------------------------------- protected: virtual int32_t handleGetLimit(UCalendarDateFields field, ELimitType limitType) const override; virtual int32_t handleGetMonthLength(int32_t extendedYear, int32_t month) const override; virtual int32_t handleComputeMonthStart(int32_t eyear, int32_t month, UBool useMonth) const override; virtual int32_t handleGetExtendedYear() override; virtual void handleComputeFields(int32_t julianDay, UErrorCode &status) override; virtual const UFieldResolutionTable* getFieldResolutionTable() const override; public: virtual void add(UCalendarDateFields field, int32_t amount, UErrorCode &status) override; virtual void add(EDateFields field, int32_t amount, UErrorCode &status) override; virtual void roll(UCalendarDateFields field, int32_t amount, UErrorCode &status) override; virtual void roll(EDateFields field, int32_t amount, UErrorCode &status) override; /** * @return The related Gregorian year; will be obtained by modifying the value * obtained by get from UCAL_EXTENDED_YEAR field * @internal */ virtual int32_t getRelatedYear(UErrorCode &status) const override; /** * @param year The related Gregorian year to set; will be modified as necessary then * set in UCAL_EXTENDED_YEAR field * @internal */ virtual void setRelatedYear(int32_t year) override; //---------------------------------------------------------------------- // Internal methods & astronomical calculations //---------------------------------------------------------------------- private: static const UFieldResolutionTable CHINESE_DATE_PRECEDENCE[]; double daysToMillis(double days) const; double millisToDays(double millis) const; virtual int32_t winterSolstice(int32_t gyear) const; virtual int32_t newMoonNear(double days, UBool after) const; virtual int32_t synodicMonthsBetween(int32_t day1, int32_t day2) const; virtual int32_t majorSolarTerm(int32_t days) const; virtual UBool hasNoMajorSolarTerm(int32_t newMoon) const; virtual UBool isLeapMonthBetween(int32_t newMoon1, int32_t newMoon2) const; virtual void computeChineseFields(int32_t days, int32_t gyear, int32_t gmonth, UBool setAllFields); virtual int32_t newYear(int32_t gyear) const; virtual void offsetMonth(int32_t newMoon, int32_t dom, int32_t delta, UErrorCode& status); const TimeZone* getChineseCalZoneAstroCalc() const; // UObject stuff public: /** * @return The class ID for this object. All objects of a given class have the * same class ID. Objects of other classes have different class IDs. * @internal */ virtual UClassID getDynamicClassID() const override; /** * Return the class ID for this class. This is useful only for comparing to a return * value from getDynamicClassID(). For example: * * Base* polymorphic_pointer = createPolymorphicObject(); * if (polymorphic_pointer->getDynamicClassID() == * Derived::getStaticClassID()) ... * * @return The class ID for all objects of this class. * @internal */ static UClassID U_EXPORT2 getStaticClassID(); /** * return the calendar type, "chinese". * * @return calendar type * @internal */ virtual const char * getType() const override; protected: virtual int32_t internalGetMonth(int32_t defaultValue) const override; virtual int32_t internalGetMonth() const override; protected: /** * Returns true because the Islamic Calendar does have a default century * @internal */ virtual UBool haveDefaultCentury() const override; /** * Returns the date of the start of the default century * @return start of century - in milliseconds since epoch, 1970 * @internal */ virtual UDate defaultCenturyStart() const override; /** * Returns the year in which the default century begins * @internal */ virtual int32_t defaultCenturyStartYear() const override; private: // default century stuff. /** * Returns the beginning date of the 100-year window that dates * with 2-digit years are considered to fall within. */ UDate internalGetDefaultCenturyStart() const; /** * Returns the first year of the 100-year window that dates with * 2-digit years are considered to fall within. */ int32_t internalGetDefaultCenturyStartYear() const; ChineseCalendar() = delete; // default constructor not implemented }; U_NAMESPACE_END #endif #endif stringi/src/icu74/i18n/strrepl.cpp0000644000176200001440000002536514700200761016441 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (c) 2002-2012, International Business Machines Corporation * and others. All Rights Reserved. ********************************************************************** * Date Name Description * 01/21/2002 aliu Creation. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/uniset.h" #include "unicode/utf16.h" #include "strrepl.h" #include "rbt_data.h" #include "util.h" U_NAMESPACE_BEGIN UnicodeReplacer::~UnicodeReplacer() {} UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer) /** * Construct a StringReplacer that sets the emits the given output * text and sets the cursor to the given position. * @param theOutput text that will replace input text when the * replace() method is called. May contain stand-in characters * that represent nested replacers. * @param theCursorPos cursor position that will be returned by * the replace() method * @param theData transliterator context object that translates * stand-in characters to UnicodeReplacer objects */ StringReplacer::StringReplacer(const UnicodeString& theOutput, int32_t theCursorPos, const TransliterationRuleData* theData) { output = theOutput; cursorPos = theCursorPos; hasCursor = true; data = theData; isComplex = true; } /** * Construct a StringReplacer that sets the emits the given output * text and does not modify the cursor. * @param theOutput text that will replace input text when the * replace() method is called. May contain stand-in characters * that represent nested replacers. * @param theData transliterator context object that translates * stand-in characters to UnicodeReplacer objects */ StringReplacer::StringReplacer(const UnicodeString& theOutput, const TransliterationRuleData* theData) { output = theOutput; cursorPos = 0; hasCursor = false; data = theData; isComplex = true; } /** * Copy constructor. */ StringReplacer::StringReplacer(const StringReplacer& other) : UnicodeFunctor(other), UnicodeReplacer(other) { output = other.output; cursorPos = other.cursorPos; hasCursor = other.hasCursor; data = other.data; isComplex = other.isComplex; } /** * Destructor */ StringReplacer::~StringReplacer() { } /** * Implement UnicodeFunctor */ StringReplacer* StringReplacer::clone() const { return new StringReplacer(*this); } /** * Implement UnicodeFunctor */ UnicodeReplacer* StringReplacer::toReplacer() const { return const_cast(this); } /** * UnicodeReplacer API */ int32_t StringReplacer::replace(Replaceable& text, int32_t start, int32_t limit, int32_t& cursor) { int32_t outLen; int32_t newStart = 0; // NOTE: It should be possible to _always_ run the complex // processing code; just slower. If not, then there is a bug // in the complex processing code. // Simple (no nested replacers) Processing Code : if (!isComplex) { text.handleReplaceBetween(start, limit, output); outLen = output.length(); // Setup default cursor position (for cursorPos within output) newStart = cursorPos; } // Complex (nested replacers) Processing Code : else { /* When there are segments to be copied, use the Replaceable.copy() * API in order to retain out-of-band data. Copy everything to the * end of the string, then copy them back over the key. This preserves * the integrity of indices into the key and surrounding context while * generating the output text. */ UnicodeString buf; int32_t oOutput; // offset into 'output' isComplex = false; // The temporary buffer starts at tempStart, and extends // to destLimit. The start of the buffer has a single // character from before the key. This provides style // data when addition characters are filled into the // temporary buffer. If there is nothing to the left, use // the non-character U+FFFF, which Replaceable subclasses // should treat specially as a "no-style character." // destStart points to the point after the style context // character, so it is tempStart+1 or tempStart+2. int32_t tempStart = text.length(); // start of temp buffer int32_t destStart = tempStart; // copy new text to here if (start > 0) { int32_t len = U16_LENGTH(text.char32At(start-1)); text.copy(start-len, start, tempStart); destStart += len; } else { UnicodeString str((char16_t) 0xFFFF); text.handleReplaceBetween(tempStart, tempStart, str); destStart++; } int32_t destLimit = destStart; for (oOutput=0; oOutputlookupReplacer(c); if (r == nullptr) { // Accumulate straight (non-segment) text. buf.append(c); } else { isComplex = true; // Insert any accumulated straight text. if (buf.length() > 0) { text.handleReplaceBetween(destLimit, destLimit, buf); destLimit += buf.length(); buf.truncate(0); } // Delegate output generation to replacer object int32_t len = r->replace(text, destLimit, destLimit, cursor); destLimit += len; } oOutput += U16_LENGTH(c); } // Insert any accumulated straight text. if (buf.length() > 0) { text.handleReplaceBetween(destLimit, destLimit, buf); destLimit += buf.length(); } if (oOutput == cursorPos) { // Record the position of the cursor newStart = destLimit - destStart; // relative to start } outLen = destLimit - destStart; // Copy new text to start, and delete it text.copy(destStart, destLimit, start); text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString()); // Delete the old text (the key) text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString()); } if (hasCursor) { // Adjust the cursor for positions outside the key. These // refer to code points rather than code units. If cursorPos // is within the output string, then use newStart, which has // already been set above. if (cursorPos < 0) { newStart = start; int32_t n = cursorPos; // Outside the output string, cursorPos counts code points while (n < 0 && newStart > 0) { newStart -= U16_LENGTH(text.char32At(newStart-1)); ++n; } newStart += n; } else if (cursorPos > output.length()) { newStart = start + outLen; int32_t n = cursorPos - output.length(); // Outside the output string, cursorPos counts code points while (n > 0 && newStart < text.length()) { newStart += U16_LENGTH(text.char32At(newStart)); --n; } newStart += n; } else { // Cursor is within output string. It has been set up above // to be relative to start. newStart += start; } cursor = newStart; } return outLen; } /** * UnicodeReplacer API */ UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule, UBool escapeUnprintable) const { rule.truncate(0); UnicodeString quoteBuf; int32_t cursor = cursorPos; // Handle a cursor preceding the output if (hasCursor && cursor < 0) { while (cursor++ < 0) { ICU_Utility::appendToRule(rule, (char16_t)0x0040 /*@*/, true, escapeUnprintable, quoteBuf); } // Fall through and append '|' below } for (int32_t i=0; ilookupReplacer(c); if (r == nullptr) { ICU_Utility::appendToRule(rule, c, false, escapeUnprintable, quoteBuf); } else { UnicodeString buf; r->toReplacerPattern(buf, escapeUnprintable); buf.insert(0, (char16_t)0x20); buf.append((char16_t)0x20); ICU_Utility::appendToRule(rule, buf, true, escapeUnprintable, quoteBuf); } } // Handle a cursor after the output. Use > rather than >= because // if cursor == output.length() it is at the end of the output, // which is the default position, so we need not emit it. if (hasCursor && cursor > output.length()) { cursor -= output.length(); while (cursor-- > 0) { ICU_Utility::appendToRule(rule, (char16_t)0x0040 /*@*/, true, escapeUnprintable, quoteBuf); } ICU_Utility::appendToRule(rule, (char16_t)0x007C /*|*/, true, escapeUnprintable, quoteBuf); } // Flush quoteBuf out to result ICU_Utility::appendToRule(rule, -1, true, escapeUnprintable, quoteBuf); return rule; } /** * Implement UnicodeReplacer */ void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const { UChar32 ch; for (int32_t i=0; ilookupReplacer(ch); if (r == nullptr) { toUnionTo.add(ch); } else { r->addReplacementSetTo(toUnionTo); } } } /** * UnicodeFunctor API */ void StringReplacer::setData(const TransliterationRuleData* d) { data = d; int32_t i = 0; while (ilookup(c); if (f != nullptr) { f->setData(data); } i += U16_LENGTH(c); } } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ //eof stringi/src/icu74/i18n/esctrn.h0000644000176200001440000001053114700200761015676 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (c) 2001-2007, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 11/20/2001 aliu Creation. ********************************************************************** */ #ifndef ESCTRN_H #define ESCTRN_H #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/translit.h" U_NAMESPACE_BEGIN /** * A transliterator that converts Unicode characters to an escape * form. Examples of escape forms are "U+4E01" and "􏿿". * Escape forms have a prefix and suffix, either of which may be * empty, a radix, typically 16 or 10, a minimum digit count, * typically 1, 4, or 8, and a boolean that specifies whether * supplemental characters are handled as 32-bit code points or as two * 16-bit code units. Most escape forms handle 32-bit code points, * but some, such as the Java form, intentionally break them into two * surrogate pairs, for backward compatibility. * *

Some escape forms actually have two different patterns, one for * BMP characters (0..FFFF) and one for supplements (>FFFF). To * handle this, a second EscapeTransliterator may be defined that * specifies the pattern to be produced for supplementals. An example * of a form that requires this is the C form, which uses "\\uFFFF" * for BMP characters and "\\U0010FFFF" for supplementals. * *

This class is package private. It registers several standard * variants with the system which are then accessed via their IDs. * * @author Alan Liu */ class EscapeTransliterator : public Transliterator { private: /** * The prefix of the escape form; may be empty, but usually isn't. */ UnicodeString prefix; /** * The prefix of the escape form; often empty. */ UnicodeString suffix; /** * The radix to display the number in. Typically 16 or 10. Must * be in the range 2 to 36. */ int32_t radix; /** * The minimum number of digits. Typically 1, 4, or 8. Values * less than 1 are equivalent to 1. */ int32_t minDigits; /** * If true, supplementals are handled as 32-bit code points. If * false, they are handled as two 16-bit code units. */ UBool grokSupplementals; /** * The form to be used for supplementals. If this is null then * the same form is used for BMP characters and supplementals. If * this is not null and if grokSupplementals is true then the * prefix, suffix, radix, and minDigits of this object are used * for supplementals. This pointer is owned. */ EscapeTransliterator* supplementalHandler; public: /** * Registers standard variants with the system. Called by * Transliterator during initialization. */ static void registerIDs(); /** * Constructs an escape transliterator with the given ID and * parameters. See the class member documentation for details. */ EscapeTransliterator(const UnicodeString& ID, const UnicodeString& prefix, const UnicodeString& suffix, int32_t radix, int32_t minDigits, UBool grokSupplementals, EscapeTransliterator* adoptedSupplementalHandler); /** * Copy constructor. */ EscapeTransliterator(const EscapeTransliterator&); /** * Destructor. */ virtual ~EscapeTransliterator(); /** * Transliterator API. */ virtual EscapeTransliterator* clone() const override; /** * ICU "poor man's RTTI", returns a UClassID for the actual class. */ virtual UClassID getDynamicClassID() const override; /** * ICU "poor man's RTTI", returns a UClassID for this class. */ U_I18N_API static UClassID U_EXPORT2 getStaticClassID(); protected: /** * Implements {@link Transliterator#handleTransliterate}. */ virtual void handleTransliterate(Replaceable& text, UTransPosition& offset, UBool isIncremental) const override; }; U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ #endif stringi/src/icu74/i18n/double-conversion-strtod.cpp0000644000176200001440000005707714700200761021725 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // // From the double-conversion library. Original license: // // Copyright 2010 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // ICU PATCH: ifdef around UCONFIG_NO_FORMATTING #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include #include // ICU PATCH: Customize header file paths for ICU. #include "double-conversion-bignum.h" #include "double-conversion-cached-powers.h" #include "double-conversion-ieee.h" #include "double-conversion-strtod.h" // ICU PATCH: Wrap in ICU namespace U_NAMESPACE_BEGIN namespace double_conversion { #if defined(DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS) // 2^53 = 9007199254740992. // Any integer with at most 15 decimal digits will hence fit into a double // (which has a 53bit significand) without loss of precision. static const int kMaxExactDoubleIntegerDecimalDigits = 15; #endif // #if defined(DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS) // 2^64 = 18446744073709551616 > 10^19 static const int kMaxUint64DecimalDigits = 19; // Max double: 1.7976931348623157 x 10^308 // Min non-zero double: 4.9406564584124654 x 10^-324 // Any x >= 10^309 is interpreted as +infinity. // Any x <= 10^-324 is interpreted as 0. // Note that 2.5e-324 (despite being smaller than the min double) will be read // as non-zero (equal to the min non-zero double). static const int kMaxDecimalPower = 309; static const int kMinDecimalPower = -324; // 2^64 = 18446744073709551616 static const uint64_t kMaxUint64 = DOUBLE_CONVERSION_UINT64_2PART_C(0xFFFFFFFF, FFFFFFFF); #if defined(DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS) static const double exact_powers_of_ten[] = { 1.0, // 10^0 10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, 10000000.0, 100000000.0, 1000000000.0, 10000000000.0, // 10^10 100000000000.0, 1000000000000.0, 10000000000000.0, 100000000000000.0, 1000000000000000.0, 10000000000000000.0, 100000000000000000.0, 1000000000000000000.0, 10000000000000000000.0, 100000000000000000000.0, // 10^20 1000000000000000000000.0, // 10^22 = 0x21e19e0c9bab2400000 = 0x878678326eac9 * 2^22 10000000000000000000000.0 }; static const int kExactPowersOfTenSize = DOUBLE_CONVERSION_ARRAY_SIZE(exact_powers_of_ten); #endif // #if defined(DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS) // Maximum number of significant digits in the decimal representation. // In fact the value is 772 (see conversions.cc), but to give us some margin // we round up to 780. static const int kMaxSignificantDecimalDigits = 780; static Vector TrimLeadingZeros(Vector buffer) { for (int i = 0; i < buffer.length(); i++) { if (buffer[i] != '0') { return buffer.SubVector(i, buffer.length()); } } return Vector(buffer.start(), 0); } static void CutToMaxSignificantDigits(Vector buffer, int exponent, char* significant_buffer, int* significant_exponent) { for (int i = 0; i < kMaxSignificantDecimalDigits - 1; ++i) { significant_buffer[i] = buffer[i]; } // The input buffer has been trimmed. Therefore the last digit must be // different from '0'. DOUBLE_CONVERSION_ASSERT(buffer[buffer.length() - 1] != '0'); // Set the last digit to be non-zero. This is sufficient to guarantee // correct rounding. significant_buffer[kMaxSignificantDecimalDigits - 1] = '1'; *significant_exponent = exponent + (buffer.length() - kMaxSignificantDecimalDigits); } // Trims the buffer and cuts it to at most kMaxSignificantDecimalDigits. // If possible the input-buffer is reused, but if the buffer needs to be // modified (due to cutting), then the input needs to be copied into the // buffer_copy_space. static void TrimAndCut(Vector buffer, int exponent, char* buffer_copy_space, int space_size, Vector* trimmed, int* updated_exponent) { Vector left_trimmed = TrimLeadingZeros(buffer); Vector right_trimmed = TrimTrailingZeros(left_trimmed); exponent += left_trimmed.length() - right_trimmed.length(); if (right_trimmed.length() > kMaxSignificantDecimalDigits) { (void) space_size; // Mark variable as used. DOUBLE_CONVERSION_ASSERT(space_size >= kMaxSignificantDecimalDigits); CutToMaxSignificantDigits(right_trimmed, exponent, buffer_copy_space, updated_exponent); *trimmed = Vector(buffer_copy_space, kMaxSignificantDecimalDigits); } else { *trimmed = right_trimmed; *updated_exponent = exponent; } } // Reads digits from the buffer and converts them to a uint64. // Reads in as many digits as fit into a uint64. // When the string starts with "1844674407370955161" no further digit is read. // Since 2^64 = 18446744073709551616 it would still be possible read another // digit if it was less or equal than 6, but this would complicate the code. static uint64_t ReadUint64(Vector buffer, int* number_of_read_digits) { uint64_t result = 0; int i = 0; while (i < buffer.length() && result <= (kMaxUint64 / 10 - 1)) { int digit = buffer[i++] - '0'; DOUBLE_CONVERSION_ASSERT(0 <= digit && digit <= 9); result = 10 * result + digit; } *number_of_read_digits = i; return result; } // Reads a DiyFp from the buffer. // The returned DiyFp is not necessarily normalized. // If remaining_decimals is zero then the returned DiyFp is accurate. // Otherwise it has been rounded and has error of at most 1/2 ulp. static void ReadDiyFp(Vector buffer, DiyFp* result, int* remaining_decimals) { int read_digits; uint64_t significand = ReadUint64(buffer, &read_digits); if (buffer.length() == read_digits) { *result = DiyFp(significand, 0); *remaining_decimals = 0; } else { // Round the significand. if (buffer[read_digits] >= '5') { significand++; } // Compute the binary exponent. int exponent = 0; *result = DiyFp(significand, exponent); *remaining_decimals = buffer.length() - read_digits; } } static bool DoubleStrtod(Vector trimmed, int exponent, double* result) { #if !defined(DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS) // Avoid "unused parameter" warnings (void) trimmed; (void) exponent; (void) result; // On x86 the floating-point stack can be 64 or 80 bits wide. If it is // 80 bits wide (as is the case on Linux) then double-rounding occurs and the // result is not accurate. // We know that Windows32 uses 64 bits and is therefore accurate. return false; #else if (trimmed.length() <= kMaxExactDoubleIntegerDecimalDigits) { int read_digits; // The trimmed input fits into a double. // If the 10^exponent (resp. 10^-exponent) fits into a double too then we // can compute the result-double simply by multiplying (resp. dividing) the // two numbers. // This is possible because IEEE guarantees that floating-point operations // return the best possible approximation. if (exponent < 0 && -exponent < kExactPowersOfTenSize) { // 10^-exponent fits into a double. *result = static_cast(ReadUint64(trimmed, &read_digits)); DOUBLE_CONVERSION_ASSERT(read_digits == trimmed.length()); *result /= exact_powers_of_ten[-exponent]; return true; } if (0 <= exponent && exponent < kExactPowersOfTenSize) { // 10^exponent fits into a double. *result = static_cast(ReadUint64(trimmed, &read_digits)); DOUBLE_CONVERSION_ASSERT(read_digits == trimmed.length()); *result *= exact_powers_of_ten[exponent]; return true; } int remaining_digits = kMaxExactDoubleIntegerDecimalDigits - trimmed.length(); if ((0 <= exponent) && (exponent - remaining_digits < kExactPowersOfTenSize)) { // The trimmed string was short and we can multiply it with // 10^remaining_digits. As a result the remaining exponent now fits // into a double too. *result = static_cast(ReadUint64(trimmed, &read_digits)); DOUBLE_CONVERSION_ASSERT(read_digits == trimmed.length()); *result *= exact_powers_of_ten[remaining_digits]; *result *= exact_powers_of_ten[exponent - remaining_digits]; return true; } } return false; #endif } // Returns 10^exponent as an exact DiyFp. // The given exponent must be in the range [1; kDecimalExponentDistance[. static DiyFp AdjustmentPowerOfTen(int exponent) { DOUBLE_CONVERSION_ASSERT(0 < exponent); DOUBLE_CONVERSION_ASSERT(exponent < PowersOfTenCache::kDecimalExponentDistance); // Simply hardcode the remaining powers for the given decimal exponent // distance. DOUBLE_CONVERSION_ASSERT(PowersOfTenCache::kDecimalExponentDistance == 8); switch (exponent) { case 1: return DiyFp(DOUBLE_CONVERSION_UINT64_2PART_C(0xa0000000, 00000000), -60); case 2: return DiyFp(DOUBLE_CONVERSION_UINT64_2PART_C(0xc8000000, 00000000), -57); case 3: return DiyFp(DOUBLE_CONVERSION_UINT64_2PART_C(0xfa000000, 00000000), -54); case 4: return DiyFp(DOUBLE_CONVERSION_UINT64_2PART_C(0x9c400000, 00000000), -50); case 5: return DiyFp(DOUBLE_CONVERSION_UINT64_2PART_C(0xc3500000, 00000000), -47); case 6: return DiyFp(DOUBLE_CONVERSION_UINT64_2PART_C(0xf4240000, 00000000), -44); case 7: return DiyFp(DOUBLE_CONVERSION_UINT64_2PART_C(0x98968000, 00000000), -40); default: DOUBLE_CONVERSION_UNREACHABLE(); } } // If the function returns true then the result is the correct double. // Otherwise it is either the correct double or the double that is just below // the correct double. static bool DiyFpStrtod(Vector buffer, int exponent, double* result) { DiyFp input; int remaining_decimals; ReadDiyFp(buffer, &input, &remaining_decimals); // Since we may have dropped some digits the input is not accurate. // If remaining_decimals is different than 0 than the error is at most // .5 ulp (unit in the last place). // We don't want to deal with fractions and therefore keep a common // denominator. const int kDenominatorLog = 3; const int kDenominator = 1 << kDenominatorLog; // Move the remaining decimals into the exponent. exponent += remaining_decimals; uint64_t error = (remaining_decimals == 0 ? 0 : kDenominator / 2); int old_e = input.e(); input.Normalize(); error <<= old_e - input.e(); DOUBLE_CONVERSION_ASSERT(exponent <= PowersOfTenCache::kMaxDecimalExponent); if (exponent < PowersOfTenCache::kMinDecimalExponent) { *result = 0.0; return true; } DiyFp cached_power; int cached_decimal_exponent; PowersOfTenCache::GetCachedPowerForDecimalExponent(exponent, &cached_power, &cached_decimal_exponent); if (cached_decimal_exponent != exponent) { int adjustment_exponent = exponent - cached_decimal_exponent; DiyFp adjustment_power = AdjustmentPowerOfTen(adjustment_exponent); input.Multiply(adjustment_power); if (kMaxUint64DecimalDigits - buffer.length() >= adjustment_exponent) { // The product of input with the adjustment power fits into a 64 bit // integer. DOUBLE_CONVERSION_ASSERT(DiyFp::kSignificandSize == 64); } else { // The adjustment power is exact. There is hence only an error of 0.5. error += kDenominator / 2; } } input.Multiply(cached_power); // The error introduced by a multiplication of a*b equals // error_a + error_b + error_a*error_b/2^64 + 0.5 // Substituting a with 'input' and b with 'cached_power' we have // error_b = 0.5 (all cached powers have an error of less than 0.5 ulp), // error_ab = 0 or 1 / kDenominator > error_a*error_b/ 2^64 int error_b = kDenominator / 2; int error_ab = (error == 0 ? 0 : 1); // We round up to 1. int fixed_error = kDenominator / 2; error += error_b + error_ab + fixed_error; old_e = input.e(); input.Normalize(); error <<= old_e - input.e(); // See if the double's significand changes if we add/subtract the error. int order_of_magnitude = DiyFp::kSignificandSize + input.e(); int effective_significand_size = Double::SignificandSizeForOrderOfMagnitude(order_of_magnitude); int precision_digits_count = DiyFp::kSignificandSize - effective_significand_size; if (precision_digits_count + kDenominatorLog >= DiyFp::kSignificandSize) { // This can only happen for very small denormals. In this case the // half-way multiplied by the denominator exceeds the range of an uint64. // Simply shift everything to the right. int shift_amount = (precision_digits_count + kDenominatorLog) - DiyFp::kSignificandSize + 1; input.set_f(input.f() >> shift_amount); input.set_e(input.e() + shift_amount); // We add 1 for the lost precision of error, and kDenominator for // the lost precision of input.f(). error = (error >> shift_amount) + 1 + kDenominator; precision_digits_count -= shift_amount; } // We use uint64_ts now. This only works if the DiyFp uses uint64_ts too. DOUBLE_CONVERSION_ASSERT(DiyFp::kSignificandSize == 64); DOUBLE_CONVERSION_ASSERT(precision_digits_count < 64); uint64_t one64 = 1; uint64_t precision_bits_mask = (one64 << precision_digits_count) - 1; uint64_t precision_bits = input.f() & precision_bits_mask; uint64_t half_way = one64 << (precision_digits_count - 1); precision_bits *= kDenominator; half_way *= kDenominator; DiyFp rounded_input(input.f() >> precision_digits_count, input.e() + precision_digits_count); if (precision_bits >= half_way + error) { rounded_input.set_f(rounded_input.f() + 1); } // If the last_bits are too close to the half-way case than we are too // inaccurate and round down. In this case we return false so that we can // fall back to a more precise algorithm. *result = Double(rounded_input).value(); if (half_way - error < precision_bits && precision_bits < half_way + error) { // Too imprecise. The caller will have to fall back to a slower version. // However the returned number is guaranteed to be either the correct // double, or the next-lower double. return false; } else { return true; } } // Returns // - -1 if buffer*10^exponent < diy_fp. // - 0 if buffer*10^exponent == diy_fp. // - +1 if buffer*10^exponent > diy_fp. // Preconditions: // buffer.length() + exponent <= kMaxDecimalPower + 1 // buffer.length() + exponent > kMinDecimalPower // buffer.length() <= kMaxDecimalSignificantDigits static int CompareBufferWithDiyFp(Vector buffer, int exponent, DiyFp diy_fp) { DOUBLE_CONVERSION_ASSERT(buffer.length() + exponent <= kMaxDecimalPower + 1); DOUBLE_CONVERSION_ASSERT(buffer.length() + exponent > kMinDecimalPower); DOUBLE_CONVERSION_ASSERT(buffer.length() <= kMaxSignificantDecimalDigits); // Make sure that the Bignum will be able to hold all our numbers. // Our Bignum implementation has a separate field for exponents. Shifts will // consume at most one bigit (< 64 bits). // ln(10) == 3.3219... DOUBLE_CONVERSION_ASSERT(((kMaxDecimalPower + 1) * 333 / 100) < Bignum::kMaxSignificantBits); Bignum buffer_bignum; Bignum diy_fp_bignum; buffer_bignum.AssignDecimalString(buffer); diy_fp_bignum.AssignUInt64(diy_fp.f()); if (exponent >= 0) { buffer_bignum.MultiplyByPowerOfTen(exponent); } else { diy_fp_bignum.MultiplyByPowerOfTen(-exponent); } if (diy_fp.e() > 0) { diy_fp_bignum.ShiftLeft(diy_fp.e()); } else { buffer_bignum.ShiftLeft(-diy_fp.e()); } return Bignum::Compare(buffer_bignum, diy_fp_bignum); } // Returns true if the guess is the correct double. // Returns false, when guess is either correct or the next-lower double. static bool ComputeGuess(Vector trimmed, int exponent, double* guess) { if (trimmed.length() == 0) { *guess = 0.0; return true; } if (exponent + trimmed.length() - 1 >= kMaxDecimalPower) { *guess = Double::Infinity(); return true; } if (exponent + trimmed.length() <= kMinDecimalPower) { *guess = 0.0; return true; } if (DoubleStrtod(trimmed, exponent, guess) || DiyFpStrtod(trimmed, exponent, guess)) { return true; } if (*guess == Double::Infinity()) { return true; } return false; } #if U_DEBUG // needed for ICU only in debug mode static bool IsDigit(const char d) { return ('0' <= d) && (d <= '9'); } static bool IsNonZeroDigit(const char d) { return ('1' <= d) && (d <= '9'); } #ifdef __has_cpp_attribute #if __has_cpp_attribute(maybe_unused) [[maybe_unused]] #endif #endif static bool AssertTrimmedDigits(const Vector& buffer) { for(int i = 0; i < buffer.length(); ++i) { if(!IsDigit(buffer[i])) { return false; } } return (buffer.length() == 0) || (IsNonZeroDigit(buffer[0]) && IsNonZeroDigit(buffer[buffer.length()-1])); } #endif // needed for ICU only in debug mode double StrtodTrimmed(Vector trimmed, int exponent) { DOUBLE_CONVERSION_ASSERT(trimmed.length() <= kMaxSignificantDecimalDigits); DOUBLE_CONVERSION_ASSERT(AssertTrimmedDigits(trimmed)); double guess; const bool is_correct = ComputeGuess(trimmed, exponent, &guess); if (is_correct) { return guess; } DiyFp upper_boundary = Double(guess).UpperBoundary(); int comparison = CompareBufferWithDiyFp(trimmed, exponent, upper_boundary); if (comparison < 0) { return guess; } else if (comparison > 0) { return Double(guess).NextDouble(); } else if ((Double(guess).Significand() & 1) == 0) { // Round towards even. return guess; } else { return Double(guess).NextDouble(); } } double Strtod(Vector buffer, int exponent) { char copy_buffer[kMaxSignificantDecimalDigits]; Vector trimmed; int updated_exponent; TrimAndCut(buffer, exponent, copy_buffer, kMaxSignificantDecimalDigits, &trimmed, &updated_exponent); return StrtodTrimmed(trimmed, updated_exponent); } static float SanitizedDoubletof(double d) { DOUBLE_CONVERSION_ASSERT(d >= 0.0); // ASAN has a sanitize check that disallows casting doubles to floats if // they are too big. // https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html#available-checks // The behavior should be covered by IEEE 754, but some projects use this // flag, so work around it. float max_finite = 3.4028234663852885981170418348451692544e+38; // The half-way point between the max-finite and infinity value. // Since infinity has an even significand everything equal or greater than // this value should become infinity. double half_max_finite_infinity = 3.40282356779733661637539395458142568448e+38; if (d >= max_finite) { if (d >= half_max_finite_infinity) { return Single::Infinity(); } else { return max_finite; } } else { return static_cast(d); } } float Strtof(Vector buffer, int exponent) { char copy_buffer[kMaxSignificantDecimalDigits]; Vector trimmed; int updated_exponent; TrimAndCut(buffer, exponent, copy_buffer, kMaxSignificantDecimalDigits, &trimmed, &updated_exponent); exponent = updated_exponent; return StrtofTrimmed(trimmed, exponent); } float StrtofTrimmed(Vector trimmed, int exponent) { DOUBLE_CONVERSION_ASSERT(trimmed.length() <= kMaxSignificantDecimalDigits); DOUBLE_CONVERSION_ASSERT(AssertTrimmedDigits(trimmed)); double double_guess; bool is_correct = ComputeGuess(trimmed, exponent, &double_guess); float float_guess = SanitizedDoubletof(double_guess); if (float_guess == double_guess) { // This shortcut triggers for integer values. return float_guess; } // We must catch double-rounding. Say the double has been rounded up, and is // now a boundary of a float, and rounds up again. This is why we have to // look at previous too. // Example (in decimal numbers): // input: 12349 // high-precision (4 digits): 1235 // low-precision (3 digits): // when read from input: 123 // when rounded from high precision: 124. // To do this we simply look at the neighbors of the correct result and see // if they would round to the same float. If the guess is not correct we have // to look at four values (since two different doubles could be the correct // double). double double_next = Double(double_guess).NextDouble(); double double_previous = Double(double_guess).PreviousDouble(); float f1 = SanitizedDoubletof(double_previous); float f2 = float_guess; float f3 = SanitizedDoubletof(double_next); float f4; if (is_correct) { f4 = f3; } else { double double_next2 = Double(double_next).NextDouble(); f4 = SanitizedDoubletof(double_next2); } (void) f2; // Mark variable as used. DOUBLE_CONVERSION_ASSERT(f1 <= f2 && f2 <= f3 && f3 <= f4); // If the guess doesn't lie near a single-precision boundary we can simply // return its float-value. if (f1 == f4) { return float_guess; } DOUBLE_CONVERSION_ASSERT((f1 != f2 && f2 == f3 && f3 == f4) || (f1 == f2 && f2 != f3 && f3 == f4) || (f1 == f2 && f2 == f3 && f3 != f4)); // guess and next are the two possible candidates (in the same way that // double_guess was the lower candidate for a double-precision guess). float guess = f1; float next = f4; DiyFp upper_boundary; if (guess == 0.0f) { float min_float = 1e-45f; upper_boundary = Double(static_cast(min_float) / 2).AsDiyFp(); } else { upper_boundary = Single(guess).UpperBoundary(); } int comparison = CompareBufferWithDiyFp(trimmed, exponent, upper_boundary); if (comparison < 0) { return guess; } else if (comparison > 0) { return next; } else if ((Single(guess).Significand() & 1) == 0) { // Round towards even. return guess; } else { return next; } } } // namespace double_conversion // ICU PATCH: Close ICU namespace U_NAMESPACE_END #endif // ICU PATCH: close #if !UCONFIG_NO_FORMATTING stringi/src/icu74/i18n/collationfastlatinbuilder.h0000644000176200001440000000627014700200761021646 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2013-2016, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * collationfastlatinbuilder.h * * created on: 2013aug09 * created by: Markus W. Scherer */ #ifndef __COLLATIONFASTLATINBUILDER_H__ #define __COLLATIONFASTLATINBUILDER_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION #include "unicode/ucol.h" #include "unicode/unistr.h" #include "unicode/uobject.h" #include "collation.h" #include "collationfastlatin.h" #include "uvectr64.h" U_NAMESPACE_BEGIN struct CollationData; class U_I18N_API CollationFastLatinBuilder : public UObject { public: CollationFastLatinBuilder(UErrorCode &errorCode); ~CollationFastLatinBuilder(); UBool forData(const CollationData &data, UErrorCode &errorCode); const uint16_t *getTable() const { return reinterpret_cast(result.getBuffer()); } int32_t lengthOfTable() const { return result.length(); } private: // space, punct, symbol, currency (not digit) enum { NUM_SPECIAL_GROUPS = UCOL_REORDER_CODE_CURRENCY - UCOL_REORDER_CODE_FIRST + 1 }; UBool loadGroups(const CollationData &data, UErrorCode &errorCode); UBool inSameGroup(uint32_t p, uint32_t q) const; void resetCEs(); void getCEs(const CollationData &data, UErrorCode &errorCode); UBool getCEsFromCE32(const CollationData &data, UChar32 c, uint32_t ce32, UErrorCode &errorCode); UBool getCEsFromContractionCE32(const CollationData &data, uint32_t ce32, UErrorCode &errorCode); void addContractionEntry(int32_t x, int64_t cce0, int64_t cce1, UErrorCode &errorCode); void addUniqueCE(int64_t ce, UErrorCode &errorCode); uint32_t getMiniCE(int64_t ce) const; UBool encodeUniqueCEs(UErrorCode &errorCode); UBool encodeCharCEs(UErrorCode &errorCode); UBool encodeContractions(UErrorCode &errorCode); uint32_t encodeTwoCEs(int64_t first, int64_t second) const; static UBool isContractionCharCE(int64_t ce) { return (uint32_t)(ce >> 32) == Collation::NO_CE_PRIMARY && ce != Collation::NO_CE; } static const uint32_t CONTRACTION_FLAG = 0x80000000; // temporary "buffer" int64_t ce0, ce1; int64_t charCEs[CollationFastLatin::NUM_FAST_CHARS][2]; UVector64 contractionCEs; UVector64 uniqueCEs; /** One 16-bit mini CE per unique CE. */ uint16_t *miniCEs; // These are constant for a given root collator. uint32_t lastSpecialPrimaries[NUM_SPECIAL_GROUPS]; uint32_t firstDigitPrimary; uint32_t firstLatinPrimary; uint32_t lastLatinPrimary; // This determines the first normal primary weight which is mapped to // a short mini primary. It must be >=firstDigitPrimary. uint32_t firstShortPrimary; UBool shortPrimaryOverflow; UnicodeString result; int32_t headerLength; }; U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION #endif // __COLLATIONFASTLATINBUILDER_H__ stringi/src/icu74/i18n/number_usageprefs.h0000644000176200001440000001007214700200761020114 0ustar liggesusers// © 2020 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #ifndef __NUMBER_USAGEPREFS_H__ #define __NUMBER_USAGEPREFS_H__ #include "cmemory.h" #include "number_types.h" #include "unicode/listformatter.h" #include "unicode/localpointer.h" #include "unicode/locid.h" #include "unicode/measunit.h" #include "unicode/stringpiece.h" #include "unicode/uobject.h" #include "units_converter.h" #include "units_router.h" U_NAMESPACE_BEGIN using ::icu::units::ComplexUnitsConverter; using ::icu::units::UnitsRouter; namespace number { namespace impl { /** * A MicroPropsGenerator which uses UnitsRouter to produce output converted to a * MeasureUnit appropriate for a particular localized usage: see * NumberFormatterSettings::usage(). */ class U_I18N_API UsagePrefsHandler : public MicroPropsGenerator, public UMemory { public: UsagePrefsHandler(const Locale &locale, const MeasureUnit &inputUnit, const StringPiece usage, const MicroPropsGenerator *parent, UErrorCode &status); /** * Obtains the appropriate output value, MeasureUnit and * rounding/precision behaviour from the UnitsRouter. * * The output unit is passed on to the LongNameHandler via * micros.outputUnit. */ void processQuantity(DecimalQuantity &quantity, MicroProps µs, UErrorCode &status) const override; /** * Returns the list of possible output units, i.e. the full set of * preferences, for the localized, usage-specific unit preferences. * * The returned pointer should be valid for the lifetime of the * UsagePrefsHandler instance. */ const MaybeStackVector *getOutputUnits() const { return fUnitsRouter.getOutputUnits(); } private: UnitsRouter fUnitsRouter; const MicroPropsGenerator *fParent; }; } // namespace impl } // namespace number // Export explicit template instantiations of LocalPointerBase and LocalPointer. // This is required when building DLLs for Windows. (See datefmt.h, // collationiterator.h, erarules.h and others for similar examples.) // // Note: These need to be outside of the number::impl namespace, or Clang will // generate a compile error. #if U_PF_WINDOWS <= U_PLATFORM && U_PLATFORM <= U_PF_CYGWIN #if defined(_MSC_VER) // Ignore warning 4661 as LocalPointerBase does not use operator== or operator!= #pragma warning(push) #pragma warning(disable: 4661) #endif template class U_I18N_API LocalPointerBase; template class U_I18N_API LocalPointer; #if defined(_MSC_VER) #pragma warning(pop) #endif #endif namespace number { namespace impl { /** * A MicroPropsGenerator which converts a measurement from one MeasureUnit to * another. In particular, the output MeasureUnit may be a mixed unit. (The * input unit may not be a mixed unit.) */ class U_I18N_API UnitConversionHandler : public MicroPropsGenerator, public UMemory { public: /** * Constructor. * * @param targetUnit Specifies the output MeasureUnit. The input MeasureUnit * is derived from it: in case of a mixed unit, the biggest unit is * taken as the input unit. If not a mixed unit, the input unit will be * the same as the output unit and no unit conversion takes place. * @param parent The parent MicroPropsGenerator. * @param status Receives status. */ UnitConversionHandler(const MeasureUnit &targetUnit, const MicroPropsGenerator *parent, UErrorCode &status); /** * Obtains the appropriate output values from the Unit Converter. */ void processQuantity(DecimalQuantity &quantity, MicroProps µs, UErrorCode &status) const override; private: MeasureUnit fOutputUnit; LocalPointer fUnitConverter; const MicroPropsGenerator *fParent; }; } // namespace impl } // namespace number U_NAMESPACE_END #endif // __NUMBER_USAGEPREFS_H__ #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/funcrepl.cpp0000644000176200001440000000651414700200761016557 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (c) 2002-2012, International Business Machines Corporation * and others. All Rights Reserved. ********************************************************************** * Date Name Description * 02/04/2002 aliu Creation. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/translit.h" #include "unicode/uniset.h" #include "funcrepl.h" static const char16_t AMPERSAND = 38; // '&' static const char16_t OPEN[] = {40,32,0}; // "( " static const char16_t CLOSE[] = {32,41,0}; // " )" U_NAMESPACE_BEGIN UOBJECT_DEFINE_RTTI_IMPLEMENTATION(FunctionReplacer) /** * Construct a replacer that takes the output of the given * replacer, passes it through the given transliterator, and emits * the result as output. */ FunctionReplacer::FunctionReplacer(Transliterator* adoptedTranslit, UnicodeFunctor* adoptedReplacer) { translit = adoptedTranslit; replacer = adoptedReplacer; } /** * Copy constructor. */ FunctionReplacer::FunctionReplacer(const FunctionReplacer& other) : UnicodeFunctor(other), UnicodeReplacer(other) { translit = other.translit->clone(); replacer = other.replacer->clone(); } /** * Destructor */ FunctionReplacer::~FunctionReplacer() { delete translit; delete replacer; } /** * Implement UnicodeFunctor */ FunctionReplacer* FunctionReplacer::clone() const { return new FunctionReplacer(*this); } /** * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer * and return the pointer. */ UnicodeReplacer* FunctionReplacer::toReplacer() const { FunctionReplacer *nonconst_this = const_cast(this); UnicodeReplacer *nonconst_base = static_cast(nonconst_this); return nonconst_base; } /** * UnicodeReplacer API */ int32_t FunctionReplacer::replace(Replaceable& text, int32_t start, int32_t limit, int32_t& cursor) { // First delegate to subordinate replacer int32_t len = replacer->toReplacer()->replace(text, start, limit, cursor); limit = start + len; // Now transliterate limit = translit->transliterate(text, start, limit); return limit - start; } /** * UnicodeReplacer API */ UnicodeString& FunctionReplacer::toReplacerPattern(UnicodeString& rule, UBool escapeUnprintable) const { UnicodeString str; rule.truncate(0); rule.append(AMPERSAND); rule.append(translit->getID()); rule.append(OPEN, 2); rule.append(replacer->toReplacer()->toReplacerPattern(str, escapeUnprintable)); rule.append(CLOSE, 2); return rule; } /** * Implement UnicodeReplacer */ void FunctionReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const { UnicodeSet set; toUnionTo.addAll(translit->getTargetSet(set)); } /** * UnicodeFunctor API */ void FunctionReplacer::setData(const TransliterationRuleData* d) { replacer->setData(d); } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ //eof stringi/src/icu74/i18n/uitercollationiterator.h0000644000176200001440000001135114700200761021210 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2012-2016, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * uitercollationiterator.h * * created on: 2012sep23 (from utf16collationiterator.h) * created by: Markus W. Scherer */ #ifndef __UITERCOLLATIONITERATOR_H__ #define __UITERCOLLATIONITERATOR_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION #include "unicode/uiter.h" #include "cmemory.h" #include "collation.h" #include "collationdata.h" #include "collationiterator.h" #include "normalizer2impl.h" U_NAMESPACE_BEGIN /** * UCharIterator-based collation element and character iterator. * Handles normalized text inline, with length or NUL-terminated. * Unnormalized text is handled by a subclass. */ class U_I18N_API UIterCollationIterator : public CollationIterator { public: UIterCollationIterator(const CollationData *d, UBool numeric, UCharIterator &ui) : CollationIterator(d, numeric), iter(ui) {} virtual ~UIterCollationIterator(); virtual void resetToOffset(int32_t newOffset) override; virtual int32_t getOffset() const override; virtual UChar32 nextCodePoint(UErrorCode &errorCode) override; virtual UChar32 previousCodePoint(UErrorCode &errorCode) override; protected: virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode) override; virtual char16_t handleGetTrailSurrogate() override; virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override; virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override; UCharIterator &iter; }; /** * Incrementally checks the input text for FCD and normalizes where necessary. */ class U_I18N_API FCDUIterCollationIterator : public UIterCollationIterator { public: FCDUIterCollationIterator(const CollationData *data, UBool numeric, UCharIterator &ui, int32_t startIndex) : UIterCollationIterator(data, numeric, ui), state(ITER_CHECK_FWD), start(startIndex), nfcImpl(data->nfcImpl) {} virtual ~FCDUIterCollationIterator(); virtual void resetToOffset(int32_t newOffset) override; virtual int32_t getOffset() const override; virtual UChar32 nextCodePoint(UErrorCode &errorCode) override; virtual UChar32 previousCodePoint(UErrorCode &errorCode) override; protected: virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode) override; virtual char16_t handleGetTrailSurrogate() override; virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override; virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override; private: /** * Switches to forward checking if possible. */ void switchToForward(); /** * Extends the FCD text segment forward or normalizes around pos. * @return true if success */ UBool nextSegment(UErrorCode &errorCode); /** * Switches to backward checking. */ void switchToBackward(); /** * Extends the FCD text segment backward or normalizes around pos. * @return true if success */ UBool previousSegment(UErrorCode &errorCode); UBool normalize(const UnicodeString &s, UErrorCode &errorCode); enum State { /** * The input text [start..(iter index)[ passes the FCD check. * Moving forward checks incrementally. * pos & limit are undefined. */ ITER_CHECK_FWD, /** * The input text [(iter index)..limit[ passes the FCD check. * Moving backward checks incrementally. * start & pos are undefined. */ ITER_CHECK_BWD, /** * The input text [start..limit[ passes the FCD check. * pos tracks the current text index. */ ITER_IN_FCD_SEGMENT, /** * The input text [start..limit[ failed the FCD check and was normalized. * pos tracks the current index in the normalized string. * The text iterator is at the limit index. */ IN_NORM_ITER_AT_LIMIT, /** * The input text [start..limit[ failed the FCD check and was normalized. * pos tracks the current index in the normalized string. * The text iterator is at the start index. */ IN_NORM_ITER_AT_START }; State state; int32_t start; int32_t pos; int32_t limit; const Normalizer2Impl &nfcImpl; UnicodeString normalized; }; U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION #endif // __UITERCOLLATIONITERATOR_H__ stringi/src/icu74/i18n/udat.cpp0000644000176200001440000013044514700200761015677 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 1996-2015, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/udat.h" #include "unicode/uloc.h" #include "unicode/datefmt.h" #include "unicode/timezone.h" #include "unicode/smpdtfmt.h" #include "unicode/fieldpos.h" #include "unicode/parsepos.h" #include "unicode/calendar.h" #include "unicode/numfmt.h" #include "unicode/dtfmtsym.h" #include "unicode/ustring.h" #include "unicode/udisplaycontext.h" #include "unicode/ufieldpositer.h" #include "cpputils.h" #include "reldtfmt.h" #include "umutex.h" U_NAMESPACE_USE /** * Verify that fmt is a SimpleDateFormat. Invalid error if not. * @param fmt the UDateFormat, definitely a DateFormat, maybe something else * @param status error code, will be set to failure if there is a failure or the fmt is nullptr. */ static void verifyIsSimpleDateFormat(const UDateFormat* fmt, UErrorCode *status) { if(U_SUCCESS(*status) && dynamic_cast(reinterpret_cast(fmt))==nullptr) { *status = U_ILLEGAL_ARGUMENT_ERROR; } } // This mirrors the correspondence between the // SimpleDateFormat::fgPatternIndexToDateFormatField and // SimpleDateFormat::fgPatternIndexToCalendarField arrays. static UCalendarDateFields gDateFieldMapping[] = { UCAL_ERA, // UDAT_ERA_FIELD = 0 UCAL_YEAR, // UDAT_YEAR_FIELD = 1 UCAL_MONTH, // UDAT_MONTH_FIELD = 2 UCAL_DATE, // UDAT_DATE_FIELD = 3 UCAL_HOUR_OF_DAY, // UDAT_HOUR_OF_DAY1_FIELD = 4 UCAL_HOUR_OF_DAY, // UDAT_HOUR_OF_DAY0_FIELD = 5 UCAL_MINUTE, // UDAT_MINUTE_FIELD = 6 UCAL_SECOND, // UDAT_SECOND_FIELD = 7 UCAL_MILLISECOND, // UDAT_FRACTIONAL_SECOND_FIELD = 8 UCAL_DAY_OF_WEEK, // UDAT_DAY_OF_WEEK_FIELD = 9 UCAL_DAY_OF_YEAR, // UDAT_DAY_OF_YEAR_FIELD = 10 UCAL_DAY_OF_WEEK_IN_MONTH, // UDAT_DAY_OF_WEEK_IN_MONTH_FIELD = 11 UCAL_WEEK_OF_YEAR, // UDAT_WEEK_OF_YEAR_FIELD = 12 UCAL_WEEK_OF_MONTH, // UDAT_WEEK_OF_MONTH_FIELD = 13 UCAL_AM_PM, // UDAT_AM_PM_FIELD = 14 UCAL_HOUR, // UDAT_HOUR1_FIELD = 15 UCAL_HOUR, // UDAT_HOUR0_FIELD = 16 UCAL_ZONE_OFFSET, // UDAT_TIMEZONE_FIELD = 17 UCAL_YEAR_WOY, // UDAT_YEAR_WOY_FIELD = 18 UCAL_DOW_LOCAL, // UDAT_DOW_LOCAL_FIELD = 19 UCAL_EXTENDED_YEAR, // UDAT_EXTENDED_YEAR_FIELD = 20 UCAL_JULIAN_DAY, // UDAT_JULIAN_DAY_FIELD = 21 UCAL_MILLISECONDS_IN_DAY, // UDAT_MILLISECONDS_IN_DAY_FIELD = 22 UCAL_ZONE_OFFSET, // UDAT_TIMEZONE_RFC_FIELD = 23 (also UCAL_DST_OFFSET) UCAL_ZONE_OFFSET, // UDAT_TIMEZONE_GENERIC_FIELD = 24 (also UCAL_DST_OFFSET) UCAL_DOW_LOCAL, // UDAT_STANDALONE_DAY_FIELD = 25 UCAL_MONTH, // UDAT_STANDALONE_MONTH_FIELD = 26 UCAL_MONTH, // UDAT_QUARTER_FIELD = 27 UCAL_MONTH, // UDAT_STANDALONE_QUARTER_FIELD = 28 UCAL_ZONE_OFFSET, // UDAT_TIMEZONE_SPECIAL_FIELD = 29 (also UCAL_DST_OFFSET) UCAL_YEAR, // UDAT_YEAR_NAME_FIELD = 30 UCAL_ZONE_OFFSET, // UDAT_TIMEZONE_LOCALIZED_GMT_OFFSET_FIELD = 31 (also UCAL_DST_OFFSET) UCAL_ZONE_OFFSET, // UDAT_TIMEZONE_ISO_FIELD = 32 (also UCAL_DST_OFFSET) UCAL_ZONE_OFFSET, // UDAT_TIMEZONE_ISO_LOCAL_FIELD = 33 (also UCAL_DST_OFFSET) UCAL_EXTENDED_YEAR, // UDAT_RELATED_YEAR_FIELD = 34 (not an exact match) UCAL_FIELD_COUNT, // UDAT_AM_PM_MIDNIGHT_NOON_FIELD=35 (no match) UCAL_FIELD_COUNT, // UDAT_FLEXIBLE_DAY_PERIOD_FIELD=36 (no match) UCAL_FIELD_COUNT, // UDAT_TIME_SEPARATOR_FIELD = 37 (no match) // UDAT_FIELD_COUNT = 38 as of ICU 67 // UCAL_IS_LEAP_MONTH is not the target of a mapping }; U_CAPI UCalendarDateFields U_EXPORT2 udat_toCalendarDateField(UDateFormatField field) UPRV_NO_SANITIZE_UNDEFINED { static_assert(UDAT_FIELD_COUNT == UPRV_LENGTHOF(gDateFieldMapping), "UDateFormatField and gDateFieldMapping should have the same number of entries and be kept in sync."); return (field >= UDAT_ERA_FIELD && field < UPRV_LENGTHOF(gDateFieldMapping))? gDateFieldMapping[field]: UCAL_FIELD_COUNT; } /* For now- one opener. */ static UDateFormatOpener gOpener = nullptr; U_CAPI void U_EXPORT2 udat_registerOpener(UDateFormatOpener opener, UErrorCode *status) { if(U_FAILURE(*status)) return; umtx_lock(nullptr); if(gOpener==nullptr) { gOpener = opener; } else { *status = U_ILLEGAL_ARGUMENT_ERROR; } umtx_unlock(nullptr); } U_CAPI UDateFormatOpener U_EXPORT2 udat_unregisterOpener(UDateFormatOpener opener, UErrorCode *status) { if(U_FAILURE(*status)) return nullptr; UDateFormatOpener oldOpener = nullptr; umtx_lock(nullptr); if(gOpener==nullptr || gOpener!=opener) { *status = U_ILLEGAL_ARGUMENT_ERROR; } else { oldOpener=gOpener; gOpener=nullptr; } umtx_unlock(nullptr); return oldOpener; } U_CAPI UDateFormat* U_EXPORT2 udat_open(UDateFormatStyle timeStyle, UDateFormatStyle dateStyle, const char *locale, const char16_t *tzID, int32_t tzIDLength, const char16_t *pattern, int32_t patternLength, UErrorCode *status) { DateFormat *fmt; if(U_FAILURE(*status)) { return 0; } if(gOpener!=nullptr) { // if it's registered fmt = (DateFormat*) (*gOpener)(timeStyle,dateStyle,locale,tzID,tzIDLength,pattern,patternLength,status); if(fmt!=nullptr) { return (UDateFormat*)fmt; } // else fall through. } if(timeStyle != UDAT_PATTERN) { if(locale == 0) { fmt = DateFormat::createDateTimeInstance((DateFormat::EStyle)dateStyle, (DateFormat::EStyle)timeStyle); } else { fmt = DateFormat::createDateTimeInstance((DateFormat::EStyle)dateStyle, (DateFormat::EStyle)timeStyle, Locale(locale)); } } else { UnicodeString pat((UBool)(patternLength == -1), pattern, patternLength); if(locale == 0) { fmt = new SimpleDateFormat(pat, *status); } else { fmt = new SimpleDateFormat(pat, Locale(locale), *status); } } if(fmt == nullptr) { *status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } if (U_FAILURE(*status)) { delete fmt; return nullptr; } if(tzID != 0) { TimeZone *zone = TimeZone::createTimeZone(UnicodeString((UBool)(tzIDLength == -1), tzID, tzIDLength)); if(zone == 0) { *status = U_MEMORY_ALLOCATION_ERROR; delete fmt; return 0; } fmt->adoptTimeZone(zone); } return (UDateFormat*)fmt; } U_CAPI void U_EXPORT2 udat_close(UDateFormat* format) { if (format == nullptr) return; delete (DateFormat*)format; } U_CAPI UDateFormat* U_EXPORT2 udat_clone(const UDateFormat *fmt, UErrorCode *status) { if(U_FAILURE(*status)) return 0; Format *res = ((DateFormat*)fmt)->clone(); if(res == 0) { *status = U_MEMORY_ALLOCATION_ERROR; return 0; } return (UDateFormat*) res; } U_CAPI int32_t U_EXPORT2 udat_format( const UDateFormat* format, UDate dateToFormat, char16_t* result, int32_t resultLength, UFieldPosition* position, UErrorCode* status) { if(U_FAILURE(*status)) { return -1; } if (result == nullptr ? resultLength != 0 : resultLength < 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return -1; } UnicodeString res; if (result != nullptr) { // nullptr destination for pure preflighting: empty dummy string // otherwise, alias the destination buffer res.setTo(result, 0, resultLength); } FieldPosition fp; if(position != 0) fp.setField(position->field); ((DateFormat*)format)->format(dateToFormat, res, fp); if(position != 0) { position->beginIndex = fp.getBeginIndex(); position->endIndex = fp.getEndIndex(); } return res.extract(result, resultLength, *status); } U_CAPI int32_t U_EXPORT2 udat_formatCalendar(const UDateFormat* format, UCalendar* calendar, char16_t* result, int32_t resultLength, UFieldPosition* position, UErrorCode* status) { if(U_FAILURE(*status)) { return -1; } if (result == nullptr ? resultLength != 0 : resultLength < 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return -1; } UnicodeString res; if (result != nullptr) { // nullptr destination for pure preflighting: empty dummy string // otherwise, alias the destination buffer res.setTo(result, 0, resultLength); } FieldPosition fp; if(position != 0) fp.setField(position->field); ((DateFormat*)format)->format(*(Calendar*)calendar, res, fp); if(position != 0) { position->beginIndex = fp.getBeginIndex(); position->endIndex = fp.getEndIndex(); } return res.extract(result, resultLength, *status); } U_CAPI int32_t U_EXPORT2 udat_formatForFields( const UDateFormat* format, UDate dateToFormat, char16_t* result, int32_t resultLength, UFieldPositionIterator* fpositer, UErrorCode* status) { if(U_FAILURE(*status)) { return -1; } if (result == nullptr ? resultLength != 0 : resultLength < 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return -1; } UnicodeString res; if (result != nullptr) { // nullptr destination for pure preflighting: empty dummy string // otherwise, alias the destination buffer res.setTo(result, 0, resultLength); } ((DateFormat*)format)->format(dateToFormat, res, (FieldPositionIterator*)fpositer, *status); return res.extract(result, resultLength, *status); } U_CAPI int32_t U_EXPORT2 udat_formatCalendarForFields(const UDateFormat* format, UCalendar* calendar, char16_t* result, int32_t resultLength, UFieldPositionIterator* fpositer, UErrorCode* status) { if(U_FAILURE(*status)) { return -1; } if (result == nullptr ? resultLength != 0 : resultLength < 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return -1; } UnicodeString res; if (result != nullptr) { // nullptr destination for pure preflighting: empty dummy string // otherwise, alias the destination buffer res.setTo(result, 0, resultLength); } ((DateFormat*)format)->format(*(Calendar*)calendar, res, (FieldPositionIterator*)fpositer, *status); return res.extract(result, resultLength, *status); } U_CAPI UDate U_EXPORT2 udat_parse( const UDateFormat* format, const char16_t* text, int32_t textLength, int32_t *parsePos, UErrorCode *status) { if(U_FAILURE(*status)) return (UDate)0; const UnicodeString src((UBool)(textLength == -1), text, textLength); ParsePosition pp; int32_t stackParsePos = 0; UDate res; if(parsePos == nullptr) { parsePos = &stackParsePos; } pp.setIndex(*parsePos); res = ((DateFormat*)format)->parse(src, pp); if(pp.getErrorIndex() == -1) *parsePos = pp.getIndex(); else { *parsePos = pp.getErrorIndex(); *status = U_PARSE_ERROR; } return res; } U_CAPI void U_EXPORT2 udat_parseCalendar(const UDateFormat* format, UCalendar* calendar, const char16_t* text, int32_t textLength, int32_t *parsePos, UErrorCode *status) { if(U_FAILURE(*status)) return; const UnicodeString src((UBool)(textLength == -1), text, textLength); ParsePosition pp; int32_t stackParsePos = 0; if(parsePos == nullptr) { parsePos = &stackParsePos; } pp.setIndex(*parsePos); ((DateFormat*)format)->parse(src, *(Calendar*)calendar, pp); if(pp.getErrorIndex() == -1) *parsePos = pp.getIndex(); else { *parsePos = pp.getErrorIndex(); *status = U_PARSE_ERROR; } } U_CAPI UBool U_EXPORT2 udat_isLenient(const UDateFormat* fmt) { return ((DateFormat*)fmt)->isLenient(); } U_CAPI void U_EXPORT2 udat_setLenient( UDateFormat* fmt, UBool isLenient) { ((DateFormat*)fmt)->setLenient(isLenient); } U_CAPI UBool U_EXPORT2 udat_getBooleanAttribute(const UDateFormat* fmt, UDateFormatBooleanAttribute attr, UErrorCode* status) { if(U_FAILURE(*status)) return false; return ((DateFormat*)fmt)->getBooleanAttribute(attr, *status); //return false; } U_CAPI void U_EXPORT2 udat_setBooleanAttribute(UDateFormat *fmt, UDateFormatBooleanAttribute attr, UBool newValue, UErrorCode* status) { if(U_FAILURE(*status)) return; ((DateFormat*)fmt)->setBooleanAttribute(attr, newValue, *status); } U_CAPI const UCalendar* U_EXPORT2 udat_getCalendar(const UDateFormat* fmt) { return (const UCalendar*) ((DateFormat*)fmt)->getCalendar(); } U_CAPI void U_EXPORT2 udat_setCalendar(UDateFormat* fmt, const UCalendar* calendarToSet) { ((DateFormat*)fmt)->setCalendar(*((Calendar*)calendarToSet)); } U_CAPI const UNumberFormat* U_EXPORT2 udat_getNumberFormatForField(const UDateFormat* fmt, char16_t field) { UErrorCode status = U_ZERO_ERROR; verifyIsSimpleDateFormat(fmt, &status); if (U_FAILURE(status)) return (const UNumberFormat*) ((DateFormat*)fmt)->getNumberFormat(); return (const UNumberFormat*) ((SimpleDateFormat*)fmt)->getNumberFormatForField(field); } U_CAPI const UNumberFormat* U_EXPORT2 udat_getNumberFormat(const UDateFormat* fmt) { return (const UNumberFormat*) ((DateFormat*)fmt)->getNumberFormat(); } U_CAPI void U_EXPORT2 udat_adoptNumberFormatForFields( UDateFormat* fmt, const char16_t* fields, UNumberFormat* numberFormatToSet, UErrorCode* status) { verifyIsSimpleDateFormat(fmt, status); if (U_FAILURE(*status)) return; if (fields!=nullptr) { UnicodeString overrideFields(fields); ((SimpleDateFormat*)fmt)->adoptNumberFormat(overrideFields, (NumberFormat*)numberFormatToSet, *status); } } U_CAPI void U_EXPORT2 udat_setNumberFormat(UDateFormat* fmt, const UNumberFormat* numberFormatToSet) { ((DateFormat*)fmt)->setNumberFormat(*((NumberFormat*)numberFormatToSet)); } U_CAPI void U_EXPORT2 udat_adoptNumberFormat( UDateFormat* fmt, UNumberFormat* numberFormatToAdopt) { ((DateFormat*)fmt)->adoptNumberFormat((NumberFormat*)numberFormatToAdopt); } U_CAPI const char* U_EXPORT2 udat_getAvailable(int32_t index) { return uloc_getAvailable(index); } U_CAPI int32_t U_EXPORT2 udat_countAvailable() { return uloc_countAvailable(); } U_CAPI UDate U_EXPORT2 udat_get2DigitYearStart( const UDateFormat *fmt, UErrorCode *status) { verifyIsSimpleDateFormat(fmt, status); if(U_FAILURE(*status)) return (UDate)0; return ((SimpleDateFormat*)fmt)->get2DigitYearStart(*status); } U_CAPI void U_EXPORT2 udat_set2DigitYearStart( UDateFormat *fmt, UDate d, UErrorCode *status) { verifyIsSimpleDateFormat(fmt, status); if(U_FAILURE(*status)) return; ((SimpleDateFormat*)fmt)->set2DigitYearStart(d, *status); } U_CAPI int32_t U_EXPORT2 udat_toPattern( const UDateFormat *fmt, UBool localized, char16_t *result, int32_t resultLength, UErrorCode *status) { if(U_FAILURE(*status)) { return -1; } if (result == nullptr ? resultLength != 0 : resultLength < 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return -1; } UnicodeString res; if (result != nullptr) { // nullptr destination for pure preflighting: empty dummy string // otherwise, alias the destination buffer res.setTo(result, 0, resultLength); } const DateFormat *df=reinterpret_cast(fmt); const SimpleDateFormat *sdtfmt=dynamic_cast(df); const RelativeDateFormat *reldtfmt; if (sdtfmt!=nullptr) { if(localized) sdtfmt->toLocalizedPattern(res, *status); else sdtfmt->toPattern(res); } else if (!localized && (reldtfmt=dynamic_cast(df))!=nullptr) { reldtfmt->toPattern(res, *status); } else { *status = U_ILLEGAL_ARGUMENT_ERROR; return -1; } return res.extract(result, resultLength, *status); } // TODO: should this take an UErrorCode? // A: Yes. Of course. U_CAPI void U_EXPORT2 udat_applyPattern( UDateFormat *format, UBool localized, const char16_t *pattern, int32_t patternLength) { const UnicodeString pat((UBool)(patternLength == -1), pattern, patternLength); UErrorCode status = U_ZERO_ERROR; verifyIsSimpleDateFormat(format, &status); if(U_FAILURE(status)) { return; } if(localized) ((SimpleDateFormat*)format)->applyLocalizedPattern(pat, status); else ((SimpleDateFormat*)format)->applyPattern(pat); } U_CAPI int32_t U_EXPORT2 udat_getSymbols(const UDateFormat *fmt, UDateFormatSymbolType type, int32_t index, char16_t *result, int32_t resultLength, UErrorCode *status) { const DateFormatSymbols *syms; const SimpleDateFormat* sdtfmt; const RelativeDateFormat* rdtfmt; if ((sdtfmt = dynamic_cast(reinterpret_cast(fmt))) != nullptr) { syms = sdtfmt->getDateFormatSymbols(); } else if ((rdtfmt = dynamic_cast(reinterpret_cast(fmt))) != nullptr) { syms = rdtfmt->getDateFormatSymbols(); } else { return -1; } int32_t count = 0; const UnicodeString *res = nullptr; switch(type) { case UDAT_ERAS: res = syms->getEras(count); break; case UDAT_ERA_NAMES: res = syms->getEraNames(count); break; case UDAT_MONTHS: res = syms->getMonths(count); break; case UDAT_SHORT_MONTHS: res = syms->getShortMonths(count); break; case UDAT_WEEKDAYS: res = syms->getWeekdays(count); break; case UDAT_SHORT_WEEKDAYS: res = syms->getShortWeekdays(count); break; case UDAT_AM_PMS: res = syms->getAmPmStrings(count); break; case UDAT_LOCALIZED_CHARS: { UnicodeString res1; if(!(result==nullptr && resultLength==0)) { // nullptr destination for pure preflighting: empty dummy string // otherwise, alias the destination buffer res1.setTo(result, 0, resultLength); } syms->getLocalPatternChars(res1); return res1.extract(result, resultLength, *status); } case UDAT_NARROW_MONTHS: res = syms->getMonths(count, DateFormatSymbols::FORMAT, DateFormatSymbols::NARROW); break; case UDAT_SHORTER_WEEKDAYS: res = syms->getWeekdays(count, DateFormatSymbols::FORMAT, DateFormatSymbols::SHORT); break; case UDAT_NARROW_WEEKDAYS: res = syms->getWeekdays(count, DateFormatSymbols::FORMAT, DateFormatSymbols::NARROW); break; case UDAT_STANDALONE_MONTHS: res = syms->getMonths(count, DateFormatSymbols::STANDALONE, DateFormatSymbols::WIDE); break; case UDAT_STANDALONE_SHORT_MONTHS: res = syms->getMonths(count, DateFormatSymbols::STANDALONE, DateFormatSymbols::ABBREVIATED); break; case UDAT_STANDALONE_NARROW_MONTHS: res = syms->getMonths(count, DateFormatSymbols::STANDALONE, DateFormatSymbols::NARROW); break; case UDAT_STANDALONE_WEEKDAYS: res = syms->getWeekdays(count, DateFormatSymbols::STANDALONE, DateFormatSymbols::WIDE); break; case UDAT_STANDALONE_SHORT_WEEKDAYS: res = syms->getWeekdays(count, DateFormatSymbols::STANDALONE, DateFormatSymbols::ABBREVIATED); break; case UDAT_STANDALONE_SHORTER_WEEKDAYS: res = syms->getWeekdays(count, DateFormatSymbols::STANDALONE, DateFormatSymbols::SHORT); break; case UDAT_STANDALONE_NARROW_WEEKDAYS: res = syms->getWeekdays(count, DateFormatSymbols::STANDALONE, DateFormatSymbols::NARROW); break; case UDAT_QUARTERS: res = syms->getQuarters(count, DateFormatSymbols::FORMAT, DateFormatSymbols::WIDE); break; case UDAT_SHORT_QUARTERS: res = syms->getQuarters(count, DateFormatSymbols::FORMAT, DateFormatSymbols::ABBREVIATED); break; case UDAT_NARROW_QUARTERS: res = syms->getQuarters(count, DateFormatSymbols::FORMAT, DateFormatSymbols::NARROW); break; case UDAT_STANDALONE_QUARTERS: res = syms->getQuarters(count, DateFormatSymbols::STANDALONE, DateFormatSymbols::WIDE); break; case UDAT_STANDALONE_SHORT_QUARTERS: res = syms->getQuarters(count, DateFormatSymbols::STANDALONE, DateFormatSymbols::ABBREVIATED); break; case UDAT_STANDALONE_NARROW_QUARTERS: res = syms->getQuarters(count, DateFormatSymbols::STANDALONE, DateFormatSymbols::NARROW); break; case UDAT_CYCLIC_YEARS_WIDE: res = syms->getYearNames(count, DateFormatSymbols::FORMAT, DateFormatSymbols::WIDE); break; case UDAT_CYCLIC_YEARS_ABBREVIATED: res = syms->getYearNames(count, DateFormatSymbols::FORMAT, DateFormatSymbols::ABBREVIATED); break; case UDAT_CYCLIC_YEARS_NARROW: res = syms->getYearNames(count, DateFormatSymbols::FORMAT, DateFormatSymbols::NARROW); break; case UDAT_ZODIAC_NAMES_WIDE: res = syms->getZodiacNames(count, DateFormatSymbols::FORMAT, DateFormatSymbols::WIDE); break; case UDAT_ZODIAC_NAMES_ABBREVIATED: res = syms->getZodiacNames(count, DateFormatSymbols::FORMAT, DateFormatSymbols::ABBREVIATED); break; case UDAT_ZODIAC_NAMES_NARROW: res = syms->getZodiacNames(count, DateFormatSymbols::FORMAT, DateFormatSymbols::NARROW); break; } if(index < count) { return res[index].extract(result, resultLength, *status); } return 0; } // TODO: also needs an errorCode. U_CAPI int32_t U_EXPORT2 udat_countSymbols( const UDateFormat *fmt, UDateFormatSymbolType type) { const DateFormatSymbols *syms; const SimpleDateFormat* sdtfmt; const RelativeDateFormat* rdtfmt; if ((sdtfmt = dynamic_cast(reinterpret_cast(fmt))) != nullptr) { syms = sdtfmt->getDateFormatSymbols(); } else if ((rdtfmt = dynamic_cast(reinterpret_cast(fmt))) != nullptr) { syms = rdtfmt->getDateFormatSymbols(); } else { return 0; } int32_t count = 0; switch(type) { case UDAT_ERAS: syms->getEras(count); break; case UDAT_MONTHS: syms->getMonths(count); break; case UDAT_SHORT_MONTHS: syms->getShortMonths(count); break; case UDAT_WEEKDAYS: syms->getWeekdays(count); break; case UDAT_SHORT_WEEKDAYS: syms->getShortWeekdays(count); break; case UDAT_AM_PMS: syms->getAmPmStrings(count); break; case UDAT_LOCALIZED_CHARS: count = 1; break; case UDAT_ERA_NAMES: syms->getEraNames(count); break; case UDAT_NARROW_MONTHS: syms->getMonths(count, DateFormatSymbols::FORMAT, DateFormatSymbols::NARROW); break; case UDAT_SHORTER_WEEKDAYS: syms->getWeekdays(count, DateFormatSymbols::FORMAT, DateFormatSymbols::SHORT); break; case UDAT_NARROW_WEEKDAYS: syms->getWeekdays(count, DateFormatSymbols::FORMAT, DateFormatSymbols::NARROW); break; case UDAT_STANDALONE_MONTHS: syms->getMonths(count, DateFormatSymbols::STANDALONE, DateFormatSymbols::WIDE); break; case UDAT_STANDALONE_SHORT_MONTHS: syms->getMonths(count, DateFormatSymbols::STANDALONE, DateFormatSymbols::ABBREVIATED); break; case UDAT_STANDALONE_NARROW_MONTHS: syms->getMonths(count, DateFormatSymbols::STANDALONE, DateFormatSymbols::NARROW); break; case UDAT_STANDALONE_WEEKDAYS: syms->getWeekdays(count, DateFormatSymbols::STANDALONE, DateFormatSymbols::WIDE); break; case UDAT_STANDALONE_SHORT_WEEKDAYS: syms->getWeekdays(count, DateFormatSymbols::STANDALONE, DateFormatSymbols::ABBREVIATED); break; case UDAT_STANDALONE_SHORTER_WEEKDAYS: syms->getWeekdays(count, DateFormatSymbols::STANDALONE, DateFormatSymbols::SHORT); break; case UDAT_STANDALONE_NARROW_WEEKDAYS: syms->getWeekdays(count, DateFormatSymbols::STANDALONE, DateFormatSymbols::NARROW); break; case UDAT_QUARTERS: syms->getQuarters(count, DateFormatSymbols::FORMAT, DateFormatSymbols::WIDE); break; case UDAT_SHORT_QUARTERS: syms->getQuarters(count, DateFormatSymbols::FORMAT, DateFormatSymbols::ABBREVIATED); break; case UDAT_NARROW_QUARTERS: syms->getQuarters(count, DateFormatSymbols::FORMAT, DateFormatSymbols::NARROW); break; case UDAT_STANDALONE_QUARTERS: syms->getQuarters(count, DateFormatSymbols::STANDALONE, DateFormatSymbols::WIDE); break; case UDAT_STANDALONE_SHORT_QUARTERS: syms->getQuarters(count, DateFormatSymbols::STANDALONE, DateFormatSymbols::ABBREVIATED); break; case UDAT_STANDALONE_NARROW_QUARTERS: syms->getQuarters(count, DateFormatSymbols::STANDALONE, DateFormatSymbols::NARROW); break; case UDAT_CYCLIC_YEARS_WIDE: syms->getYearNames(count, DateFormatSymbols::FORMAT, DateFormatSymbols::WIDE); break; case UDAT_CYCLIC_YEARS_ABBREVIATED: syms->getYearNames(count, DateFormatSymbols::FORMAT, DateFormatSymbols::ABBREVIATED); break; case UDAT_CYCLIC_YEARS_NARROW: syms->getYearNames(count, DateFormatSymbols::FORMAT, DateFormatSymbols::NARROW); break; case UDAT_ZODIAC_NAMES_WIDE: syms->getZodiacNames(count, DateFormatSymbols::FORMAT, DateFormatSymbols::WIDE); break; case UDAT_ZODIAC_NAMES_ABBREVIATED: syms->getZodiacNames(count, DateFormatSymbols::FORMAT, DateFormatSymbols::ABBREVIATED); break; case UDAT_ZODIAC_NAMES_NARROW: syms->getZodiacNames(count, DateFormatSymbols::FORMAT, DateFormatSymbols::NARROW); break; } return count; } U_NAMESPACE_BEGIN /* * This DateFormatSymbolsSingleSetter class is a friend of DateFormatSymbols * solely for the purpose of avoiding to clone the array of strings * just to modify one of them and then setting all of them back. * For example, the old code looked like this: * case UDAT_MONTHS: * res = syms->getMonths(count); * array = new UnicodeString[count]; * if(array == 0) { * *status = U_MEMORY_ALLOCATION_ERROR; * return; * } * uprv_arrayCopy(res, array, count); * if(index < count) * array[index] = val; * syms->setMonths(array, count); * break; * * Even worse, the old code actually cloned the entire DateFormatSymbols object, * cloned one value array, changed one value, and then made the SimpleDateFormat * replace its DateFormatSymbols object with the new one. * * markus 2002-oct-14 */ class DateFormatSymbolsSingleSetter /* not : public UObject because all methods are static */ { public: static void setSymbol(UnicodeString *array, int32_t count, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { if(array!=nullptr) { if(index>=count) { errorCode=U_INDEX_OUTOFBOUNDS_ERROR; } else if(value==nullptr) { errorCode=U_ILLEGAL_ARGUMENT_ERROR; } else { array[index].setTo(value, valueLength); } } } static void setEra(DateFormatSymbols *syms, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(syms->fEras, syms->fErasCount, index, value, valueLength, errorCode); } static void setEraName(DateFormatSymbols *syms, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(syms->fEraNames, syms->fEraNamesCount, index, value, valueLength, errorCode); } static void setMonth(DateFormatSymbols *syms, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(syms->fMonths, syms->fMonthsCount, index, value, valueLength, errorCode); } static void setShortMonth(DateFormatSymbols *syms, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(syms->fShortMonths, syms->fShortMonthsCount, index, value, valueLength, errorCode); } static void setNarrowMonth(DateFormatSymbols *syms, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(syms->fNarrowMonths, syms->fNarrowMonthsCount, index, value, valueLength, errorCode); } static void setStandaloneMonth(DateFormatSymbols *syms, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(syms->fStandaloneMonths, syms->fStandaloneMonthsCount, index, value, valueLength, errorCode); } static void setStandaloneShortMonth(DateFormatSymbols *syms, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(syms->fStandaloneShortMonths, syms->fStandaloneShortMonthsCount, index, value, valueLength, errorCode); } static void setStandaloneNarrowMonth(DateFormatSymbols *syms, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(syms->fStandaloneNarrowMonths, syms->fStandaloneNarrowMonthsCount, index, value, valueLength, errorCode); } static void setWeekday(DateFormatSymbols *syms, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(syms->fWeekdays, syms->fWeekdaysCount, index, value, valueLength, errorCode); } static void setShortWeekday(DateFormatSymbols *syms, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(syms->fShortWeekdays, syms->fShortWeekdaysCount, index, value, valueLength, errorCode); } static void setShorterWeekday(DateFormatSymbols *syms, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(syms->fShorterWeekdays, syms->fShorterWeekdaysCount, index, value, valueLength, errorCode); } static void setNarrowWeekday(DateFormatSymbols *syms, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(syms->fNarrowWeekdays, syms->fNarrowWeekdaysCount, index, value, valueLength, errorCode); } static void setStandaloneWeekday(DateFormatSymbols *syms, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(syms->fStandaloneWeekdays, syms->fStandaloneWeekdaysCount, index, value, valueLength, errorCode); } static void setStandaloneShortWeekday(DateFormatSymbols *syms, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(syms->fStandaloneShortWeekdays, syms->fStandaloneShortWeekdaysCount, index, value, valueLength, errorCode); } static void setStandaloneShorterWeekday(DateFormatSymbols *syms, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(syms->fStandaloneShorterWeekdays, syms->fStandaloneShorterWeekdaysCount, index, value, valueLength, errorCode); } static void setStandaloneNarrowWeekday(DateFormatSymbols *syms, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(syms->fStandaloneNarrowWeekdays, syms->fStandaloneNarrowWeekdaysCount, index, value, valueLength, errorCode); } static void setQuarter(DateFormatSymbols *syms, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(syms->fQuarters, syms->fQuartersCount, index, value, valueLength, errorCode); } static void setShortQuarter(DateFormatSymbols *syms, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(syms->fShortQuarters, syms->fShortQuartersCount, index, value, valueLength, errorCode); } static void setNarrowQuarter(DateFormatSymbols *syms, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(syms->fNarrowQuarters, syms->fNarrowQuartersCount, index, value, valueLength, errorCode); } static void setStandaloneQuarter(DateFormatSymbols *syms, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(syms->fStandaloneQuarters, syms->fStandaloneQuartersCount, index, value, valueLength, errorCode); } static void setStandaloneShortQuarter(DateFormatSymbols *syms, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(syms->fStandaloneShortQuarters, syms->fStandaloneShortQuartersCount, index, value, valueLength, errorCode); } static void setStandaloneNarrowQuarter(DateFormatSymbols *syms, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(syms->fStandaloneNarrowQuarters, syms->fStandaloneNarrowQuartersCount, index, value, valueLength, errorCode); } static void setShortYearNames(DateFormatSymbols *syms, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(syms->fShortYearNames, syms->fShortYearNamesCount, index, value, valueLength, errorCode); } static void setShortZodiacNames(DateFormatSymbols *syms, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(syms->fShortZodiacNames, syms->fShortZodiacNamesCount, index, value, valueLength, errorCode); } static void setAmPm(DateFormatSymbols *syms, int32_t index, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(syms->fAmPms, syms->fAmPmsCount, index, value, valueLength, errorCode); } static void setLocalPatternChars(DateFormatSymbols *syms, const char16_t *value, int32_t valueLength, UErrorCode &errorCode) { setSymbol(&syms->fLocalPatternChars, 1, 0, value, valueLength, errorCode); } }; U_NAMESPACE_END U_CAPI void U_EXPORT2 udat_setSymbols( UDateFormat *format, UDateFormatSymbolType type, int32_t index, char16_t *value, int32_t valueLength, UErrorCode *status) { verifyIsSimpleDateFormat(format, status); if(U_FAILURE(*status)) return; DateFormatSymbols *syms = (DateFormatSymbols *)((SimpleDateFormat *)format)->getDateFormatSymbols(); switch(type) { case UDAT_ERAS: DateFormatSymbolsSingleSetter::setEra(syms, index, value, valueLength, *status); break; case UDAT_ERA_NAMES: DateFormatSymbolsSingleSetter::setEraName(syms, index, value, valueLength, *status); break; case UDAT_MONTHS: DateFormatSymbolsSingleSetter::setMonth(syms, index, value, valueLength, *status); break; case UDAT_SHORT_MONTHS: DateFormatSymbolsSingleSetter::setShortMonth(syms, index, value, valueLength, *status); break; case UDAT_NARROW_MONTHS: DateFormatSymbolsSingleSetter::setNarrowMonth(syms, index, value, valueLength, *status); break; case UDAT_STANDALONE_MONTHS: DateFormatSymbolsSingleSetter::setStandaloneMonth(syms, index, value, valueLength, *status); break; case UDAT_STANDALONE_SHORT_MONTHS: DateFormatSymbolsSingleSetter::setStandaloneShortMonth(syms, index, value, valueLength, *status); break; case UDAT_STANDALONE_NARROW_MONTHS: DateFormatSymbolsSingleSetter::setStandaloneNarrowMonth(syms, index, value, valueLength, *status); break; case UDAT_WEEKDAYS: DateFormatSymbolsSingleSetter::setWeekday(syms, index, value, valueLength, *status); break; case UDAT_SHORT_WEEKDAYS: DateFormatSymbolsSingleSetter::setShortWeekday(syms, index, value, valueLength, *status); break; case UDAT_SHORTER_WEEKDAYS: DateFormatSymbolsSingleSetter::setShorterWeekday(syms, index, value, valueLength, *status); break; case UDAT_NARROW_WEEKDAYS: DateFormatSymbolsSingleSetter::setNarrowWeekday(syms, index, value, valueLength, *status); break; case UDAT_STANDALONE_WEEKDAYS: DateFormatSymbolsSingleSetter::setStandaloneWeekday(syms, index, value, valueLength, *status); break; case UDAT_STANDALONE_SHORT_WEEKDAYS: DateFormatSymbolsSingleSetter::setStandaloneShortWeekday(syms, index, value, valueLength, *status); break; case UDAT_STANDALONE_SHORTER_WEEKDAYS: DateFormatSymbolsSingleSetter::setStandaloneShorterWeekday(syms, index, value, valueLength, *status); break; case UDAT_STANDALONE_NARROW_WEEKDAYS: DateFormatSymbolsSingleSetter::setStandaloneNarrowWeekday(syms, index, value, valueLength, *status); break; case UDAT_QUARTERS: DateFormatSymbolsSingleSetter::setQuarter(syms, index, value, valueLength, *status); break; case UDAT_SHORT_QUARTERS: DateFormatSymbolsSingleSetter::setShortQuarter(syms, index, value, valueLength, *status); break; case UDAT_NARROW_QUARTERS: DateFormatSymbolsSingleSetter::setNarrowQuarter(syms, index, value, valueLength, *status); break; case UDAT_STANDALONE_QUARTERS: DateFormatSymbolsSingleSetter::setStandaloneQuarter(syms, index, value, valueLength, *status); break; case UDAT_STANDALONE_SHORT_QUARTERS: DateFormatSymbolsSingleSetter::setStandaloneShortQuarter(syms, index, value, valueLength, *status); break; case UDAT_STANDALONE_NARROW_QUARTERS: DateFormatSymbolsSingleSetter::setStandaloneNarrowQuarter(syms, index, value, valueLength, *status); break; case UDAT_CYCLIC_YEARS_ABBREVIATED: DateFormatSymbolsSingleSetter::setShortYearNames(syms, index, value, valueLength, *status); break; case UDAT_ZODIAC_NAMES_ABBREVIATED: DateFormatSymbolsSingleSetter::setShortZodiacNames(syms, index, value, valueLength, *status); break; case UDAT_AM_PMS: DateFormatSymbolsSingleSetter::setAmPm(syms, index, value, valueLength, *status); break; case UDAT_LOCALIZED_CHARS: DateFormatSymbolsSingleSetter::setLocalPatternChars(syms, value, valueLength, *status); break; default: *status = U_UNSUPPORTED_ERROR; break; } } U_CAPI const char* U_EXPORT2 udat_getLocaleByType(const UDateFormat *fmt, ULocDataLocaleType type, UErrorCode* status) { if (fmt == nullptr) { if (U_SUCCESS(*status)) { *status = U_ILLEGAL_ARGUMENT_ERROR; } return nullptr; } return ((Format*)fmt)->getLocaleID(type, *status); } U_CAPI void U_EXPORT2 udat_setContext(UDateFormat* fmt, UDisplayContext value, UErrorCode* status) { if (U_FAILURE(*status)) { return; } ((DateFormat*)fmt)->setContext(value, *status); return; } U_CAPI UDisplayContext U_EXPORT2 udat_getContext(const UDateFormat* fmt, UDisplayContextType type, UErrorCode* status) { if (U_FAILURE(*status)) { return (UDisplayContext)0; } return ((const DateFormat*)fmt)->getContext(type, *status); } /** * Verify that fmt is a RelativeDateFormat. Invalid error if not. * @param fmt the UDateFormat, definitely a DateFormat, maybe something else * @param status error code, will be set to failure if there is a failure or the fmt is nullptr. */ static void verifyIsRelativeDateFormat(const UDateFormat* fmt, UErrorCode *status) { if(U_SUCCESS(*status) && dynamic_cast(reinterpret_cast(fmt))==nullptr) { *status = U_ILLEGAL_ARGUMENT_ERROR; } } U_CAPI int32_t U_EXPORT2 udat_toPatternRelativeDate(const UDateFormat *fmt, char16_t *result, int32_t resultLength, UErrorCode *status) { verifyIsRelativeDateFormat(fmt, status); if(U_FAILURE(*status)) { return -1; } if (result == nullptr ? resultLength != 0 : resultLength < 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return -1; } UnicodeString datePattern; if (result != nullptr) { // nullptr destination for pure preflighting: empty dummy string // otherwise, alias the destination buffer datePattern.setTo(result, 0, resultLength); } ((RelativeDateFormat*)fmt)->toPatternDate(datePattern, *status); return datePattern.extract(result, resultLength, *status); } U_CAPI int32_t U_EXPORT2 udat_toPatternRelativeTime(const UDateFormat *fmt, char16_t *result, int32_t resultLength, UErrorCode *status) { verifyIsRelativeDateFormat(fmt, status); if(U_FAILURE(*status)) { return -1; } if (result == nullptr ? resultLength != 0 : resultLength < 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return -1; } UnicodeString timePattern; if (result != nullptr) { // nullptr destination for pure preflighting: empty dummy string // otherwise, alias the destination buffer timePattern.setTo(result, 0, resultLength); } ((RelativeDateFormat*)fmt)->toPatternTime(timePattern, *status); return timePattern.extract(result, resultLength, *status); } U_CAPI void U_EXPORT2 udat_applyPatternRelative(UDateFormat *format, const char16_t *datePattern, int32_t datePatternLength, const char16_t *timePattern, int32_t timePatternLength, UErrorCode *status) { verifyIsRelativeDateFormat(format, status); if(U_FAILURE(*status)) return; const UnicodeString datePat((UBool)(datePatternLength == -1), datePattern, datePatternLength); const UnicodeString timePat((UBool)(timePatternLength == -1), timePattern, timePatternLength); ((RelativeDateFormat*)format)->applyPatterns(datePat, timePat, *status); } #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/rbt_pars.cpp0000644000176200001440000017627014700200761016564 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 1999-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 11/17/99 aliu Creation. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/uobject.h" #include "unicode/parseerr.h" #include "unicode/parsepos.h" #include "unicode/putil.h" #include "unicode/uchar.h" #include "unicode/ustring.h" #include "unicode/uniset.h" #include "unicode/utf16.h" #include "cstring.h" #include "funcrepl.h" #include "hash.h" #include "quant.h" #include "rbt.h" #include "rbt_data.h" #include "rbt_pars.h" #include "rbt_rule.h" #include "strmatch.h" #include "strrepl.h" #include "unicode/symtable.h" #include "tridpars.h" #include "uvector.h" #include "hash.h" #include "patternprops.h" #include "util.h" #include "cmemory.h" #include "uprops.h" #include "putilimp.h" // Operators #define VARIABLE_DEF_OP ((char16_t)0x003D) /*=*/ #define FORWARD_RULE_OP ((char16_t)0x003E) /*>*/ #define REVERSE_RULE_OP ((char16_t)0x003C) /*<*/ #define FWDREV_RULE_OP ((char16_t)0x007E) /*~*/ // internal rep of <> op // Other special characters #define QUOTE ((char16_t)0x0027) /*'*/ #define ESCAPE ((char16_t)0x005C) /*\*/ #define END_OF_RULE ((char16_t)0x003B) /*;*/ #define RULE_COMMENT_CHAR ((char16_t)0x0023) /*#*/ #define SEGMENT_OPEN ((char16_t)0x0028) /*(*/ #define SEGMENT_CLOSE ((char16_t)0x0029) /*)*/ #define CONTEXT_ANTE ((char16_t)0x007B) /*{*/ #define CONTEXT_POST ((char16_t)0x007D) /*}*/ #define CURSOR_POS ((char16_t)0x007C) /*|*/ #define CURSOR_OFFSET ((char16_t)0x0040) /*@*/ #define ANCHOR_START ((char16_t)0x005E) /*^*/ #define KLEENE_STAR ((char16_t)0x002A) /***/ #define ONE_OR_MORE ((char16_t)0x002B) /*+*/ #define ZERO_OR_ONE ((char16_t)0x003F) /*?*/ #define DOT ((char16_t)46) /*.*/ static const char16_t DOT_SET[] = { // "[^[:Zp:][:Zl:]\r\n$]"; 91, 94, 91, 58, 90, 112, 58, 93, 91, 58, 90, 108, 58, 93, 92, 114, 92, 110, 36, 93, 0 }; // A function is denoted &Source-Target/Variant(text) #define FUNCTION ((char16_t)38) /*&*/ // Aliases for some of the syntax characters. These are provided so // transliteration rules can be expressed in XML without clashing with // XML syntax characters '<', '>', and '&'. #define ALT_REVERSE_RULE_OP ((char16_t)0x2190) // Left Arrow #define ALT_FORWARD_RULE_OP ((char16_t)0x2192) // Right Arrow #define ALT_FWDREV_RULE_OP ((char16_t)0x2194) // Left Right Arrow #define ALT_FUNCTION ((char16_t)0x2206) // Increment (~Greek Capital Delta) // Special characters disallowed at the top level static const char16_t ILLEGAL_TOP[] = {41,0}; // ")" // Special characters disallowed within a segment static const char16_t ILLEGAL_SEG[] = {123,125,124,64,0}; // "{}|@" // Special characters disallowed within a function argument static const char16_t ILLEGAL_FUNC[] = {94,40,46,42,43,63,123,125,124,64,0}; // "^(.*+?{}|@" // By definition, the ANCHOR_END special character is a // trailing SymbolTable.SYMBOL_REF character. // private static final char ANCHOR_END = '$'; static const char16_t gOPERATORS[] = { // "=><" VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP, ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP, 0 }; static const char16_t HALF_ENDERS[] = { // "=><;" VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP, ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP, END_OF_RULE, 0 }; // These are also used in Transliterator::toRules() static const int32_t ID_TOKEN_LEN = 2; static const char16_t ID_TOKEN[] = { 0x3A, 0x3A }; // ':', ':' /* commented out until we do real ::BEGIN/::END functionality static const int32_t BEGIN_TOKEN_LEN = 5; static const char16_t BEGIN_TOKEN[] = { 0x42, 0x45, 0x47, 0x49, 0x4e }; // 'BEGIN' static const int32_t END_TOKEN_LEN = 3; static const char16_t END_TOKEN[] = { 0x45, 0x4e, 0x44 }; // 'END' */ U_NAMESPACE_BEGIN //---------------------------------------------------------------------- // BEGIN ParseData //---------------------------------------------------------------------- /** * This class implements the SymbolTable interface. It is used * during parsing to give UnicodeSet access to variables that * have been defined so far. Note that it uses variablesVector, * _not_ data.setVariables. */ class ParseData : public UMemory, public SymbolTable { public: const TransliterationRuleData* data; // alias const UVector* variablesVector; // alias const Hashtable* variableNames; // alias ParseData(const TransliterationRuleData* data = 0, const UVector* variablesVector = 0, const Hashtable* variableNames = 0); virtual ~ParseData(); virtual const UnicodeString* lookup(const UnicodeString& s) const override; virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const override; virtual UnicodeString parseReference(const UnicodeString& text, ParsePosition& pos, int32_t limit) const override; /** * Return true if the given character is a matcher standin or a plain * character (non standin). */ UBool isMatcher(UChar32 ch); /** * Return true if the given character is a replacer standin or a plain * character (non standin). */ UBool isReplacer(UChar32 ch); private: ParseData(const ParseData &other); // forbid copying of this class ParseData &operator=(const ParseData &other); // forbid copying of this class }; ParseData::ParseData(const TransliterationRuleData* d, const UVector* sets, const Hashtable* vNames) : data(d), variablesVector(sets), variableNames(vNames) {} ParseData::~ParseData() {} /** * Implement SymbolTable API. */ const UnicodeString* ParseData::lookup(const UnicodeString& name) const { return (const UnicodeString*) variableNames->get(name); } /** * Implement SymbolTable API. */ const UnicodeFunctor* ParseData::lookupMatcher(UChar32 ch) const { // Note that we cannot use data.lookupSet() because the // set array has not been constructed yet. const UnicodeFunctor* set = nullptr; int32_t i = ch - data->variablesBase; if (i >= 0 && i < variablesVector->size()) { int32_t j = ch - data->variablesBase; set = (j < variablesVector->size()) ? (UnicodeFunctor*) variablesVector->elementAt(j) : 0; } return set; } /** * Implement SymbolTable API. Parse out a symbol reference * name. */ UnicodeString ParseData::parseReference(const UnicodeString& text, ParsePosition& pos, int32_t limit) const { int32_t start = pos.getIndex(); int32_t i = start; UnicodeString result; while (i < limit) { char16_t c = text.charAt(i); if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) { break; } ++i; } if (i == start) { // No valid name chars return result; // Indicate failure with empty string } pos.setIndex(i); text.extractBetween(start, i, result); return result; } UBool ParseData::isMatcher(UChar32 ch) { // Note that we cannot use data.lookup() because the // set array has not been constructed yet. int32_t i = ch - data->variablesBase; if (i >= 0 && i < variablesVector->size()) { UnicodeFunctor *f = (UnicodeFunctor*) variablesVector->elementAt(i); return f != nullptr && f->toMatcher() != nullptr; } return true; } /** * Return true if the given character is a replacer standin or a plain * character (non standin). */ UBool ParseData::isReplacer(UChar32 ch) { // Note that we cannot use data.lookup() because the // set array has not been constructed yet. int i = ch - data->variablesBase; if (i >= 0 && i < variablesVector->size()) { UnicodeFunctor *f = (UnicodeFunctor*) variablesVector->elementAt(i); return f != nullptr && f->toReplacer() != nullptr; } return true; } //---------------------------------------------------------------------- // BEGIN RuleHalf //---------------------------------------------------------------------- /** * A class representing one side of a rule. This class knows how to * parse half of a rule. It is tightly coupled to the method * RuleBasedTransliterator.Parser.parseRule(). */ class RuleHalf : public UMemory { public: UnicodeString text; int32_t cursor; // position of cursor in text int32_t ante; // position of ante context marker '{' in text int32_t post; // position of post context marker '}' in text // Record the offset to the cursor either to the left or to the // right of the key. This is indicated by characters on the output // side that allow the cursor to be positioned arbitrarily within // the matching text. For example, abc{def} > | @@@ xyz; changes // def to xyz and moves the cursor to before abc. Offset characters // must be at the start or end, and they cannot move the cursor past // the ante- or postcontext text. Placeholders are only valid in // output text. The length of the ante and post context is // determined at runtime, because of supplementals and quantifiers. int32_t cursorOffset; // only nonzero on output side // Position of first CURSOR_OFFSET on _right_. This will be -1 // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc. int32_t cursorOffsetPos; UBool anchorStart; UBool anchorEnd; /** * The segment number from 1..n of the next '(' we see * during parsing; 1-based. */ int32_t nextSegmentNumber; TransliteratorParser& parser; //-------------------------------------------------- // Methods RuleHalf(TransliteratorParser& parser); ~RuleHalf(); int32_t parse(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); int32_t parseSection(const UnicodeString& rule, int32_t pos, int32_t limit, UnicodeString& buf, const UnicodeString& illegal, UBool isSegment, UErrorCode& status); /** * Remove context. */ void removeContext(); /** * Return true if this half looks like valid output, that is, does not * contain quantifiers or other special input-only elements. */ UBool isValidOutput(TransliteratorParser& parser); /** * Return true if this half looks like valid input, that is, does not * contain functions or other special output-only elements. */ UBool isValidInput(TransliteratorParser& parser); int syntaxError(UErrorCode code, const UnicodeString& rule, int32_t start, UErrorCode& status) { return parser.syntaxError(code, rule, start, status); } private: // Disallowed methods; no impl. RuleHalf(const RuleHalf&); RuleHalf& operator=(const RuleHalf&); }; RuleHalf::RuleHalf(TransliteratorParser& p) : parser(p) { cursor = -1; ante = -1; post = -1; cursorOffset = 0; cursorOffsetPos = 0; anchorStart = anchorEnd = false; nextSegmentNumber = 1; } RuleHalf::~RuleHalf() { } /** * Parse one side of a rule, stopping at either the limit, * the END_OF_RULE character, or an operator. * @return the index after the terminating character, or * if limit was reached, limit */ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) { int32_t start = pos; text.truncate(0); pos = parseSection(rule, pos, limit, text, UnicodeString(true, ILLEGAL_TOP, -1), false, status); if (cursorOffset > 0 && cursor != cursorOffsetPos) { return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status); } return pos; } /** * Parse a section of one side of a rule, stopping at either * the limit, the END_OF_RULE character, an operator, or a * segment close character. This method parses both a * top-level rule half and a segment within such a rule half. * It calls itself recursively to parse segments and nested * segments. * @param buf buffer into which to accumulate the rule pattern * characters, either literal characters from the rule or * standins for UnicodeMatcher objects including segments. * @param illegal the set of special characters that is illegal during * this parse. * @param isSegment if true, then we've already seen a '(' and * pos on entry points right after it. Accumulate everything * up to the closing ')', put it in a segment matcher object, * generate a standin for it, and add the standin to buf. As * a side effect, update the segments vector with a reference * to the segment matcher. This works recursively for nested * segments. If isSegment is false, just accumulate * characters into buf. * @return the index after the terminating character, or * if limit was reached, limit */ int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t limit, UnicodeString& buf, const UnicodeString& illegal, UBool isSegment, UErrorCode& status) { int32_t start = pos; ParsePosition pp; UnicodeString scratch; UBool done = false; int32_t quoteStart = -1; // Most recent 'single quoted string' int32_t quoteLimit = -1; int32_t varStart = -1; // Most recent $variableReference int32_t varLimit = -1; int32_t bufStart = buf.length(); while (pos < limit && !done) { // Since all syntax characters are in the BMP, fetching // 16-bit code units suffices here. char16_t c = rule.charAt(pos++); if (PatternProps::isWhiteSpace(c)) { // Ignore whitespace. Note that this is not Unicode // spaces, but Java spaces -- a subset, representing // whitespace likely to be seen in code. continue; } if (u_strchr(HALF_ENDERS, c) != nullptr) { if (isSegment) { // Unclosed segment return syntaxError(U_UNCLOSED_SEGMENT, rule, start, status); } break; } if (anchorEnd) { // Text after a presumed end anchor is a syntax err return syntaxError(U_MALFORMED_VARIABLE_REFERENCE, rule, start, status); } if (UnicodeSet::resemblesPattern(rule, pos-1)) { pp.setIndex(pos-1); // Backup to opening '[' buf.append(parser.parseSet(rule, pp, status)); if (U_FAILURE(status)) { return syntaxError(U_MALFORMED_SET, rule, start, status); } pos = pp.getIndex(); continue; } // Handle escapes if (c == ESCAPE) { if (pos == limit) { return syntaxError(U_TRAILING_BACKSLASH, rule, start, status); } UChar32 escaped = rule.unescapeAt(pos); // pos is already past '\\' if (escaped == (UChar32) -1) { return syntaxError(U_MALFORMED_UNICODE_ESCAPE, rule, start, status); } if (!parser.checkVariableRange(escaped)) { return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status); } buf.append(escaped); continue; } // Handle quoted matter if (c == QUOTE) { int32_t iq = rule.indexOf(QUOTE, pos); if (iq == pos) { buf.append(c); // Parse [''] outside quotes as ['] ++pos; } else { /* This loop picks up a run of quoted text of the * form 'aaaa' each time through. If this run * hasn't really ended ('aaaa''bbbb') then it keeps * looping, each time adding on a new run. When it * reaches the final quote it breaks. */ quoteStart = buf.length(); for (;;) { if (iq < 0) { return syntaxError(U_UNTERMINATED_QUOTE, rule, start, status); } scratch.truncate(0); rule.extractBetween(pos, iq, scratch); buf.append(scratch); pos = iq+1; if (pos < limit && rule.charAt(pos) == QUOTE) { // Parse [''] inside quotes as ['] iq = rule.indexOf(QUOTE, pos+1); // Continue looping } else { break; } } quoteLimit = buf.length(); for (iq=quoteStart; iq= 0) { syntaxError(U_ILLEGAL_CHARACTER, rule, start, status); } switch (c) { //------------------------------------------------------ // Elements allowed within and out of segments //------------------------------------------------------ case ANCHOR_START: if (buf.length() == 0 && !anchorStart) { anchorStart = true; } else { return syntaxError(U_MISPLACED_ANCHOR_START, rule, start, status); } break; case SEGMENT_OPEN: { // bufSegStart is the offset in buf to the first // character of the segment we are parsing. int32_t bufSegStart = buf.length(); // Record segment number now, since nextSegmentNumber // will be incremented during the call to parseSection // if there are nested segments. int32_t segmentNumber = nextSegmentNumber++; // 1-based // Parse the segment pos = parseSection(rule, pos, limit, buf, UnicodeString(true, ILLEGAL_SEG, -1), true, status); // After parsing a segment, the relevant characters are // in buf, starting at offset bufSegStart. Extract them // into a string matcher, and replace them with a // standin for that matcher. StringMatcher* m = new StringMatcher(buf, bufSegStart, buf.length(), segmentNumber, *parser.curData); if (m == nullptr) { return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); } // Record and associate object and segment number parser.setSegmentObject(segmentNumber, m, status); buf.truncate(bufSegStart); buf.append(parser.getSegmentStandin(segmentNumber, status)); } break; case FUNCTION: case ALT_FUNCTION: { int32_t iref = pos; TransliteratorIDParser::SingleID* single = TransliteratorIDParser::parseFilterID(rule, iref); // The next character MUST be a segment open if (single == nullptr || !ICU_Utility::parseChar(rule, iref, SEGMENT_OPEN)) { return syntaxError(U_INVALID_FUNCTION, rule, start, status); } Transliterator *t = single->createInstance(); delete single; if (t == nullptr) { return syntaxError(U_INVALID_FUNCTION, rule, start, status); } // bufSegStart is the offset in buf to the first // character of the segment we are parsing. int32_t bufSegStart = buf.length(); // Parse the segment pos = parseSection(rule, iref, limit, buf, UnicodeString(true, ILLEGAL_FUNC, -1), true, status); // After parsing a segment, the relevant characters are // in buf, starting at offset bufSegStart. UnicodeString output; buf.extractBetween(bufSegStart, buf.length(), output); FunctionReplacer *r = new FunctionReplacer(t, new StringReplacer(output, parser.curData)); if (r == nullptr) { return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); } // Replace the buffer contents with a stand-in buf.truncate(bufSegStart); buf.append(parser.generateStandInFor(r, status)); } break; case SymbolTable::SYMBOL_REF: // Handle variable references and segment references "$1" .. "$9" { // A variable reference must be followed immediately // by a Unicode identifier start and zero or more // Unicode identifier part characters, or by a digit // 1..9 if it is a segment reference. if (pos == limit) { // A variable ref character at the end acts as // an anchor to the context limit, as in perl. anchorEnd = true; break; } // Parse "$1" "$2" .. "$9" .. (no upper limit) c = rule.charAt(pos); int32_t r = u_digit(c, 10); if (r >= 1 && r <= 9) { r = ICU_Utility::parseNumber(rule, pos, 10); if (r < 0) { return syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, rule, start, status); } buf.append(parser.getSegmentStandin(r, status)); } else { pp.setIndex(pos); UnicodeString name = parser.parseData-> parseReference(rule, pp, limit); if (name.length() == 0) { // This means the '$' was not followed by a // valid name. Try to interpret it as an // end anchor then. If this also doesn't work // (if we see a following character) then signal // an error. anchorEnd = true; break; } pos = pp.getIndex(); // If this is a variable definition statement, // then the LHS variable will be undefined. In // that case appendVariableDef() will append the // special placeholder char variableLimit-1. varStart = buf.length(); parser.appendVariableDef(name, buf, status); varLimit = buf.length(); } } break; case DOT: buf.append(parser.getDotStandIn(status)); break; case KLEENE_STAR: case ONE_OR_MORE: case ZERO_OR_ONE: // Quantifiers. We handle single characters, quoted strings, // variable references, and segments. // a+ matches aaa // 'foo'+ matches foofoofoo // $v+ matches xyxyxy if $v == xy // (seg)+ matches segsegseg { if (isSegment && buf.length() == bufStart) { // The */+ immediately follows '(' return syntaxError(U_MISPLACED_QUANTIFIER, rule, start, status); } int32_t qstart, qlimit; // The */+ follows an isolated character or quote // or variable reference if (buf.length() == quoteLimit) { // The */+ follows a 'quoted string' qstart = quoteStart; qlimit = quoteLimit; } else if (buf.length() == varLimit) { // The */+ follows a $variableReference qstart = varStart; qlimit = varLimit; } else { // The */+ follows a single character, possibly // a segment standin qstart = buf.length() - 1; qlimit = qstart + 1; } UnicodeFunctor *m = new StringMatcher(buf, qstart, qlimit, 0, *parser.curData); if (m == nullptr) { return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); } int32_t min = 0; int32_t max = Quantifier::MAX; switch (c) { case ONE_OR_MORE: min = 1; break; case ZERO_OR_ONE: min = 0; max = 1; break; // case KLEENE_STAR: // do nothing -- min, max already set } m = new Quantifier(m, min, max); if (m == nullptr) { return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); } buf.truncate(qstart); buf.append(parser.generateStandInFor(m, status)); } break; //------------------------------------------------------ // Elements allowed ONLY WITHIN segments //------------------------------------------------------ case SEGMENT_CLOSE: // assert(isSegment); // We're done parsing a segment. done = true; break; //------------------------------------------------------ // Elements allowed ONLY OUTSIDE segments //------------------------------------------------------ case CONTEXT_ANTE: if (ante >= 0) { return syntaxError(U_MULTIPLE_ANTE_CONTEXTS, rule, start, status); } ante = buf.length(); break; case CONTEXT_POST: if (post >= 0) { return syntaxError(U_MULTIPLE_POST_CONTEXTS, rule, start, status); } post = buf.length(); break; case CURSOR_POS: if (cursor >= 0) { return syntaxError(U_MULTIPLE_CURSORS, rule, start, status); } cursor = buf.length(); break; case CURSOR_OFFSET: if (cursorOffset < 0) { if (buf.length() > 0) { return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status); } --cursorOffset; } else if (cursorOffset > 0) { if (buf.length() != cursorOffsetPos || cursor >= 0) { return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status); } ++cursorOffset; } else { if (cursor == 0 && buf.length() == 0) { cursorOffset = -1; } else if (cursor < 0) { cursorOffsetPos = buf.length(); cursorOffset = 1; } else { return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status); } } break; //------------------------------------------------------ // Non-special characters //------------------------------------------------------ default: // Disallow unquoted characters other than [0-9A-Za-z] // in the printable ASCII range. These characters are // reserved for possible future use. if (c >= 0x0021 && c <= 0x007E && !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) || (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) || (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) { return syntaxError(U_UNQUOTED_SPECIAL, rule, start, status); } buf.append(c); break; } } return pos; } /** * Remove context. */ void RuleHalf::removeContext() { //text = text.substring(ante < 0 ? 0 : ante, // post < 0 ? text.length() : post); if (post >= 0) { text.remove(post); } if (ante >= 0) { text.removeBetween(0, ante); } ante = post = -1; anchorStart = anchorEnd = false; } /** * Return true if this half looks like valid output, that is, does not * contain quantifiers or other special input-only elements. */ UBool RuleHalf::isValidOutput(TransliteratorParser& transParser) { for (int32_t i=0; iisReplacer(c)) { return false; } } return true; } /** * Return true if this half looks like valid input, that is, does not * contain functions or other special output-only elements. */ UBool RuleHalf::isValidInput(TransliteratorParser& transParser) { for (int32_t i=0; iisMatcher(c)) { return false; } } return true; } //---------------------------------------------------------------------- // PUBLIC API //---------------------------------------------------------------------- /** * Constructor. */ TransliteratorParser::TransliteratorParser(UErrorCode &statusReturn) : dataVector(statusReturn), idBlockVector(statusReturn), variablesVector(statusReturn), segmentObjects(statusReturn) { idBlockVector.setDeleter(uprv_deleteUObject); curData = nullptr; compoundFilter = nullptr; parseData = nullptr; variableNames.setValueDeleter(uprv_deleteUObject); } /** * Destructor. */ TransliteratorParser::~TransliteratorParser() { while (!dataVector.isEmpty()) delete (TransliterationRuleData*)(dataVector.orphanElementAt(0)); delete compoundFilter; delete parseData; while (!variablesVector.isEmpty()) delete (UnicodeFunctor*)variablesVector.orphanElementAt(0); } void TransliteratorParser::parse(const UnicodeString& rules, UTransDirection transDirection, UParseError& pe, UErrorCode& ec) { if (U_SUCCESS(ec)) { parseRules(rules, transDirection, ec); pe = parseError; } } /** * Return the compound filter parsed by parse(). Caller owns result. */ UnicodeSet* TransliteratorParser::orphanCompoundFilter() { UnicodeSet* f = compoundFilter; compoundFilter = nullptr; return f; } //---------------------------------------------------------------------- // Private implementation //---------------------------------------------------------------------- /** * Parse the given string as a sequence of rules, separated by newline * characters ('\n'), and cause this object to implement those rules. Any * previous rules are discarded. Typically this method is called exactly * once, during construction. * @exception IllegalArgumentException if there is a syntax error in the * rules */ void TransliteratorParser::parseRules(const UnicodeString& rule, UTransDirection theDirection, UErrorCode& status) { // Clear error struct uprv_memset(&parseError, 0, sizeof(parseError)); parseError.line = parseError.offset = -1; UBool parsingIDs = true; int32_t ruleCount = 0; while (!dataVector.isEmpty()) { delete (TransliterationRuleData*)(dataVector.orphanElementAt(0)); } if (U_FAILURE(status)) { return; } idBlockVector.removeAllElements(); curData = nullptr; direction = theDirection; ruleCount = 0; delete compoundFilter; compoundFilter = nullptr; while (!variablesVector.isEmpty()) { delete (UnicodeFunctor*)variablesVector.orphanElementAt(0); } variableNames.removeAll(); parseData = new ParseData(0, &variablesVector, &variableNames); if (parseData == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } dotStandIn = (char16_t) -1; UnicodeString *tempstr = nullptr; // used for memory allocation error checking UnicodeString str; // scratch UnicodeString idBlockResult; int32_t pos = 0; int32_t limit = rule.length(); // The compound filter offset is an index into idBlockResult. // If it is 0, then the compound filter occurred at the start, // and it is the offset to the _start_ of the compound filter // pattern. Otherwise it is the offset to the _limit_ of the // compound filter pattern within idBlockResult. compoundFilter = nullptr; int32_t compoundFilterOffset = -1; while (pos < limit && U_SUCCESS(status)) { char16_t c = rule.charAt(pos++); if (PatternProps::isWhiteSpace(c)) { // Ignore leading whitespace. continue; } // Skip lines starting with the comment character if (c == RULE_COMMENT_CHAR) { pos = rule.indexOf((char16_t)0x000A /*\n*/, pos) + 1; if (pos == 0) { break; // No "\n" found; rest of rule is a comment } continue; // Either fall out or restart with next line } // skip empty rules if (c == END_OF_RULE) continue; // keep track of how many rules we've seen ++ruleCount; // We've found the start of a rule or ID. c is its first // character, and pos points past c. --pos; // Look for an ID token. Must have at least ID_TOKEN_LEN + 1 // chars left. if ((pos + ID_TOKEN_LEN + 1) <= limit && rule.compare(pos, ID_TOKEN_LEN, ID_TOKEN) == 0) { pos += ID_TOKEN_LEN; c = rule.charAt(pos); while (PatternProps::isWhiteSpace(c) && pos < limit) { ++pos; c = rule.charAt(pos); } int32_t p = pos; if (!parsingIDs) { if (curData != nullptr) { U_ASSERT(!dataVector.hasDeleter()); if (direction == UTRANS_FORWARD) dataVector.addElement(curData, status); else dataVector.insertElementAt(curData, 0, status); if (U_FAILURE(status)) { delete curData; } curData = nullptr; } parsingIDs = true; } TransliteratorIDParser::SingleID* id = TransliteratorIDParser::parseSingleID(rule, p, direction, status); if (p != pos && ICU_Utility::parseChar(rule, p, END_OF_RULE)) { // Successful ::ID parse. if (direction == UTRANS_FORWARD) { idBlockResult.append(id->canonID).append(END_OF_RULE); } else { idBlockResult.insert(0, END_OF_RULE); idBlockResult.insert(0, id->canonID); } } else { // Couldn't parse an ID. Try to parse a global filter int32_t withParens = -1; UnicodeSet* f = TransliteratorIDParser::parseGlobalFilter(rule, p, direction, withParens, nullptr); if (f != nullptr) { if (ICU_Utility::parseChar(rule, p, END_OF_RULE) && (direction == UTRANS_FORWARD) == (withParens == 0)) { if (compoundFilter != nullptr) { // Multiple compound filters syntaxError(U_MULTIPLE_COMPOUND_FILTERS, rule, pos, status); delete f; } else { compoundFilter = f; compoundFilterOffset = ruleCount; } } else { delete f; } } else { // Invalid ::id // Can be parsed as neither an ID nor a global filter syntaxError(U_INVALID_ID, rule, pos, status); } } delete id; pos = p; } else { if (parsingIDs) { tempstr = new UnicodeString(idBlockResult); // nullptr pointer check if (tempstr == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } U_ASSERT(idBlockVector.hasDeleter()); if (direction == UTRANS_FORWARD) idBlockVector.adoptElement(tempstr, status); else idBlockVector.insertElementAt(tempstr, 0, status); if (U_FAILURE(status)) { return; } idBlockResult.remove(); parsingIDs = false; curData = new TransliterationRuleData(status); // nullptr pointer check if (curData == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } parseData->data = curData; // By default, rules use part of the private use area // E000..F8FF for variables and other stand-ins. Currently // the range F000..F8FF is typically sufficient. The 'use // variable range' pragma allows rule sets to modify this. setVariableRange(0xF000, 0xF8FF, status); } if (resemblesPragma(rule, pos, limit)) { int32_t ppp = parsePragma(rule, pos, limit, status); if (ppp < 0) { syntaxError(U_MALFORMED_PRAGMA, rule, pos, status); } pos = ppp; // Parse a rule } else { pos = parseRule(rule, pos, limit, status); } } } if (parsingIDs && idBlockResult.length() > 0) { tempstr = new UnicodeString(idBlockResult); // nullptr pointer check if (tempstr == nullptr) { // TODO: Testing, forcing this path, shows many memory leaks. ICU-21701 // intltest translit/TransliteratorTest/TestInstantiation status = U_MEMORY_ALLOCATION_ERROR; return; } if (direction == UTRANS_FORWARD) idBlockVector.adoptElement(tempstr, status); else idBlockVector.insertElementAt(tempstr, 0, status); if (U_FAILURE(status)) { return; } } else if (!parsingIDs && curData != nullptr) { if (direction == UTRANS_FORWARD) { dataVector.addElement(curData, status); } else { dataVector.insertElementAt(curData, 0, status); } if (U_FAILURE(status)) { delete curData; curData = nullptr; } } if (U_SUCCESS(status)) { // Convert the set vector to an array int32_t i, dataVectorSize = dataVector.size(); for (i = 0; i < dataVectorSize; i++) { TransliterationRuleData* data = (TransliterationRuleData*)dataVector.elementAt(i); data->variablesLength = variablesVector.size(); if (data->variablesLength == 0) { data->variables = 0; } else { data->variables = (UnicodeFunctor**)uprv_malloc(data->variablesLength * sizeof(UnicodeFunctor*)); // nullptr pointer check if (data->variables == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } data->variablesAreOwned = (i == 0); } for (int32_t j = 0; j < data->variablesLength; j++) { data->variables[j] = static_cast(variablesVector.elementAt(j)); } data->variableNames.removeAll(); int32_t p = UHASH_FIRST; const UHashElement* he = variableNames.nextElement(p); while (he != nullptr) { UnicodeString* tempus = ((UnicodeString*)(he->value.pointer))->clone(); if (tempus == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } data->variableNames.put(*((UnicodeString*)(he->key.pointer)), tempus, status); he = variableNames.nextElement(p); } } variablesVector.removeAllElements(); // keeps them from getting deleted when we succeed // Index the rules if (compoundFilter != nullptr) { if ((direction == UTRANS_FORWARD && compoundFilterOffset != 1) || (direction == UTRANS_REVERSE && compoundFilterOffset != ruleCount)) { status = U_MISPLACED_COMPOUND_FILTER; } } for (i = 0; i < dataVectorSize; i++) { TransliterationRuleData* data = (TransliterationRuleData*)dataVector.elementAt(i); data->ruleSet.freeze(parseError, status); } if (idBlockVector.size() == 1 && ((UnicodeString*)idBlockVector.elementAt(0))->isEmpty()) { idBlockVector.removeElementAt(0); } } } /** * Set the variable range to [start, end] (inclusive). */ void TransliteratorParser::setVariableRange(int32_t start, int32_t end, UErrorCode& status) { if (start > end || start < 0 || end > 0xFFFF) { status = U_MALFORMED_PRAGMA; return; } curData->variablesBase = (char16_t) start; if (dataVector.size() == 0) { variableNext = (char16_t) start; variableLimit = (char16_t) (end + 1); } } /** * Assert that the given character is NOT within the variable range. * If it is, return false. This is necessary to ensure that the * variable range does not overlap characters used in a rule. */ UBool TransliteratorParser::checkVariableRange(UChar32 ch) const { return !(ch >= curData->variablesBase && ch < variableLimit); } /** * Set the maximum backup to 'backup', in response to a pragma * statement. */ void TransliteratorParser::pragmaMaximumBackup(int32_t /*backup*/) { //TODO Finish } /** * Begin normalizing all rules using the given mode, in response * to a pragma statement. */ void TransliteratorParser::pragmaNormalizeRules(UNormalizationMode /*mode*/) { //TODO Finish } static const char16_t PRAGMA_USE[] = {0x75,0x73,0x65,0x20,0}; // "use " static const char16_t PRAGMA_VARIABLE_RANGE[] = {0x7E,0x76,0x61,0x72,0x69,0x61,0x62,0x6C,0x65,0x20,0x72,0x61,0x6E,0x67,0x65,0x20,0x23,0x20,0x23,0x7E,0x3B,0}; // "~variable range # #~;" static const char16_t PRAGMA_MAXIMUM_BACKUP[] = {0x7E,0x6D,0x61,0x78,0x69,0x6D,0x75,0x6D,0x20,0x62,0x61,0x63,0x6B,0x75,0x70,0x20,0x23,0x7E,0x3B,0}; // "~maximum backup #~;" static const char16_t PRAGMA_NFD_RULES[] = {0x7E,0x6E,0x66,0x64,0x20,0x72,0x75,0x6C,0x65,0x73,0x7E,0x3B,0}; // "~nfd rules~;" static const char16_t PRAGMA_NFC_RULES[] = {0x7E,0x6E,0x66,0x63,0x20,0x72,0x75,0x6C,0x65,0x73,0x7E,0x3B,0}; // "~nfc rules~;" /** * Return true if the given rule looks like a pragma. * @param pos offset to the first non-whitespace character * of the rule. * @param limit pointer past the last character of the rule. */ UBool TransliteratorParser::resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit) { // Must start with /use\s/i return ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(true, PRAGMA_USE, 4), nullptr) >= 0; } /** * Parse a pragma. This method assumes resemblesPragma() has * already returned true. * @param pos offset to the first non-whitespace character * of the rule. * @param limit pointer past the last character of the rule. * @return the position index after the final ';' of the pragma, * or -1 on failure. */ int32_t TransliteratorParser::parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) { int32_t array[2]; // resemblesPragma() has already returned true, so we // know that pos points to /use\s/i; we can skip 4 characters // immediately pos += 4; // Here are the pragmas we recognize: // use variable range 0xE000 0xEFFF; // use maximum backup 16; // use nfd rules; // use nfc rules; int p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(true, PRAGMA_VARIABLE_RANGE, -1), array); if (p >= 0) { setVariableRange(array[0], array[1], status); return p; } p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(true, PRAGMA_MAXIMUM_BACKUP, -1), array); if (p >= 0) { pragmaMaximumBackup(array[0]); return p; } p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(true, PRAGMA_NFD_RULES, -1), nullptr); if (p >= 0) { pragmaNormalizeRules(UNORM_NFD); return p; } p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(true, PRAGMA_NFC_RULES, -1), nullptr); if (p >= 0) { pragmaNormalizeRules(UNORM_NFC); return p; } // Syntax error: unable to parse pragma return -1; } /** * MAIN PARSER. Parse the next rule in the given rule string, starting * at pos. Return the index after the last character parsed. Do not * parse characters at or after limit. * * Important: The character at pos must be a non-whitespace character * that is not the comment character. * * This method handles quoting, escaping, and whitespace removal. It * parses the end-of-rule character. It recognizes context and cursor * indicators. Once it does a lexical breakdown of the rule at pos, it * creates a rule object and adds it to our rule list. */ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) { // Locate the left side, operator, and right side int32_t start = pos; char16_t op = 0; int32_t i; // Set up segments data segmentStandins.truncate(0); segmentObjects.removeAllElements(); // Use pointers to automatics to make swapping possible. RuleHalf _left(*this), _right(*this); RuleHalf* left = &_left; RuleHalf* right = &_right; undefinedVariableName.remove(); pos = left->parse(rule, pos, limit, status); if (U_FAILURE(status)) { return start; } if (pos == limit || u_strchr(gOPERATORS, (op = rule.charAt(--pos))) == nullptr) { return syntaxError(U_MISSING_OPERATOR, rule, start, status); } ++pos; // Found an operator char. Check for forward-reverse operator. if (op == REVERSE_RULE_OP && (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) { ++pos; op = FWDREV_RULE_OP; } // Translate alternate op characters. switch (op) { case ALT_FORWARD_RULE_OP: op = FORWARD_RULE_OP; break; case ALT_REVERSE_RULE_OP: op = REVERSE_RULE_OP; break; case ALT_FWDREV_RULE_OP: op = FWDREV_RULE_OP; break; } pos = right->parse(rule, pos, limit, status); if (U_FAILURE(status)) { return start; } if (pos < limit) { if (rule.charAt(--pos) == END_OF_RULE) { ++pos; } else { // RuleHalf parser must have terminated at an operator return syntaxError(U_UNQUOTED_SPECIAL, rule, start, status); } } if (op == VARIABLE_DEF_OP) { // LHS is the name. RHS is a single character, either a literal // or a set (already parsed). If RHS is longer than one // character, it is either a multi-character string, or multiple // sets, or a mixture of chars and sets -- syntax error. // We expect to see a single undefined variable (the one being // defined). if (undefinedVariableName.length() == 0) { // "Missing '$' or duplicate definition" return syntaxError(U_BAD_VARIABLE_DEFINITION, rule, start, status); } if (left->text.length() != 1 || left->text.charAt(0) != variableLimit) { // "Malformed LHS" return syntaxError(U_MALFORMED_VARIABLE_DEFINITION, rule, start, status); } if (left->anchorStart || left->anchorEnd || right->anchorStart || right->anchorEnd) { return syntaxError(U_MALFORMED_VARIABLE_DEFINITION, rule, start, status); } // We allow anything on the right, including an empty string. UnicodeString* value = new UnicodeString(right->text); // nullptr pointer check if (value == nullptr) { return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); } variableNames.put(undefinedVariableName, value, status); ++variableLimit; return pos; } // If this is not a variable definition rule, we shouldn't have // any undefined variable names. if (undefinedVariableName.length() != 0) { return syntaxError(// "Undefined variable $" + undefinedVariableName, U_UNDEFINED_VARIABLE, rule, start, status); } // Verify segments if (segmentStandins.length() > segmentObjects.size()) { syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, rule, start, status); } for (i=0; iremoveContext(); left->cursor = -1; left->cursorOffset = 0; } // Normalize context if (left->ante < 0) { left->ante = 0; } if (left->post < 0) { left->post = left->text.length(); } // Context is only allowed on the input side. Cursors are only // allowed on the output side. Segment delimiters can only appear // on the left, and references on the right. Cursor offset // cannot appear without an explicit cursor. Cursor offset // cannot place the cursor outside the limits of the context. // Anchors are only allowed on the input side. if (right->ante >= 0 || right->post >= 0 || left->cursor >= 0 || (right->cursorOffset != 0 && right->cursor < 0) || // - The following two checks were used to ensure that the // - the cursor offset stayed within the ante- or postcontext. // - However, with the addition of quantifiers, we have to // - allow arbitrary cursor offsets and do runtime checking. //(right->cursorOffset > (left->text.length() - left->post)) || //(-right->cursorOffset > left->ante) || right->anchorStart || right->anchorEnd || !left->isValidInput(*this) || !right->isValidOutput(*this) || left->ante > left->post) { return syntaxError(U_MALFORMED_RULE, rule, start, status); } // Flatten segment objects vector to an array UnicodeFunctor** segmentsArray = nullptr; if (segmentObjects.size() > 0) { segmentsArray = (UnicodeFunctor **)uprv_malloc(segmentObjects.size() * sizeof(UnicodeFunctor *)); // Null pointer check if (segmentsArray == nullptr) { return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); } segmentObjects.toArray((void**) segmentsArray); } TransliterationRule* temptr = new TransliterationRule( left->text, left->ante, left->post, right->text, right->cursor, right->cursorOffset, segmentsArray, segmentObjects.size(), left->anchorStart, left->anchorEnd, curData, status); //Null pointer check if (temptr == nullptr) { uprv_free(segmentsArray); return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); } curData->ruleSet.addRule(temptr, status); return pos; } /** * Called by main parser upon syntax error. Search the rule string * for the probable end of the rule. Of course, if the error is that * the end of rule marker is missing, then the rule end will not be found. * In any case the rule start will be correctly reported. * @param msg error description * @param rule pattern string * @param start position of first character of current rule */ int32_t TransliteratorParser::syntaxError(UErrorCode parseErrorCode, const UnicodeString& rule, int32_t pos, UErrorCode& status) { parseError.offset = pos; parseError.line = 0 ; /* we are not using line numbers */ // for pre-context const int32_t LEN = U_PARSE_CONTEXT_LEN - 1; int32_t start = uprv_max(pos - LEN, 0); int32_t stop = pos; rule.extract(start,stop-start,parseError.preContext); //null terminate the buffer parseError.preContext[stop-start] = 0; //for post-context start = pos; stop = uprv_min(pos + LEN, rule.length()); rule.extract(start,stop-start,parseError.postContext); //null terminate the buffer parseError.postContext[stop-start]= 0; status = (UErrorCode)parseErrorCode; return pos; } /** * Parse a UnicodeSet out, store it, and return the stand-in character * used to represent it. */ char16_t TransliteratorParser::parseSet(const UnicodeString& rule, ParsePosition& pos, UErrorCode& status) { UnicodeSet* set = new UnicodeSet(rule, pos, USET_IGNORE_SPACE, parseData, status); // Null pointer check if (set == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return (char16_t)0x0000; // Return empty character with error. } set->compact(); return generateStandInFor(set, status); } /** * Generate and return a stand-in for a new UnicodeFunctor. Store * the matcher (adopt it). */ char16_t TransliteratorParser::generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status) { // assert(obj != null); // Look up previous stand-in, if any. This is a short list // (typical n is 0, 1, or 2); linear search is optimal. for (int32_t i=0; ivariablesBase + i); } } if (variableNext >= variableLimit) { delete adopted; status = U_VARIABLE_RANGE_EXHAUSTED; return 0; } variablesVector.addElement(adopted, status); if (U_FAILURE(status)) { delete adopted; return 0; } return variableNext++; } /** * Return the standin for segment seg (1-based). */ char16_t TransliteratorParser::getSegmentStandin(int32_t seg, UErrorCode& status) { // Special character used to indicate an empty spot char16_t empty = curData->variablesBase - 1; while (segmentStandins.length() < seg) { segmentStandins.append(empty); } char16_t c = segmentStandins.charAt(seg-1); if (c == empty) { if (variableNext >= variableLimit) { status = U_VARIABLE_RANGE_EXHAUSTED; return 0; } c = variableNext++; // Set a placeholder in the primary variables vector that will be // filled in later by setSegmentObject(). We know that we will get // called first because setSegmentObject() will call us. variablesVector.addElement((void*) nullptr, status); segmentStandins.setCharAt(seg-1, c); } return c; } /** * Set the object for segment seg (1-based). */ void TransliteratorParser::setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status) { // Since we call parseSection() recursively, nested // segments will result in segment i+1 getting parsed // and stored before segment i; be careful with the // vector handling here. if (segmentObjects.size() < seg) { segmentObjects.setSize(seg, status); } if (U_FAILURE(status)) { return; } int32_t index = getSegmentStandin(seg, status) - curData->variablesBase; if (segmentObjects.elementAt(seg-1) != nullptr || variablesVector.elementAt(index) != nullptr) { // should never happen if (U_SUCCESS(status)) {status = U_INTERNAL_TRANSLITERATOR_ERROR;} return; } // Note: neither segmentObjects or variablesVector has an object deleter function. segmentObjects.setElementAt(adopted, seg-1); variablesVector.setElementAt(adopted, index); } /** * Return the stand-in for the dot set. It is allocated the first * time and reused thereafter. */ char16_t TransliteratorParser::getDotStandIn(UErrorCode& status) { if (dotStandIn == (char16_t) -1) { UnicodeSet* tempus = new UnicodeSet(UnicodeString(true, DOT_SET, -1), status); // Null pointer check. if (tempus == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return (char16_t)0x0000; } dotStandIn = generateStandInFor(tempus, status); } return dotStandIn; } /** * Append the value of the given variable name to the given * UnicodeString. */ void TransliteratorParser::appendVariableDef(const UnicodeString& name, UnicodeString& buf, UErrorCode& status) { const UnicodeString* s = (const UnicodeString*) variableNames.get(name); if (s == nullptr) { // We allow one undefined variable so that variable definition // statements work. For the first undefined variable we return // the special placeholder variableLimit-1, and save the variable // name. if (undefinedVariableName.length() == 0) { undefinedVariableName = name; if (variableNext >= variableLimit) { // throw new RuntimeException("Private use variables exhausted"); status = U_ILLEGAL_ARGUMENT_ERROR; return; } buf.append((char16_t) --variableLimit); } else { //throw new IllegalArgumentException("Undefined variable $" // + name); status = U_ILLEGAL_ARGUMENT_ERROR; return; } } else { buf.append(*s); } } /** * Glue method to get around access restrictions in C++. */ /*Transliterator* TransliteratorParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) { return Transliterator::createBasicInstance(id, canonID); }*/ U_NAMESPACE_END U_CAPI int32_t utrans_stripRules(const char16_t *source, int32_t sourceLen, char16_t *target, UErrorCode *status) { U_NAMESPACE_USE //const char16_t *sourceStart = source; const char16_t *targetStart = target; const char16_t *sourceLimit = source+sourceLen; char16_t *targetLimit = target+sourceLen; UChar32 c = 0; UBool quoted = false; int32_t index; uprv_memset(target, 0, sourceLen*U_SIZEOF_UCHAR); /* read the rules into the buffer */ while (source < sourceLimit) { index=0; U16_NEXT_UNSAFE(source, index, c); source+=index; if(c == QUOTE) { quoted = (UBool)!quoted; } else if (!quoted) { if (c == RULE_COMMENT_CHAR) { /* skip comments and all preceding spaces */ while (targetStart < target && *(target - 1) == 0x0020) { target--; } do { if (source == sourceLimit) { c = U_SENTINEL; break; } c = *(source++); } while (c != CR && c != LF); if (c < 0) { break; } } else if (c == ESCAPE && source < sourceLimit) { UChar32 c2 = *source; if (c2 == CR || c2 == LF) { /* A backslash at the end of a line. */ /* Since we're stripping lines, ignore the backslash. */ source++; continue; } if (c2 == 0x0075 && source+5 < sourceLimit) { /* \u seen. \U isn't unescaped. */ int32_t escapeOffset = 0; UnicodeString escapedStr(source, 5); c2 = escapedStr.unescapeAt(escapeOffset); if (c2 == (UChar32)0xFFFFFFFF || escapeOffset == 0) { *status = U_PARSE_ERROR; return 0; } if (!PatternProps::isWhiteSpace(c2) && !u_iscntrl(c2) && !u_ispunct(c2)) { /* It was escaped for a reason. Write what it was suppose to be. */ source+=5; c = c2; } } else if (c2 == QUOTE) { /* \' seen. Make sure we don't do anything when we see it again. */ quoted = (UBool)!quoted; } } } if (c == CR || c == LF) { /* ignore spaces carriage returns, and all leading spaces on the next line. * and line feed unless in the form \uXXXX */ quoted = false; while (source < sourceLimit) { c = *(source); if (c != CR && c != LF && c != 0x0020) { break; } source++; } continue; } /* Append char16_t * after dissembling if c > 0xffff*/ index=0; U16_APPEND_UNSAFE(target, index, c); target+=index; } if (target < targetLimit) { *target = 0; } return (int32_t)(target-targetStart); } #endif /* #if !UCONFIG_NO_TRANSLITERATION */ stringi/src/icu74/i18n/csr2022.cpp0000644000176200001440000001417614700200761016041 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2005-2016, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_CONVERSION #include "cmemory.h" #include "cstring.h" #include "csr2022.h" #include "csmatch.h" U_NAMESPACE_BEGIN /** * Matching function shared among the 2022 detectors JP, CN and KR * Counts up the number of legal and unrecognized escape sequences in * the sample of text, and computes a score based on the total number & * the proportion that fit the encoding. * * * @param text the byte buffer containing text to analyse * @param textLen the size of the text in the byte. * @param escapeSequences the byte escape sequences to test for. * @return match quality, in the range of 0-100. */ int32_t CharsetRecog_2022::match_2022(const uint8_t *text, int32_t textLen, const uint8_t escapeSequences[][5], int32_t escapeSequences_length) const { int32_t i, j; int32_t escN; int32_t hits = 0; int32_t misses = 0; int32_t shifts = 0; int32_t quality; i = 0; while(i < textLen) { if(text[i] == 0x1B) { escN = 0; while(escN < escapeSequences_length) { const uint8_t *seq = escapeSequences[escN]; int32_t seq_length = (int32_t)uprv_strlen((const char *) seq); if (textLen-i >= seq_length) { j = 1; while(j < seq_length) { if(seq[j] != text[i+j]) { goto checkEscapes; } j += 1; } hits += 1; i += seq_length-1; goto scanInput; } // else we ran out of string to compare this time. checkEscapes: escN += 1; } misses += 1; } if( text[i]== 0x0e || text[i] == 0x0f){ shifts += 1; } scanInput: i += 1; } if (hits == 0) { return 0; } // // Initial quality is based on relative proportion of recognized vs. // unrecognized escape sequences. // All good: quality = 100; // half or less good: quality = 0; // linear inbetween. quality = (100*hits - 100*misses) / (hits + misses); // Back off quality if there were too few escape sequences seen. // Include shifts in this computation, so that KR does not get penalized // for having only a single Escape sequence, but many shifts. if (hits+shifts < 5) { quality -= (5-(hits+shifts))*10; } if (quality < 0) { quality = 0; } return quality; } static const uint8_t escapeSequences_2022JP[][5] = { {0x1b, 0x24, 0x28, 0x43, 0x00}, // KS X 1001:1992 {0x1b, 0x24, 0x28, 0x44, 0x00}, // JIS X 212-1990 {0x1b, 0x24, 0x40, 0x00, 0x00}, // JIS C 6226-1978 {0x1b, 0x24, 0x41, 0x00, 0x00}, // GB 2312-80 {0x1b, 0x24, 0x42, 0x00, 0x00}, // JIS X 208-1983 {0x1b, 0x26, 0x40, 0x00, 0x00}, // JIS X 208 1990, 1997 {0x1b, 0x28, 0x42, 0x00, 0x00}, // ASCII {0x1b, 0x28, 0x48, 0x00, 0x00}, // JIS-Roman {0x1b, 0x28, 0x49, 0x00, 0x00}, // Half-width katakana {0x1b, 0x28, 0x4a, 0x00, 0x00}, // JIS-Roman {0x1b, 0x2e, 0x41, 0x00, 0x00}, // ISO 8859-1 {0x1b, 0x2e, 0x46, 0x00, 0x00} // ISO 8859-7 }; #if !UCONFIG_ONLY_HTML_CONVERSION static const uint8_t escapeSequences_2022KR[][5] = { {0x1b, 0x24, 0x29, 0x43, 0x00} }; static const uint8_t escapeSequences_2022CN[][5] = { {0x1b, 0x24, 0x29, 0x41, 0x00}, // GB 2312-80 {0x1b, 0x24, 0x29, 0x47, 0x00}, // CNS 11643-1992 Plane 1 {0x1b, 0x24, 0x2A, 0x48, 0x00}, // CNS 11643-1992 Plane 2 {0x1b, 0x24, 0x29, 0x45, 0x00}, // ISO-IR-165 {0x1b, 0x24, 0x2B, 0x49, 0x00}, // CNS 11643-1992 Plane 3 {0x1b, 0x24, 0x2B, 0x4A, 0x00}, // CNS 11643-1992 Plane 4 {0x1b, 0x24, 0x2B, 0x4B, 0x00}, // CNS 11643-1992 Plane 5 {0x1b, 0x24, 0x2B, 0x4C, 0x00}, // CNS 11643-1992 Plane 6 {0x1b, 0x24, 0x2B, 0x4D, 0x00}, // CNS 11643-1992 Plane 7 {0x1b, 0x4e, 0x00, 0x00, 0x00}, // SS2 {0x1b, 0x4f, 0x00, 0x00, 0x00}, // SS3 }; #endif CharsetRecog_2022JP::~CharsetRecog_2022JP() {} const char *CharsetRecog_2022JP::getName() const { return "ISO-2022-JP"; } UBool CharsetRecog_2022JP::match(InputText *textIn, CharsetMatch *results) const { int32_t confidence = match_2022(textIn->fInputBytes, textIn->fInputLen, escapeSequences_2022JP, UPRV_LENGTHOF(escapeSequences_2022JP)); results->set(textIn, this, confidence); return (confidence > 0); } #if !UCONFIG_ONLY_HTML_CONVERSION CharsetRecog_2022KR::~CharsetRecog_2022KR() {} const char *CharsetRecog_2022KR::getName() const { return "ISO-2022-KR"; } UBool CharsetRecog_2022KR::match(InputText *textIn, CharsetMatch *results) const { int32_t confidence = match_2022(textIn->fInputBytes, textIn->fInputLen, escapeSequences_2022KR, UPRV_LENGTHOF(escapeSequences_2022KR)); results->set(textIn, this, confidence); return (confidence > 0); } CharsetRecog_2022CN::~CharsetRecog_2022CN() {} const char *CharsetRecog_2022CN::getName() const { return "ISO-2022-CN"; } UBool CharsetRecog_2022CN::match(InputText *textIn, CharsetMatch *results) const { int32_t confidence = match_2022(textIn->fInputBytes, textIn->fInputLen, escapeSequences_2022CN, UPRV_LENGTHOF(escapeSequences_2022CN)); results->set(textIn, this, confidence); return (confidence > 0); } #endif CharsetRecog_2022::~CharsetRecog_2022() { // nothing to do } U_NAMESPACE_END #endif stringi/src/icu74/i18n/repattrn.cpp0000644000176200001440000006163414700200761016604 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // // file: repattrn.cpp // /* *************************************************************************** * Copyright (C) 2002-2016 International Business Machines Corporation * and others. All rights reserved. *************************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_REGULAR_EXPRESSIONS #include "unicode/regex.h" #include "unicode/uclean.h" #include "cmemory.h" #include "cstr.h" #include "uassert.h" #include "uhash.h" #include "uvector.h" #include "uvectr32.h" #include "uvectr64.h" #include "regexcmp.h" #include "regeximp.h" #include "regexst.h" U_NAMESPACE_BEGIN //-------------------------------------------------------------------------- // // RegexPattern Default Constructor // //-------------------------------------------------------------------------- RegexPattern::RegexPattern() { // Init all of this instances data. init(); } //-------------------------------------------------------------------------- // // Copy Constructor Note: This is a rather inefficient implementation, // but it probably doesn't matter. // //-------------------------------------------------------------------------- RegexPattern::RegexPattern(const RegexPattern &other) : UObject(other) { init(); *this = other; } //-------------------------------------------------------------------------- // // Assignment Operator // //-------------------------------------------------------------------------- RegexPattern &RegexPattern::operator = (const RegexPattern &other) { if (this == &other) { // Source and destination are the same. Don't do anything. return *this; } // Clean out any previous contents of object being assigned to. zap(); // Give target object a default initialization init(); // Copy simple fields fDeferredStatus = other.fDeferredStatus; if (U_FAILURE(fDeferredStatus)) { return *this; } if (other.fPatternString == nullptr) { fPatternString = nullptr; fPattern = utext_clone(fPattern, other.fPattern, false, true, &fDeferredStatus); } else { fPatternString = new UnicodeString(*(other.fPatternString)); if (fPatternString == nullptr) { fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; } else { fPattern = utext_openConstUnicodeString(nullptr, fPatternString, &fDeferredStatus); } } if (U_FAILURE(fDeferredStatus)) { return *this; } fFlags = other.fFlags; fLiteralText = other.fLiteralText; fMinMatchLen = other.fMinMatchLen; fFrameSize = other.fFrameSize; fDataSize = other.fDataSize; fStartType = other.fStartType; fInitialStringIdx = other.fInitialStringIdx; fInitialStringLen = other.fInitialStringLen; *fInitialChars = *other.fInitialChars; fInitialChar = other.fInitialChar; *fInitialChars8 = *other.fInitialChars8; fNeedsAltInput = other.fNeedsAltInput; // Copy the pattern. It's just values, nothing deep to copy. fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus); fGroupMap->assign(*other.fGroupMap, fDeferredStatus); // Copy the Unicode Sets. // Could be made more efficient if the sets were reference counted and shared, // but I doubt that pattern copying will be particularly common. // Note: init() already added an empty element zero to fSets int32_t i; int32_t numSets = other.fSets->size(); fSets8 = new Regex8BitSet[numSets]; if (fSets8 == nullptr) { fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; return *this; } for (i=1; ielementAt(i); UnicodeSet *newSet = new UnicodeSet(*sourceSet); if (newSet == nullptr) { fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; break; } fSets->addElement(newSet, fDeferredStatus); fSets8[i] = other.fSets8[i]; } // Copy the named capture group hash map. if (other.fNamedCaptureMap != nullptr && initNamedCaptureMap()) { int32_t hashPos = UHASH_FIRST; while (const UHashElement *hashEl = uhash_nextElement(other.fNamedCaptureMap, &hashPos)) { if (U_FAILURE(fDeferredStatus)) { break; } const UnicodeString *name = (const UnicodeString *)hashEl->key.pointer; UnicodeString *key = new UnicodeString(*name); int32_t val = hashEl->value.integer; if (key == nullptr) { fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; } else { uhash_puti(fNamedCaptureMap, key, val, &fDeferredStatus); } } } return *this; } //-------------------------------------------------------------------------- // // init Shared initialization for use by constructors. // Bring an uninitialized RegexPattern up to a default state. // //-------------------------------------------------------------------------- void RegexPattern::init() { fFlags = 0; fCompiledPat = 0; fLiteralText.remove(); fSets = nullptr; fSets8 = nullptr; fDeferredStatus = U_ZERO_ERROR; fMinMatchLen = 0; fFrameSize = 0; fDataSize = 0; fGroupMap = nullptr; fStartType = START_NO_INFO; fInitialStringIdx = 0; fInitialStringLen = 0; fInitialChars = nullptr; fInitialChar = 0; fInitialChars8 = nullptr; fNeedsAltInput = false; fNamedCaptureMap = nullptr; fPattern = nullptr; // will be set later fPatternString = nullptr; // may be set later fCompiledPat = new UVector64(fDeferredStatus); fGroupMap = new UVector32(fDeferredStatus); fSets = new UVector(fDeferredStatus); fInitialChars = new UnicodeSet; fInitialChars8 = new Regex8BitSet; if (U_FAILURE(fDeferredStatus)) { return; } if (fCompiledPat == nullptr || fGroupMap == nullptr || fSets == nullptr || fInitialChars == nullptr || fInitialChars8 == nullptr) { fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; return; } // Slot zero of the vector of sets is reserved. Fill it here. fSets->addElement((int32_t)0, fDeferredStatus); } bool RegexPattern::initNamedCaptureMap() { if (fNamedCaptureMap) { return true; } fNamedCaptureMap = uhash_openSize(uhash_hashUnicodeString, // Key hash function uhash_compareUnicodeString, // Key comparator function uhash_compareLong, // Value comparator function 7, // Initial table capacity &fDeferredStatus); if (U_FAILURE(fDeferredStatus)) { return false; } // fNamedCaptureMap owns its key strings, type (UnicodeString *) uhash_setKeyDeleter(fNamedCaptureMap, uprv_deleteUObject); return true; } //-------------------------------------------------------------------------- // // zap Delete everything owned by this RegexPattern. // //-------------------------------------------------------------------------- void RegexPattern::zap() { delete fCompiledPat; fCompiledPat = nullptr; int i; for (i=1; isize(); i++) { UnicodeSet *s; s = (UnicodeSet *)fSets->elementAt(i); if (s != nullptr) { delete s; } } delete fSets; fSets = nullptr; delete[] fSets8; fSets8 = nullptr; delete fGroupMap; fGroupMap = nullptr; delete fInitialChars; fInitialChars = nullptr; delete fInitialChars8; fInitialChars8 = nullptr; if (fPattern != nullptr) { utext_close(fPattern); fPattern = nullptr; } if (fPatternString != nullptr) { delete fPatternString; fPatternString = nullptr; } if (fNamedCaptureMap != nullptr) { uhash_close(fNamedCaptureMap); fNamedCaptureMap = nullptr; } } //-------------------------------------------------------------------------- // // Destructor // //-------------------------------------------------------------------------- RegexPattern::~RegexPattern() { zap(); } //-------------------------------------------------------------------------- // // Clone // //-------------------------------------------------------------------------- RegexPattern *RegexPattern::clone() const { RegexPattern *copy = new RegexPattern(*this); return copy; } //-------------------------------------------------------------------------- // // operator == (comparison) Consider to patterns to be == if the // pattern strings and the flags are the same. // Note that pattern strings with the same // characters can still be considered different. // //-------------------------------------------------------------------------- bool RegexPattern::operator ==(const RegexPattern &other) const { if (this->fFlags == other.fFlags && this->fDeferredStatus == other.fDeferredStatus) { if (this->fPatternString != nullptr && other.fPatternString != nullptr) { return *(this->fPatternString) == *(other.fPatternString); } else if (this->fPattern == nullptr) { if (other.fPattern == nullptr) { return true; } } else if (other.fPattern != nullptr) { UTEXT_SETNATIVEINDEX(this->fPattern, 0); UTEXT_SETNATIVEINDEX(other.fPattern, 0); return utext_equals(this->fPattern, other.fPattern); } } return false; } //--------------------------------------------------------------------- // // compile // //--------------------------------------------------------------------- RegexPattern * U_EXPORT2 RegexPattern::compile(const UnicodeString ®ex, uint32_t flags, UParseError &pe, UErrorCode &status) { if (U_FAILURE(status)) { return nullptr; } const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD | UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITERAL; if ((flags & ~allFlags) != 0) { status = U_REGEX_INVALID_FLAG; return nullptr; } if ((flags & UREGEX_CANON_EQ) != 0) { status = U_REGEX_UNIMPLEMENTED; return nullptr; } RegexPattern *This = new RegexPattern; if (This == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } if (U_FAILURE(This->fDeferredStatus)) { status = This->fDeferredStatus; delete This; return nullptr; } This->fFlags = flags; RegexCompile compiler(This, status); compiler.compile(regex, pe, status); if (U_FAILURE(status)) { delete This; This = nullptr; } return This; } // // compile, UText mode // RegexPattern * U_EXPORT2 RegexPattern::compile(UText *regex, uint32_t flags, UParseError &pe, UErrorCode &status) { if (U_FAILURE(status)) { return nullptr; } const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD | UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITERAL; if ((flags & ~allFlags) != 0) { status = U_REGEX_INVALID_FLAG; return nullptr; } if ((flags & UREGEX_CANON_EQ) != 0) { status = U_REGEX_UNIMPLEMENTED; return nullptr; } RegexPattern *This = new RegexPattern; if (This == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } if (U_FAILURE(This->fDeferredStatus)) { status = This->fDeferredStatus; delete This; return nullptr; } This->fFlags = flags; RegexCompile compiler(This, status); compiler.compile(regex, pe, status); if (U_FAILURE(status)) { delete This; This = nullptr; } return This; } // // compile with default flags. // RegexPattern * U_EXPORT2 RegexPattern::compile(const UnicodeString ®ex, UParseError &pe, UErrorCode &err) { return compile(regex, 0, pe, err); } // // compile with default flags, UText mode // RegexPattern * U_EXPORT2 RegexPattern::compile(UText *regex, UParseError &pe, UErrorCode &err) { return compile(regex, 0, pe, err); } // // compile with no UParseErr parameter. // RegexPattern * U_EXPORT2 RegexPattern::compile(const UnicodeString ®ex, uint32_t flags, UErrorCode &err) { UParseError pe; return compile(regex, flags, pe, err); } // // compile with no UParseErr parameter, UText mode // RegexPattern * U_EXPORT2 RegexPattern::compile(UText *regex, uint32_t flags, UErrorCode &err) { UParseError pe; return compile(regex, flags, pe, err); } //--------------------------------------------------------------------- // // flags // //--------------------------------------------------------------------- uint32_t RegexPattern::flags() const { return fFlags; } //--------------------------------------------------------------------- // // matcher(UnicodeString, err) // //--------------------------------------------------------------------- RegexMatcher *RegexPattern::matcher(const UnicodeString &input, UErrorCode &status) const { RegexMatcher *retMatcher = matcher(status); if (retMatcher != nullptr) { retMatcher->fDeferredStatus = status; retMatcher->reset(input); } return retMatcher; } //--------------------------------------------------------------------- // // matcher(status) // //--------------------------------------------------------------------- RegexMatcher *RegexPattern::matcher(UErrorCode &status) const { RegexMatcher *retMatcher = nullptr; if (U_FAILURE(status)) { return nullptr; } if (U_FAILURE(fDeferredStatus)) { status = fDeferredStatus; return nullptr; } retMatcher = new RegexMatcher(this); if (retMatcher == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } return retMatcher; } //--------------------------------------------------------------------- // // matches Convenience function to test for a match, starting // with a pattern string and a data string. // //--------------------------------------------------------------------- UBool U_EXPORT2 RegexPattern::matches(const UnicodeString ®ex, const UnicodeString &input, UParseError &pe, UErrorCode &status) { if (U_FAILURE(status)) {return false;} UBool retVal; RegexPattern *pat = nullptr; RegexMatcher *matcher = nullptr; pat = RegexPattern::compile(regex, 0, pe, status); matcher = pat->matcher(input, status); retVal = matcher->matches(status); delete matcher; delete pat; return retVal; } // // matches, UText mode // UBool U_EXPORT2 RegexPattern::matches(UText *regex, UText *input, UParseError &pe, UErrorCode &status) { if (U_FAILURE(status)) {return false;} UBool retVal = false; RegexPattern *pat = nullptr; RegexMatcher *matcher = nullptr; pat = RegexPattern::compile(regex, 0, pe, status); matcher = pat->matcher(status); if (U_SUCCESS(status)) { matcher->reset(input); retVal = matcher->matches(status); } delete matcher; delete pat; return retVal; } //--------------------------------------------------------------------- // // pattern // //--------------------------------------------------------------------- UnicodeString RegexPattern::pattern() const { if (fPatternString != nullptr) { return *fPatternString; } else if (fPattern == nullptr) { return UnicodeString(); } else { UErrorCode status = U_ZERO_ERROR; int64_t nativeLen = utext_nativeLength(fPattern); int32_t len16 = utext_extract(fPattern, 0, nativeLen, nullptr, 0, &status); // buffer overflow error UnicodeString result; status = U_ZERO_ERROR; char16_t *resultChars = result.getBuffer(len16); utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // unterminated warning result.releaseBuffer(len16); return result; } } //--------------------------------------------------------------------- // // patternText // //--------------------------------------------------------------------- UText *RegexPattern::patternText(UErrorCode &status) const { if (U_FAILURE(status)) {return nullptr;} status = U_ZERO_ERROR; if (fPattern != nullptr) { return fPattern; } else { RegexStaticSets::initGlobals(&status); return RegexStaticSets::gStaticSets->fEmptyText; } } //-------------------------------------------------------------------------------- // // groupNumberFromName() // //-------------------------------------------------------------------------------- int32_t RegexPattern::groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const { if (U_FAILURE(status)) { return 0; } // No need to explicitly check for syntactically valid names. // Invalid ones will never be in the map, and the lookup will fail. int32_t number = fNamedCaptureMap ? uhash_geti(fNamedCaptureMap, &groupName) : 0; if (number == 0) { status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; } return number; } int32_t RegexPattern::groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const { if (U_FAILURE(status)) { return 0; } UnicodeString name(groupName, nameLength, US_INV); return groupNumberFromName(name, status); } //--------------------------------------------------------------------- // // split // //--------------------------------------------------------------------- int32_t RegexPattern::split(const UnicodeString &input, UnicodeString dest[], int32_t destCapacity, UErrorCode &status) const { if (U_FAILURE(status)) { return 0; } RegexMatcher m(this); int32_t r = 0; // Check m's status to make sure all is ok. if (U_SUCCESS(m.fDeferredStatus)) { r = m.split(input, dest, destCapacity, status); } return r; } // // split, UText mode // int32_t RegexPattern::split(UText *input, UText *dest[], int32_t destCapacity, UErrorCode &status) const { if (U_FAILURE(status)) { return 0; } RegexMatcher m(this); int32_t r = 0; // Check m's status to make sure all is ok. if (U_SUCCESS(m.fDeferredStatus)) { r = m.split(input, dest, destCapacity, status); } return r; } //--------------------------------------------------------------------- // // dump Output the compiled form of the pattern. // Debugging function only. // //--------------------------------------------------------------------- void RegexPattern::dumpOp(int32_t index) const { (void)index; // Suppress warnings in non-debug build. #if defined(REGEX_DEBUG) static const char * const opNames[] = {URX_OPCODE_NAMES}; int32_t op = fCompiledPat->elementAti(index); int32_t val = URX_VAL(op); int32_t type = URX_TYPE(op); int32_t pinnedType = type; if ((uint32_t)pinnedType >= UPRV_LENGTHOF(opNames)) { pinnedType = 0; } printf("%4d %08x %-15s ", index, op, opNames[pinnedType]); switch (type) { case URX_NOP: case URX_DOTANY: case URX_DOTANY_ALL: case URX_FAIL: case URX_CARET: case URX_DOLLAR: case URX_BACKSLASH_G: case URX_BACKSLASH_X: case URX_END: case URX_DOLLAR_M: case URX_CARET_M: // Types with no operand field of interest. break; case URX_RESERVED_OP: case URX_START_CAPTURE: case URX_END_CAPTURE: case URX_STATE_SAVE: case URX_JMP: case URX_JMP_SAV: case URX_JMP_SAV_X: case URX_BACKSLASH_B: case URX_BACKSLASH_BU: case URX_BACKSLASH_D: case URX_BACKSLASH_Z: case URX_STRING_LEN: case URX_CTR_INIT: case URX_CTR_INIT_NG: case URX_CTR_LOOP: case URX_CTR_LOOP_NG: case URX_RELOC_OPRND: case URX_STO_SP: case URX_LD_SP: case URX_BACKREF: case URX_STO_INP_LOC: case URX_JMPX: case URX_LA_START: case URX_LA_END: case URX_BACKREF_I: case URX_LB_START: case URX_LB_CONT: case URX_LB_END: case URX_LBN_CONT: case URX_LBN_END: case URX_LOOP_C: case URX_LOOP_DOT_I: case URX_BACKSLASH_H: case URX_BACKSLASH_R: case URX_BACKSLASH_V: // types with an integer operand field. printf("%d", val); break; case URX_ONECHAR: case URX_ONECHAR_I: if (val < 0x20) { printf("%#x", val); } else { printf("'%s'", CStr(UnicodeString(val))()); } break; case URX_STRING: case URX_STRING_I: { int32_t lengthOp = fCompiledPat->elementAti(index+1); U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN); int32_t length = URX_VAL(lengthOp); UnicodeString str(fLiteralText, val, length); printf("%s", CStr(str)()); } break; case URX_SETREF: case URX_LOOP_SR_I: { UnicodeString s; UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val); set->toPattern(s, true); printf("%s", CStr(s)()); } break; case URX_STATIC_SETREF: case URX_STAT_SETREF_N: { UnicodeString s; if (val & URX_NEG_SET) { printf("NOT "); val &= ~URX_NEG_SET; } UnicodeSet &set = RegexStaticSets::gStaticSets->fPropSets[val]; set.toPattern(s, true); printf("%s", CStr(s)()); } break; default: printf("??????"); break; } printf("\n"); #endif } void RegexPattern::dumpPattern() const { #if defined(REGEX_DEBUG) int index; UnicodeString patStr; for (UChar32 c = utext_next32From(fPattern, 0); c != U_SENTINEL; c = utext_next32(fPattern)) { patStr.append(c); } printf("Original Pattern: \"%s\"\n", CStr(patStr)()); printf(" Min Match Length: %d\n", fMinMatchLen); printf(" Match Start Type: %s\n", START_OF_MATCH_STR(fStartType)); if (fStartType == START_STRING) { UnicodeString initialString(fLiteralText,fInitialStringIdx, fInitialStringLen); printf(" Initial match string: \"%s\"\n", CStr(initialString)()); } else if (fStartType == START_SET) { UnicodeString s; fInitialChars->toPattern(s, true); printf(" Match First Chars: %s\n", CStr(s)()); } else if (fStartType == START_CHAR) { printf(" First char of Match: "); if (fInitialChar > 0x20) { printf("'%s'\n", CStr(UnicodeString(fInitialChar))()); } else { printf("%#x\n", fInitialChar); } } printf("Named Capture Groups:\n"); if (!fNamedCaptureMap || uhash_count(fNamedCaptureMap) == 0) { printf(" None\n"); } else { int32_t pos = UHASH_FIRST; const UHashElement *el = nullptr; while ((el = uhash_nextElement(fNamedCaptureMap, &pos))) { const UnicodeString *name = (const UnicodeString *)el->key.pointer; int32_t number = el->value.integer; printf(" %d\t%s\n", number, CStr(*name)()); } } printf("\nIndex Binary Type Operand\n" \ "-------------------------------------------\n"); for (index = 0; indexsize(); index++) { dumpOp(index); } printf("\n\n"); #endif } UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern) U_NAMESPACE_END #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS stringi/src/icu74/i18n/numparse_types.h0000644000176200001440000002222114700200761017455 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #ifndef __NUMPARSE_TYPES_H__ #define __NUMPARSE_TYPES_H__ #include "unicode/uobject.h" #include "number_decimalquantity.h" #include "string_segment.h" U_NAMESPACE_BEGIN namespace numparse { namespace impl { // Forward-declarations class ParsedNumber; typedef int32_t result_flags_t; typedef int32_t parse_flags_t; /** Flags for the type result_flags_t */ enum ResultFlags { FLAG_NEGATIVE = 0x0001, FLAG_PERCENT = 0x0002, FLAG_PERMILLE = 0x0004, FLAG_HAS_EXPONENT = 0x0008, // FLAG_HAS_DEFAULT_CURRENCY = 0x0010, // no longer used FLAG_HAS_DECIMAL_SEPARATOR = 0x0020, FLAG_NAN = 0x0040, FLAG_INFINITY = 0x0080, FLAG_FAIL = 0x0100, }; /** Flags for the type parse_flags_t */ enum ParseFlags { PARSE_FLAG_IGNORE_CASE = 0x0001, PARSE_FLAG_MONETARY_SEPARATORS = 0x0002, PARSE_FLAG_STRICT_SEPARATORS = 0x0004, PARSE_FLAG_STRICT_GROUPING_SIZE = 0x0008, PARSE_FLAG_INTEGER_ONLY = 0x0010, PARSE_FLAG_GROUPING_DISABLED = 0x0020, // PARSE_FLAG_FRACTION_GROUPING_ENABLED = 0x0040, // see #10794 PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES = 0x0080, PARSE_FLAG_USE_FULL_AFFIXES = 0x0100, PARSE_FLAG_EXACT_AFFIX = 0x0200, PARSE_FLAG_PLUS_SIGN_ALLOWED = 0x0400, // PARSE_FLAG_OPTIMIZE = 0x0800, // no longer used // PARSE_FLAG_FORCE_BIG_DECIMAL = 0x1000, // not used in ICU4C PARSE_FLAG_NO_FOREIGN_CURRENCY = 0x2000, PARSE_FLAG_ALLOW_INFINITE_RECURSION = 0x4000, PARSE_FLAG_STRICT_IGNORABLES = 0x8000, }; // TODO: Is this class worthwhile? template class CompactUnicodeString { public: CompactUnicodeString() { static_assert(stackCapacity > 0, "cannot have zero space on stack"); fBuffer[0] = 0; } CompactUnicodeString(const UnicodeString& text, UErrorCode& status) : fBuffer(text.length() + 1, status) { if (U_FAILURE(status)) { return; } uprv_memcpy(fBuffer.getAlias(), text.getBuffer(), sizeof(char16_t) * text.length()); fBuffer[text.length()] = 0; } inline UnicodeString toAliasedUnicodeString() const { return UnicodeString(true, fBuffer.getAlias(), -1); } bool operator==(const CompactUnicodeString& other) const { // Use the alias-only constructor and then call UnicodeString operator== return toAliasedUnicodeString() == other.toAliasedUnicodeString(); } private: MaybeStackArray fBuffer; }; /** * Struct-like class to hold the results of a parsing routine. * * @author sffc */ // Exported as U_I18N_API for tests class U_I18N_API ParsedNumber { public: /** * The numerical value that was parsed. */ ::icu::number::impl::DecimalQuantity quantity; /** * The index of the last char consumed during parsing. If parsing started at index 0, this is equal * to the number of chars consumed. This is NOT necessarily the same as the StringSegment offset; * "weak" chars, like whitespace, change the offset, but the charsConsumed is not touched until a * "strong" char is encountered. */ int32_t charEnd; /** * Boolean flags (see constants above). */ result_flags_t flags; /** * The pattern string corresponding to the prefix that got consumed. */ UnicodeString prefix; /** * The pattern string corresponding to the suffix that got consumed. */ UnicodeString suffix; /** * The currency that got consumed. */ char16_t currencyCode[4]; ParsedNumber(); ParsedNumber(const ParsedNumber& other) = default; ParsedNumber& operator=(const ParsedNumber& other) = default; void clear(); /** * Call this method to register that a "strong" char was consumed. This should be done after calling * {@link StringSegment#setOffset} or {@link StringSegment#adjustOffset} except when the char is * "weak", like whitespace. * *

* What is a strong versus weak char? The behavior of number parsing is to "stop" * after reading the number, even if there is other content following the number. For example, after * parsing the string "123 " (123 followed by a space), the cursor should be set to 3, not 4, even * though there are matchers that accept whitespace. In this example, the digits are strong, whereas * the whitespace is weak. Grouping separators are weak, whereas decimal separators are strong. Most * other chars are strong. * * @param segment * The current StringSegment, usually immediately following a call to setOffset. */ void setCharsConsumed(const StringSegment& segment); /** Apply certain number-related flags to the DecimalQuantity. */ void postProcess(); /** * Returns whether this the parse was successful. To be successful, at least one char must have been * consumed, and the failure flag must not be set. */ bool success() const; bool seenNumber() const; double getDouble(UErrorCode& status) const; void populateFormattable(Formattable& output, parse_flags_t parseFlags) const; bool isBetterThan(const ParsedNumber& other); }; /** * The core interface implemented by all matchers used for number parsing. * * Given a string, there should NOT be more than one way to consume the string with the same matcher * applied multiple times. If there is, the non-greedy parsing algorithm will be unhappy and may enter an * exponential-time loop. For example, consider the "A Matcher" that accepts "any number of As". Given * the string "AAAA", there are 2^N = 8 ways to apply the A Matcher to this string: you could have the A * Matcher apply 4 times to each character; you could have it apply just once to all the characters; you * could have it apply to the first 2 characters and the second 2 characters; and so on. A better version * of the "A Matcher" would be for it to accept exactly one A, and allow the algorithm to run it * repeatedly to consume a string of multiple As. The A Matcher can implement the Flexible interface * below to signal that it can be applied multiple times in a row. * * @author sffc */ // Exported as U_I18N_API for tests class U_I18N_API NumberParseMatcher { public: virtual ~NumberParseMatcher(); /** * Matchers can override this method to return true to indicate that they are optional and can be run * repeatedly. Used by SeriesMatcher, primarily in the context of IgnorablesMatcher. */ virtual bool isFlexible() const { return false; } /** * Runs this matcher starting at the beginning of the given StringSegment. If this matcher finds * something interesting in the StringSegment, it should update the offset of the StringSegment * corresponding to how many chars were matched. * * This method is thread-safe. * * @param segment * The StringSegment to match against. Matches always start at the beginning of the * segment. The segment is guaranteed to contain at least one char. * @param result * The data structure to store results if the match succeeds. * @return Whether this matcher thinks there may be more interesting chars beyond the end of the * string segment. */ virtual bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const = 0; /** * Performs a fast "smoke check" for whether or not this matcher could possibly match against the * given string segment. The test should be as fast as possible but also as restrictive as possible. * For example, matchers can maintain a UnicodeSet of all code points that count possibly start a * match. Matchers should use the {@link StringSegment#startsWith} method in order to correctly * handle case folding. * * @param segment * The segment to check against. * @return true if the matcher might be able to match against this segment; false if it definitely * will not be able to match. */ virtual bool smokeTest(const StringSegment& segment) const = 0; /** * Method called at the end of a parse, after all matchers have failed to consume any more chars. * Allows a matcher to make final modifications to the result given the knowledge that no more * matches are possible. * * @param result * The data structure to store results. */ virtual void postProcess(ParsedNumber&) const { // Default implementation: no-op } // String for debugging virtual UnicodeString toString() const = 0; protected: // No construction except by subclasses! NumberParseMatcher() = default; }; /** * Interface for use in arguments. */ // Exported as U_I18N_API for tests class U_I18N_API MutableMatcherCollection { public: virtual ~MutableMatcherCollection() = default; virtual void addMatcher(NumberParseMatcher& matcher) = 0; }; } // namespace impl } // namespace numparse U_NAMESPACE_END #endif //__NUMPARSE_TYPES_H__ #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/ufieldpositer.cpp0000644000176200001440000000300614700200761017610 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ***************************************************************************************** * Copyright (C) 2015, International Business Machines * Corporation and others. All Rights Reserved. ***************************************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/ufieldpositer.h" #include "unicode/fpositer.h" #include "unicode/localpointer.h" U_NAMESPACE_USE U_CAPI UFieldPositionIterator* U_EXPORT2 ufieldpositer_open(UErrorCode* status) { if (U_FAILURE(*status)) { return nullptr; } FieldPositionIterator* fpositer = new FieldPositionIterator(); if (fpositer == nullptr) { *status = U_MEMORY_ALLOCATION_ERROR; } return (UFieldPositionIterator*)fpositer; } U_CAPI void U_EXPORT2 ufieldpositer_close(UFieldPositionIterator *fpositer) { delete (FieldPositionIterator*)fpositer; } U_CAPI int32_t U_EXPORT2 ufieldpositer_next(UFieldPositionIterator *fpositer, int32_t *beginIndex, int32_t *endIndex) { FieldPosition fp; int32_t field = -1; if (((FieldPositionIterator*)fpositer)->next(fp)) { field = fp.getField(); if (beginIndex) { *beginIndex = fp.getBeginIndex(); } if (endIndex) { *endIndex = fp.getEndIndex(); } } return field; } #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/double-conversion-utils.h0000644000176200001440000003731414700200761021203 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // // From the double-conversion library. Original license: // // Copyright 2010 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // ICU PATCH: ifdef around UCONFIG_NO_FORMATTING #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #ifndef DOUBLE_CONVERSION_UTILS_H_ #define DOUBLE_CONVERSION_UTILS_H_ // Use DOUBLE_CONVERSION_NON_PREFIXED_MACROS to get unprefixed macros as was // the case in double-conversion releases prior to 3.1.6 #include #include // For pre-C++11 compatibility #if __cplusplus >= 201103L #define DOUBLE_CONVERSION_NULLPTR nullptr #else #define DOUBLE_CONVERSION_NULLPTR NULL #endif // ICU PATCH: Use U_ASSERT instead of #include "uassert.h" #ifndef DOUBLE_CONVERSION_ASSERT #define DOUBLE_CONVERSION_ASSERT(condition) \ U_ASSERT(condition) #endif #if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(ASSERT) #define ASSERT DOUBLE_CONVERSION_ASSERT #endif #ifndef DOUBLE_CONVERSION_UNIMPLEMENTED #define DOUBLE_CONVERSION_UNIMPLEMENTED() (abort()) #endif #if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(UNIMPLEMENTED) #define UNIMPLEMENTED DOUBLE_CONVERSION_UNIMPLEMENTED #endif #ifndef DOUBLE_CONVERSION_NO_RETURN #ifdef _MSC_VER #define DOUBLE_CONVERSION_NO_RETURN __declspec(noreturn) #else #define DOUBLE_CONVERSION_NO_RETURN __attribute__((noreturn)) #endif #endif #if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(NO_RETURN) #define NO_RETURN DOUBLE_CONVERSION_NO_RETURN #endif #ifndef DOUBLE_CONVERSION_UNREACHABLE #ifdef _MSC_VER void DOUBLE_CONVERSION_NO_RETURN abort_noreturn(); inline void abort_noreturn() { abort(); } #define DOUBLE_CONVERSION_UNREACHABLE() (abort_noreturn()) #else #define DOUBLE_CONVERSION_UNREACHABLE() (abort()) #endif #endif #if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(UNREACHABLE) #define UNREACHABLE DOUBLE_CONVERSION_UNREACHABLE #endif // Not all compilers support __has_attribute and combining a check for both // ifdef and __has_attribute on the same preprocessor line isn't portable. #ifdef __has_attribute # define DOUBLE_CONVERSION_HAS_ATTRIBUTE(x) __has_attribute(x) #else # define DOUBLE_CONVERSION_HAS_ATTRIBUTE(x) 0 #endif #ifndef DOUBLE_CONVERSION_UNUSED #if DOUBLE_CONVERSION_HAS_ATTRIBUTE(unused) #define DOUBLE_CONVERSION_UNUSED __attribute__((unused)) #else #define DOUBLE_CONVERSION_UNUSED #endif #endif #if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(UNUSED) #define UNUSED DOUBLE_CONVERSION_UNUSED #endif #if DOUBLE_CONVERSION_HAS_ATTRIBUTE(uninitialized) #define DOUBLE_CONVERSION_STACK_UNINITIALIZED __attribute__((uninitialized)) #else #define DOUBLE_CONVERSION_STACK_UNINITIALIZED #endif #if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(STACK_UNINITIALIZED) #define STACK_UNINITIALIZED DOUBLE_CONVERSION_STACK_UNINITIALIZED #endif // Double operations detection based on target architecture. // Linux uses a 80bit wide floating point stack on x86. This induces double // rounding, which in turn leads to wrong results. // An easy way to test if the floating-point operations are correct is to // evaluate: 89255.0/1e22. If the floating-point stack is 64 bits wide then // the result is equal to 89255e-22. // The best way to test this, is to create a division-function and to compare // the output of the division with the expected result. (Inlining must be // disabled.) // On Linux,x86 89255e-22 != Div_double(89255.0/1e22) // // For example: /* // -- in div.c double Div_double(double x, double y) { return x / y; } // -- in main.c double Div_double(double x, double y); // Forward declaration. int main(int argc, char** argv) { return Div_double(89255.0, 1e22) == 89255e-22; } */ // Run as follows ./main || echo "correct" // // If it prints "correct" then the architecture should be here, in the "correct" section. #if defined(_M_X64) || defined(__x86_64__) || \ defined(__ARMEL__) || defined(__avr32__) || defined(_M_ARM) || defined(_M_ARM64) || \ defined(__hppa__) || defined(__ia64__) || \ defined(__mips__) || \ defined(__loongarch__) || \ defined(__nios2__) || defined(__ghs) || \ defined(__powerpc__) || defined(__ppc__) || defined(__ppc64__) || \ defined(_POWER) || defined(_ARCH_PPC) || defined(_ARCH_PPC64) || \ defined(__sparc__) || defined(__sparc) || defined(__s390__) || \ defined(__SH4__) || defined(__alpha__) || \ defined(_MIPS_ARCH_MIPS32R2) || defined(__ARMEB__) ||\ defined(__AARCH64EL__) || defined(__aarch64__) || defined(__AARCH64EB__) || \ defined(__riscv) || defined(__e2k__) || \ defined(__or1k__) || defined(__arc__) || defined(__ARC64__) || \ defined(__microblaze__) || defined(__XTENSA__) || \ defined(__EMSCRIPTEN__) || defined(__wasm32__) || defined(__loongarch__) #define DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS 1 #elif defined(__mc68000__) || \ defined(__pnacl__) || defined(__native_client__) #undef DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS #elif defined(_M_IX86) || defined(__i386__) || defined(__i386) #if defined(_WIN32) // Windows uses a 64bit wide floating point stack. #define DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS 1 #else #undef DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS #endif // _WIN32 #else #error Target architecture was not detected as supported by Double-Conversion. #endif #if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(CORRECT_DOUBLE_OPERATIONS) #define CORRECT_DOUBLE_OPERATIONS DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS #endif #if defined(_WIN32) && !defined(__MINGW32__) typedef signed char int8_t; typedef unsigned char uint8_t; typedef short int16_t; // NOLINT typedef unsigned short uint16_t; // NOLINT typedef int int32_t; typedef unsigned int uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; // intptr_t and friends are defined in crtdefs.h through stdio.h. #else #include #endif typedef uint16_t uc16; // The following macro works on both 32 and 64-bit platforms. // Usage: instead of writing 0x1234567890123456 // write DOUBLE_CONVERSION_UINT64_2PART_C(0x12345678,90123456); #define DOUBLE_CONVERSION_UINT64_2PART_C(a, b) (((static_cast(a) << 32) + 0x##b##u)) #if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(UINT64_2PART_C) #define UINT64_2PART_C DOUBLE_CONVERSION_UINT64_2PART_C #endif // The expression DOUBLE_CONVERSION_ARRAY_SIZE(a) is a compile-time constant of type // size_t which represents the number of elements of the given // array. You should only use DOUBLE_CONVERSION_ARRAY_SIZE on statically allocated // arrays. #ifndef DOUBLE_CONVERSION_ARRAY_SIZE #define DOUBLE_CONVERSION_ARRAY_SIZE(a) \ ((sizeof(a) / sizeof(*(a))) / \ static_cast(!(sizeof(a) % sizeof(*(a))))) #endif #if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(ARRAY_SIZE) #define ARRAY_SIZE DOUBLE_CONVERSION_ARRAY_SIZE #endif // A macro to disallow the evil copy constructor and operator= functions // This should be used in the private: declarations for a class #ifndef DOUBLE_CONVERSION_DISALLOW_COPY_AND_ASSIGN #define DOUBLE_CONVERSION_DISALLOW_COPY_AND_ASSIGN(TypeName) \ TypeName(const TypeName&); \ void operator=(const TypeName&) #endif #if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(DC_DISALLOW_COPY_AND_ASSIGN) #define DC_DISALLOW_COPY_AND_ASSIGN DOUBLE_CONVERSION_DISALLOW_COPY_AND_ASSIGN #endif // A macro to disallow all the implicit constructors, namely the // default constructor, copy constructor and operator= functions. // // This should be used in the private: declarations for a class // that wants to prevent anyone from instantiating it. This is // especially useful for classes containing only static methods. #ifndef DOUBLE_CONVERSION_DISALLOW_IMPLICIT_CONSTRUCTORS #define DOUBLE_CONVERSION_DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \ TypeName(); \ DOUBLE_CONVERSION_DISALLOW_COPY_AND_ASSIGN(TypeName) #endif #if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(DC_DISALLOW_IMPLICIT_CONSTRUCTORS) #define DC_DISALLOW_IMPLICIT_CONSTRUCTORS DOUBLE_CONVERSION_DISALLOW_IMPLICIT_CONSTRUCTORS #endif // ICU PATCH: Wrap in ICU namespace U_NAMESPACE_BEGIN namespace double_conversion { inline int StrLength(const char* string) { size_t length = strlen(string); DOUBLE_CONVERSION_ASSERT(length == static_cast(static_cast(length))); return static_cast(length); } // This is a simplified version of V8's Vector class. template class Vector { public: Vector() : start_(DOUBLE_CONVERSION_NULLPTR), length_(0) {} Vector(T* data, int len) : start_(data), length_(len) { DOUBLE_CONVERSION_ASSERT(len == 0 || (len > 0 && data != DOUBLE_CONVERSION_NULLPTR)); } // Returns a vector using the same backing storage as this one, // spanning from and including 'from', to but not including 'to'. Vector SubVector(int from, int to) { DOUBLE_CONVERSION_ASSERT(to <= length_); DOUBLE_CONVERSION_ASSERT(from < to); DOUBLE_CONVERSION_ASSERT(0 <= from); return Vector(start() + from, to - from); } // Returns the length of the vector. int length() const { return length_; } // Returns whether or not the vector is empty. bool is_empty() const { return length_ == 0; } // Returns the pointer to the start of the data in the vector. T* start() const { return start_; } // Access individual vector elements - checks bounds in debug mode. T& operator[](int index) const { DOUBLE_CONVERSION_ASSERT(0 <= index && index < length_); return start_[index]; } T& first() { return start_[0]; } T& last() { return start_[length_ - 1]; } void pop_back() { DOUBLE_CONVERSION_ASSERT(!is_empty()); --length_; } private: T* start_; int length_; }; // Helper class for building result strings in a character buffer. The // purpose of the class is to use safe operations that checks the // buffer bounds on all operations in debug mode. class StringBuilder { public: StringBuilder(char* buffer, int buffer_size) : buffer_(buffer, buffer_size), position_(0) { } ~StringBuilder() { if (!is_finalized()) Finalize(); } int size() const { return buffer_.length(); } // Get the current position in the builder. int position() const { DOUBLE_CONVERSION_ASSERT(!is_finalized()); return position_; } // Reset the position. void Reset() { position_ = 0; } // Add a single character to the builder. It is not allowed to add // 0-characters; use the Finalize() method to terminate the string // instead. void AddCharacter(char c) { DOUBLE_CONVERSION_ASSERT(c != '\0'); DOUBLE_CONVERSION_ASSERT(!is_finalized() && position_ < buffer_.length()); buffer_[position_++] = c; } // Add an entire string to the builder. Uses strlen() internally to // compute the length of the input string. void AddString(const char* s) { AddSubstring(s, StrLength(s)); } // Add the first 'n' characters of the given string 's' to the // builder. The input string must have enough characters. void AddSubstring(const char* s, int n) { DOUBLE_CONVERSION_ASSERT(!is_finalized() && position_ + n < buffer_.length()); DOUBLE_CONVERSION_ASSERT(static_cast(n) <= strlen(s)); memmove(&buffer_[position_], s, static_cast(n)); position_ += n; } // Add character padding to the builder. If count is non-positive, // nothing is added to the builder. void AddPadding(char c, int count) { for (int i = 0; i < count; i++) { AddCharacter(c); } } // Finalize the string by 0-terminating it and returning the buffer. char* Finalize() { DOUBLE_CONVERSION_ASSERT(!is_finalized() && position_ < buffer_.length()); buffer_[position_] = '\0'; // Make sure nobody managed to add a 0-character to the // buffer while building the string. DOUBLE_CONVERSION_ASSERT(strlen(buffer_.start()) == static_cast(position_)); position_ = -1; DOUBLE_CONVERSION_ASSERT(is_finalized()); return buffer_.start(); } private: Vector buffer_; int position_; bool is_finalized() const { return position_ < 0; } DOUBLE_CONVERSION_DISALLOW_IMPLICIT_CONSTRUCTORS(StringBuilder); }; // The type-based aliasing rule allows the compiler to assume that pointers of // different types (for some definition of different) never alias each other. // Thus the following code does not work: // // float f = foo(); // int fbits = *(int*)(&f); // // The compiler 'knows' that the int pointer can't refer to f since the types // don't match, so the compiler may cache f in a register, leaving random data // in fbits. Using C++ style casts makes no difference, however a pointer to // char data is assumed to alias any other pointer. This is the 'memcpy // exception'. // // Bit_cast uses the memcpy exception to move the bits from a variable of one // type of a variable of another type. Of course the end result is likely to // be implementation dependent. Most compilers (gcc-4.2 and MSVC 2005) // will completely optimize BitCast away. // // There is an additional use for BitCast. // Recent gccs will warn when they see casts that may result in breakage due to // the type-based aliasing rule. If you have checked that there is no breakage // you can use BitCast to cast one pointer type to another. This confuses gcc // enough that it can no longer see that you have cast one pointer type to // another thus avoiding the warning. template Dest BitCast(const Source& source) { // Compile time assertion: sizeof(Dest) == sizeof(Source) // A compile error here means your Dest and Source have different sizes. #if __cplusplus >= 201103L static_assert(sizeof(Dest) == sizeof(Source), "source and destination size mismatch"); #else DOUBLE_CONVERSION_UNUSED typedef char VerifySizesAreEqual[sizeof(Dest) == sizeof(Source) ? 1 : -1]; #endif Dest dest; memmove(&dest, &source, sizeof(dest)); return dest; } template Dest BitCast(Source* source) { return BitCast(reinterpret_cast(source)); } } // namespace double_conversion // ICU PATCH: Close ICU namespace U_NAMESPACE_END #endif // DOUBLE_CONVERSION_UTILS_H_ #endif // ICU PATCH: close #if !UCONFIG_NO_FORMATTING stringi/src/icu74/i18n/unesctrn.cpp0000644000176200001440000002250714700200761016602 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (c) 2001-2011, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 11/19/2001 aliu Creation. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/uchar.h" #include "unicode/utf16.h" #include "unesctrn.h" #include "util.h" #include "cmemory.h" U_NAMESPACE_BEGIN /** * Special character marking the end of the spec[] array. */ static const char16_t END = 0xFFFF; // Unicode: "U+10FFFF" hex, min=4, max=6 static const char16_t SPEC_Unicode[] = { 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, END }; // Java: "\\uFFFF" hex, min=4, max=4 static const char16_t SPEC_Java[] = { 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, END }; // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8 static const char16_t SPEC_C[] = { 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, END }; // XML: "􏿿" hex, min=1, max=6 static const char16_t SPEC_XML[] = { 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, END }; // XML10: "􏿿" dec, min=1, max=7 (not really "Hex-Any") static const char16_t SPEC_XML10[] = { 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, END }; // Perl: "\\x{263A}" hex, min=1, max=6 static const char16_t SPEC_Perl[] = { 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, END }; // All: Java, C, Perl, XML, XML10, Unicode static const char16_t SPEC_Any[] = { 2, 0, 16, 4, 6, 85/*U*/, 43/*+*/, // Unicode 2, 0, 16, 4, 4, 92/*\*/, 117/*u*/, // Java 2, 0, 16, 8, 8, 92/*\*/, 85/*U*/, // C (surrogates) 3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/, // XML 2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/, // XML10 3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl END }; UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator) static char16_t* copySpec(const char16_t* spec) { int32_t len = 0; while (spec[len] != END) { ++len; } ++len; char16_t *result = (char16_t *)uprv_malloc(len*sizeof(char16_t)); // Check for memory allocation error. if (result != nullptr) { uprv_memcpy(result, spec, (size_t)len*sizeof(result[0])); } return result; } /** * Factory methods. Ignore the context. */ static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) { return new UnescapeTransliterator(ID, SPEC_Unicode); } static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) { return new UnescapeTransliterator(ID, SPEC_Java); } static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) { return new UnescapeTransliterator(ID, SPEC_C); } static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) { return new UnescapeTransliterator(ID, SPEC_XML); } static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) { return new UnescapeTransliterator(ID, SPEC_XML10); } static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) { return new UnescapeTransliterator(ID, SPEC_Perl); } static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) { return new UnescapeTransliterator(ID, SPEC_Any); } /** * Registers standard variants with the system. Called by * Transliterator during initialization. */ void UnescapeTransliterator::registerIDs() { Token t = integerToken(0); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t); Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t); } /** * Constructor. Takes the encoded spec array. */ UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID, const char16_t *newSpec) : Transliterator(newID, nullptr) { this->spec = copySpec(newSpec); } /** * Copy constructor. */ UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) : Transliterator(o) { this->spec = copySpec(o.spec); } UnescapeTransliterator::~UnescapeTransliterator() { uprv_free(spec); } /** * Transliterator API. */ UnescapeTransliterator* UnescapeTransliterator::clone() const { return new UnescapeTransliterator(*this); } /** * Implements {@link Transliterator#handleTransliterate}. */ void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos, UBool isIncremental) const { int32_t start = pos.start; int32_t limit = pos.limit; int32_t i, ipat; while (start < limit) { // Loop over the forms in spec[]. Exit this loop when we // match one of the specs. Exit the outer loop if a // partial match is detected and isIncremental is true. for (ipat=0; spec[ipat] != END;) { // Read the header int32_t prefixLen = spec[ipat++]; int32_t suffixLen = spec[ipat++]; int8_t radix = (int8_t) spec[ipat++]; int32_t minDigits = spec[ipat++]; int32_t maxDigits = spec[ipat++]; // s is a copy of start that is advanced over the // characters as we parse them. int32_t s = start; UBool match = true; for (i=0; i= limit) { if (i > 0) { // We've already matched a character. This is // a partial match, so we return if in // incremental mode. In non-incremental mode, // go to the next spec. if (isIncremental) { goto exit; } match = false; break; } } char16_t c = text.charAt(s++); if (c != spec[ipat + i]) { match = false; break; } } if (match) { UChar32 u = 0; int32_t digitCount = 0; for (;;) { if (s >= limit) { // Check for partial match in incremental mode. if (s > start && isIncremental) { goto exit; } break; } UChar32 ch = text.char32At(s); int32_t digit = u_digit(ch, radix); if (digit < 0) { break; } s += U16_LENGTH(ch); u = (u * radix) + digit; if (++digitCount == maxDigits) { break; } } match = (digitCount >= minDigits); if (match) { for (i=0; i= limit) { // Check for partial match in incremental mode. if (s > start && isIncremental) { goto exit; } match = false; break; } char16_t c = text.charAt(s++); if (c != spec[ipat + prefixLen + i]) { match = false; break; } } if (match) { // At this point, we have a match UnicodeString str(u); text.handleReplaceBetween(start, s, str); limit -= s - start - str.length(); // The following break statement leaves the // loop that is traversing the forms in // spec[]. We then parse the next input // character. break; } } } ipat += prefixLen + suffixLen; } if (start < limit) { start += U16_LENGTH(text.char32At(start)); } } exit: pos.contextLimit += limit - pos.limit; pos.limit = limit; pos.start = start; } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_TRANSLITERATION */ //eof stringi/src/icu74/i18n/utmscale.cpp0000644000176200001440000001517714700200761016563 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2004-2012, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/utmscale.h" #define ticks INT64_C(1) #define microseconds (ticks * 10) #define milliseconds (microseconds * 1000) #define seconds (milliseconds * 1000) #define minutes (seconds * 60) #define hours (minutes * 60) #define days (hours * 24) /* Constants generated by ICU4J com.ibm.icu.dev.tool.timescale.GenerateCTimeScaleData. */ static const int64_t timeScaleTable[UDTS_MAX_SCALE][UTSV_MAX_SCALE_VALUE] = { /* units epochOffset fromMin fromMax toMin toMax epochOffsetP1 epochOffsetM1 unitsRound minRound maxRound */ {milliseconds, INT64_C(62135596800000), INT64_C(-984472800485477), INT64_C(860201606885477), INT64_C(-9223372036854774999), INT64_C(9223372036854774999), INT64_C(62135596800001), INT64_C(62135596799999), INT64_C(5000), INT64_C(-9223372036854770808), INT64_C(9223372036854770807)}, {seconds, INT64_C(62135596800), INT64_C(-984472800485), INT64_C(860201606885), U_INT64_MIN, U_INT64_MAX, INT64_C(62135596801), INT64_C(62135596799), INT64_C(5000000), INT64_C(-9223372036849775808), INT64_C(9223372036849775807)}, {milliseconds, INT64_C(62135596800000), INT64_C(-984472800485477), INT64_C(860201606885477), INT64_C(-9223372036854774999), INT64_C(9223372036854774999), INT64_C(62135596800001), INT64_C(62135596799999), INT64_C(5000), INT64_C(-9223372036854770808), INT64_C(9223372036854770807)}, {ticks, INT64_C(504911232000000000), U_INT64_MIN, INT64_C(8718460804854775807), INT64_C(-8718460804854775808), U_INT64_MAX, INT64_C(504911232000000000), INT64_C(504911232000000000), INT64_C(0), U_INT64_MIN, U_INT64_MAX}, {ticks, INT64_C(0), U_INT64_MIN, U_INT64_MAX, U_INT64_MIN, U_INT64_MAX, INT64_C(0), INT64_C(0), INT64_C(0), U_INT64_MIN, U_INT64_MAX}, {seconds, INT64_C(60052752000), INT64_C(-982389955685), INT64_C(862284451685), U_INT64_MIN, U_INT64_MAX, INT64_C(60052752001), INT64_C(60052751999), INT64_C(5000000), INT64_C(-9223372036849775808), INT64_C(9223372036849775807)}, {seconds, INT64_C(63113904000), INT64_C(-985451107685), INT64_C(859223299685), U_INT64_MIN, U_INT64_MAX, INT64_C(63113904001), INT64_C(63113903999), INT64_C(5000000), INT64_C(-9223372036849775808), INT64_C(9223372036849775807)}, {days, INT64_C(693594), INT64_C(-11368793), INT64_C(9981605), U_INT64_MIN, U_INT64_MAX, INT64_C(693595), INT64_C(693593), INT64_C(432000000000), INT64_C(-9223371604854775808), INT64_C(9223371604854775807)}, {days, INT64_C(693594), INT64_C(-11368793), INT64_C(9981605), U_INT64_MIN, U_INT64_MAX, INT64_C(693595), INT64_C(693593), INT64_C(432000000000), INT64_C(-9223371604854775808), INT64_C(9223371604854775807)}, {microseconds, INT64_C(62135596800000000), INT64_C(-984472800485477580), INT64_C(860201606885477580), INT64_C(-9223372036854775804), INT64_C(9223372036854775804), INT64_C(62135596800000001), INT64_C(62135596799999999), INT64_C(5), INT64_C(-9223372036854775803), INT64_C(9223372036854775802)}, }; U_CAPI int64_t U_EXPORT2 utmscale_getTimeScaleValue(UDateTimeScale timeScale, UTimeScaleValue value, UErrorCode *status) { if (status == nullptr || U_FAILURE(*status)) { return 0; } if (timeScale < UDTS_JAVA_TIME || UDTS_MAX_SCALE <= timeScale || value < UTSV_UNITS_VALUE || UTSV_MAX_SCALE_VALUE <= value) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } return timeScaleTable[timeScale][value]; } U_CAPI int64_t U_EXPORT2 utmscale_fromInt64(int64_t otherTime, UDateTimeScale timeScale, UErrorCode *status) UPRV_NO_SANITIZE_UNDEFINED { const int64_t *data; if (status == nullptr || U_FAILURE(*status)) { return 0; } if ((int32_t)timeScale < 0 || timeScale >= UDTS_MAX_SCALE) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } data = (const int64_t *)(&timeScaleTable[timeScale]); if (otherTime < data[UTSV_FROM_MIN_VALUE] || otherTime > data[UTSV_FROM_MAX_VALUE]) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } return (otherTime + data[UTSV_EPOCH_OFFSET_VALUE]) * data[UTSV_UNITS_VALUE]; } U_CAPI int64_t U_EXPORT2 utmscale_toInt64(int64_t universalTime, UDateTimeScale timeScale, UErrorCode *status) UPRV_NO_SANITIZE_UNDEFINED { const int64_t *data; if (status == nullptr || U_FAILURE(*status)) { return 0; } if ((int32_t)timeScale < 0 || timeScale >= UDTS_MAX_SCALE) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } data = (const int64_t *)(&timeScaleTable[timeScale]); if (universalTime < data[UTSV_TO_MIN_VALUE] || universalTime > data[UTSV_TO_MAX_VALUE]) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } if (universalTime < 0) { if (universalTime < data[UTSV_MIN_ROUND_VALUE]) { return (universalTime + data[UTSV_UNITS_ROUND_VALUE]) / data[UTSV_UNITS_VALUE] - data[UTSV_EPOCH_OFFSET_PLUS_1_VALUE]; } return (universalTime - data[UTSV_UNITS_ROUND_VALUE]) / data[UTSV_UNITS_VALUE] - data[UTSV_EPOCH_OFFSET_VALUE]; } if (universalTime > data[UTSV_MAX_ROUND_VALUE]) { return (universalTime - data[UTSV_UNITS_ROUND_VALUE]) / data[UTSV_UNITS_VALUE] - data[UTSV_EPOCH_OFFSET_MINUS_1_VALUE]; } return (universalTime + data[UTSV_UNITS_ROUND_VALUE]) / data[UTSV_UNITS_VALUE] - data[UTSV_EPOCH_OFFSET_VALUE]; } #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/utf8collationiterator.cpp0000644000176200001440000004116114700200761021303 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2012-2014, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * utf8collationiterator.cpp * * created on: 2012nov12 (from utf16collationiterator.cpp & uitercollationiterator.cpp) * created by: Markus W. Scherer */ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION #include "unicode/utf8.h" #include "charstr.h" #include "cmemory.h" #include "collation.h" #include "collationdata.h" #include "collationfcd.h" #include "collationiterator.h" #include "normalizer2impl.h" #include "uassert.h" #include "utf8collationiterator.h" U_NAMESPACE_BEGIN UTF8CollationIterator::~UTF8CollationIterator() {} void UTF8CollationIterator::resetToOffset(int32_t newOffset) { reset(); pos = newOffset; } int32_t UTF8CollationIterator::getOffset() const { return pos; } uint32_t UTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) { if(pos == length) { c = U_SENTINEL; return Collation::FALLBACK_CE32; } // Optimized combination of U8_NEXT_OR_FFFD() and UTRIE2_U8_NEXT32(). c = u8[pos++]; if(U8_IS_SINGLE(c)) { // ASCII 00..7F return trie->data32[c]; } uint8_t t1, t2; if(0xe0 <= c && c < 0xf0 && ((pos + 1) < length || length < 0) && U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) && (t2 = (u8[pos + 1] - 0x80)) <= 0x3f) { // U+0800..U+FFFF except surrogates c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2); pos += 2; return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); } else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) { // U+0080..U+07FF uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1]; c = ((c & 0x1f) << 6) | t1; ++pos; return ce32; } else { // Function call for supplementary code points and error cases. // Illegal byte sequences yield U+FFFD. c = utf8_nextCharSafeBody(u8, &pos, length, c, -3); return data->getCE32(c); } } UBool UTF8CollationIterator::foundNULTerminator() { if(length < 0) { length = --pos; return true; } else { return false; } } UBool UTF8CollationIterator::forbidSurrogateCodePoints() const { return true; } UChar32 UTF8CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) { if(pos == length) { return U_SENTINEL; } if(u8[pos] == 0 && length < 0) { length = pos; return U_SENTINEL; } UChar32 c; U8_NEXT_OR_FFFD(u8, pos, length, c); return c; } UChar32 UTF8CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) { if(pos == 0) { return U_SENTINEL; } UChar32 c; U8_PREV_OR_FFFD(u8, 0, pos, c); return c; } void UTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { U8_FWD_N(u8, pos, length, num); } void UTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { U8_BACK_N(u8, 0, pos, num); } // FCDUTF8CollationIterator ------------------------------------------------ *** FCDUTF8CollationIterator::~FCDUTF8CollationIterator() {} void FCDUTF8CollationIterator::resetToOffset(int32_t newOffset) { reset(); start = pos = newOffset; state = CHECK_FWD; } int32_t FCDUTF8CollationIterator::getOffset() const { if(state != IN_NORMALIZED) { return pos; } else if(pos == 0) { return start; } else { return limit; } } uint32_t FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) { for(;;) { if(state == CHECK_FWD) { // Combination of UTF8CollationIterator::handleNextCE32() with FCD check fastpath. if(pos == length) { c = U_SENTINEL; return Collation::FALLBACK_CE32; } c = u8[pos++]; if(U8_IS_SINGLE(c)) { // ASCII 00..7F return trie->data32[c]; } uint8_t t1, t2; if(0xe0 <= c && c < 0xf0 && ((pos + 1) < length || length < 0) && U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) && (t2 = (u8[pos + 1] - 0x80)) <= 0x3f) { // U+0800..U+FFFF except surrogates c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2); pos += 2; if(CollationFCD::hasTccc(c) && (CollationFCD::maybeTibetanCompositeVowel(c) || (pos != length && nextHasLccc()))) { pos -= 3; } else { break; // return CE32(BMP) } } else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) { // U+0080..U+07FF uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1]; c = ((c & 0x1f) << 6) | t1; ++pos; if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) { pos -= 2; } else { return ce32; } } else { // Function call for supplementary code points and error cases. // Illegal byte sequences yield U+FFFD. c = utf8_nextCharSafeBody(u8, &pos, length, c, -3); if(c == 0xfffd) { return Collation::FFFD_CE32; } else { U_ASSERT(c > 0xffff); if(CollationFCD::hasTccc(U16_LEAD(c)) && pos != length && nextHasLccc()) { pos -= 4; } else { return data->getCE32FromSupplementary(c); } } } if(!nextSegment(errorCode)) { c = U_SENTINEL; return Collation::FALLBACK_CE32; } continue; } else if(state == IN_FCD_SEGMENT && pos != limit) { return UTF8CollationIterator::handleNextCE32(c, errorCode); } else if(state == IN_NORMALIZED && pos != normalized.length()) { c = normalized[pos++]; break; } else { switchToForward(); } } return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); } UBool FCDUTF8CollationIterator::nextHasLccc() const { U_ASSERT(state == CHECK_FWD && pos != length); // The lowest code point with ccc!=0 is U+0300 which is CC 80 in UTF-8. // CJK U+4000..U+DFFF except U+Axxx are also FCD-inert. (Lead bytes E4..ED except EA.) UChar32 c = u8[pos]; if(c < 0xcc || (0xe4 <= c && c <= 0xed && c != 0xea)) { return false; } int32_t i = pos; U8_NEXT_OR_FFFD(u8, i, length, c); if(c > 0xffff) { c = U16_LEAD(c); } return CollationFCD::hasLccc(c); } UBool FCDUTF8CollationIterator::previousHasTccc() const { U_ASSERT(state == CHECK_BWD && pos != 0); UChar32 c = u8[pos - 1]; if(U8_IS_SINGLE(c)) { return false; } int32_t i = pos; U8_PREV_OR_FFFD(u8, 0, i, c); if(c > 0xffff) { c = U16_LEAD(c); } return CollationFCD::hasTccc(c); } char16_t FCDUTF8CollationIterator::handleGetTrailSurrogate() { if(state != IN_NORMALIZED) { return 0; } U_ASSERT(pos < normalized.length()); char16_t trail; if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; } return trail; } UBool FCDUTF8CollationIterator::foundNULTerminator() { if(state == CHECK_FWD && length < 0) { length = --pos; return true; } else { return false; } } UChar32 FCDUTF8CollationIterator::nextCodePoint(UErrorCode &errorCode) { UChar32 c; for(;;) { if(state == CHECK_FWD) { if(pos == length || ((c = u8[pos]) == 0 && length < 0)) { return U_SENTINEL; } if(U8_IS_SINGLE(c)) { ++pos; return c; } U8_NEXT_OR_FFFD(u8, pos, length, c); if(CollationFCD::hasTccc(c <= 0xffff ? c : U16_LEAD(c)) && (CollationFCD::maybeTibetanCompositeVowel(c) || (pos != length && nextHasLccc()))) { // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence // and we can use U8_LENGTH() rather than a previous-position variable. pos -= U8_LENGTH(c); if(!nextSegment(errorCode)) { return U_SENTINEL; } continue; } return c; } else if(state == IN_FCD_SEGMENT && pos != limit) { U8_NEXT_OR_FFFD(u8, pos, length, c); return c; } else if(state == IN_NORMALIZED && pos != normalized.length()) { c = normalized.char32At(pos); pos += U16_LENGTH(c); return c; } else { switchToForward(); } } } UChar32 FCDUTF8CollationIterator::previousCodePoint(UErrorCode &errorCode) { UChar32 c; for(;;) { if(state == CHECK_BWD) { if(pos == 0) { return U_SENTINEL; } if(U8_IS_SINGLE(c = u8[pos - 1])) { --pos; return c; } U8_PREV_OR_FFFD(u8, 0, pos, c); if(CollationFCD::hasLccc(c <= 0xffff ? c : U16_LEAD(c)) && (CollationFCD::maybeTibetanCompositeVowel(c) || (pos != 0 && previousHasTccc()))) { // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence // and we can use U8_LENGTH() rather than a previous-position variable. pos += U8_LENGTH(c); if(!previousSegment(errorCode)) { return U_SENTINEL; } continue; } return c; } else if(state == IN_FCD_SEGMENT && pos != start) { U8_PREV_OR_FFFD(u8, 0, pos, c); return c; } else if(state >= IN_NORMALIZED && pos != 0) { c = normalized.char32At(pos - 1); pos -= U16_LENGTH(c); return c; } else { switchToBackward(); } } } void FCDUTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) { // Specify the class to avoid a virtual-function indirection. // In Java, we would declare this class final. while(num > 0 && FCDUTF8CollationIterator::nextCodePoint(errorCode) >= 0) { --num; } } void FCDUTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) { // Specify the class to avoid a virtual-function indirection. // In Java, we would declare this class final. while(num > 0 && FCDUTF8CollationIterator::previousCodePoint(errorCode) >= 0) { --num; } } void FCDUTF8CollationIterator::switchToForward() { U_ASSERT(state == CHECK_BWD || (state == IN_FCD_SEGMENT && pos == limit) || (state == IN_NORMALIZED && pos == normalized.length())); if(state == CHECK_BWD) { // Turn around from backward checking. start = pos; if(pos == limit) { state = CHECK_FWD; // Check forward. } else { // pos < limit state = IN_FCD_SEGMENT; // Stay in FCD segment. } } else { // Reached the end of the FCD segment. if(state == IN_FCD_SEGMENT) { // The input text segment is FCD, extend it forward. } else { // The input text segment needed to be normalized. // Switch to checking forward from it. start = pos = limit; } state = CHECK_FWD; } } UBool FCDUTF8CollationIterator::nextSegment(UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return false; } U_ASSERT(state == CHECK_FWD && pos != length); // The input text [start..pos[ passes the FCD check. int32_t segmentStart = pos; // Collect the characters being checked, in case they need to be normalized. UnicodeString s; uint8_t prevCC = 0; for(;;) { // Fetch the next character and its fcd16 value. int32_t cpStart = pos; UChar32 c; U8_NEXT_OR_FFFD(u8, pos, length, c); uint16_t fcd16 = nfcImpl.getFCD16(c); uint8_t leadCC = (uint8_t)(fcd16 >> 8); if(leadCC == 0 && cpStart != segmentStart) { // FCD boundary before this character. pos = cpStart; break; } s.append(c); if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) { // Fails FCD check. Find the next FCD boundary and normalize. while(pos != length) { cpStart = pos; U8_NEXT_OR_FFFD(u8, pos, length, c); if(nfcImpl.getFCD16(c) <= 0xff) { pos = cpStart; break; } s.append(c); } if(!normalize(s, errorCode)) { return false; } start = segmentStart; limit = pos; state = IN_NORMALIZED; pos = 0; return true; } prevCC = (uint8_t)fcd16; if(pos == length || prevCC == 0) { // FCD boundary after the last character. break; } } limit = pos; pos = segmentStart; U_ASSERT(pos != limit); state = IN_FCD_SEGMENT; return true; } void FCDUTF8CollationIterator::switchToBackward() { U_ASSERT(state == CHECK_FWD || (state == IN_FCD_SEGMENT && pos == start) || (state >= IN_NORMALIZED && pos == 0)); if(state == CHECK_FWD) { // Turn around from forward checking. limit = pos; if(pos == start) { state = CHECK_BWD; // Check backward. } else { // pos > start state = IN_FCD_SEGMENT; // Stay in FCD segment. } } else { // Reached the start of the FCD segment. if(state == IN_FCD_SEGMENT) { // The input text segment is FCD, extend it backward. } else { // The input text segment needed to be normalized. // Switch to checking backward from it. limit = pos = start; } state = CHECK_BWD; } } UBool FCDUTF8CollationIterator::previousSegment(UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return false; } U_ASSERT(state == CHECK_BWD && pos != 0); // The input text [pos..limit[ passes the FCD check. int32_t segmentLimit = pos; // Collect the characters being checked, in case they need to be normalized. UnicodeString s; uint8_t nextCC = 0; for(;;) { // Fetch the previous character and its fcd16 value. int32_t cpLimit = pos; UChar32 c; U8_PREV_OR_FFFD(u8, 0, pos, c); uint16_t fcd16 = nfcImpl.getFCD16(c); uint8_t trailCC = (uint8_t)fcd16; if(trailCC == 0 && cpLimit != segmentLimit) { // FCD boundary after this character. pos = cpLimit; break; } s.append(c); if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) { // Fails FCD check. Find the previous FCD boundary and normalize. while(fcd16 > 0xff && pos != 0) { cpLimit = pos; U8_PREV_OR_FFFD(u8, 0, pos, c); fcd16 = nfcImpl.getFCD16(c); if(fcd16 == 0) { pos = cpLimit; break; } s.append(c); } s.reverse(); if(!normalize(s, errorCode)) { return false; } limit = segmentLimit; start = pos; state = IN_NORMALIZED; pos = normalized.length(); return true; } nextCC = (uint8_t)(fcd16 >> 8); if(pos == 0 || nextCC == 0) { // FCD boundary before the following character. break; } } start = pos; pos = segmentLimit; U_ASSERT(pos != start); state = IN_FCD_SEGMENT; return true; } UBool FCDUTF8CollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) { // NFD without argument checking. U_ASSERT(U_SUCCESS(errorCode)); nfcImpl.decompose(s, normalized, errorCode); return U_SUCCESS(errorCode); } U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION stringi/src/icu74/i18n/cecal.cpp0000644000176200001440000001251514700200761016006 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2003 - 2009, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "cecal.h" #include "gregoimp.h" //Math #include "cstring.h" U_NAMESPACE_BEGIN static const int32_t LIMITS[UCAL_FIELD_COUNT][4] = { // Minimum Greatest Least Maximum // Minimum Maximum { 0, 0, 1, 1}, // ERA { 1, 1, 5000000, 5000000}, // YEAR { 0, 0, 12, 12}, // MONTH { 1, 1, 52, 53}, // WEEK_OF_YEAR {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // WEEK_OF_MONTH { 1, 1, 5, 30}, // DAY_OF_MONTH { 1, 1, 365, 366}, // DAY_OF_YEAR {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // DAY_OF_WEEK { -1, -1, 1, 5}, // DAY_OF_WEEK_IN_MONTH {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // AM_PM {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // HOUR {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // HOUR_OF_DAY {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // MINUTE {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // SECOND {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // MILLISECOND {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // ZONE_OFFSET {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // DST_OFFSET { -5000000, -5000000, 5000000, 5000000}, // YEAR_WOY {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // DOW_LOCAL { -5000000, -5000000, 5000000, 5000000}, // EXTENDED_YEAR {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // JULIAN_DAY {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // MILLISECONDS_IN_DAY {/*N/A*/-1,/*N/A*/-1,/*N/A*/-1,/*N/A*/-1}, // IS_LEAP_MONTH { 0, 0, 12, 12}, // ORDINAL_MONTH }; //------------------------------------------------------------------------- // Constructors... //------------------------------------------------------------------------- CECalendar::CECalendar(const Locale& aLocale, UErrorCode& success) : Calendar(TimeZone::forLocaleOrDefault(aLocale), aLocale, success) { setTimeInMillis(getNow(), success); } CECalendar::CECalendar (const CECalendar& other) : Calendar(other) { } CECalendar::~CECalendar() { } CECalendar& CECalendar::operator=(const CECalendar& right) { Calendar::operator=(right); return *this; } //------------------------------------------------------------------------- // Calendar framework //------------------------------------------------------------------------- int32_t CECalendar::handleComputeMonthStart(int32_t eyear,int32_t emonth, UBool /*useMonth*/) const { return ceToJD(eyear, emonth, 0, getJDEpochOffset()); } int32_t CECalendar::handleGetLimit(UCalendarDateFields field, ELimitType limitType) const { return LIMITS[field][limitType]; } UBool CECalendar::haveDefaultCentury() const { return true; } //------------------------------------------------------------------------- // Calendar system Conversion methods... //------------------------------------------------------------------------- int32_t CECalendar::ceToJD(int32_t year, int32_t month, int32_t date, int32_t jdEpochOffset) { // handle month > 12, < 0 (e.g. from add/set) if ( month >= 0 ) { year += month/13; month %= 13; } else { ++month; year += month/13 - 1; month = month%13 + 12; } return (int32_t) ( jdEpochOffset // difference from Julian epoch to 1,1,1 + 365 * year // number of days from years + ClockMath::floorDivide(year, 4) // extra day of leap year + 30 * month // number of days from months (months are 0-based) + date - 1 // number of days for present month (1 based) ); } void CECalendar::jdToCE(int32_t julianDay, int32_t jdEpochOffset, int32_t& year, int32_t& month, int32_t& day) { int32_t c4; // number of 4 year cycle (1461 days) int32_t r4; // remainder of 4 year cycle, always positive c4 = ClockMath::floorDivide(julianDay - jdEpochOffset, 1461, &r4); year = 4 * c4 + (r4/365 - r4/1460); // 4 * + int32_t doy = (r4 == 1460) ? 365 : (r4 % 365); // days in present year month = doy / 30; // 30 -> Coptic/Ethiopic month length up to 12th month day = (doy % 30) + 1; // 1-based days in a month } static const char* kMonthCode13 = "M13"; const char* CECalendar::getTemporalMonthCode(UErrorCode& status) const { if (get(UCAL_MONTH, status) == 12) return kMonthCode13; return Calendar::getTemporalMonthCode(status); } void CECalendar::setTemporalMonthCode(const char* code, UErrorCode& status) { if (U_FAILURE(status)) return; if (uprv_strcmp(code, kMonthCode13) == 0) { set(UCAL_MONTH, 12); set(UCAL_IS_LEAP_MONTH, 0); return; } Calendar::setTemporalMonthCode(code, status); } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_FORMATTING */ //eof stringi/src/icu74/i18n/ulocdata.cpp0000644000176200001440000002666414700200761016545 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * * * Copyright (C) 2003-2016, International Business Machines * * Corporation and others. All Rights Reserved. * * * ****************************************************************************** * file name: ulocdata.c * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2003Oct21 * created by: Ram Viswanadha,John Emmons */ #include "cmemory.h" #include "unicode/ustring.h" #include "unicode/ures.h" #include "unicode/uloc.h" #include "unicode/ulocdata.h" #include "uresimp.h" #include "ureslocs.h" #include "ulocimp.h" #define MEASUREMENT_SYSTEM "MeasurementSystem" #define PAPER_SIZE "PaperSize" /** A locale data object. * For usage in C programs. * @draft ICU 3.4 */ struct ULocaleData { /** * Controls the "No Substitute" behavior of this locale data object */ UBool noSubstitute; /** * Pointer to the resource bundle associated with this locale data object */ UResourceBundle *bundle; /** * Pointer to the lang resource bundle associated with this locale data object */ UResourceBundle *langBundle; }; U_CAPI ULocaleData* U_EXPORT2 ulocdata_open(const char *localeID, UErrorCode *status) { ULocaleData *uld; if (U_FAILURE(*status)) { return nullptr; } uld = (ULocaleData *)uprv_malloc(sizeof(ULocaleData)); if (uld == nullptr) { *status = U_MEMORY_ALLOCATION_ERROR; return(nullptr); } uld->langBundle = nullptr; uld->noSubstitute = false; uld->bundle = ures_open(nullptr, localeID, status); if (U_FAILURE(*status)) { uprv_free(uld); return nullptr; } // ICU-22149: not all functions require lang data, so fail gracefully if it is not present UErrorCode oldStatus = *status; uld->langBundle = ures_open(U_ICUDATA_LANG, localeID, status); if (*status == U_MISSING_RESOURCE_ERROR) { *status = oldStatus; } return uld; } U_CAPI void U_EXPORT2 ulocdata_close(ULocaleData *uld) { if ( uld != nullptr ) { ures_close(uld->langBundle); ures_close(uld->bundle); uprv_free(uld); } } U_CAPI void U_EXPORT2 ulocdata_setNoSubstitute(ULocaleData *uld, UBool setting) { uld->noSubstitute = setting; } U_CAPI UBool U_EXPORT2 ulocdata_getNoSubstitute(ULocaleData *uld) { return uld->noSubstitute; } U_CAPI USet* U_EXPORT2 ulocdata_getExemplarSet(ULocaleData *uld, USet *fillIn, uint32_t options, ULocaleDataExemplarSetType extype, UErrorCode *status){ static const char* const exemplarSetTypes[] = { "ExemplarCharacters", "AuxExemplarCharacters", "ExemplarCharactersIndex", "ExemplarCharactersPunctuation"}; const char16_t *exemplarChars = nullptr; int32_t len = 0; UErrorCode localStatus = U_ZERO_ERROR; if (U_FAILURE(*status)) return nullptr; exemplarChars = ures_getStringByKey(uld->bundle, exemplarSetTypes[extype], &len, &localStatus); if ( (localStatus == U_USING_DEFAULT_WARNING) && uld->noSubstitute ) { localStatus = U_MISSING_RESOURCE_ERROR; } if (localStatus != U_ZERO_ERROR) { *status = localStatus; } if (U_FAILURE(*status)) return nullptr; if(fillIn != nullptr) uset_applyPattern(fillIn, exemplarChars, len, USET_IGNORE_SPACE | options, status); else fillIn = uset_openPatternOptions(exemplarChars, len, USET_IGNORE_SPACE | options, status); return fillIn; } U_CAPI int32_t U_EXPORT2 ulocdata_getDelimiter(ULocaleData *uld, ULocaleDataDelimiterType type, char16_t *result, int32_t resultLength, UErrorCode *status){ static const char* const delimiterKeys[] = { "quotationStart", "quotationEnd", "alternateQuotationStart", "alternateQuotationEnd" }; UResourceBundle *delimiterBundle; int32_t len = 0; const char16_t *delimiter = nullptr; UErrorCode localStatus = U_ZERO_ERROR; if (U_FAILURE(*status)) return 0; delimiterBundle = ures_getByKey(uld->bundle, "delimiters", nullptr, &localStatus); if ( (localStatus == U_USING_DEFAULT_WARNING) && uld->noSubstitute ) { localStatus = U_MISSING_RESOURCE_ERROR; } if (localStatus != U_ZERO_ERROR) { *status = localStatus; } if (U_FAILURE(*status)){ ures_close(delimiterBundle); return 0; } delimiter = ures_getStringByKeyWithFallback(delimiterBundle, delimiterKeys[type], &len, &localStatus); ures_close(delimiterBundle); if ( (localStatus == U_USING_DEFAULT_WARNING) && uld->noSubstitute ) { localStatus = U_MISSING_RESOURCE_ERROR; } if (localStatus != U_ZERO_ERROR) { *status = localStatus; } if (U_FAILURE(*status)){ return 0; } u_strncpy(result,delimiter, resultLength); return len; } static UResourceBundle * measurementTypeBundleForLocale(const char *localeID, const char *measurementType, UErrorCode *status){ char region[ULOC_COUNTRY_CAPACITY]; UResourceBundle *rb; UResourceBundle *measTypeBundle = nullptr; ulocimp_getRegionForSupplementalData(localeID, true, region, ULOC_COUNTRY_CAPACITY, status); rb = ures_openDirect(nullptr, "supplementalData", status); ures_getByKey(rb, "measurementData", rb, status); if (rb != nullptr) { UResourceBundle *measDataBundle = ures_getByKey(rb, region, nullptr, status); if (U_SUCCESS(*status)) { measTypeBundle = ures_getByKey(measDataBundle, measurementType, nullptr, status); } if (*status == U_MISSING_RESOURCE_ERROR) { *status = U_ZERO_ERROR; if (measDataBundle != nullptr) { ures_close(measDataBundle); } measDataBundle = ures_getByKey(rb, "001", nullptr, status); measTypeBundle = ures_getByKey(measDataBundle, measurementType, nullptr, status); } ures_close(measDataBundle); } ures_close(rb); return measTypeBundle; } U_CAPI UMeasurementSystem U_EXPORT2 ulocdata_getMeasurementSystem(const char *localeID, UErrorCode *status){ UResourceBundle* measurement=nullptr; UMeasurementSystem system = UMS_LIMIT; if(status == nullptr || U_FAILURE(*status)){ return system; } measurement = measurementTypeBundleForLocale(localeID, MEASUREMENT_SYSTEM, status); int32_t result = ures_getInt(measurement, status); if (U_SUCCESS(*status)) { system = static_cast(result); } ures_close(measurement); return system; } U_CAPI void U_EXPORT2 ulocdata_getPaperSize(const char* localeID, int32_t *height, int32_t *width, UErrorCode *status){ UResourceBundle* paperSizeBundle = nullptr; const int32_t* paperSize=nullptr; int32_t len = 0; if(status == nullptr || U_FAILURE(*status)){ return; } paperSizeBundle = measurementTypeBundleForLocale(localeID, PAPER_SIZE, status); paperSize = ures_getIntVector(paperSizeBundle, &len, status); if(U_SUCCESS(*status)){ if(len < 2){ *status = U_INTERNAL_PROGRAM_ERROR; }else{ *height = paperSize[0]; *width = paperSize[1]; } } ures_close(paperSizeBundle); } U_CAPI void U_EXPORT2 ulocdata_getCLDRVersion(UVersionInfo versionArray, UErrorCode *status) { UResourceBundle *rb = nullptr; rb = ures_openDirect(nullptr, "supplementalData", status); ures_getVersionByKey(rb, "cldrVersion", versionArray, status); ures_close(rb); } U_CAPI int32_t U_EXPORT2 ulocdata_getLocaleDisplayPattern(ULocaleData *uld, char16_t *result, int32_t resultCapacity, UErrorCode *status) { UResourceBundle *patternBundle; int32_t len = 0; const char16_t *pattern = nullptr; UErrorCode localStatus = U_ZERO_ERROR; if (U_FAILURE(*status)) return 0; if (uld->langBundle == nullptr) { *status = U_MISSING_RESOURCE_ERROR; return 0; } patternBundle = ures_getByKey(uld->langBundle, "localeDisplayPattern", nullptr, &localStatus); if ( (localStatus == U_USING_DEFAULT_WARNING) && uld->noSubstitute ) { localStatus = U_MISSING_RESOURCE_ERROR; } if (localStatus != U_ZERO_ERROR) { *status = localStatus; } if (U_FAILURE(*status)){ ures_close(patternBundle); return 0; } pattern = ures_getStringByKey(patternBundle, "pattern", &len, &localStatus); ures_close(patternBundle); if ( (localStatus == U_USING_DEFAULT_WARNING) && uld->noSubstitute ) { localStatus = U_MISSING_RESOURCE_ERROR; } if (localStatus != U_ZERO_ERROR) { *status = localStatus; } if (U_FAILURE(*status)){ return 0; } u_strncpy(result, pattern, resultCapacity); return len; } U_CAPI int32_t U_EXPORT2 ulocdata_getLocaleSeparator(ULocaleData *uld, char16_t *result, int32_t resultCapacity, UErrorCode *status) { UResourceBundle *separatorBundle; int32_t len = 0; const char16_t *separator = nullptr; UErrorCode localStatus = U_ZERO_ERROR; char16_t *p0, *p1; static const char16_t sub0[4] = { 0x007b, 0x0030, 0x007d , 0x0000 }; /* {0} */ static const char16_t sub1[4] = { 0x007b, 0x0031, 0x007d , 0x0000 }; /* {1} */ static const int32_t subLen = 3; if (U_FAILURE(*status)) return 0; if (uld->langBundle == nullptr) { *status = U_MISSING_RESOURCE_ERROR; return 0; } separatorBundle = ures_getByKey(uld->langBundle, "localeDisplayPattern", nullptr, &localStatus); if ( (localStatus == U_USING_DEFAULT_WARNING) && uld->noSubstitute ) { localStatus = U_MISSING_RESOURCE_ERROR; } if (localStatus != U_ZERO_ERROR) { *status = localStatus; } if (U_FAILURE(*status)){ ures_close(separatorBundle); return 0; } separator = ures_getStringByKey(separatorBundle, "separator", &len, &localStatus); ures_close(separatorBundle); if ( (localStatus == U_USING_DEFAULT_WARNING) && uld->noSubstitute ) { localStatus = U_MISSING_RESOURCE_ERROR; } if (localStatus != U_ZERO_ERROR) { *status = localStatus; } if (U_FAILURE(*status)){ return 0; } /* For backwards compatibility, if we have a pattern, return the portion between {0} and {1} */ p0=u_strstr(separator, sub0); p1=u_strstr(separator, sub1); if (p0!=nullptr && p1!=nullptr && p0<=p1) { separator = (const char16_t *)p0 + subLen; len = static_cast(p1 - separator); /* Desired separator is no longer zero-terminated; handle that if necessary */ if (len < resultCapacity) { u_strncpy(result, separator, len); result[len] = 0; return len; } } u_strncpy(result, separator, resultCapacity); return len; } stringi/src/icu74/i18n/dtitv_impl.h0000644000176200001440000000656514700200761016567 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2007-2016, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* * * File DTITV_IMPL.H * ******************************************************************************* */ #ifndef DTITV_IMPL_H__ #define DTITV_IMPL_H__ /** * \file * \brief C++ API: Defines macros for interval format implementation */ #if !UCONFIG_NO_FORMATTING #include "unicode/unistr.h" #define QUOTE ((char16_t)0x0027) #define LOW_LINE ((char16_t)0x005F) #define COLON ((char16_t)0x003A) #define LEFT_CURLY_BRACKET ((char16_t)0x007B) #define RIGHT_CURLY_BRACKET ((char16_t)0x007D) #define SPACE ((char16_t)0x0020) #define EN_DASH ((char16_t)0x2013) #define SOLIDUS ((char16_t)0x002F) #define DIGIT_ZERO ((char16_t)0x0030) #define DIGIT_ONE ((char16_t)0x0031) #define LOW_A ((char16_t)0x0061) #define LOW_B ((char16_t)0x0062) #define LOW_C ((char16_t)0x0063) #define LOW_D ((char16_t)0x0064) #define LOW_E ((char16_t)0x0065) #define LOW_F ((char16_t)0x0066) #define LOW_G ((char16_t)0x0067) #define LOW_H ((char16_t)0x0068) #define LOW_I ((char16_t)0x0069) #define LOW_J ((char16_t)0x006a) #define LOW_K ((char16_t)0x006B) #define LOW_L ((char16_t)0x006C) #define LOW_M ((char16_t)0x006D) #define LOW_N ((char16_t)0x006E) #define LOW_O ((char16_t)0x006F) #define LOW_P ((char16_t)0x0070) #define LOW_Q ((char16_t)0x0071) #define LOW_R ((char16_t)0x0072) #define LOW_S ((char16_t)0x0073) #define LOW_T ((char16_t)0x0074) #define LOW_U ((char16_t)0x0075) #define LOW_V ((char16_t)0x0076) #define LOW_W ((char16_t)0x0077) #define LOW_Y ((char16_t)0x0079) #define LOW_Z ((char16_t)0x007A) #define CAP_A ((char16_t)0x0041) #define CAP_B ((char16_t)0x0042) #define CAP_C ((char16_t)0x0043) #define CAP_D ((char16_t)0x0044) #define CAP_E ((char16_t)0x0045) #define CAP_F ((char16_t)0x0046) #define CAP_G ((char16_t)0x0047) #define CAP_J ((char16_t)0x004A) #define CAP_H ((char16_t)0x0048) #define CAP_K ((char16_t)0x004B) #define CAP_L ((char16_t)0x004C) #define CAP_M ((char16_t)0x004D) #define CAP_O ((char16_t)0x004F) #define CAP_Q ((char16_t)0x0051) #define CAP_S ((char16_t)0x0053) #define CAP_T ((char16_t)0x0054) #define CAP_U ((char16_t)0x0055) #define CAP_V ((char16_t)0x0056) #define CAP_W ((char16_t)0x0057) #define CAP_Y ((char16_t)0x0059) #define CAP_Z ((char16_t)0x005A) //#define MINIMUM_SUPPORTED_CALENDAR_FIELD UCAL_MINUTE #define MAX_E_COUNT 5 #define MAX_M_COUNT 5 //#define MAX_INTERVAL_INDEX 4 #define MAX_POSITIVE_INT 56632 #endif /* #if !UCONFIG_NO_FORMATTING */ #endif //eof stringi/src/icu74/i18n/regextxt.cpp0000644000176200001440000000271014700200761016605 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /******************************************************************** * COPYRIGHT: * Copyright (c) 2008-2011, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ // // file: regextxt.cpp // // This file contains utility code for supporting UText in the regular expression engine. // #include "unicode/utf.h" #include "regextxt.h" U_NAMESPACE_BEGIN U_CFUNC char16_t U_CALLCONV uregex_utext_unescape_charAt(int32_t offset, void *ct) { struct URegexUTextUnescapeCharContext *context = (struct URegexUTextUnescapeCharContext *)ct; UChar32 c; if (offset == context->lastOffset + 1) { c = UTEXT_NEXT32(context->text); context->lastOffset++; } else if (offset == context->lastOffset) { c = UTEXT_PREVIOUS32(context->text); UTEXT_NEXT32(context->text); } else { utext_moveIndex32(context->text, offset - context->lastOffset - 1); c = UTEXT_NEXT32(context->text); context->lastOffset = offset; } // !!!: Doesn't handle characters outside BMP if (U_IS_BMP(c)) { return (char16_t)c; } else { return 0; } } U_CFUNC char16_t U_CALLCONV uregex_ucstr_unescape_charAt(int32_t offset, void *context) { return ((char16_t *)context)[offset]; } U_NAMESPACE_END stringi/src/icu74/i18n/gregoimp.cpp0000644000176200001440000001503414700200761016547 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (c) 2003-2008, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Author: Alan Liu * Created: September 2 2003 * Since: ICU 2.8 ********************************************************************** */ #include "gregoimp.h" #if !UCONFIG_NO_FORMATTING #include "unicode/ucal.h" #include "uresimp.h" #include "cstring.h" #include "uassert.h" U_NAMESPACE_BEGIN int32_t ClockMath::floorDivide(int32_t numerator, int32_t denominator) { return (numerator >= 0) ? numerator / denominator : ((numerator + 1) / denominator) - 1; } int64_t ClockMath::floorDivide(int64_t numerator, int64_t denominator) { return (numerator >= 0) ? numerator / denominator : ((numerator + 1) / denominator) - 1; } int32_t ClockMath::floorDivide(double numerator, int32_t denominator, int32_t* remainder) { // For an integer n and representable ⌊x/n⌋, ⌊RN(x/n)⌋=⌊x/n⌋, where RN is // rounding to nearest. double quotient = uprv_floor(numerator / denominator); if (remainder != nullptr) { // For doubles x and n, where n is an integer and ⌊x+n⌋ < 2³¹, the // expression `(int32_t) (x + n)` evaluated with rounding to nearest // differs from ⌊x+n⌋ if 0 < ⌈x⌉−x ≪ x+n, as `x + n` is rounded up to // n+⌈x⌉ = ⌊x+n⌋ + 1. Rewriting it as ⌊x⌋+n makes the addition exact. *remainder = (int32_t) (uprv_floor(numerator) - (quotient * denominator)); } return (int32_t) quotient; } double ClockMath::floorDivide(double dividend, double divisor, double* remainder) { // Only designed to work for positive divisors U_ASSERT(divisor > 0); double quotient = floorDivide(dividend, divisor); double r = dividend - (quotient * divisor); // N.B. For certain large dividends, on certain platforms, there // is a bug such that the quotient is off by one. If you doubt // this to be true, set a breakpoint below and run cintltst. if (r < 0 || r >= divisor) { // E.g. 6.7317038241449352e+022 / 86400000.0 is wrong on my // machine (too high by one). 4.1792057231752762e+024 / // 86400000.0 is wrong the other way (too low). double q = quotient; quotient += (r < 0) ? -1 : +1; if (q == quotient) { // For quotients > ~2^53, we won't be able to add or // subtract one, since the LSB of the mantissa will be > // 2^0; that is, the exponent (base 2) will be larger than // the length, in bits, of the mantissa. In that case, we // can't give a correct answer, so we set the remainder to // zero. This has the desired effect of making extreme // values give back an approximate answer rather than // crashing. For example, UDate values above a ~10^25 // might all have a time of midnight. r = 0; } else { r = dividend - (quotient * divisor); } } U_ASSERT(0 <= r && r < divisor); if (remainder != nullptr) { *remainder = r; } return quotient; } const int32_t JULIAN_1_CE = 1721426; // January 1, 1 CE Gregorian const int32_t JULIAN_1970_CE = 2440588; // January 1, 1970 CE Gregorian const int16_t Grego::DAYS_BEFORE[24] = {0,31,59,90,120,151,181,212,243,273,304,334, 0,31,60,91,121,152,182,213,244,274,305,335}; const int8_t Grego::MONTH_LENGTH[24] = {31,28,31,30,31,30,31,31,30,31,30,31, 31,29,31,30,31,30,31,31,30,31,30,31}; double Grego::fieldsToDay(int32_t year, int32_t month, int32_t dom) { int32_t y = year - 1; double julian = 365 * y + ClockMath::floorDivide(y, 4) + (JULIAN_1_CE - 3) + // Julian cal ClockMath::floorDivide(y, 400) - ClockMath::floorDivide(y, 100) + 2 + // => Gregorian cal DAYS_BEFORE[month + (isLeapYear(year) ? 12 : 0)] + dom; // => month/dom return julian - JULIAN_1970_CE; // JD => epoch day } void Grego::dayToFields(double day, int32_t& year, int32_t& month, int32_t& dom, int32_t& dow, int32_t& doy) { // Convert from 1970 CE epoch to 1 CE epoch (Gregorian calendar) day += JULIAN_1970_CE - JULIAN_1_CE; // Convert from the day number to the multiple radix // representation. We use 400-year, 100-year, and 4-year cycles. // For example, the 4-year cycle has 4 years + 1 leap day; giving // 1461 == 365*4 + 1 days. int32_t n400 = ClockMath::floorDivide(day, 146097, &doy); // 400-year cycle length int32_t n100 = ClockMath::floorDivide(doy, 36524, &doy); // 100-year cycle length int32_t n4 = ClockMath::floorDivide(doy, 1461, &doy); // 4-year cycle length int32_t n1 = ClockMath::floorDivide(doy, 365, &doy); year = 400*n400 + 100*n100 + 4*n4 + n1; if (n100 == 4 || n1 == 4) { doy = 365; // Dec 31 at end of 4- or 400-year cycle } else { ++year; } UBool isLeap = isLeapYear(year); // Gregorian day zero is a Monday. dow = (int32_t) uprv_fmod(day + 1, 7); dow += (dow < 0) ? (UCAL_SUNDAY + 7) : UCAL_SUNDAY; // Common Julian/Gregorian calculation int32_t correction = 0; int32_t march1 = isLeap ? 60 : 59; // zero-based DOY for March 1 if (doy >= march1) { correction = isLeap ? 1 : 2; } month = (12 * (doy + correction) + 6) / 367; // zero-based month dom = doy - DAYS_BEFORE[month + (isLeap ? 12 : 0)] + 1; // one-based DOM doy++; // one-based doy } void Grego::timeToFields(UDate time, int32_t& year, int32_t& month, int32_t& dom, int32_t& dow, int32_t& doy, int32_t& mid) { double millisInDay; double day = ClockMath::floorDivide((double)time, (double)U_MILLIS_PER_DAY, &millisInDay); mid = (int32_t)millisInDay; dayToFields(day, year, month, dom, dow, doy); } int32_t Grego::dayOfWeek(double day) { int32_t dow; ClockMath::floorDivide(day + int{UCAL_THURSDAY}, 7, &dow); return (dow == 0) ? UCAL_SATURDAY : dow; } int32_t Grego::dayOfWeekInMonth(int32_t year, int32_t month, int32_t dom) { int32_t weekInMonth = (dom + 6)/7; if (weekInMonth == 4) { if (dom + 7 > monthLength(year, month)) { weekInMonth = -1; } } else if (weekInMonth == 5) { weekInMonth = -1; } return weekInMonth; } U_NAMESPACE_END #endif //eof stringi/src/icu74/i18n/collation.cpp0000644000176200001440000001224514700200761016723 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2010-2014, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * collation.cpp * * created on: 2010oct27 * created by: Markus W. Scherer */ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION #include "collation.h" #include "uassert.h" U_NAMESPACE_BEGIN uint32_t Collation::incTwoBytePrimaryByOffset(uint32_t basePrimary, UBool isCompressible, int32_t offset) { // Extract the second byte, minus the minimum byte value, // plus the offset, modulo the number of usable byte values, plus the minimum. // Reserve the PRIMARY_COMPRESSION_LOW_BYTE and high byte if necessary. uint32_t primary; if(isCompressible) { offset += ((int32_t)(basePrimary >> 16) & 0xff) - 4; primary = (uint32_t)((offset % 251) + 4) << 16; offset /= 251; } else { offset += ((int32_t)(basePrimary >> 16) & 0xff) - 2; primary = (uint32_t)((offset % 254) + 2) << 16; offset /= 254; } // First byte, assume no further overflow. return primary | ((basePrimary & 0xff000000) + (uint32_t)(offset << 24)); } uint32_t Collation::incThreeBytePrimaryByOffset(uint32_t basePrimary, UBool isCompressible, int32_t offset) { // Extract the third byte, minus the minimum byte value, // plus the offset, modulo the number of usable byte values, plus the minimum. offset += ((int32_t)(basePrimary >> 8) & 0xff) - 2; uint32_t primary = (uint32_t)((offset % 254) + 2) << 8; offset /= 254; // Same with the second byte, // but reserve the PRIMARY_COMPRESSION_LOW_BYTE and high byte if necessary. if(isCompressible) { offset += ((int32_t)(basePrimary >> 16) & 0xff) - 4; primary |= (uint32_t)((offset % 251) + 4) << 16; offset /= 251; } else { offset += ((int32_t)(basePrimary >> 16) & 0xff) - 2; primary |= (uint32_t)((offset % 254) + 2) << 16; offset /= 254; } // First byte, assume no further overflow. return primary | ((basePrimary & 0xff000000) + (uint32_t)(offset << 24)); } uint32_t Collation::decTwoBytePrimaryByOneStep(uint32_t basePrimary, UBool isCompressible, int32_t step) { // Extract the second byte, minus the minimum byte value, // minus the step, modulo the number of usable byte values, plus the minimum. // Reserve the PRIMARY_COMPRESSION_LOW_BYTE and high byte if necessary. // Assume no further underflow for the first byte. U_ASSERT(0 < step && step <= 0x7f); int32_t byte2 = ((int32_t)(basePrimary >> 16) & 0xff) - step; if(isCompressible) { if(byte2 < 4) { byte2 += 251; basePrimary -= 0x1000000; } } else { if(byte2 < 2) { byte2 += 254; basePrimary -= 0x1000000; } } return (basePrimary & 0xff000000) | ((uint32_t)byte2 << 16); } uint32_t Collation::decThreeBytePrimaryByOneStep(uint32_t basePrimary, UBool isCompressible, int32_t step) { // Extract the third byte, minus the minimum byte value, // minus the step, modulo the number of usable byte values, plus the minimum. U_ASSERT(0 < step && step <= 0x7f); int32_t byte3 = ((int32_t)(basePrimary >> 8) & 0xff) - step; if(byte3 >= 2) { return (basePrimary & 0xffff0000) | ((uint32_t)byte3 << 8); } byte3 += 254; // Same with the second byte, // but reserve the PRIMARY_COMPRESSION_LOW_BYTE and high byte if necessary. int32_t byte2 = ((int32_t)(basePrimary >> 16) & 0xff) - 1; if(isCompressible) { if(byte2 < 4) { byte2 = 0xfe; basePrimary -= 0x1000000; } } else { if(byte2 < 2) { byte2 = 0xff; basePrimary -= 0x1000000; } } // First byte, assume no further underflow. return (basePrimary & 0xff000000) | ((uint32_t)byte2 << 16) | ((uint32_t)byte3 << 8); } uint32_t Collation::getThreeBytePrimaryForOffsetData(UChar32 c, int64_t dataCE) { uint32_t p = (uint32_t)(dataCE >> 32); // three-byte primary pppppp00 int32_t lower32 = (int32_t)dataCE; // base code point b & step s: bbbbbbss (bit 7: isCompressible) int32_t offset = (c - (lower32 >> 8)) * (lower32 & 0x7f); // delta * increment UBool isCompressible = (lower32 & 0x80) != 0; return Collation::incThreeBytePrimaryByOffset(p, isCompressible, offset); } uint32_t Collation::unassignedPrimaryFromCodePoint(UChar32 c) { // Create a gap before U+0000. Use c=-1 for [first unassigned]. ++c; // Fourth byte: 18 values, every 14th byte value (gap of 13). uint32_t primary = 2 + (c % 18) * 14; c /= 18; // Third byte: 254 values. primary |= (2 + (c % 254)) << 8; c /= 254; // Second byte: 251 values 04..FE excluding the primary compression bytes. primary |= (4 + (c % 251)) << 16; // One lead byte covers all code points (c < 0x1182B4 = 1*251*254*18). return primary | (UNASSIGNED_IMPLICIT_BYTE << 24); } U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION stringi/src/icu74/i18n/reldtfmt.cpp0000644000176200001440000005523514700200761016566 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * Copyright (C) 2007-2016, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include #include "unicode/datefmt.h" #include "unicode/reldatefmt.h" #include "unicode/simpleformatter.h" #include "unicode/smpdtfmt.h" #include "unicode/udisplaycontext.h" #include "unicode/uchar.h" #include "unicode/brkiter.h" #include "unicode/ucasemap.h" #include "reldtfmt.h" #include "cmemory.h" #include "uresimp.h" U_NAMESPACE_BEGIN /** * An array of URelativeString structs is used to store the resource data loaded out of the bundle. */ struct URelativeString { int32_t offset; /** offset of this item, such as, the relative date **/ int32_t len; /** length of the string **/ const char16_t* string; /** string, or nullptr if not set **/ }; UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RelativeDateFormat) RelativeDateFormat::RelativeDateFormat(const RelativeDateFormat& other) : DateFormat(other), fDateTimeFormatter(nullptr), fDatePattern(other.fDatePattern), fTimePattern(other.fTimePattern), fCombinedFormat(nullptr), fDateStyle(other.fDateStyle), fLocale(other.fLocale), fDatesLen(other.fDatesLen), fDates(nullptr), fCombinedHasDateAtStart(other.fCombinedHasDateAtStart), fCapitalizationInfoSet(other.fCapitalizationInfoSet), fCapitalizationOfRelativeUnitsForUIListMenu(other.fCapitalizationOfRelativeUnitsForUIListMenu), fCapitalizationOfRelativeUnitsForStandAlone(other.fCapitalizationOfRelativeUnitsForStandAlone), fCapitalizationBrkIter(nullptr) { if(other.fDateTimeFormatter != nullptr) { fDateTimeFormatter = other.fDateTimeFormatter->clone(); } if(other.fCombinedFormat != nullptr) { fCombinedFormat = new SimpleFormatter(*other.fCombinedFormat); } if (fDatesLen > 0) { fDates = (URelativeString*) uprv_malloc(sizeof(fDates[0])*(size_t)fDatesLen); uprv_memcpy(fDates, other.fDates, sizeof(fDates[0])*(size_t)fDatesLen); } #if !UCONFIG_NO_BREAK_ITERATION if (other.fCapitalizationBrkIter != nullptr) { fCapitalizationBrkIter = (other.fCapitalizationBrkIter)->clone(); } #endif } RelativeDateFormat::RelativeDateFormat( UDateFormatStyle timeStyle, UDateFormatStyle dateStyle, const Locale& locale, UErrorCode& status) : DateFormat(), fDateTimeFormatter(nullptr), fDatePattern(), fTimePattern(), fCombinedFormat(nullptr), fDateStyle(dateStyle), fLocale(locale), fDatesLen(0), fDates(nullptr), fCombinedHasDateAtStart(false), fCapitalizationInfoSet(false), fCapitalizationOfRelativeUnitsForUIListMenu(false), fCapitalizationOfRelativeUnitsForStandAlone(false), fCapitalizationBrkIter(nullptr) { if(U_FAILURE(status) ) { return; } if (dateStyle != UDAT_FULL_RELATIVE && dateStyle != UDAT_LONG_RELATIVE && dateStyle != UDAT_MEDIUM_RELATIVE && dateStyle != UDAT_SHORT_RELATIVE && dateStyle != UDAT_RELATIVE) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } if (timeStyle < UDAT_NONE || timeStyle > UDAT_SHORT) { // don't support other time styles (e.g. relative styles), for now status = U_ILLEGAL_ARGUMENT_ERROR; return; } UDateFormatStyle baseDateStyle = (dateStyle > UDAT_SHORT)? (UDateFormatStyle)(dateStyle & ~UDAT_RELATIVE): dateStyle; DateFormat * df; // Get fDateTimeFormatter from either date or time style (does not matter, we will override the pattern). // We do need to get separate patterns for the date & time styles. if (baseDateStyle != UDAT_NONE) { df = createDateInstance((EStyle)baseDateStyle, locale); fDateTimeFormatter=dynamic_cast(df); if (fDateTimeFormatter == nullptr) { status = U_UNSUPPORTED_ERROR; return; } fDateTimeFormatter->toPattern(fDatePattern); if (timeStyle != UDAT_NONE) { df = createTimeInstance((EStyle)timeStyle, locale); SimpleDateFormat *sdf = dynamic_cast(df); if (sdf != nullptr) { sdf->toPattern(fTimePattern); delete sdf; } } } else { // does not matter whether timeStyle is UDAT_NONE, we need something for fDateTimeFormatter df = createTimeInstance((EStyle)timeStyle, locale); fDateTimeFormatter=dynamic_cast(df); if (fDateTimeFormatter == nullptr) { status = U_UNSUPPORTED_ERROR; delete df; return; } fDateTimeFormatter->toPattern(fTimePattern); } // Initialize the parent fCalendar, so that parse() works correctly. initializeCalendar(nullptr, locale, status); loadDates(status); } RelativeDateFormat::~RelativeDateFormat() { delete fDateTimeFormatter; delete fCombinedFormat; uprv_free(fDates); #if !UCONFIG_NO_BREAK_ITERATION delete fCapitalizationBrkIter; #endif } RelativeDateFormat* RelativeDateFormat::clone() const { return new RelativeDateFormat(*this); } bool RelativeDateFormat::operator==(const Format& other) const { if(DateFormat::operator==(other)) { // The DateFormat::operator== check for fCapitalizationContext equality above // is sufficient to check equality of all derived context-related data. // DateFormat::operator== guarantees following cast is safe RelativeDateFormat* that = (RelativeDateFormat*)&other; return (fDateStyle==that->fDateStyle && fDatePattern==that->fDatePattern && fTimePattern==that->fTimePattern && fLocale==that->fLocale ); } return false; } static const char16_t APOSTROPHE = (char16_t)0x0027; UnicodeString& RelativeDateFormat::format( Calendar& cal, UnicodeString& appendTo, FieldPosition& pos) const { UErrorCode status = U_ZERO_ERROR; UnicodeString relativeDayString; UDisplayContext capitalizationContext = getContext(UDISPCTX_TYPE_CAPITALIZATION, status); // calculate the difference, in days, between 'cal' and now. int dayDiff = dayDifference(cal, status); // look up string int32_t len = 0; const char16_t *theString = getStringForDay(dayDiff, len, status); if(U_SUCCESS(status) && (theString!=nullptr)) { // found a relative string relativeDayString.setTo(theString, len); } if ( relativeDayString.length() > 0 && !fDatePattern.isEmpty() && (fTimePattern.isEmpty() || fCombinedFormat == nullptr || fCombinedHasDateAtStart)) { #if !UCONFIG_NO_BREAK_ITERATION // capitalize relativeDayString according to context for relative, set formatter no context if ( u_islower(relativeDayString.char32At(0)) && fCapitalizationBrkIter!= nullptr && ( capitalizationContext==UDISPCTX_CAPITALIZATION_FOR_BEGINNING_OF_SENTENCE || (capitalizationContext==UDISPCTX_CAPITALIZATION_FOR_UI_LIST_OR_MENU && fCapitalizationOfRelativeUnitsForUIListMenu) || (capitalizationContext==UDISPCTX_CAPITALIZATION_FOR_STANDALONE && fCapitalizationOfRelativeUnitsForStandAlone) ) ) { // titlecase first word of relativeDayString relativeDayString.toTitle(fCapitalizationBrkIter, fLocale, U_TITLECASE_NO_LOWERCASE | U_TITLECASE_NO_BREAK_ADJUSTMENT); } #endif fDateTimeFormatter->setContext(UDISPCTX_CAPITALIZATION_NONE, status); } else { // set our context for the formatter fDateTimeFormatter->setContext(capitalizationContext, status); } if (fDatePattern.isEmpty()) { fDateTimeFormatter->applyPattern(fTimePattern); fDateTimeFormatter->format(cal,appendTo,pos); } else if (fTimePattern.isEmpty() || fCombinedFormat == nullptr) { if (relativeDayString.length() > 0) { appendTo.append(relativeDayString); } else { fDateTimeFormatter->applyPattern(fDatePattern); fDateTimeFormatter->format(cal,appendTo,pos); } } else { UnicodeString datePattern; if (relativeDayString.length() > 0) { // Need to quote the relativeDayString to make it a legal date pattern relativeDayString.findAndReplace(UNICODE_STRING("'", 1), UNICODE_STRING("''", 2)); // double any existing APOSTROPHE relativeDayString.insert(0, APOSTROPHE); // add APOSTROPHE at beginning... relativeDayString.append(APOSTROPHE); // and at end datePattern.setTo(relativeDayString); } else { datePattern.setTo(fDatePattern); } UnicodeString combinedPattern; fCombinedFormat->format(fTimePattern, datePattern, combinedPattern, status); fDateTimeFormatter->applyPattern(combinedPattern); fDateTimeFormatter->format(cal,appendTo,pos); } return appendTo; } UnicodeString& RelativeDateFormat::format(const Formattable& obj, UnicodeString& appendTo, FieldPosition& pos, UErrorCode& status) const { // this is just here to get around the hiding problem // (the previous format() override would hide the version of // format() on DateFormat that this function correspond to, so we // have to redefine it here) return DateFormat::format(obj, appendTo, pos, status); } void RelativeDateFormat::parse( const UnicodeString& text, Calendar& cal, ParsePosition& pos) const { int32_t startIndex = pos.getIndex(); if (fDatePattern.isEmpty()) { // no date pattern, try parsing as time fDateTimeFormatter->applyPattern(fTimePattern); fDateTimeFormatter->parse(text,cal,pos); } else if (fTimePattern.isEmpty() || fCombinedFormat == nullptr) { // no time pattern or way to combine, try parsing as date // first check whether text matches a relativeDayString UBool matchedRelative = false; for (int n=0; n < fDatesLen && !matchedRelative; n++) { if (fDates[n].string != nullptr && text.compare(startIndex, fDates[n].len, fDates[n].string) == 0) { // it matched, handle the relative day string UErrorCode status = U_ZERO_ERROR; matchedRelative = true; // Set the calendar to now+offset cal.setTime(Calendar::getNow(),status); cal.add(UCAL_DATE,fDates[n].offset, status); if(U_FAILURE(status)) { // failure in setting calendar field, set offset to beginning of rel day string pos.setErrorIndex(startIndex); } else { pos.setIndex(startIndex + fDates[n].len); } } } if (!matchedRelative) { // just parse as normal date fDateTimeFormatter->applyPattern(fDatePattern); fDateTimeFormatter->parse(text,cal,pos); } } else { // Here we replace any relativeDayString in text with the equivalent date // formatted per fDatePattern, then parse text normally using the combined pattern. UnicodeString modifiedText(text); FieldPosition fPos; int32_t dateStart = 0, origDateLen = 0, modDateLen = 0; UErrorCode status = U_ZERO_ERROR; for (int n=0; n < fDatesLen; n++) { int32_t relativeStringOffset; if (fDates[n].string != nullptr && (relativeStringOffset = modifiedText.indexOf(fDates[n].string, fDates[n].len, startIndex)) >= startIndex) { // it matched, replace the relative date with a real one for parsing UnicodeString dateString; Calendar * tempCal = cal.clone(); // Set the calendar to now+offset tempCal->setTime(Calendar::getNow(),status); tempCal->add(UCAL_DATE,fDates[n].offset, status); if(U_FAILURE(status)) { pos.setErrorIndex(startIndex); delete tempCal; return; } fDateTimeFormatter->applyPattern(fDatePattern); fDateTimeFormatter->format(*tempCal, dateString, fPos); dateStart = relativeStringOffset; origDateLen = fDates[n].len; modDateLen = dateString.length(); modifiedText.replace(dateStart, origDateLen, dateString); delete tempCal; break; } } UnicodeString combinedPattern; fCombinedFormat->format(fTimePattern, fDatePattern, combinedPattern, status); fDateTimeFormatter->applyPattern(combinedPattern); fDateTimeFormatter->parse(modifiedText,cal,pos); // Adjust offsets UBool noError = (pos.getErrorIndex() < 0); int32_t offset = (noError)? pos.getIndex(): pos.getErrorIndex(); if (offset >= dateStart + modDateLen) { // offset at or after the end of the replaced text, // correct by the difference between original and replacement offset -= (modDateLen - origDateLen); } else if (offset >= dateStart) { // offset in the replaced text, set it to the beginning of that text // (i.e. the beginning of the relative day string) offset = dateStart; } if (noError) { pos.setIndex(offset); } else { pos.setErrorIndex(offset); } } } UDate RelativeDateFormat::parse( const UnicodeString& text, ParsePosition& pos) const { // redefined here because the other parse() function hides this function's // counterpart on DateFormat return DateFormat::parse(text, pos); } UDate RelativeDateFormat::parse(const UnicodeString& text, UErrorCode& status) const { // redefined here because the other parse() function hides this function's // counterpart on DateFormat return DateFormat::parse(text, status); } const char16_t *RelativeDateFormat::getStringForDay(int32_t day, int32_t &len, UErrorCode &status) const { if(U_FAILURE(status)) { return nullptr; } // Is it inside the resource bundle's range? int n = day + UDAT_DIRECTION_THIS; if (n >= 0 && n < fDatesLen) { if (fDates[n].offset == day && fDates[n].string != nullptr) { len = fDates[n].len; return fDates[n].string; } } return nullptr; // not found. } UnicodeString& RelativeDateFormat::toPattern(UnicodeString& result, UErrorCode& status) const { if (!U_FAILURE(status)) { result.remove(); if (fDatePattern.isEmpty()) { result.setTo(fTimePattern); } else if (fTimePattern.isEmpty() || fCombinedFormat == nullptr) { result.setTo(fDatePattern); } else { fCombinedFormat->format(fTimePattern, fDatePattern, result, status); } } return result; } UnicodeString& RelativeDateFormat::toPatternDate(UnicodeString& result, UErrorCode& status) const { if (!U_FAILURE(status)) { result.remove(); result.setTo(fDatePattern); } return result; } UnicodeString& RelativeDateFormat::toPatternTime(UnicodeString& result, UErrorCode& status) const { if (!U_FAILURE(status)) { result.remove(); result.setTo(fTimePattern); } return result; } void RelativeDateFormat::applyPatterns(const UnicodeString& datePattern, const UnicodeString& timePattern, UErrorCode &status) { if (!U_FAILURE(status)) { fDatePattern.setTo(datePattern); fTimePattern.setTo(timePattern); } } const DateFormatSymbols* RelativeDateFormat::getDateFormatSymbols() const { return fDateTimeFormatter->getDateFormatSymbols(); } // override the DateFormat implementation in order to // lazily initialize relevant items void RelativeDateFormat::setContext(UDisplayContext value, UErrorCode& status) { DateFormat::setContext(value, status); if (U_SUCCESS(status)) { if (!fCapitalizationInfoSet && (value==UDISPCTX_CAPITALIZATION_FOR_UI_LIST_OR_MENU || value==UDISPCTX_CAPITALIZATION_FOR_STANDALONE)) { initCapitalizationContextInfo(fLocale); fCapitalizationInfoSet = true; } #if !UCONFIG_NO_BREAK_ITERATION if ( fCapitalizationBrkIter == nullptr && (value==UDISPCTX_CAPITALIZATION_FOR_BEGINNING_OF_SENTENCE || (value==UDISPCTX_CAPITALIZATION_FOR_UI_LIST_OR_MENU && fCapitalizationOfRelativeUnitsForUIListMenu) || (value==UDISPCTX_CAPITALIZATION_FOR_STANDALONE && fCapitalizationOfRelativeUnitsForStandAlone)) ) { status = U_ZERO_ERROR; fCapitalizationBrkIter = BreakIterator::createSentenceInstance(fLocale, status); if (U_FAILURE(status)) { delete fCapitalizationBrkIter; fCapitalizationBrkIter = nullptr; } } #endif } } void RelativeDateFormat::initCapitalizationContextInfo(const Locale& thelocale) { #if !UCONFIG_NO_BREAK_ITERATION const char * localeID = (thelocale != nullptr)? thelocale.getBaseName(): nullptr; UErrorCode status = U_ZERO_ERROR; LocalUResourceBundlePointer rb(ures_open(nullptr, localeID, &status)); ures_getByKeyWithFallback(rb.getAlias(), "contextTransforms/relative", rb.getAlias(), &status); if (U_SUCCESS(status) && rb != nullptr) { int32_t len = 0; const int32_t * intVector = ures_getIntVector(rb.getAlias(), &len, &status); if (U_SUCCESS(status) && intVector != nullptr && len >= 2) { fCapitalizationOfRelativeUnitsForUIListMenu = static_cast(intVector[0]); fCapitalizationOfRelativeUnitsForStandAlone = static_cast(intVector[1]); } } #endif } namespace { /** * Sink for getting data from fields/day/relative data. * For loading relative day names, e.g., "yesterday", "today". */ struct RelDateFmtDataSink : public ResourceSink { URelativeString *fDatesPtr; int32_t fDatesLen; RelDateFmtDataSink(URelativeString* fDates, int32_t len) : fDatesPtr(fDates), fDatesLen(len) { for (int32_t i = 0; i < fDatesLen; ++i) { fDatesPtr[i].offset = 0; fDatesPtr[i].string = nullptr; fDatesPtr[i].len = -1; } } virtual ~RelDateFmtDataSink(); virtual void put(const char *key, ResourceValue &value, UBool /*noFallback*/, UErrorCode &errorCode) override { ResourceTable relDayTable = value.getTable(errorCode); int32_t n = 0; int32_t len = 0; for (int32_t i = 0; relDayTable.getKeyAndValue(i, key, value); ++i) { // Find the relative offset. int32_t offset = atoi(key); // Put in the proper spot, but don't override existing data. n = offset + UDAT_DIRECTION_THIS; // Converts to index in UDAT_R if (n < fDatesLen && fDatesPtr[n].string == nullptr) { // Not found and n is an empty slot. fDatesPtr[n].offset = offset; fDatesPtr[n].string = value.getString(len, errorCode); fDatesPtr[n].len = len; } } } }; // Virtual destructors must be defined out of line. RelDateFmtDataSink::~RelDateFmtDataSink() {} } // Namespace static const char16_t patItem1[] = {0x7B,0x31,0x7D}; // "{1}" static const int32_t patItem1Len = 3; void RelativeDateFormat::loadDates(UErrorCode &status) { UResourceBundle *rb = ures_open(nullptr, fLocale.getBaseName(), &status); LocalUResourceBundlePointer dateTimePatterns( ures_getByKeyWithFallback(rb, "calendar/gregorian/DateTimePatterns", (UResourceBundle*)nullptr, &status)); if(U_SUCCESS(status)) { int32_t patternsSize = ures_getSize(dateTimePatterns.getAlias()); if (patternsSize > kDateTime) { int32_t resStrLen = 0; int32_t glueIndex = kDateTime; if (patternsSize >= (kDateTimeOffset + kShort + 1)) { int32_t offsetIncrement = (fDateStyle & ~kRelative); // Remove relative bit. if (offsetIncrement >= (int32_t)kFull && offsetIncrement <= (int32_t)kShortRelative) { glueIndex = kDateTimeOffset + offsetIncrement; } } const char16_t *resStr = ures_getStringByIndex(dateTimePatterns.getAlias(), glueIndex, &resStrLen, &status); if (U_SUCCESS(status) && resStrLen >= patItem1Len && u_strncmp(resStr,patItem1,patItem1Len)==0) { fCombinedHasDateAtStart = true; } fCombinedFormat = new SimpleFormatter(UnicodeString(true, resStr, resStrLen), 2, 2, status); } } // Data loading for relative names, e.g., "yesterday", "today", "tomorrow". fDatesLen = UDAT_DIRECTION_COUNT; // Maximum defined by data. fDates = (URelativeString*) uprv_malloc(sizeof(fDates[0])*fDatesLen); RelDateFmtDataSink sink(fDates, fDatesLen); ures_getAllItemsWithFallback(rb, "fields/day/relative", sink, status); ures_close(rb); if(U_FAILURE(status)) { fDatesLen=0; return; } } //---------------------------------------------------------------------- // this should to be in DateFormat, instead it was copied from SimpleDateFormat. Calendar* RelativeDateFormat::initializeCalendar(TimeZone* adoptZone, const Locale& locale, UErrorCode& status) { if(!U_FAILURE(status)) { fCalendar = Calendar::createInstance(adoptZone?adoptZone:TimeZone::createDefault(), locale, status); } if (U_SUCCESS(status) && fCalendar == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; } return fCalendar; } int32_t RelativeDateFormat::dayDifference(Calendar &cal, UErrorCode &status) { if(U_FAILURE(status)) { return 0; } // TODO: Cache the nowCal to avoid heap allocs? Would be difficult, don't know the calendar type Calendar *nowCal = cal.clone(); nowCal->setTime(Calendar::getNow(), status); // For the day difference, we are interested in the difference in the (modified) julian day number // which is midnight to midnight. Using fieldDifference() is NOT correct here, because // 6pm Jan 4th to 10am Jan 5th should be considered "tomorrow". int32_t dayDiff = cal.get(UCAL_JULIAN_DAY, status) - nowCal->get(UCAL_JULIAN_DAY, status); delete nowCal; return dayDiff; } U_NAMESPACE_END #endif /* !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/numrange_capi.cpp0000644000176200001440000001430314700200761017544 0ustar liggesusers// © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING // Allow implicit conversion from char16_t* to UnicodeString for this file: // Helpful in toString methods and elsewhere. #define UNISTR_FROM_STRING_EXPLICIT #include "fphdlimp.h" #include "number_utypes.h" #include "numparse_types.h" #include "formattedval_impl.h" #include "numrange_impl.h" #include "number_decnum.h" #include "unicode/numberrangeformatter.h" #include "unicode/unumberrangeformatter.h" using namespace icu; using namespace icu::number; using namespace icu::number::impl; U_NAMESPACE_BEGIN namespace number { namespace impl { /** * Implementation class for UNumberRangeFormatter. Wraps a LocalizedRangeNumberFormatter. */ struct UNumberRangeFormatterData : public UMemory, // Magic number as ASCII == "NRF" (NumberRangeFormatter) public IcuCApiHelper { LocalizedNumberRangeFormatter fFormatter; }; struct UFormattedNumberRangeImpl; // Magic number as ASCII == "FDN" (FormatteDNumber) typedef IcuCApiHelper UFormattedNumberRangeApiHelper; struct UFormattedNumberRangeImpl : public UFormattedValueImpl, public UFormattedNumberRangeApiHelper { UFormattedNumberRangeImpl(); ~UFormattedNumberRangeImpl(); FormattedNumberRange fImpl; UFormattedNumberRangeData fData; }; UFormattedNumberRangeImpl::UFormattedNumberRangeImpl() : fImpl(&fData) { fFormattedValue = &fImpl; } UFormattedNumberRangeImpl::~UFormattedNumberRangeImpl() { // Disown the data from fImpl so it doesn't get deleted twice fImpl.fData = nullptr; } } // namespace impl } // namespace number U_NAMESPACE_END UPRV_FORMATTED_VALUE_CAPI_NO_IMPLTYPE_AUTO_IMPL( UFormattedNumberRange, UFormattedNumberRangeImpl, UFormattedNumberRangeApiHelper, unumrf) const UFormattedNumberRangeData* number::impl::validateUFormattedNumberRange( const UFormattedNumberRange* uresult, UErrorCode& status) { auto* result = UFormattedNumberRangeApiHelper::validate(uresult, status); if (U_FAILURE(status)) { return nullptr; } return &result->fData; } U_CAPI UNumberRangeFormatter* U_EXPORT2 unumrf_openForSkeletonWithCollapseAndIdentityFallback( const char16_t* skeleton, int32_t skeletonLen, UNumberRangeCollapse collapse, UNumberRangeIdentityFallback identityFallback, const char* locale, UParseError* perror, UErrorCode* ec) { auto* impl = new UNumberRangeFormatterData(); if (impl == nullptr) { *ec = U_MEMORY_ALLOCATION_ERROR; return nullptr; } // Readonly-alias constructor (first argument is whether we are NUL-terminated) UnicodeString skeletonString(skeletonLen == -1, skeleton, skeletonLen); UParseError tempParseError; impl->fFormatter = NumberRangeFormatter::withLocale(locale) .numberFormatterBoth(NumberFormatter::forSkeleton(skeletonString, (perror == nullptr) ? tempParseError : *perror, *ec)) .collapse(collapse) .identityFallback(identityFallback); return impl->exportForC(); } U_CAPI void U_EXPORT2 unumrf_formatDoubleRange( const UNumberRangeFormatter* uformatter, double first, double second, UFormattedNumberRange* uresult, UErrorCode* ec) { const UNumberRangeFormatterData* formatter = UNumberRangeFormatterData::validate(uformatter, *ec); auto* result = UFormattedNumberRangeApiHelper::validate(uresult, *ec); if (U_FAILURE(*ec)) { return; } result->fData.resetString(); result->fData.quantity1.clear(); result->fData.quantity2.clear(); result->fData.quantity1.setToDouble(first); result->fData.quantity2.setToDouble(second); formatter->fFormatter.formatImpl(result->fData, first == second, *ec); } U_CAPI void U_EXPORT2 unumrf_formatDecimalRange( const UNumberRangeFormatter* uformatter, const char* first, int32_t firstLen, const char* second, int32_t secondLen, UFormattedNumberRange* uresult, UErrorCode* ec) { const UNumberRangeFormatterData* formatter = UNumberRangeFormatterData::validate(uformatter, *ec); auto* result = UFormattedNumberRangeApiHelper::validate(uresult, *ec); if (U_FAILURE(*ec)) { return; } result->fData.resetString(); result->fData.quantity1.clear(); result->fData.quantity2.clear(); result->fData.quantity1.setToDecNumber({first, firstLen}, *ec); result->fData.quantity2.setToDecNumber({second, secondLen}, *ec); formatter->fFormatter.formatImpl(result->fData, first == second, *ec); } U_CAPI UNumberRangeIdentityResult U_EXPORT2 unumrf_resultGetIdentityResult( const UFormattedNumberRange* uresult, UErrorCode* ec) { auto* result = UFormattedNumberRangeApiHelper::validate(uresult, *ec); if (U_FAILURE(*ec)) { return UNUM_IDENTITY_RESULT_COUNT; } return result->fData.identityResult; } U_CAPI int32_t U_EXPORT2 unumrf_resultGetFirstDecimalNumber( const UFormattedNumberRange* uresult, char* dest, int32_t destCapacity, UErrorCode* ec) { const auto* result = UFormattedNumberRangeApiHelper::validate(uresult, *ec); if (U_FAILURE(*ec)) { return 0; } DecNum decnum; return result->fData.quantity1.toDecNum(decnum, *ec) .toCharString(*ec) .extract(dest, destCapacity, *ec); } U_CAPI int32_t U_EXPORT2 unumrf_resultGetSecondDecimalNumber( const UFormattedNumberRange* uresult, char* dest, int32_t destCapacity, UErrorCode* ec) { const auto* result = UFormattedNumberRangeApiHelper::validate(uresult, *ec); if (U_FAILURE(*ec)) { return 0; } DecNum decnum; return result->fData.quantity2 .toDecNum(decnum, *ec) .toCharString(*ec) .extract(dest, destCapacity, *ec); } U_CAPI void U_EXPORT2 unumrf_close(UNumberRangeFormatter* f) { UErrorCode localStatus = U_ZERO_ERROR; const UNumberRangeFormatterData* impl = UNumberRangeFormatterData::validate(f, localStatus); delete impl; } #endif /* #if !UCONFIG_NO_FORMATTING */ stringi/src/icu74/i18n/scientificnumberformatter.cpp0000644000176200001440000002304214700200761022211 0ustar liggesusers// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (c) 2014, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING #include "unicode/scientificnumberformatter.h" #include "unicode/dcfmtsym.h" #include "unicode/fpositer.h" #include "unicode/utf16.h" #include "unicode/uniset.h" #include "unicode/decimfmt.h" #include "static_unicode_sets.h" U_NAMESPACE_BEGIN static const char16_t kSuperscriptDigits[] = { 0x2070, 0xB9, 0xB2, 0xB3, 0x2074, 0x2075, 0x2076, 0x2077, 0x2078, 0x2079}; static const char16_t kSuperscriptPlusSign = 0x207A; static const char16_t kSuperscriptMinusSign = 0x207B; static UBool copyAsSuperscript( const UnicodeString &s, int32_t beginIndex, int32_t endIndex, UnicodeString &result, UErrorCode &status) { if (U_FAILURE(status)) { return false; } for (int32_t i = beginIndex; i < endIndex;) { UChar32 c = s.char32At(i); int32_t digit = u_charDigitValue(c); if (digit < 0) { status = U_INVALID_CHAR_FOUND; return false; } result.append(kSuperscriptDigits[digit]); i += U16_LENGTH(c); } return true; } ScientificNumberFormatter *ScientificNumberFormatter::createSuperscriptInstance( DecimalFormat *fmtToAdopt, UErrorCode &status) { return createInstance(fmtToAdopt, new SuperscriptStyle(), status); } ScientificNumberFormatter *ScientificNumberFormatter::createSuperscriptInstance( const Locale &locale, UErrorCode &status) { return createInstance( static_cast( DecimalFormat::createScientificInstance(locale, status)), new SuperscriptStyle(), status); } ScientificNumberFormatter *ScientificNumberFormatter::createMarkupInstance( DecimalFormat *fmtToAdopt, const UnicodeString &beginMarkup, const UnicodeString &endMarkup, UErrorCode &status) { return createInstance( fmtToAdopt, new MarkupStyle(beginMarkup, endMarkup), status); } ScientificNumberFormatter *ScientificNumberFormatter::createMarkupInstance( const Locale &locale, const UnicodeString &beginMarkup, const UnicodeString &endMarkup, UErrorCode &status) { return createInstance( static_cast( DecimalFormat::createScientificInstance(locale, status)), new MarkupStyle(beginMarkup, endMarkup), status); } ScientificNumberFormatter *ScientificNumberFormatter::createInstance( DecimalFormat *fmtToAdopt, Style *styleToAdopt, UErrorCode &status) { LocalPointer fmt(fmtToAdopt); LocalPointer