stringi/0000755000175100001440000000000012612241663011752 5ustar hornikusersstringi/inst/0000755000175100001440000000000012612166246012732 5ustar hornikusersstringi/inst/CITATION0000644000175100001440000000103712543603515014066 0ustar hornikuserscitHeader("To cite package 'stringi' in publications use:") citEntry(entry = "Manual", title = "R package stringi: Character string processing facilities", author = personList(as.person("Marek Gagolewski"), as.person("Bartek Tartanus")), year = "2015", url = "http://stringi.rexamine.com/", doi = "10.5281/zenodo.19071", textVersion = paste("Gagolewski M., Tartanus B. (2015).", "R package stringi: Character string processing facilities.", "http://stringi.rexamine.com/.", "DOI:10.5281/zenodo.19071") ) stringi/inst/AUTHORS0000644000175100001440000000316512507305453014005 0ustar hornikusers** stringi authors and contributors ** * Marek Gagolewski (gagolews at rexamine dot com) [aut,cre] * Bartek Tartanus (tartanus at rexamine dot com) [aut] * Marcin Bujarski [ctb] ** ICU source code contributors ** The overwhelming majority of all ICU4C code has been contributed by IBM employees, or by people under contract to IBM. The following lists the contributions from non-IBMers. Except for some minor contributions, these are all covered by Joint Copyright Assignment letters between IBM and the contributors. * Apple Computer * Open Forum of Cambodia, represented by Javier Sola * Software Ventures, Inc., represented by Scott Duchin * Ge'ez Frontier Foundation, represented by Daniel Yacob * Language Analysis Systems, Inc., represented by Richard T. Gillam and Leonard A. Shaefer * PalmSource, Inc., represented by Vivek Magotra; contribution by Ken Krugler under PalmSource contract * Department of Information Technology (DIT) - Royal Government of Bhutan, represented by Pema Geyleg * Adobe Systems Incorporated, represented by Niti Hantaweepant * Yahoo! Inc., represented by Badi Kumar Sudhakaran and Tex Texin * Dominic Ludlam * Jonas Utterstrom * Yves Arrouye * Carl Brown * Robert Buck * Sean Hunter * Michael Lecuyer * and others for more details see: http://source.icu-project.org/repos/icu/icuhtml/trunk/legal/contributions/code_contributions.html as well as the source files in the package's src/icu55/ directory and http://site.icu-project.org/ for general information on the ICU project. ** Unicode Character Database ** UCD is provided and maintained by Unicode, Inc. See http://www.unicode.org/copyright.html stringi/src/0000755000175100001440000000000012612166245012543 5ustar hornikusersstringi/src/stri_search_fixed_extract.cpp0000644000175100001440000002042512612166246020472 0ustar hornikusers/* This file is part of the 'stringi' package for R. * Copyright (C) 2013-2015, Marek Gagolewski and Bartek Tartanus * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_bytesearch.h" #include #include using namespace std; /** * Extract first or last occurrences of pattern in a string [exact byte search] * * @param str character vector * @param pattern character vector * @param first looking for first or last match? * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-24) * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * stri_extract_fixed now uses byte search only * * @version 0.4-1 (Marek Gagolewski, 2014-12-08) * new args: opts_fixed * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use StriByteSearchMatcher */ SEXP stri__extract_firstlast_fixed(SEXP str, SEXP pattern, SEXP opts_fixed, bool first) { uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed); PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); // prepare string argument STRI__ERROR_HANDLER_BEGIN(2) int vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF8 str_cont(str, vectorize_length); StriContainerByteSearch pattern_cont(pattern, vectorize_length, pattern_flags); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, SET_STRING_ELT(ret, i, NA_STRING);, SET_STRING_ELT(ret, i, NA_STRING);) StriByteSearchMatcher* matcher = pattern_cont.getMatcher(i); matcher->reset(str_cont.get(i).c_str(), str_cont.get(i).length()); int start, len; if (first) { start = matcher->findFirst(); } else { start = matcher->findLast(); } if (start == USEARCH_DONE) { SET_STRING_ELT(ret, i, NA_STRING); continue; } len = matcher->getMatchedLength(); SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cont.get(i).c_str()+start, len, CE_UTF8)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* no-op */ }) } /** * Extract first occurrence of a fixed pattern in each string * * @param str character vector * @param pattern character vector * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-24) * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * stri_extract_fixed now uses byte search only * * @version 0.4-1 (Marek Gagolewski, 2014-12-08) * new args: opts_fixed */ SEXP stri_extract_first_fixed(SEXP str, SEXP pattern, SEXP opts_fixed) { return stri__extract_firstlast_fixed(str, pattern, opts_fixed, true); } /** * Extract last occurrence of a fixed pattern in each string * * @param str character vector * @param pattern character vector * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-24) * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * stri_extract_fixed now uses byte search only * * @version 0.4-1 (Marek Gagolewski, 2014-12-08) * new args: opts_fixed */ SEXP stri_extract_last_fixed(SEXP str, SEXP pattern, SEXP opts_fixed) { return stri__extract_firstlast_fixed(str, pattern, opts_fixed, false); } /** * Extract all occurrences of pattern in a string [exact byte search] * * @param str character vector * @param pattern character vector * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-24) * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * stri_extract_fixed now uses byte search only * * @version 0.4-1 (Marek Gagolewski, 2014-12-08) * new args: opts_fixed, omit_no_match, simplify * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use StriByteSearchMatcher */ SEXP stri_extract_all_fixed(SEXP str, SEXP pattern, SEXP simplify, SEXP omit_no_match, SEXP opts_fixed) { uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed, /*allow_overlap*/true); bool omit_no_match1 = stri__prepare_arg_logical_1_notNA(omit_no_match, "omit_no_match"); PROTECT(simplify = stri_prepare_arg_logical_1(simplify, "simplify")); PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); // prepare string argument STRI__ERROR_HANDLER_BEGIN(3) R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF8 str_cont(str, vectorize_length); StriContainerByteSearch pattern_cont(pattern, vectorize_length, pattern_flags); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));, SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(omit_no_match1?0:1));) StriByteSearchMatcher* matcher = pattern_cont.getMatcher(i); matcher->reset(str_cont.get(i).c_str(), str_cont.get(i).length()); int start = matcher->findFirst(); deque< pair > occurrences; while (start != USEARCH_DONE) { occurrences.push_back(pair(start, start+matcher->getMatchedLength())); start = matcher->findNext(); } R_len_t noccurrences = (R_len_t)occurrences.size(); if (noccurrences <= 0) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(omit_no_match1?0:1)); continue; } const char* str_cur_s = str_cont.get(i).c_str(); SEXP cur_res; STRI__PROTECT(cur_res = Rf_allocVector(STRSXP, noccurrences)); deque< pair >::iterator iter = occurrences.begin(); for (R_len_t j = 0; iter != occurrences.end(); ++iter, ++j) { pair curo = *iter; SET_STRING_ELT(cur_res, j, Rf_mkCharLenCE(str_cur_s+curo.first, curo.second-curo.first, CE_UTF8)); } SET_VECTOR_ELT(ret, i, cur_res); STRI__UNPROTECT(1); } if (LOGICAL(simplify)[0] == NA_LOGICAL) { STRI__PROTECT(ret = stri_list2matrix(ret, Rf_ScalarLogical(TRUE), stri__vector_NA_strings(1), Rf_ScalarInteger(0))) } else if (LOGICAL(simplify)[0]) { STRI__PROTECT(ret = stri_list2matrix(ret, Rf_ScalarLogical(TRUE), stri__vector_empty_strings(1), Rf_ScalarInteger(0))) } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({/* no-op */}) } stringi/src/stri_messages.h0000644000175100001440000002173112612166246015571 0ustar hornikusers/* This file is part of the 'stringi' package for R. * Copyright (C) 2013-2015, Marek Gagolewski and Bartek Tartanus * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_messages_h #define __stri_messages_h /// inccorect uchar class id, see stri_char_getpropertyid and stri_char_getcategoryid #define MSG__INCORRECT_UCHAR_CLASS_ID \ "incorrect class identifier" #define MSG__INCORRECT_MATCH_OPTION \ "incorrect option for `%s`" #define MSG__INCORRECT_COLLATOR_OPTION \ "incorrect opts_collator setting: `%s`. ignoring" #define MSG__INCORRECT_COLLATOR_OPTION_SPEC \ "incorrect collator option specifier. see ?stri_opts_collator" #define MSG__INCORRECT_BRKITER_OPTION_SPEC \ "incorrect break iterator option specifier. see ?stri_opts_brkiter" #define MSG__INCORRECT_FIXED_OPTION \ "incorrect opts_fixed setting: `%s`. ignoring" #define MSG__INCORRECT_REGEX_OPTION \ "incorrect opts_regex setting: `%s`. ignoring" #define MSG__INVALID_CODE_POINT \ "invalid Unicode codepoint \\U%08.8x" #define MSG__INVALID_CODE_POINT_FIXING \ "invalid UTF-8 codepoint definition. fixing" #define MSG__INVALID_CODE_POINT_REPLNA \ "invalid UTF-8 codepoint definition. setting string to NA. see also stri_enc_toutf8()" #define MSG__INVALID_UTF8 \ "invalid UTF-8 byte sequence detected. perhaps you should try calling stri_enc_toutf8()" #define MSG__INVALID_ESCAPE \ "invalid escape sequence detected. Setting NA" #define MSG__UNCONVERTABLE_CODE_POINT \ "the Unicode codepoint \\U%08.8x cannot be converted to destination encoding" #define MSG__UNCONVERTABLE_BINARY_1 \ "input data \\x%02.2x in current source encoding could not be converted to Unicode" #define MSG__UNCONVERTABLE_BINARY_2 \ "input data \\x%02.2x\\x%02.2x in current source encoding could not be converted to Unicode" #define MSG__UNCONVERTABLE_BINARY_3 \ "input data \\x%02.2x\\x%02.2x\\x%02.2x in current source encoding could not be converted to Unicode" #define MSG__UNCONVERTABLE_BINARY_4 \ "input data \\x%02.2x\\x%02.2x\\x%02.2x\\x%02.2x in current source encoding could not be converted to Unicode" #define MSG__UNCONVERTABLE_BINARY_n \ "some input data in current source encoding could not be converted to Unicode" /// warning when applying recycling rule to not fully recycled args #define MSG__WARN_RECYCLING_RULE \ "longer object length is not a multiple of shorter object length" #define MSG__WARN_RECYCLING_RULE2 \ "vector length not consistent with other arguments" #define MSG__INCORRECT_INTERNAL_ARG \ "incorrect argument" #define MSG__INTERNAL_ERROR \ "internal error" #define MSG__ICU_ERROR \ "%s (%s)" #define MSG__ICU_WARNING \ "%s (%s)" #define MSG__EXPECTED_NONNEGATIVE \ "argument `%s`: expected a nonnegative numeric value" #define MSG__EXPECTED_POSITIVE \ "argument `%s`: expected a positive numeric value" #define MSG__EXPECTED_SMALLER \ "argument `%s`: value too large" #define MSG__EXPECTED_ASCII \ "incorrect argument: the string contains non-ASCII characters" #define MSG__TIMEZONE_INCORRECT_ID \ "incorrect time zone identifier" #define MSG__LOCALE_ERROR_SET \ "could not set or select given locale" #define MSG__ENC_ERROR_GETNAME \ "could not fetch name of the character encoding from the ICU converter" #define MSG__ENC_ERROR_SET \ "could not set, query or select given character encoding" #define MSG__ENC_ERROR_CONVERT \ "could not convert string encoding" #define MSG__LOCALE_INCORRECT_ID \ "incorrect locale identifier" #define MSG__ENC_INCORRECT_ID \ "incorrect character encoding identifier" #define MSG__ENC_INCORRECT_ID_WHAT \ "incorrect character encoding identifier: %s" #define MSG__ENC_NOT8BIT \ "encoding %s is not an 8-bit encoding" #define MSG__BYTESENC \ "bytes encoding is not supported by this function" #define MSG__REGEXP_FAILED \ "regexp search failed" #define MSG__REGEXP_CONFIG_FAILED \ "regexp engine config failed" #define MSG__FIXED_CONFIG_FAILED \ "fixed search engine config failed" #define MSG__REGEXP_FAILED_DETAILS \ "regexp search failed: %s" #define MSG__STRSEARCH_FAILED \ "string search failed" #define MSG__RESOURCE_ERROR_GET \ "required ICU resource unavailable" #define MSG__RESOURCE_ERROR_APPLY \ "error while applying operation" #define MSG__LOCATE_DIM_START \ "start" #define MSG__LOCATE_DIM_END \ "end" #define MSG__NEWLINE_FOUND \ "newline character found in a string" #define MSG__NOT_EQ_N_CODEPOINTS \ "each string in `%s` should consist of exactly %d code points" #define MSG__NOT_EQ_N_WIDTH \ "each string in `%s` should consist of code points of total width %d" #define MSG__CHARCLASS_INCORRECT_WHICH \ "unknown charclass `%s`. assuming NA" #define MSG__CHARCLASS_INCORRECT \ "unknown charclass" #define MSG__ARG_EXPECTED_NOT_NA \ "missing value in argument `%s` is not supported" #define MSG__ARG_EXPECTED_NOT_EMPTY \ "argument `%s` should be a non-empty vector" #define MSG__ARG_EXPECTED_1_STRING \ "argument `%s` should be one character string; taking the first one" #define MSG__ARG_EXPECTED_1_LOGICAL \ "argument `%s` should be one logical value; taking the first one" #define MSG__ARG_EXPECTED_1_INTEGER \ "argument `%s` should be one integer value; taking the first one" #define MSG__ARG_EXPECTED_1_NUMERIC \ "argument `%s` should be one numeric value; taking the first one" #define MSG__ARG_EXPECTED_STRING \ "argument `%s` should be a character vector (or an object coercible to)" #define MSG__ARG_EXPECTED_LIST \ "argument `%s` should be a list" #define MSG__ARG_EXPECTED_LIST_STRING \ "argument `%s` should be a list of character vectors (or an object coercible to)" #define MSG__ARG_EXPECTED_LIST_INTEGER \ "argument `%s` should be a list of integer vectors or an integer vector (or an object coercible to)" #define MSG__ARG_EXPECTED_RAW \ "argument `%s` should be a raw vector (or an object coercible to)" #define MSG__ARG_EXPECTED_LOGICAL \ "argument `%s` should be a logical vector (or an object coercible to)" #define MSG__ARG_EXPECTED_INTEGER \ "argument `%s` should be an integer vector (or an object coercible to)" #define MSG__ARG_EXPECTED_NUMERIC \ "argument `%s` should be a numeric vector (or an object coercible to)" #define MSG__ARG_EXPECTED_POSIXct \ "argument `%s` should be an object of class POSIXct (or an object coercible to)" #define MSG__ARG_EXPECTED_STRING_NO_COERCION \ "argument `%s` should be a character vector" #define MSG__ARG_EXPECTED_RAW_IN_LIST_NO_COERCION \ "all elements in `%s` should be a raw vectors" #define MSG__ARG_EXPECTED_RAW_NO_COERCION \ "argument `%s` should be a raw vector" #define MSG__ARG_EXPECTED_LOGICAL_NO_COERCION \ "argument `%s` should be a logical vector" #define MSG__ARG_EXPECTED_INTEGER_NO_COERCION \ "argument `%s` should be an integer vector" #define MSG__ARG_EXPECTED_NUMERIC_NO_COERCION \ "argument `%s` should be a numeric vector" #define MSG__ARG_EXPECTED_MATRIX_WITH_GIVEN_COLUMNS \ "argument `%s` should be a matrix with %d columns" #define MSG__ARG_EXPECTED_NOT_MATRIX \ "argument `%s` is a matrix, which is not supported in given context" #define MSG__ARG_IGNORING \ "ignoring argument `%s` in given context" #define MSG__ARG_EXCLUSIVE \ "arguments `%s` and `%s` are mutually exclusive in given context" #define MSG__EMPTY_SEARCH_PATTERN_UNSUPPORTED \ "empty search patterns are not supported" #define MSG__OVERLAPPING_PATTERN_UNSUPPORTED \ "overlapping pattern matches are not supported" #define MSG__MEM_ALLOC_ERROR \ "memory allocation error" #endif stringi/src/stri_search_class_count.cpp0000644000175100001440000000747412612166246020167 0ustar hornikusers/* This file is part of the 'stringi' package for R. * Copyright (C) 2013-2015, Marek Gagolewski and Bartek Tartanus * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_charclass.h" /** * Count the number of occurrences of a character class * * @param str character vector * @param pattern character vector * @return integer vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-02) * * @version 0.1-?? (Marek Gagolewski, 2013-06-15) * Use StrContainerCharClass * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * detects invalid UTF-8 byte stream * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * StriContainerCharClass now relies on UnicodeSet * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_count_charclass(SEXP str, SEXP pattern) { PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, vectorize_length)); int* ret_tab = INTEGER(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i)) { ret_tab[i] = NA_INTEGER; continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); UChar32 chr = 0; R_len_t count = 0; for (R_len_t j=0; jcontains(chr)) ++count; } ret_tab[i] = count; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } stringi/src/stri_stringi.cpp0000644000175100001440000005304012612166246015772 0ustar hornikusers/* This file is part of the 'stringi' package for R. * Copyright (C) 2013-2015, Marek Gagolewski and Bartek Tartanus * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include #include #include #ifndef STRI_ICU_FOUND #include "uconfig_local.h" #endif #define STRI__MK_CALL(symb, name, args) \ {symb, (DL_FUNC)&name, args} /** * List of functions available via .Call() in R * * Form: * \code{\{"method_name", (DL_FUNC)pointer, number_of_args\}} - * this is generated by the STRI__MAKE_CALL_METHOD macro. */ const R_CallMethodDef cCallMethods[] = { // STRI__MK_CALL("C_stri_charcategories", stri_charcategories, 0), // TO BE >= 0.6 // STRI__MK_CALL("C_stri_chartype", stri_chartype, 1), // TO BE >= 0.6 // STRI__MK_CALL("C_stri_c_posixst", stri_c_posixst, 1), // internal STRI__MK_CALL("C_stri_cmp_eq", stri_cmp_eq, 2), STRI__MK_CALL("C_stri_cmp_neq", stri_cmp_neq, 2), STRI__MK_CALL("C_stri_cmp", stri_cmp, 3), STRI__MK_CALL("C_stri_cmp_lt", stri_cmp_lt, 3), STRI__MK_CALL("C_stri_cmp_le", stri_cmp_le, 3), STRI__MK_CALL("C_stri_cmp_gt", stri_cmp_gt, 3), STRI__MK_CALL("C_stri_cmp_ge", stri_cmp_ge, 3), STRI__MK_CALL("C_stri_cmp_equiv", stri_cmp_equiv, 3), STRI__MK_CALL("C_stri_cmp_nequiv", stri_cmp_nequiv, 3), STRI__MK_CALL("C_stri_count_boundaries", stri_count_boundaries, 2), STRI__MK_CALL("C_stri_count_charclass", stri_count_charclass, 2), STRI__MK_CALL("C_stri_count_fixed", stri_count_fixed, 3), STRI__MK_CALL("C_stri_count_coll", stri_count_coll, 3), STRI__MK_CALL("C_stri_count_regex", stri_count_regex, 3), STRI__MK_CALL("C_stri_datetime_symbols", stri_datetime_symbols, 3), STRI__MK_CALL("C_stri_datetime_fields", stri_datetime_fields, 3), STRI__MK_CALL("C_stri_datetime_now", stri_datetime_now, 0), STRI__MK_CALL("C_stri_datetime_create", stri_datetime_create, 9), STRI__MK_CALL("C_stri_datetime_format", stri_datetime_format, 4), STRI__MK_CALL("C_stri_datetime_parse", stri_datetime_parse, 5), STRI__MK_CALL("C_stri_datetime_add", stri_datetime_add, 5), STRI__MK_CALL("C_stri_detect_charclass", stri_detect_charclass, 2), STRI__MK_CALL("C_stri_detect_coll", stri_detect_coll, 3), STRI__MK_CALL("C_stri_detect_fixed", stri_detect_fixed, 3), STRI__MK_CALL("C_stri_detect_regex", stri_detect_regex, 3), STRI__MK_CALL("C_stri_dup", stri_dup, 2), STRI__MK_CALL("C_stri_duplicated", stri_duplicated, 3), STRI__MK_CALL("C_stri_duplicated_any", stri_duplicated_any, 3), STRI__MK_CALL("C_stri_enc_detect", stri_enc_detect, 2), STRI__MK_CALL("C_stri_enc_detect2", stri_enc_detect2, 2), STRI__MK_CALL("C_stri_enc_isutf8", stri_enc_isutf8, 1), STRI__MK_CALL("C_stri_enc_isutf16le", stri_enc_isutf16le, 1), STRI__MK_CALL("C_stri_enc_isutf16be", stri_enc_isutf16be, 1), STRI__MK_CALL("C_stri_enc_isutf32le", stri_enc_isutf32le, 1), STRI__MK_CALL("C_stri_enc_isutf32be", stri_enc_isutf32be, 1), STRI__MK_CALL("C_stri_enc_isascii", stri_enc_isascii, 1), STRI__MK_CALL("C_stri_enc_info", stri_enc_info, 1), STRI__MK_CALL("C_stri_enc_list", stri_enc_list, 0), STRI__MK_CALL("C_stri_enc_mark", stri_enc_mark, 1), STRI__MK_CALL("C_stri_enc_set", stri_enc_set, 1), STRI__MK_CALL("C_stri_enc_fromutf32", stri_enc_fromutf32, 1), STRI__MK_CALL("C_stri_enc_toascii", stri_enc_toascii, 1), STRI__MK_CALL("C_stri_enc_toutf8", stri_enc_toutf8, 3), STRI__MK_CALL("C_stri_enc_toutf32", stri_enc_toutf32, 1), STRI__MK_CALL("C_stri_encode", stri_encode, 4), // STRI__MK_CALL("C_stri_encode_from_marked", stri_encode_from_marked, 3), // internal STRI__MK_CALL("C_stri_endswith_charclass", stri_endswith_charclass, 3), STRI__MK_CALL("C_stri_endswith_coll", stri_endswith_coll, 4), STRI__MK_CALL("C_stri_endswith_fixed", stri_endswith_fixed, 4), STRI__MK_CALL("C_stri_escape_unicode", stri_escape_unicode, 1), STRI__MK_CALL("C_stri_extract_first_boundaries", stri_extract_first_boundaries, 2), STRI__MK_CALL("C_stri_extract_last_boundaries", stri_extract_last_boundaries, 2), STRI__MK_CALL("C_stri_extract_all_boundaries", stri_extract_all_boundaries, 4), STRI__MK_CALL("C_stri_extract_first_charclass", stri_extract_first_charclass, 2), STRI__MK_CALL("C_stri_extract_last_charclass", stri_extract_last_charclass, 2), STRI__MK_CALL("C_stri_extract_all_charclass", stri_extract_all_charclass, 5), STRI__MK_CALL("C_stri_extract_first_coll", stri_extract_first_coll, 3), STRI__MK_CALL("C_stri_extract_last_coll", stri_extract_last_coll, 3), STRI__MK_CALL("C_stri_extract_all_coll", stri_extract_all_coll, 5), STRI__MK_CALL("C_stri_extract_first_fixed", stri_extract_first_fixed, 3), STRI__MK_CALL("C_stri_extract_last_fixed", stri_extract_last_fixed, 3), STRI__MK_CALL("C_stri_extract_all_fixed", stri_extract_all_fixed, 5), STRI__MK_CALL("C_stri_extract_first_regex", stri_extract_first_regex, 3), STRI__MK_CALL("C_stri_extract_last_regex", stri_extract_last_regex, 3), STRI__MK_CALL("C_stri_extract_all_regex", stri_extract_all_regex, 5), STRI__MK_CALL("C_stri_flatten", stri_flatten, 2), // STRI__MK_CALL("C_stri_in_fixed", stri_in_fixed, 3), // TODO: version >= 0.6 STRI__MK_CALL("C_stri_info", stri_info, 0), STRI__MK_CALL("C_stri_isempty", stri_isempty, 1), STRI__MK_CALL("C_stri_join", stri_join, 4), STRI__MK_CALL("C_stri_join2", stri_join2, 2), // STRI__MK_CALL("C_stri_justify", stri_justify, 2), // TODO: version >= 0.6 STRI__MK_CALL("C_stri_length", stri_length, 1), STRI__MK_CALL("C_stri_list2matrix", stri_list2matrix, 4), STRI__MK_CALL("C_stri_locale_info", stri_locale_info, 1), STRI__MK_CALL("C_stri_locale_list", stri_locale_list, 0), STRI__MK_CALL("C_stri_locale_set", stri_locale_set, 1), STRI__MK_CALL("C_stri_locate_all_boundaries", stri_locate_all_boundaries, 3), STRI__MK_CALL("C_stri_locate_first_boundaries", stri_locate_first_boundaries, 2), STRI__MK_CALL("C_stri_locate_last_boundaries", stri_locate_last_boundaries, 2), STRI__MK_CALL("C_stri_locate_first_charclass", stri_locate_first_charclass, 2), STRI__MK_CALL("C_stri_locate_last_charclass", stri_locate_last_charclass, 2), STRI__MK_CALL("C_stri_locate_all_charclass", stri_locate_all_charclass, 4), STRI__MK_CALL("C_stri_locate_last_fixed", stri_locate_last_fixed, 3), STRI__MK_CALL("C_stri_locate_first_fixed", stri_locate_first_fixed, 3), STRI__MK_CALL("C_stri_locate_all_fixed", stri_locate_all_fixed, 4), STRI__MK_CALL("C_stri_locate_last_coll", stri_locate_last_coll, 3), STRI__MK_CALL("C_stri_locate_first_coll", stri_locate_first_coll, 3), STRI__MK_CALL("C_stri_locate_all_coll", stri_locate_all_coll, 4), STRI__MK_CALL("C_stri_locate_all_regex", stri_locate_all_regex, 4), STRI__MK_CALL("C_stri_locate_first_regex", stri_locate_first_regex, 3), STRI__MK_CALL("C_stri_locate_last_regex", stri_locate_last_regex, 3), STRI__MK_CALL("C_stri_match_first_regex", stri_match_first_regex, 4), STRI__MK_CALL("C_stri_match_last_regex", stri_match_last_regex, 4), STRI__MK_CALL("C_stri_match_all_regex", stri_match_all_regex, 5), STRI__MK_CALL("C_stri_numbytes", stri_numbytes, 1), STRI__MK_CALL("C_stri_order", stri_order, 4), STRI__MK_CALL("C_stri_sort", stri_sort, 4), STRI__MK_CALL("C_stri_pad", stri_pad, 5), STRI__MK_CALL("C_stri_prepare_arg_string", stri_prepare_arg_string, 2), STRI__MK_CALL("C_stri_prepare_arg_POSIXct", stri_prepare_arg_POSIXct, 2), STRI__MK_CALL("C_stri_prepare_arg_double", stri_prepare_arg_double, 2), STRI__MK_CALL("C_stri_prepare_arg_integer", stri_prepare_arg_integer, 2), STRI__MK_CALL("C_stri_prepare_arg_logical", stri_prepare_arg_logical, 2), STRI__MK_CALL("C_stri_prepare_arg_raw", stri_prepare_arg_raw, 2), STRI__MK_CALL("C_stri_prepare_arg_string_1", stri_prepare_arg_string_1, 2), STRI__MK_CALL("C_stri_prepare_arg_double_1", stri_prepare_arg_double_1, 2), STRI__MK_CALL("C_stri_prepare_arg_integer_1", stri_prepare_arg_integer_1, 2), STRI__MK_CALL("C_stri_prepare_arg_logical_1", stri_prepare_arg_logical_1, 2), STRI__MK_CALL("C_stri_rand_shuffle", stri_rand_shuffle, 1), STRI__MK_CALL("C_stri_rand_strings", stri_rand_strings, 3), STRI__MK_CALL("C_stri_replace_na", stri_replace_na, 2), STRI__MK_CALL("C_stri_replace_all_fixed", stri_replace_all_fixed, 5), STRI__MK_CALL("C_stri_replace_first_fixed", stri_replace_first_fixed, 4), STRI__MK_CALL("C_stri_replace_last_fixed", stri_replace_last_fixed, 4), STRI__MK_CALL("C_stri_replace_all_coll", stri_replace_all_coll, 5), STRI__MK_CALL("C_stri_replace_first_coll", stri_replace_first_coll, 4), STRI__MK_CALL("C_stri_replace_last_coll", stri_replace_last_coll, 4), STRI__MK_CALL("C_stri_replace_all_regex", stri_replace_all_regex, 5), STRI__MK_CALL("C_stri_replace_first_regex", stri_replace_first_regex, 4), STRI__MK_CALL("C_stri_replace_last_regex", stri_replace_last_regex, 4), STRI__MK_CALL("C_stri_replace_all_charclass", stri_replace_all_charclass, 5), STRI__MK_CALL("C_stri_replace_first_charclass", stri_replace_first_charclass, 3), STRI__MK_CALL("C_stri_replace_last_charclass", stri_replace_last_charclass, 3), STRI__MK_CALL("C_stri_reverse", stri_reverse, 1), STRI__MK_CALL("C_stri_split_boundaries", stri_split_boundaries, 5), STRI__MK_CALL("C_stri_split_charclass", stri_split_charclass, 6), STRI__MK_CALL("C_stri_split_coll", stri_split_coll, 7), STRI__MK_CALL("C_stri_split_fixed", stri_split_fixed, 7), STRI__MK_CALL("C_stri_split_lines", stri_split_lines, 2), STRI__MK_CALL("C_stri_split_lines1", stri_split_lines1, 1), STRI__MK_CALL("C_stri_split_regex", stri_split_regex, 7), STRI__MK_CALL("C_stri_startswith_charclass", stri_startswith_charclass, 3), STRI__MK_CALL("C_stri_startswith_coll", stri_startswith_coll, 4), STRI__MK_CALL("C_stri_startswith_fixed", stri_startswith_fixed, 4), STRI__MK_CALL("C_stri_stats_general", stri_stats_general, 1), STRI__MK_CALL("C_stri_stats_latex", stri_stats_latex, 1), STRI__MK_CALL("C_stri_sub", stri_sub, 4), STRI__MK_CALL("C_stri_sub_replacement", stri_sub_replacement, 5), STRI__MK_CALL("C_stri_subset_charclass", stri_subset_charclass, 3), STRI__MK_CALL("C_stri_subset_coll", stri_subset_coll, 4), STRI__MK_CALL("C_stri_subset_fixed", stri_subset_fixed, 4), STRI__MK_CALL("C_stri_subset_regex", stri_subset_regex, 4), STRI__MK_CALL("C_stri_test_Rmark", stri_test_Rmark, 1), STRI__MK_CALL("C_stri_test_returnasis", stri_test_returnasis, 1), STRI__MK_CALL("C_stri_test_UnicodeContainer16", stri_test_UnicodeContainer16, 1), STRI__MK_CALL("C_stri_test_UnicodeContainer16b", stri_test_UnicodeContainer16b, 1), STRI__MK_CALL("C_stri_test_UnicodeContainer8", stri_test_UnicodeContainer8, 1), STRI__MK_CALL("C_stri_timezone_list", stri_timezone_list, 2), STRI__MK_CALL("C_stri_timezone_set", stri_timezone_set, 1), STRI__MK_CALL("C_stri_timezone_info", stri_timezone_info, 3), STRI__MK_CALL("C_stri_trans_char", stri_trans_char, 3), STRI__MK_CALL("C_stri_trans_isnfc", stri_trans_isnfc, 1), STRI__MK_CALL("C_stri_trans_isnfd", stri_trans_isnfd, 1), STRI__MK_CALL("C_stri_trans_isnfkc", stri_trans_isnfkc, 1), STRI__MK_CALL("C_stri_trans_isnfkd", stri_trans_isnfkd, 1), STRI__MK_CALL("C_stri_trans_isnfkc_casefold", stri_trans_isnfkc_casefold, 1), STRI__MK_CALL("C_stri_trans_general", stri_trans_general, 2), STRI__MK_CALL("C_stri_trans_list", stri_trans_list, 0), STRI__MK_CALL("C_stri_trans_nfc", stri_trans_nfc, 1), STRI__MK_CALL("C_stri_trans_nfd", stri_trans_nfd, 1), STRI__MK_CALL("C_stri_trans_nfkc", stri_trans_nfkc, 1), STRI__MK_CALL("C_stri_trans_nfkd", stri_trans_nfkd, 1), STRI__MK_CALL("C_stri_trans_nfkc_casefold", stri_trans_nfkc_casefold, 1), STRI__MK_CALL("C_stri_trans_totitle", stri_trans_totitle, 2), STRI__MK_CALL("C_stri_trans_tolower", stri_trans_tolower, 2), STRI__MK_CALL("C_stri_trans_toupper", stri_trans_toupper, 2), STRI__MK_CALL("C_stri_trim_both", stri_trim_both, 2), STRI__MK_CALL("C_stri_trim_left", stri_trim_left, 2), STRI__MK_CALL("C_stri_trim_right", stri_trim_right, 2), STRI__MK_CALL("C_stri_unescape_unicode", stri_unescape_unicode, 1), STRI__MK_CALL("C_stri_unique", stri_unique, 2), STRI__MK_CALL("C_stri_width", stri_width, 1), STRI__MK_CALL("C_stri_wrap", stri_wrap, 10), // STRI__MK_CALL("C_stri_trim_double", stri_trim_double, 3), // TODO: version >= 0.6 // the list must be NULL-terminated: {NULL, NULL, 0} }; /** Sets ICU data dir * * @param libpath */ void stri_set_icu_data_directory(const char* libpath) { // libpath == "...../libs" -> "...../libs" // libpath == "...../libs/i386" -> "...../libs" // libpath == "...../libs/x64" -> "...../libs" string dir(libpath); size_t idx = dir.rfind("libs"); if (idx == string::npos) { // this shouldn't happen u_setDataDirectory(libpath); // just use the libpath return; } // idx+5 -> if the string is shorter, as many characters as possible are used dir = dir.substr(0, idx+4); // 4 == strlen("libs") u_setDataDirectory(dir.c_str()); // #ifndef NDEBUG // fprintf(stderr, "ICU data directory=%s\n", dir.c_str()); // #endif // anyway, if .dat file will not be found, // ICU will use system data (may be stub) // 1. Examine the contents of the default ICU data shared library. // If it contains data, use that data. // If the data library is empty, a stub library, proceed to the next step. // 2. Dynamically load (memory map, typically) a common format (.dat) file // containing the default ICU data. } /** * Library initialization. * * R calls this automatically on lib load/attach. */ extern "C" void R_init_stringi(DllInfo* dll) { #if STRI_ICU_FOUND == 0 stri_set_icu_data_directory((char*)*(char**)(dll) /* dll->path */); #endif /* BTW: u_init: It is OK to simply use ICU services and functions without first having initialized ICU by calling u_init(). u_init() will attempt to load some part of ICU's data, and is useful as a test for configuration or installation problems that leave the ICU data inaccessible. A successful invocation of u_init() does not, however, guarantee that all ICU data is accessible. */ UErrorCode status = U_ZERO_ERROR; u_init(&status); if (U_FAILURE(status)) Rf_error("ICU init failed: %s", u_errorName(status)); R_registerRoutines(dll, NULL, cCallMethods, NULL, NULL); // R_useDynamicSymbols(dll, Rboolean(FALSE)); // slower const R_CallMethodDef* methods = cCallMethods; while (methods->name) { R_RegisterCCallable("stringi", methods->name, methods->fun); methods++; } if (!SUPPORT_UTF8) { /* Rconfig.h states that all R platforms supports that */ Rf_error("R does not support UTF-8 encoding."); } #ifndef NDEBUG // fprintf(stdout, "!NDEBUG: ************************************************\n"); // fprintf(stdout, "!NDEBUG: Dynamic library `stringi` loaded\n"); // fprintf(stdout, "!NDEBUG: Check out http://stringi.rexamine.com\n"); // fprintf(stdout, "!NDEBUG: \n"); // fprintf(stdout, "!NDEBUG: Please send bug reports to stringi@rexamine.com \n"); // fprintf(stdout, "!NDEBUG: or at https://github.com/Rexamine/stringi/issues\n"); // fprintf(stdout, "!NDEBUG: \n"); // fprintf(stdout, "!NDEBUG: Have fun testing! :-)\n"); // fprintf(stdout, "!NDEBUG: ************************************************\n"); #endif } #ifndef NDEBUG #include /** * Library cleanup */ extern "C" void R_unload_stringi(DllInfo*) { // see http://bugs.icu-project.org/trac/ticket/10897 // and https://github.com/Rexamine/stringi/issues/78 // fprintf(stdout, "!NDEBUG: ************************************************\n"); // fprintf(stdout, "!NDEBUG: Dynamic library 'stringi' unloaded.\n"); // fprintf(stdout, "!NDEBUG: ************************************************\n"); u_cleanup(); } #endif stringi/src/stri_container_double.h0000644000175100001440000000651212612166246017276 0ustar hornikusers/* This file is part of the 'stringi' package for R. * Copyright (C) 2013-2015, Marek Gagolewski and Bartek Tartanus * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_container_double_h #define __stri_container_double_h #include "stri_container_base.h" /** * A wrapper-class for R double vectors * * @version 0.5-1 (Marek Gagolewski, 2014-12-30) */ class StriContainerDouble : public StriContainerBase { private: double* data; public: StriContainerDouble() : StriContainerBase() { data = NULL; } StriContainerDouble(SEXP rvec, R_len_t _nrecycle) { this->data = NULL; #ifndef NDEBUG if (!isReal(rvec)) throw StriException("DEBUG: !isReal in StriContainerDouble"); #endif R_len_t ndata = LENGTH(rvec); this->init_Base(ndata, _nrecycle, true); this->data = REAL(rvec); } // StriContainerDouble(StriContainerDouble& container); // default-shallow // ~StriContainerDouble(); // default-shallow // StriContainerDouble& operator=(StriContainerDouble& container); // default-shallow /** check if the vectorized ith element is NA * @param i index * @return true if is NA */ inline bool isNA(R_len_t i) const { #ifndef NDEBUG if (i < 0 || i >= nrecycle) throw StriException("StriContainerDouble::isNA(): INDEX OUT OF BOUNDS"); #endif return (ISNA(data[i%n])); } /** get the vectorized ith element * @param i index * @return double */ inline double get(R_len_t i) const { #ifndef NDEBUG if (i < 0 || i >= nrecycle) throw StriException("StriContainerDouble::get(): INDEX OUT OF BOUNDS"); if (ISNA(data[i%n])) throw StriException("StriContainerDouble::get(): isNA"); #endif return (data[i%n]); } }; #endif stringi/src/stri_sub.cpp0000644000175100001440000003122112612166246015101 0ustar hornikusers/* This file is part of the 'stringi' package for R. * Copyright (C) 2013-2015, Marek Gagolewski and Bartek Tartanus * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8_indexable.h" #include "stri_string8buf.h" #define STRI__SUB_PREPARE_FROM_TO_LENGTH \ bool from_ismatrix = Rf_isMatrix(from); \ if (from_ismatrix) { \ SEXP t = Rf_getAttrib(from, R_DimSymbol); \ if (INTEGER(t)[1] == 1) \ from_ismatrix = false; /* it's a column vector */ \ else if (INTEGER(t)[1] > 2) { \ /* error() is allowed here */ \ Rf_error(MSG__ARG_EXPECTED_MATRIX_WITH_GIVEN_COLUMNS, "from", 2); \ } \ } \ PROTECT(from = stri_prepare_arg_integer(from, "from")); \ /* may remove R_DimSymbol */ \ \ if (from_ismatrix) { \ from_len = LENGTH(from)/2; \ to_len = from_len; \ from_tab = INTEGER(from); \ to_tab = from_tab+from_len; \ PROTECT(to); /* fake - not to provoke stack imbalance */ \ PROTECT(length); /* fake - not to provoke stack imbalance */ \ } \ else if (isNull(length)) { \ PROTECT(to = stri_prepare_arg_integer(to, "to")); \ from_len = LENGTH(from); \ from_tab = INTEGER(from); \ to_len = LENGTH(to); \ to_tab = INTEGER(to); \ PROTECT(length); /* fake - not to provoke stack imbalance */ \ } \ else { \ PROTECT(length= stri_prepare_arg_integer(length, "length")); \ from_len = LENGTH(from); \ from_tab = INTEGER(from); \ length_len = LENGTH(length); \ length_tab = INTEGER(length); \ PROTECT(to); /* fake - not to provoke stack imbalance */ \ } #define STRI__SUB_GET_INDICES(cur_from, cur_to, cur_from2, cur_to2) \ \ if (cur_from >= 0) { \ cur_from--; /* 1-based -> 0-based index */ \ cur_from2 = str_cont.UChar32_to_UTF8_index_fwd(i, cur_from); \ } \ else { \ cur_from = -cur_from; \ cur_from2 = str_cont.UChar32_to_UTF8_index_back(i, cur_from); \ } \ if (cur_to >= 0) { \ ; /* do nothing with cur_to ; 1-based -> 0-based index */ \ /* but +1 as we need the next one (bound) */ \ cur_to2 = str_cont.UChar32_to_UTF8_index_fwd(i, cur_to); \ } \ else { \ cur_to = -cur_to - 1; \ cur_to2 = str_cont.UChar32_to_UTF8_index_back(i, cur_to); \ } /** * Get substring * * * @param str character vector * @param from integer vector (possibly with negative indices) * @param to integer vector (possibly with negative indices) or NULL * @param length integer vector or NULL * @return character vector * * @version 0.1-?? (Bartek Tartanus) * stri_sub * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF8 and stri__UChar32_to_UTF8_index * * @version 0.1-?? (Marek Gagolewski, 2013-06-01) * use StriContainerUTF8's UChar32-to-UTF8 index * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-20) * Use StriContainerUTF8_indexable * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * Use stri__sub_prepare_from_to_length() * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.5-9003 (Marek Gagolewski, 2015-08-05) * Bugfix #183: floating point exception when to or length is an empty vector */ SEXP stri_sub(SEXP str, SEXP from, SEXP to, SEXP length) { PROTECT(str = stri_prepare_arg_string(str, "str")); R_len_t str_len = LENGTH(str); R_len_t from_len = 0; R_len_t to_len = 0; R_len_t length_len = 0; int* from_tab = 0; int* to_tab = 0; int* length_tab = 0; STRI__SUB_PREPARE_FROM_TO_LENGTH /* does 3 PROTECTs */ R_len_t vectorize_len = stri__recycling_rule(true, 3, str_len, from_len, (to_len>length_len)?to_len:length_len); if (vectorize_len <= 0) { UNPROTECT(4); return Rf_allocVector(STRSXP, 0); } STRI__ERROR_HANDLER_BEGIN(4) StriContainerUTF8_indexable str_cont(str, vectorize_len); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_len)); for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { R_len_t cur_from = from_tab[i % from_len]; R_len_t cur_to = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len]; if (str_cont.isNA(i) || cur_from == NA_INTEGER || cur_to == NA_INTEGER) { SET_STRING_ELT(ret, i, NA_STRING); continue; } if (length_tab) { if (cur_to <= 0) { SET_STRING_ELT(ret, i, R_BlankString); continue; } cur_to = cur_from + cur_to - 1; if (cur_from < 0 && cur_to >= 0) cur_to = -1; } const char* str_cur_s = str_cont.get(i).c_str(); R_len_t cur_from2; // UTF-8 byte incices R_len_t cur_to2; // UTF-8 byte incices STRI__SUB_GET_INDICES(cur_from, cur_to, cur_from2, cur_to2) if (cur_to2 > cur_from2) { // just copy SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+cur_from2, cur_to2-cur_from2, CE_UTF8)); } else { // maybe a warning here? SET_STRING_ELT(ret, i, Rf_mkCharLen(NULL, 0)); } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** * Substring replacement function * * * @param str character vector * @param from integer vector (possibly with negative indices) * @param to integer vector (possibly with negative indices) or NULL * @param length integer vector or NULL * @param value character vector replacement * @return character vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF8 and stri__UChar32_to_UTF8_index * * @version 0.1-?? (Marek Gagolewski, 2013-06-01) * use StriContainerUTF8's UChar32-to-UTF8 index * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-20) * Use StriContainerUTF8_indexable * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * Use stri__sub_prepare_from_to_length() * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.5-9003 (Marek Gagolewski, 2015-08-05) * Bugfix #183: floating point exception when to or length is an empty vector */ SEXP stri_sub_replacement(SEXP str, SEXP from, SEXP to, SEXP length, SEXP value) { PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(value = stri_prepare_arg_string(value, "value")); R_len_t value_len = LENGTH(value); R_len_t str_len = LENGTH(str); R_len_t from_len = 0; // see below R_len_t to_len = 0; // see below R_len_t length_len = 0; // see below int* from_tab = 0; // see below int* to_tab = 0; // see below int* length_tab = 0; // see below STRI__SUB_PREPARE_FROM_TO_LENGTH /* does 3 PROTECTs */ R_len_t vectorize_len = stri__recycling_rule(true, 4, str_len, value_len, from_len, (to_len>length_len)?to_len:length_len); if (vectorize_len <= 0) { UNPROTECT(5); return Rf_allocVector(STRSXP, 0); } STRI__ERROR_HANDLER_BEGIN(5) StriContainerUTF8_indexable str_cont(str, vectorize_len); StriContainerUTF8 value_cont(value, vectorize_len); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_len)); String8buf buf(0); // @TODO: estimate bufsize a priori for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { R_len_t cur_from = from_tab[i % from_len]; R_len_t cur_to = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len]; if (str_cont.isNA(i) || cur_from == NA_INTEGER || cur_to == NA_INTEGER || value_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } if (length_tab) { if (cur_to <= 0) { SET_STRING_ELT(ret, i, R_BlankString); continue; } cur_to = cur_from + cur_to - 1; if (cur_from < 0 && cur_to >= 0) cur_to = -1; } const char* str_cur_s = str_cont.get(i).c_str(); R_len_t str_cur_n = str_cont.get(i).length(); const char* value_cur_s = value_cont.get(i).c_str(); R_len_t value_cur_n = value_cont.get(i).length(); R_len_t cur_from2; // UTF-8 byte incices R_len_t cur_to2; // UTF-8 byte incices STRI__SUB_GET_INDICES(cur_from, cur_to, cur_from2, cur_to2) R_len_t buflen = str_cur_n-(cur_to2-cur_from2)+value_cur_n; buf.resize(buflen, false/*destroy contents*/); memcpy(buf.data(), str_cur_s, (size_t)cur_from2); memcpy(buf.data()+cur_from2, value_cur_s, (size_t)value_cur_n); memcpy(buf.data()+cur_from2+value_cur_n, str_cur_s+cur_to2, (size_t)str_cur_n-cur_to2); SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buflen, CE_UTF8)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } stringi/src/stri_container_listint.cpp0000644000175100001440000000763412612166246020053 0ustar hornikusers/* This file is part of the 'stringi' package for R. * Copyright (C) 2013-2015, Marek Gagolewski and Bartek Tartanus * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_listint.h" /** * Default constructor * */ StriContainerListInt::StriContainerListInt() : StriContainerBase() { data = NULL; } /** * Construct Container from R cobject * @param rstr R object * * if you want nrecycle > n, call set_nrecycle */ StriContainerListInt::StriContainerListInt(SEXP rstr) { this->data = NULL; if (isNull(rstr)) { this->init_Base(1, 1, true); this->data = new IntVec[this->n]; // 1 vector, NA/NULL if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); } else if (Rf_isInteger(rstr)) { this->init_Base(1, 1, true); this->data = new IntVec[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); this->data[0].initialize((const int*)INTEGER(rstr), LENGTH(rstr)); // shallow copy } else // if (Rf_isVectorList(rstr)) -- args already checked { R_len_t nv = LENGTH(rstr); this->init_Base(nv, nv, true); this->data = new IntVec[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (R_len_t i=0; in; ++i) { SEXP cur = VECTOR_ELT(rstr, i); if (!isNull(cur)) this->data[i].initialize((const int*)INTEGER(cur), LENGTH(cur)); // shallow copy // else leave as-is, i.e. NULL/NA } } } StriContainerListInt::StriContainerListInt(StriContainerListInt& container) : StriContainerBase((StriContainerBase&)container) { if (container.data) { this->data = new IntVec[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (int i=0; in; ++i) { this->data[i] = container.data[i]; } } else { this->data = NULL; } } StriContainerListInt& StriContainerListInt::operator=(StriContainerListInt& container) { this->~StriContainerListInt(); (StriContainerBase&) (*this) = (StriContainerBase&)container; if (container.data) { this->data = new IntVec[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (int i=0; in; ++i) { this->data[i] = container.data[i]; } } else { this->data = NULL; } return *this; } StriContainerListInt::~StriContainerListInt() { if (data) { delete [] data; data = NULL; } } stringi/src/stri_macros.h0000644000175100001440000001275412612166246015253 0ustar hornikusers/* This file is part of the 'stringi' package for R. * Copyright (C) 2013-2015, Marek Gagolewski and Bartek Tartanus * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_macros_h #define __stri_macros_h // undef R's length macro (conflicts with std::string.length()) // use LENGTH instead #undef length #define STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, naset, zeroset) \ if ((str_cont).isNA(i) || (pattern_cont).isNA(i) || (pattern_cont).get(i).length() <= 0) { \ if ((!(pattern_cont).isNA(i)) && (pattern_cont).get(i).length() <= 0) { \ Rf_warning(MSG__EMPTY_SEARCH_PATTERN_UNSUPPORTED); \ } \ naset; \ continue; \ } \ else if ((str_cont).get(i).length() <= 0) { \ zeroset; \ continue; \ } \ #define STRI__GET_INT32_BE(input, index) \ uint32_t(((uint8_t*)input)[index+0] << 24 | ((uint8_t*)input)[index+1] << 16 | ((uint8_t*)input)[index+2] << 8 | ((uint8_t*)input)[index+3]) #define STRI__GET_INT32_LE(input, index) \ uint32_t(((uint8_t*)input)[index+3] << 24 | ((uint8_t*)input)[index+2] << 16 | ((uint8_t*)input)[index+1] << 8 | ((uint8_t*)input)[index+0]) #define STRI__GET_INT16_BE(input, index) \ uint16_t(((uint8_t*)input)[index+0] << 8 | ((uint8_t*)input)[index+1]) #define STRI__GET_INT16_LE(input, index) \ uint16_t(((uint8_t*)input)[index+1] << 8 | ((uint8_t*)input)[index+0]) #define STRI__ENC_HAS_BOM_UTF8(s, n) \ bool(n >= 3 && \ (uint8_t)(s[0]) == (uint8_t)0xEF && \ (uint8_t)(s[1]) == (uint8_t)0xBB && \ (uint8_t)(s[2]) == (uint8_t)0xBF) #define STRI__ENC_HAS_BOM_UTF16LE(s, n) \ bool(n >= 2 && \ (uint8_t)(s[0]) == (uint8_t)0xFF && \ (uint8_t)(s[1]) == (uint8_t)0xFE && \ (n < 4 || ((uint8_t)(s[2]) != (uint8_t)0x00 || \ (uint8_t)(s[3]) != (uint8_t)0x00))) #define STRI__ENC_HAS_BOM_UTF16BE(s, n) \ bool(n >= 2 && \ (uint8_t)(s[0]) == (uint8_t)0xFE && \ (uint8_t)(s[1]) == (uint8_t)0xFF) #define STRI__ENC_HAS_BOM_UTF32BE(s, n) \ bool(n >= 4 && \ (STRI__GET_INT32_BE(str_cur_s, 0) == 0x0000FEFFUL)) #define STRI__ENC_HAS_BOM_UTF32LE(s, n) \ bool(n >= 4 && \ (STRI__GET_INT32_LE(str_cur_s, 0) == 0x0000FEFFUL)) // taken from R's Defn.h - sorry, this is needed // CHARSXP charset bits #define BYTES_MASK (1<<1) #define LATIN1_MASK (1<<2) #define UTF8_MASK (1<<3) #define ASCII_MASK (1<<6) #define IS_BYTES(x) ((x)->sxpinfo.gp & BYTES_MASK) #define IS_LATIN1(x) ((x)->sxpinfo.gp & LATIN1_MASK) #define IS_ASCII(x) ((x)->sxpinfo.gp & ASCII_MASK) #define IS_UTF8(x) ((x)->sxpinfo.gp & UTF8_MASK) #define ENC_KNOWN(x) ((x)->sxpinfo.gp & (LATIN1_MASK | UTF8_MASK | ASCII_MASK)) #define isRaw(x) (TYPEOF(x) == RAWSXP) /// Unicode replacement character #define UCHAR_REPLACEMENT 0xFFFD #define ASCII_SUBSTITUTE 0x1A #define ASCII_MAXCHARCODE 127 #define UCHAR_REPLACEMENT_UTF8_BYTE1 0xef #define UCHAR_REPLACEMENT_UTF8_BYTE2 0xbf #define UCHAR_REPLACEMENT_UTF8_BYTE3 0xbd #define UTF8_BOM_BYTE1 ((uint8_t)0xef) #define UTF8_BOM_BYTE2 ((uint8_t)0xbb) #define UTF8_BOM_BYTE3 ((uint8_t)0xbf) #define ASCII_CR 0x0D #define ASCII_LF 0x0A #define ASCII_FF 0x0C #define ASCII_VT 0x0B #define UCHAR_NEL 0x0085 #define UCHAR_LS 0x2028 #define UCHAR_PS 0x2029 #endif stringi/src/stri_search_fixed_split.cpp0000644000175100001440000001770512612166246020162 0ustar hornikusers/* This file is part of the 'stringi' package for R. * Copyright (C) 2013-2015, Marek Gagolewski and Bartek Tartanus * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_bytesearch.h" #include "stri_container_integer.h" #include "stri_container_logical.h" #include #include using namespace std; /** * Split a string into parts [byte compare] * * The pattern matches identify delimiters that separate the input into fields. * The input data between the matches becomes the fields themselves. * * @param str character vector * @param pattern character vector * @param n integer vector * @param omit_empty logical vector * @param tokens_only single logical value * @param simplify single logical value * * @return list of character vectors or character matrix * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-25) * StriException friendly, use StriContainerUTF8 * * @version 0.1-?? (Marek Gagolewski, 2013-07-10) * BUGFIX: wrong behavior on empty str * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * stri_split_fixed now uses byte search only * * @version 0.3-1 (Marek Gagolewski, 2014-10-19) * added tokens_only param * * @version 0.3-1 (Marek Gagolewski, 2014-10-23) * added split param * * @version 0.3-1 (Marek Gagolewski, 2014-10-24) * allow omit_empty=NA * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-04) * allow `simplify=NA`; FR #126: pass n to stri_list2matrix * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * FR #110, #23: opts_fixed arg added * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use StriByteSearchMatcher */ SEXP stri_split_fixed(SEXP str, SEXP pattern, SEXP n, SEXP omit_empty, SEXP tokens_only, SEXP simplify, SEXP opts_fixed) { uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed); bool tokens_only1 = stri__prepare_arg_logical_1_notNA(tokens_only, "tokens_only"); PROTECT(simplify = stri_prepare_arg_logical_1(simplify, "simplify")); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); PROTECT(n = stri_prepare_arg_integer(n, "n")); PROTECT(omit_empty = stri_prepare_arg_logical(omit_empty, "omit_empty")); STRI__ERROR_HANDLER_BEGIN(5) R_len_t vectorize_length = stri__recycling_rule(true, 4, LENGTH(str), LENGTH(pattern), LENGTH(n), LENGTH(omit_empty)); StriContainerUTF8 str_cont(str, vectorize_length); StriContainerByteSearch pattern_cont(pattern, vectorize_length, pattern_flags); StriContainerInteger n_cont(n, vectorize_length); StriContainerLogical omit_empty_cont(omit_empty, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (n_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } int n_cur = n_cont.get(i); int omit_empty_cur = !omit_empty_cont.isNA(i) && omit_empty_cont.get(i); STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));, SET_VECTOR_ELT(ret, i, (omit_empty_cont.isNA(i))?stri__vector_NA_strings(1): stri__vector_empty_strings((omit_empty_cur || n_cur == 0)?0:1));) R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); if (n_cur >= INT_MAX-1) throw StriException(MSG__EXPECTED_SMALLER, "n"); else if (n_cur < 0) n_cur = INT_MAX; else if (n_cur == 0) { SET_VECTOR_ELT(ret, i, Rf_allocVector(STRSXP, 0)); continue; } else if (tokens_only1) n_cur++; // we need to do one split ahead here StriByteSearchMatcher* matcher = pattern_cont.getMatcher(i); matcher->reset(str_cont.get(i).c_str(), str_cont.get(i).length()); R_len_t k; deque< pair > fields; // byte based-indices fields.push_back(pair(0,0)); for (k=1; k < n_cur && USEARCH_DONE != matcher->findNext(); ) { R_len_t s1 = (R_len_t)matcher->getMatchedStart(); R_len_t s2 = (R_len_t)matcher->getMatchedLength() + s1; if (omit_empty_cur && fields.back().first == s1) fields.back().first = s2; // don't start any new field else { fields.back().second = s1; fields.push_back(pair(s2, s2)); // start a new field here ++k; // another field } } fields.back().second = str_cur_n; if (omit_empty_cur && fields.back().first == fields.back().second) fields.pop_back(); if (tokens_only1 && n_cur < INT_MAX) { n_cur--; // one split ahead could have been made, see above while (fields.size() > (size_t)n_cur) fields.pop_back(); // get rid of the remainder } SEXP ans; STRI__PROTECT(ans = Rf_allocVector(STRSXP, fields.size())); deque< pair >::iterator iter = fields.begin(); for (k = 0; iter != fields.end(); ++iter, ++k) { pair curoccur = *iter; if (curoccur.second == curoccur.first && omit_empty_cont.isNA(i)) SET_STRING_ELT(ans, k, NA_STRING); else SET_STRING_ELT(ans, k, Rf_mkCharLenCE(str_cur_s+curoccur.first, curoccur.second-curoccur.first, CE_UTF8)); } SET_VECTOR_ELT(ret, i, ans); STRI__UNPROTECT(1); } if (LOGICAL(simplify)[0] == NA_LOGICAL || LOGICAL(simplify)[0]) { R_len_t n_min = 0; R_len_t n_length = LENGTH(n); int* n_tab = INTEGER(n); for (R_len_t i=0; i /** * Detect if a pattern occurs in a string [with collation] * * @param str character vector * @param pattern character vector * @param omit_na single logical value * @param opts_collator passed to stri__ucol_open(), * if \code{NA}, then \code{stri_detect_fixed_byte} is called * @return character vector * * @version 0.3-1 (Bartek Tartanus, 2014-07-25) * * @version 0.3-1 (Marek Gagolewski, 2014-10-17) * using std::vector to avoid mem-leaks * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.3-1 (Marek Gagolewski, 2014-11-06) * Added missing ucol_close * * @version 0.4-1 (Marek Gagolewski, 2014-12-04) * FR #122: omit_na arg added */ SEXP stri_subset_coll(SEXP str, SEXP pattern, SEXP omit_na, SEXP opts_collator) { bool omit_na1 = stri__prepare_arg_logical_1_notNA(omit_na, "omit_na"); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); // call stri__ucol_open after prepare_arg: // if prepare_arg had failed, we would have a mem leak UCollator* collator = NULL; collator = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(2) R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF16 str_cont(str, vectorize_length); StriContainerUStringSearch pattern_cont(pattern, vectorize_length, collator); // collator is not owned by pattern_cont // BT: this cannot be done with deque, because pattern is reused so i does not // go like 0,1,2...n but 0,pat_len,2*pat_len,1,pat_len+1 and so on // MG: agreed std::vector which(vectorize_length); int result_counter = 0; for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, {if (omit_na1) which[i] = FALSE; else {which[i] = NA_LOGICAL; result_counter++;} }, {which[i] = FALSE; }) UStringSearch *matcher = pattern_cont.getMatcher(i, str_cont.get(i)); usearch_reset(matcher); UErrorCode status = U_ZERO_ERROR; which[i] = ((int)usearch_first(matcher, &status) != USEARCH_DONE); // this is F*G slow! :-( if (which[i]) result_counter++; STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } if (collator) { ucol_close(collator); collator = NULL; } SEXP ret; STRI__PROTECT(ret = stri__subset_by_logical(str_cont, which, result_counter)); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( if (collator) { ucol_close(collator); collator = NULL; } ) } stringi/src/stri_TODO_justify.cpp0000644000175100001440000000562412612166246016642 0ustar hornikusers/* This file is part of the 'stringi' package for R. * Copyright (C) 2013-2015, Marek Gagolewski and Bartek Tartanus * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" ///** // vectorized over s // if s is NA the result will be NA // //*/ //SEXP stri_justify(SEXP s, SEXP width) //{ // s = stri_prepare_arg_string(s, "str"); // prepare string argument // width = stri_prepare_arg_integer(width, "width"); // // int ns = LENGTH(s); // int nwidth = LENGTH(width); // int nmax = ns; // if(nwidth > nmax) nmax = nwidth; // if(ns == 0 || nwidth == 0) // return allocVector(STRSXP, 0); // if(nmax % ns != 0 || nmax % nwidth != 0) // warning(MSG__WARN_RECYCLING_RULE); // // SEXP e, curs; // PROTECT(e = allocVector(STRSXP, nmax)); // int j=0,k=0,curwidth; // // for (int i=0; i < nmax; ++i) // { // curs = STRING_ELT(s, i % ns); // curwidth = INTEGER(width)[i % nwidth]; // if (curs == NA_STRING || curwidth == NA_INTEGER){ // SET_STRING_ELT(e, i, NA_STRING); // continue; // } // const char* string = CHAR(curs); // int nstring = LENGTH(curs); // for(j=0; j < nstring ; ++j){ // if(string[j] != ' ') // break; // } // for(k=0; k < nstring ; ++k){ // if(string[nstring-1-k] != ' ') // break; // } // SET_STRING_ELT(e, i, mkCharLen(string+j, max(0,nstring-k-j))); // // } // UNPROTECT(1); // return e; //} stringi/src/stri_container_listutf8.cpp0000644000175100001440000001116112612166246020135 0ustar hornikusers/* This file is part of the 'stringi' package for R. * Copyright (C) 2013-2015, Marek Gagolewski and Bartek Tartanus * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_listutf8.h" /** * Default constructor * */ StriContainerListUTF8::StriContainerListUTF8() : StriContainerBase() { data = NULL; } /** * Construct String Container from R character vector * @param rvec R list vector * @param nrecycle extend length of each character vector stored [vectorization] * @param shallowrecycle will stored character vectors be ever modified? */ StriContainerListUTF8::StriContainerListUTF8(SEXP rvec, R_len_t _nrecycle, bool _shallowrecycle) { this->data = NULL; #ifndef NDEBUG if (!Rf_isVectorList(rvec)) throw StriException("DEBUG: !isVectorList in StriContainerListUTF8::StriContainerListUTF8(SEXP rvec)"); #endif R_len_t rvec_length = LENGTH(rvec); this->init_Base(rvec_length, rvec_length, true); if (this->n > 0) { this->data = new StriContainerUTF8*[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (R_len_t i=0; in; ++i) this->data[i] = NULL; // in case it fails during conversion (this is "NA") for (R_len_t i=0; in; ++i) { R_len_t strlist_cur_length = LENGTH(VECTOR_ELT(rvec, i)); if (_nrecycle % strlist_cur_length != 0) { Rf_warning(MSG__WARN_RECYCLING_RULE); break; } } for (R_len_t i=0; in; ++i) { this->data[i] = new StriContainerUTF8(VECTOR_ELT(rvec, i), _nrecycle, _shallowrecycle); if (!this->data[i]) throw StriException(MSG__MEM_ALLOC_ERROR); } } } StriContainerListUTF8::StriContainerListUTF8(StriContainerListUTF8& container) : StriContainerBase((StriContainerBase&)container) { if (container.data) { this->data = new StriContainerUTF8*[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (int i=0; idata[i] = new StriContainerUTF8(*container.data[i]); if (!this->data[i]) throw StriException(MSG__MEM_ALLOC_ERROR); } else this->data[i] = NULL; } } else { this->data = NULL; } } StriContainerListUTF8& StriContainerListUTF8::operator=(StriContainerListUTF8& container) { this->~StriContainerListUTF8(); (StriContainerBase&) (*this) = (StriContainerBase&)container; if (container.data) { this->data = new StriContainerUTF8*[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (int i=0; idata[i] = new StriContainerUTF8(*container.data[i]); if (!this->data[i]) throw StriException(MSG__MEM_ALLOC_ERROR); } else this->data[i] = NULL; } } else { this->data = NULL; } return *this; } StriContainerListUTF8::~StriContainerListUTF8() { if (data) { for (int i=0; i= nrecycle) throw StriException("StriContainerUTF16::isNA(): INDEX OUT OF BOUNDS"); #endif return str[i%n].isBogus(); } /** get the vectorized ith element * @param i index * @return string */ inline const UnicodeString& get(R_len_t i) const { #ifndef NDEBUG if (isNA(i)) throw StriException("StriContainerUTF16::get(): isNA"); #endif return str[i%n]; } /** get the vectorized ith element * @param i index * @return string */ inline UnicodeString& getWritable(R_len_t i) { #ifndef NDEBUG if (isShallow) throw StriException("StriContainerUTF16::getWritable(): shallow StriContainerUTF16"); if (n != nrecycle) throw StriException("StriContainerUTF16::getWritable(): n!=nrecycle"); if (i < 0 || i >= n) throw StriException("StriContainerUTF16::getWritable(): INDEX OUT OF BOUNDS"); if (isNA(i)) throw StriException("StriContainerUTF16::getWritable(): isNA"); #endif return str[i%n]; // in fact, "%n" is not necessary } /** set NA * @param i index */ inline void setNA(R_len_t i) { #ifndef NDEBUG if (isShallow) throw StriException("StriContainerUTF16::getWritable(): shallow StriContainerUTF16"); if (n != nrecycle) throw StriException("StriContainerUTF16::getWritable(): n!=nrecycle"); if (i < 0 || i >= n) throw StriException("StriContainerUTF16::getWritable(): INDEX OUT OF BOUNDS"); #endif str[i%n].setToBogus(); } /** set the vectorized ith element * @param i index * @param s string to be copied */ inline void set(R_len_t i, const UnicodeString& s) { #ifndef NDEBUG if (isShallow) throw StriException("StriContainerUTF16::set(): shallow StriContainerUTF16"); if (n != nrecycle) throw StriException("StriContainerUTF16::set(): n!=nrecycle"); if (i < 0 || i >= n) throw StriException("StriContainerUTF16::set(): INDEX OUT OF BOUNDS"); if (str[i%n].isBogus()) throw StriException("StriContainerUTF16::set(): isNA"); #endif str[i%n].setTo(s); // in fact, "%n" is not necessary } // @QUESTION: separate StriContainerUTF16_indexable? void UChar16_to_UChar32_index(R_len_t i, int* i1, int* i2, const int ni, int adj1, int adj2); }; #endif stringi/src/stri_external.h0000644000175100001440000000474112612166246015606 0ustar hornikusers/* This file is part of the 'stringi' package for R. * Copyright (C) 2013-2015, Marek Gagolewski and Bartek Tartanus * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_external_h #define __stri_external_h #ifdef U_CHARSET_IS_UTF8 // do not enable this (must be unset before including ICU headers): #undef U_CHARSET_IS_UTF8 #endif #ifndef NDEBUG //#define U_HIDE_DRAFT_API #define U_HIDE_DEPRECATED_API #endif #define UNISTR_FROM_CHAR_EXPLICIT explicit #define UNISTR_FROM_STRING_EXPLICIT explicit #include #include #include #include #include #include #include #include #include #include #include using namespace icu; #define USE_RINTERNALS #define R_NO_REMAP #include #include #include #include #include #include #include #endif stringi/src/stri_container_listraw.cpp0000644000175100001440000001104712612166246020043 0ustar hornikusers/* This file is part of the 'stringi' package for R. * Copyright (C) 2013-2015, Marek Gagolewski and Bartek Tartanus * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_listraw.h" /** * Default constructor * */ StriContainerListRaw::StriContainerListRaw() : StriContainerBase() { data = NULL; } /** * Construct String Container from R object * @param rstr R object * * if you want nrecycle > n, call set_nrecycle */ StriContainerListRaw::StriContainerListRaw(SEXP rstr) { this->data = NULL; if (isNull(rstr)) { this->init_Base(1, 1, true); this->data = new String8[this->n]; // 1 string, NA if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); } else if (isRaw(rstr)) { this->init_Base(1, 1, true); this->data = new String8[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); this->data[0].initialize((const char*)RAW(rstr), LENGTH(rstr), false/*memalloc*/, false/*killbom*/, false/*isASCII*/); // shallow copy } else if (Rf_isVectorList(rstr)) { R_len_t nv = LENGTH(rstr); this->init_Base(nv, nv, true); this->data = new String8[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (R_len_t i=0; in; ++i) { SEXP cur = VECTOR_ELT(rstr, i); if (!isNull(cur)) this->data[i].initialize((const char*)RAW(cur), LENGTH(cur), false/*memalloc*/, false/*killbom*/, false/*isASCII*/); // shallow copy // else leave as-is, i.e. NA } } else { // it's surely a character vector (args have been checked) R_len_t nv = LENGTH(rstr); this->init_Base(nv, nv, true); this->data = new String8[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (R_len_t i=0; in; ++i) { SEXP cur = STRING_ELT(rstr, i); if (cur != NA_STRING) this->data[i].initialize(CHAR(cur), LENGTH(cur), false/*memalloc*/, false/*killbom*/, false/*isASCII*/); // shallow copy // else leave as-is, i.e. NA } } } StriContainerListRaw::StriContainerListRaw(StriContainerListRaw& container) : StriContainerBase((StriContainerBase&)container) { if (container.data) { this->data = new String8[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (int i=0; in; ++i) { this->data[i] = container.data[i]; } } else { this->data = NULL; } } StriContainerListRaw& StriContainerListRaw::operator=(StriContainerListRaw& container) { this->~StriContainerListRaw(); (StriContainerBase&) (*this) = (StriContainerBase&)container; if (container.data) { this->data = new String8[this->n]; if (!this->data) throw StriException(MSG__MEM_ALLOC_ERROR); for (int i=0; in; ++i) { this->data[i] = container.data[i]; } } else { this->data = NULL; } return *this; } StriContainerListRaw::~StriContainerListRaw() { if (data) { delete [] data; data = NULL; } } stringi/src/stri_search_class_subset.cpp0000644000175100001440000001013312612166246020326 0ustar hornikusers/* This file is part of the 'stringi' package for R. * Copyright (C) 2013-2015, Marek Gagolewski and Bartek Tartanus * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_charclass.h" /** * Detect if a character class occurs in a string * * @param str character vector * @param pattern character vector * @param omit_na single logical value * @return logical vector * * @version 0.3-1 (Bartek Tartanus, 2014-07-25) * * @version 0.3-1 (Marek Gagolewski, 2014-10-17) * using std::vector to avoid mem-leaks * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-04) * FR #122: omit_na arg added */ SEXP stri_subset_charclass(SEXP str, SEXP pattern, SEXP omit_na) { bool omit_na1 = stri__prepare_arg_logical_1_notNA(omit_na, "omit_na"); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); // BT: this cannot be done with deque, because pattern is reused so i does not // go like 0,1,2...n but 0,pat_len,2*pat_len,1,pat_len+1 and so on // MG: agreed std::vector which(vectorize_length); int result_counter = 0; for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i)) { if (omit_na1) which[i] = FALSE; else { which[i] = NA_LOGICAL; result_counter++; } continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); UChar32 chr = 0; which[i] = FALSE; for (R_len_t j=0; jcontains(chr)) { which[i] = TRUE; result_counter++; break; } } } SEXP ret; STRI__PROTECT(ret = stri__subset_by_logical(str_cont, which, result_counter)); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } stringi/src/stri_container_listraw.h0000644000175100001440000000637112612166246017514 0ustar hornikusers/* This file is part of the 'stringi' package for R. * Copyright (C) 2013-2015, Marek Gagolewski and Bartek Tartanus * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_container_listraw_h #define __stri_container_listraw_h #include "stri_container_base.h" /** * Contains R lists of raw vectors, single raw vectors, * or character string vectors treated as "byte"-encoded. * Useful for encoding conversion or detection. * Each string is represented by the String8 class, * with shallow copy of byte data. * * @version 0.1-?? (Marek Gagolewski, 2013-08-08) * * @version 0.2-1 (Marek Gagolewski, 2014-03-25) * data as String8* and not String8** (performance gain) */ class StriContainerListRaw : public StriContainerBase { private: String8* data; public: StriContainerListRaw(); StriContainerListRaw(SEXP rlist); StriContainerListRaw(StriContainerListRaw& container); ~StriContainerListRaw(); StriContainerListRaw& operator=(StriContainerListRaw& container); /** check if the vectorized ith element is NA * @param i index * @return true if is NA */ inline bool isNA(R_len_t i) const { #ifndef NDEBUG if (i < 0 || i >= nrecycle) throw StriException("StriContainerListRaw::isNA(): INDEX OUT OF BOUNDS"); #endif return (data[i%n].isNA()); } /** get the vectorized ith element * @param i index * @return string, read only */ const String8& get(R_len_t i) const { #ifndef NDEBUG if (i < 0 || i >= nrecycle) throw StriException("StriContainerListRaw::get(): INDEX OUT OF BOUNDS"); if (data[i%n].isNA()) throw StriException("StriContainerListRaw::get(): isNA"); #endif return data[i%n]; } }; #endif stringi/src/stri_interval.h0000644000175100001440000000404212612166246015602 0ustar hornikusers/* This file is part of the 'stringi' package for R. * Copyright (C) 2013-2015, Marek Gagolewski and Bartek Tartanus * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_interval_h #define __stri_interval_h /** * @version 0.3-1 (Marek Gagolewski, 2014-11-01) */ template struct StriInterval { int a; int b; T data; StriInterval(int _a, int _b, const T& _data) { this->a = _a; this->b = _b; this->data = _data; } }; template bool operator<(const StriInterval& i1, const StriInterval& i2) { return (i1.a < i2.a); } #endif stringi/src/stri_container_regex.h0000644000175100001440000000533412612166246017137 0ustar hornikusers/* This file is part of the 'stringi' package for R. * Copyright (C) 2013-2015, Marek Gagolewski and Bartek Tartanus * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __stri_container_regex_h #define __stri_container_regex_h #include #include "stri_container_utf16.h" /** * A class to handle regex searches * * @version 0.1-?? (Marek Gagolewski, 2013-06-17) * * @version 0.2-1 (Marek Gagolewski, 2014-04-18) * BUGFIX: memleaks on StriException * * @version 0.3-1 (Marek Gagolewski, 2014-05-27) * BUGFIX: invalid matcher reuse on empty search string */ class StriContainerRegexPattern : public StriContainerUTF16 { private: uint32_t flags; ///< RegexMatcher flags RegexMatcher* lastMatcher; ///< recently used \code{RegexMatcher} R_len_t lastMatcherIndex; ///< used by vectorize_getMatcher public: static uint32_t getRegexFlags(SEXP opts_regex); StriContainerRegexPattern(); StriContainerRegexPattern(SEXP rstr, R_len_t nrecycle, uint32_t flags); StriContainerRegexPattern(StriContainerRegexPattern& container); ~StriContainerRegexPattern(); StriContainerRegexPattern& operator=(StriContainerRegexPattern& container); RegexMatcher* getMatcher(R_len_t i); }; #endif stringi/src/stri_search_coll_detect.cpp0000644000175100001440000001002412612166246020114 0ustar hornikusers/* This file is part of the 'stringi' package for R. * Copyright (C) 2013-2015, Marek Gagolewski and Bartek Tartanus * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf16.h" #include "stri_container_usearch.h" #include /** * Detect if a pattern occurs in a string [with collation] * * @param str character vector * @param pattern character vector * @param opts_collator passed to stri__ucol_open(), * if \code{NA}, then \code{stri_detect_fixed_byte} is called * @return logical vector * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * corrected behavior on empty str/pattern * * @version 0.1-?? (Marek Gagolewski, 2013-06-22) * make StriException-friendly, use StriContainerUStringSearch * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * new fun: stri_detect_coll (opts_collator == NA not allowed) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_detect_coll(SEXP str, SEXP pattern, SEXP opts_collator) { PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); // call stri__ucol_open after prepare_arg: // if prepare_arg had failed, we would have a mem leak UCollator* collator = NULL; collator = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(2) R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF16 str_cont(str, vectorize_length); StriContainerUStringSearch pattern_cont(pattern, vectorize_length, collator); // collator is not owned by pattern_cont SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_LOGICAL, ret_tab[i] = FALSE) UStringSearch *matcher = pattern_cont.getMatcher(i, str_cont.get(i)); usearch_reset(matcher); UErrorCode status = U_ZERO_ERROR; ret_tab[i] = ((int)usearch_first(matcher, &status) != USEARCH_DONE); // this is F*G slow! :-( STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } if (collator) { ucol_close(collator); collator=NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( if (collator) ucol_close(collator); ) } stringi/src/stri_wrap.cpp0000644000175100001440000004203512612166246015266 0ustar hornikusers/* This file is part of the 'stringi' package for R. * Copyright (C) 2013-2015, Marek Gagolewski and Bartek Tartanus * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8_indexable.h" #include #include #include #include #include /** Greedy word wrap algorithm * * @param wrap_after [out] * @param nwords number of "words" * @param width_val maximal desired out line width * @param widths_orig ith word width original * @param widths_trim ith word width trimmed * @param add_para_1 * @param add_para_n * * @version 0.1-?? (Bartek Tartanus) * original implementation * * @version 0.2-2 (Marek Gagolewski, 2014-04-28) * BreakIterator usage mods * * @version 0.4-1 (Marek Gagolewski, 2014-12-06) * new args: add_para_1, add_para_n */ void stri__wrap_greedy(std::deque& wrap_after, R_len_t nwords, int width_val, const std::vector& widths_orig, const std::vector& widths_trim, int add_para_1, int add_para_n) { R_len_t cur_len = add_para_1+widths_orig[0]; for (R_len_t j = 1; j < nwords; ++j) { if (cur_len + widths_trim[j] > width_val) { cur_len = add_para_n+widths_orig[j]; wrap_after.push_back(j-1); } else { cur_len += widths_orig[j]; } } } /** Dynamic word wrap algorithm * (Knuth's word wrapping algorithm that minimizes raggedness of formatted text) * * @param wrap_after [out] * @param nwords number of "words" * @param width_val maximal desired out line width * @param exponent_val cost function exponent * @param widths_orig ith word width original * @param widths_trim ith word width trimmed * @param add_para_1 * @param add_para_a * * @version 0.1-?? (Bartek Tartanus) * original implementation * * @version 0.2-2 (Marek Gagolewski, 2014-04-30) * BreakIterator usage mods * * @version 0.4-1 (Marek Gagolewski, 2014-12-06) * new args: add_para_1, add_para_n, * cost of the last line is zero */ void stri__wrap_dynamic(std::deque& wrap_after, R_len_t nwords, int width_val, double exponent_val, const std::vector& widths_orig, const std::vector& widths_trim, int add_para_1, int add_para_n) { #define IDX(i,j) (i)*nwords+(j) vector cost(nwords*nwords); // where cost[IDX(i,j)] == cost of printing words i..j in a single line, i<=j // calculate costs: // there is some "punishment" for leaving blanks at the end of each line // (number of "blank" codepoints ^ exponent_val) for (int i=0; i i) { if (cost[IDX(i,j-1)] < 0.0) { // already Inf cost[IDX(i,j)] = -1.0; // Inf continue; } else { sum -= widths_trim[j-1]; sum += widths_orig[j-1]; } } sum += widths_trim[j]; int ct = width_val - sum; if (i == 0) ct -= add_para_1; else ct -= add_para_n; if (j == nwords-1) { // last line == cost 0 if (j == i || ct >= 0) cost[IDX(i,j)] = 0.0; else cost[IDX(i,j)] = -1.0/*Inf*/; } else if (j == i) // some words don't fit in a line at all -> cost 0.0 cost[IDX(i,j)] = (ct < 0) ? 0.0 : pow((double)ct, exponent_val); else cost[IDX(i,j)] = (ct < 0) ? -1.0/*"Inf"*/ : pow((double)ct, exponent_val); } } vector f(nwords); // f[j] == total cost of (optimally) printing words 0..j vector where(nwords*nwords, false); // where[IDX(i,j)] == false iff // we don't wrap after i-th word, i<=j // when (optimally) printing words 0..j for (int j=0; j= 0.0) { // no breaking needed: words 0..j fit in one line f[j] = cost[IDX(0,j)]; continue; } // let i = optimal way of printing of words 0..i + printing i+1..j int i = 0; while (i <= j) { if (cost[IDX(i+1,j)] >= 0.0) break; ++i; } double best_i = f[i] + cost[IDX(i+1,j)]; for (int k=i+1; k 0): prefix +exdent // 1st line, nth para (i> 0, u==0): prefix +indent // nth line, nth para (i> 0, u> 0): prefix +exdent StriWrapLineStart ii(initial_cont.get(0), indent_val); StriWrapLineStart pi(prefix_cont.get(0), indent_val); StriWrapLineStart pe(prefix_cont.get(0), exdent_val); status = U_ZERO_ERROR; //Unicode Newline Guidelines - Unicode Technical Report #13 UnicodeSet uset_linebreaks(UnicodeString::fromUTF8("[\\u000A-\\u000D\\u0085\\u2028\\u2029]"), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) uset_linebreaks.freeze(); status = U_ZERO_ERROR; UnicodeSet uset_whitespaces(UnicodeString::fromUTF8("\\p{White_space}"), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) uset_whitespaces.freeze(); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, str_length)); for (R_len_t i = 0; i < str_length; ++i) { if (str_cont.isNA(i) || prefix_cont.isNA(0) || initial_cont.isNA(0)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } status = U_ZERO_ERROR; const char* str_cur_s = str_cont.get(i).c_str(); R_len_t str_cur_n = str_cont.get(i).length(); str_text = utext_openUTF8(str_text, str_cur_s, str_cont.get(i).length(), &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; briter->setText(str_text, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // all right, first let's generate a list of places at which we may do line breaks deque< R_len_t > occurrences_list; // this could be an R_len_t queue R_len_t match = briter->first(); while (match != BreakIterator::DONE) { if (!whitespace_only_val) occurrences_list.push_back(match); else { if (match > 0 && match < str_cur_n) { UChar32 c; U8_GET((const uint8_t*)str_cur_s, 0, match-1, str_cur_n, c); if (uset_whitespaces.contains(c)) occurrences_list.push_back(match); } else occurrences_list.push_back(match); } match = briter->next(); } R_len_t noccurrences = (R_len_t)occurrences_list.size(); // number of boundaries if (noccurrences <= 1) { // no match (1 boundary == 0) SET_VECTOR_ELT(ret, i, Rf_ScalarString(str_cont.toR(i))); continue; } // the number of "words" is: R_len_t nwords = noccurrences - 1; // convert occurrences_list to a vector // in order to obtain end positions (in a string) of each "words", // noting that occurrences_list.at(0) == 0 #ifndef NDEBUG if (occurrences_list.at(0) != 0) throw StriException("NDEBUG: stri_wrap: (occurrences_list.at(0) != 0)"); #endif std::vector end_pos_orig(nwords); deque::iterator iter = ++(occurrences_list.begin()); for (R_len_t j = 0; iter != occurrences_list.end(); ++iter, ++j) { end_pos_orig[j] = (*iter); // this is a UTF-8 index } // now: // we'll get the total widths/number of code points in each "word" std::vector widths_orig(nwords); // we'll get the total widths/number of code points without trailing whitespaces std::vector widths_trim(nwords); // we'll get the end positions without trailing whitespaces std::vector end_pos_trim(nwords); // detect line endings (fail on a match) UChar32 c = 0; R_len_t j = 0; R_len_t cur_block = 0; R_len_t cur_width_orig = 0; R_len_t cur_width_trim = 0; R_len_t cur_count_orig = 0; R_len_t cur_count_trim = 0; R_len_t cur_end_pos_trim = 0; while (j < str_cur_n) { R_len_t jlast = j; U8_NEXT(str_cur_s, j, str_cur_n, c); if (c < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (uset_linebreaks.contains(c)) throw StriException(MSG__NEWLINE_FOUND); cur_width_orig += stri__width_char(c); ++cur_count_orig; if (uset_whitespaces.contains(c)) { // OLD: trim all white spaces from the end: // ++cur_count_trim; // [we have the normalize arg for that] // NEW: trim just one white space at the end: cur_width_trim = stri__width_char(c); cur_count_trim = 1; cur_end_pos_trim = jlast; } else { cur_width_trim = 0; cur_count_trim = 0; cur_end_pos_trim = j; } if (j >= str_cur_n || end_pos_orig[cur_block] <= j) { // we'll start a new block in a moment if (use_length_val) { widths_orig[cur_block] = cur_count_orig; widths_trim[cur_block] = cur_count_orig-cur_count_trim; } else { widths_orig[cur_block] = cur_width_orig; widths_trim[cur_block] = cur_width_orig-cur_width_trim; } end_pos_trim[cur_block] = cur_end_pos_trim; cur_block++; cur_width_orig = 0; cur_width_trim = 0; cur_count_orig = 0; cur_count_trim = 0; cur_end_pos_trim = j; } } // do wrap std::deque wrap_after; // wrap line after which word in {0..nwords-1}? if (exponent_val <= 0.0) { stri__wrap_greedy(wrap_after, nwords, width_val, widths_orig, widths_trim, (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width), (use_length_val)?pe.count:pe.width); } else { stri__wrap_dynamic(wrap_after, nwords, width_val, exponent_val, widths_orig, widths_trim, (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width), (use_length_val)?pe.count:pe.width); } // wrap_after.size() line breaks => wrap_after.size()+1 lines R_len_t nlines = (R_len_t)wrap_after.size()+1; R_len_t last_pos = 0; SEXP ans; STRI__PROTECT(ans = Rf_allocVector(STRSXP, nlines)); deque::iterator iter_wrap = wrap_after.begin(); for (R_len_t u = 0; iter_wrap != wrap_after.end(); ++iter_wrap, ++u) { R_len_t wrap_after_cur = *iter_wrap; R_len_t cur_pos = end_pos_trim[wrap_after_cur]; std::string cs; if (i == 0 && u == 0) cs = ii.str; else if (i > 0 && u == 0) cs = pi.str; else cs = pe.str; cs.append(str_cur_s+last_pos, cur_pos-last_pos); SET_STRING_ELT(ans, u, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8)); last_pos = end_pos_orig[wrap_after_cur]; } // last line goes here: std::string cs; if (i == 0 && nlines-1 == 0) cs = ii.str; else if (i > 0 && nlines-1 == 0) cs = pi.str; else cs = pe.str; cs.append(str_cur_s+last_pos, end_pos_trim[nwords-1]-last_pos); SET_STRING_ELT(ans, nlines-1, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8)); SET_VECTOR_ELT(ret, i, ans); STRI__UNPROTECT(1); } if (briter) { delete briter; briter = NULL; } if (str_text) { utext_close(str_text); str_text = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (briter) { delete briter; briter = NULL; } if (str_text) { utext_close(str_text); str_text = NULL; } }) } stringi/src/stri_search_class_extract.cpp0000644000175100001440000002165512612166246020506 0ustar hornikusers/* This file is part of the 'stringi' package for R. * Copyright (C) 2013-2015, Marek Gagolewski and Bartek Tartanus * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_charclass.h" #include "stri_container_logical.h" #include #include using namespace std; /** * Extract first or last occurrences of a character class in each string * * @param str character vector * @param pattern character vector * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-08) * * @version 0.1-?? (Marek Gagolewski, 2013-06-15) * Use StrContainerCharClass * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * detects invalid UTF-8 byte stream * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * StriContainerCharClass now relies on UnicodeSet * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri__extract_firstlast_charclass(SEXP str, SEXP pattern, bool first) { PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { SET_STRING_ELT(ret, i, NA_STRING); if (str_cont.isNA(i) || pattern_cont.isNA(i)) continue; const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); R_len_t j, jlast; UChar32 chr; if (first) { for (jlast=j=0; jcontains(chr)) { SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+jlast, j-jlast, CE_UTF8)); break; // that's enough for first } jlast = j; } } else { for (jlast=j=str_cur_n; j>0; ) { U8_PREV(str_cur_s, 0, j, chr); // go backwards if (chr < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+j, jlast-j, CE_UTF8)); break; // that's enough for last } jlast = j; } } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** * Extract first occurrence of a character class in each string * * @param str character vector * @param pattern character vector * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-08) */ SEXP stri_extract_first_charclass(SEXP str, SEXP pattern) { return stri__extract_firstlast_charclass(str, pattern, true); } /** * Extract last occurrence of a character class in each string * * @param str character vector * @param pattern character vector * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-06-08) */ SEXP stri_extract_last_charclass(SEXP str, SEXP pattern) { return stri__extract_firstlast_charclass(str, pattern, false); } /** * Extract all occurrences of a character class in each string * * @param str character vector * @param pattern character vector * @param simplify single logical value * * @return list of character vectors or character matrix * * @version 0.1-?? (Marek Gagolewski, 2013-06-08) * * @version 0.1-?? (Marek Gagolewski, 2013-06-15) * Use StrContainerCharClass * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * detects invalid UTF-8 byte stream * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * StriContainerCharClass now relies on UnicodeSet * * @version 0.3-1 (Marek Gagolewski, 2014-10-24) * added simplify param * * @version 0.3-1 (Marek Gagolewski, 2014-11-02) * using StriContainerCharClass::locateAll; * no longer vectorized over merge * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-11-27) * FR #117: omit_no_match arg added * * @version 0.4-1 (Marek Gagolewski, 2014-12-04) * allow `simplify=NA` */ SEXP stri_extract_all_charclass(SEXP str, SEXP pattern, SEXP merge, SEXP simplify, SEXP omit_no_match) { bool merge_cur = stri__prepare_arg_logical_1_notNA(merge, "merge"); bool omit_no_match1 = stri__prepare_arg_logical_1_notNA(omit_no_match, "omit_no_match"); PROTECT(simplify = stri_prepare_arg_logical_1(simplify, "simplify")); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(3) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (pattern_cont.isNA(i) || str_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); deque< pair > occurrences; StriContainerCharClass::locateAll( occurrences, &pattern_cont.get(i), str_cur_s, str_cur_n, merge_cur, false /* byte-based indices */ ); R_len_t noccurrences = (R_len_t)occurrences.size(); if (noccurrences == 0) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(omit_no_match1?0:1)); continue; } SEXP cur_res; STRI__PROTECT(cur_res = Rf_allocVector(STRSXP, noccurrences)); deque< pair >::iterator iter = occurrences.begin(); for (R_len_t f = 0; iter != occurrences.end(); ++iter, ++f) { pair curo = *iter; SET_STRING_ELT(cur_res, f, Rf_mkCharLenCE(str_cur_s+curo.first, curo.second-curo.first, CE_UTF8)); } SET_VECTOR_ELT(ret, i, cur_res); STRI__UNPROTECT(1) } if (LOGICAL(simplify)[0] == NA_LOGICAL) { STRI__PROTECT(ret = stri_list2matrix(ret, Rf_ScalarLogical(TRUE), stri__vector_NA_strings(1), Rf_ScalarInteger(0))) } else if (LOGICAL(simplify)[0]) { STRI__PROTECT(ret = stri_list2matrix(ret, Rf_ScalarLogical(TRUE), stri__vector_empty_strings(1), Rf_ScalarInteger(0))) } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } stringi/src/stri_search_fixed_startsendswith.cpp0000644000175100001440000001467412612166246022117 0ustar hornikusers/* This file is part of the 'stringi' package for R. * Copyright (C) 2013-2015, Marek Gagolewski and Bartek Tartanus * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8_indexable.h" #include "stri_container_bytesearch.h" #include "stri_container_integer.h" /** * Detect if a string starts with a pattern match * * @param str character vector * @param pattern character vector * @param from integer vector * @return logical vector * * @version 0.3-1 (Marek Gagolewski, 2014-06-03) * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * FR #110, #23: opts_fixed arg added; * use StriContainerByteSearch::startsWith() and endsWith() * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use String8::startsWith() and endsWith() */ SEXP stri_startswith_fixed(SEXP str, SEXP pattern, SEXP from, SEXP opts_fixed) { uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); PROTECT(from = stri_prepare_arg_integer(from, "from")); STRI__ERROR_HANDLER_BEGIN(3) int vectorize_length = stri__recycling_rule(true, 3, LENGTH(str), LENGTH(pattern), LENGTH(from)); StriContainerUTF8_indexable str_cont(str, vectorize_length); StriContainerByteSearch pattern_cont(pattern, vectorize_length, pattern_flags); StriContainerInteger from_cont(from, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_LOGICAL, ret_tab[i] = FALSE) if (from_cont.isNA(i)) { ret_tab[i] = NA_LOGICAL; continue; } R_len_t from_cur = from_cont.get(i); if (from_cur == 1) from_cur = 0; /* most commonly used case */ else if (from_cur >= 0) from_cur = str_cont.UChar32_to_UTF8_index_fwd(i, from_cur-1); else from_cur = str_cont.UChar32_to_UTF8_index_back(i, -from_cur); // now surely from_cur >= 0 && from_cur <= cur_n ret_tab[i] = (int)(str_cont.get(i).startsWith(from_cur, pattern_cont.get(i).c_str(), pattern_cont.get(i).length(), pattern_cont.isCaseInsensitive())); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( ;/* do nothing special on error */ ) } /** * Detect if a string ends with a pattern match * * @param str character vector * @param pattern character vector * @param to integer vector * @return logical vector * * @version 0.3-1 (Marek Gagolewski, 2014-06-03) * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * FR #110, #23: opts_fixed arg added * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use String8::startsWith() and endsWith() */ SEXP stri_endswith_fixed(SEXP str, SEXP pattern, SEXP to, SEXP opts_fixed) { uint32_t pattern_flags = StriContainerByteSearch::getByteSearchFlags(opts_fixed); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); PROTECT(to = stri_prepare_arg_integer(to, "to")); STRI__ERROR_HANDLER_BEGIN(3) int vectorize_length = stri__recycling_rule(true, 3, LENGTH(str), LENGTH(pattern), LENGTH(to)); StriContainerUTF8_indexable str_cont(str, vectorize_length); StriContainerByteSearch pattern_cont(pattern, vectorize_length, pattern_flags); StriContainerInteger to_cont(to, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_LOGICAL, ret_tab[i] = FALSE) if (to_cont.isNA(i)) { ret_tab[i] = NA_LOGICAL; continue; } R_len_t to_cur = to_cont.get(i); if (to_cur == -1) to_cur = str_cont.get(i).length(); /* most commonly used case */ else if (to_cur >= 0) to_cur = str_cont.UChar32_to_UTF8_index_fwd(i, to_cur); else to_cur = str_cont.UChar32_to_UTF8_index_back(i, -to_cur-1); // now surely to_cur >= 0 && to_cur <= cur_n ret_tab[i] = (int)(str_cont.get(i).endsWith(to_cur, pattern_cont.get(i).c_str(), pattern_cont.get(i).length(), pattern_cont.isCaseInsensitive())); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( ;/* do nothing special on error */ ) } stringi/src/stri_search_other_split.cpp0000644000175100001440000002175212612166246020201 0ustar hornikusers/* This file is part of the 'stringi' package for R. * Copyright (C) 2013-2015, Marek Gagolewski and Bartek Tartanus * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" #include "stri_container_utf16.h" #include "stri_container_usearch.h" #include "stri_container_bytesearch.h" #include "stri_container_integer.h" #include "stri_container_logical.h" #include #include #include #include using namespace std; /** * Split a single string into text lines * * @param str character vector * * @return character vector * * @version 0.1-?? (Marek Gagolewski, 2013-08-04) * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_split_lines1(SEXP str) { PROTECT(str = stri_prepare_arg_string_1(str, "str")); R_len_t vectorize_length = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(1) StriContainerUTF8 str_cont(str, vectorize_length); if (str_cont.isNA(0)) { STRI__UNPROTECT_ALL return str; } const char* str_cur_s = str_cont.get(0).c_str(); R_len_t str_cur_n = str_cont.get(0).length(); UChar32 c; R_len_t jlast; deque< pair > occurrences; occurrences.push_back(pair(0, 0)); for (R_len_t j=0; j < str_cur_n; /* null */) { jlast = j; U8_NEXT(str_cur_s, j, str_cur_n, c); switch (c) { case ASCII_CR: /* CR */ /* check if next is LF */ if (str_cur_s[j] == ASCII_LF) { // look ahead one byte j++; // just one byte } break; case ASCII_LF: /* LF */ break; case UCHAR_NEL: /* NEL */ break; case ASCII_VT: /* VT */ break; case ASCII_FF: /* FF */ break; case UCHAR_LS: /* LS */ break; case UCHAR_PS: /* PS */ break; default: /* not a newline character */ occurrences.back().second = j; continue; } occurrences.back().second = jlast; if (j < str_cur_n) occurrences.push_back(pair(j, j)); } SEXP ans; STRI__PROTECT(ans = Rf_allocVector(STRSXP, (R_len_t)occurrences.size())); deque< pair >::iterator iter = occurrences.begin(); for (R_len_t k = 0; iter != occurrences.end(); ++iter, ++k) { pair curoccur = *iter; SET_STRING_ELT(ans, k, Rf_mkCharLenCE(str_cur_s+curoccur.first, curoccur.second-curoccur.first, CE_UTF8)); } STRI__UNPROTECT_ALL return ans; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } /** * Split a string into text lines * * @param str character vector * @param omit_empty logical vector * * @return list of character vectors * * @version 0.1-?? (Marek Gagolewski, 2013-08-04) * * @version 0.3-1 (Marek Gagolewski, 2014-10-30) * removed `n_max` arg, as it doesn't make sense * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_split_lines(SEXP str, SEXP omit_empty) { PROTECT(str = stri_prepare_arg_string(str, "str")); // n_max = stri_prepare_arg_integer(n_max, "n_max"); PROTECT(omit_empty = stri_prepare_arg_logical(omit_empty, "omit_empty")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), /*LENGTH(n_max), */LENGTH(omit_empty)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); // StriContainerInteger n_max_cont(n_max, vectorize_length); StriContainerLogical omit_empty_cont(omit_empty, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } const char* str_cur_s = str_cont.get(i).c_str(); R_len_t str_cur_n = str_cont.get(i).length(); // int n_max_cur = n_max_cont.get(i); int omit_empty_cur = omit_empty_cont.get(i); // if (n_max_cur < 0) // n_max_cur = INT_MAX; // else if (n_max_cur == 0) { // SET_VECTOR_ELT(ret, i, Rf_allocVector(STRSXP, 0)); // continue; // } //#define STRI_INDEX_NEWLINE_CR 0 //#define STRI_INDEX_NEWLINE_LF 1 //#define STRI_INDEX_NEWLINE_CRLF 2 //#define STRI_INDEX_NEWLINE_NEL 3 //#define STRI_INDEX_NEWLINE_VT 4 //#define STRI_INDEX_NEWLINE_FF 5 //#define STRI_INDEX_NEWLINE_LS 6 //#define STRI_INDEX_NEWLINE_PS 7 //#define STRI_INDEX_NEWLINE_LAST 8 // int counts[STRI_INDEX_NEWLINE_LAST]; // for (R_len_t j=0; j > occurrences; occurrences.push_back(pair(0, 0)); for (R_len_t j=0; j < str_cur_n /*&& k < n_max_cur*/; /* null */) { jlast = j; U8_NEXT(str_cur_s, j, str_cur_n, c); switch (c) { case ASCII_CR: /* CR */ // counts[STRI_INDEX_NEWLINE_CR]++; /* check if next is LF */ if (str_cur_s[j] == ASCII_LF) { // look ahead one byte // counts[STRI_INDEX_NEWLINE_LF]++; // counts[STRI_INDEX_NEWLINE_CRLF]++; j++; // just one byte } break; case ASCII_LF: /* LF */ // counts[STRI_INDEX_NEWLINE_LF]++; break; case UCHAR_NEL: /* NEL */ // counts[STRI_INDEX_NEWLINE_NEL]++; break; case ASCII_VT: /* VT */ // counts[STRI_INDEX_NEWLINE_VT]++; break; case ASCII_FF: /* FF */ // counts[STRI_INDEX_NEWLINE_FF]++; break; case UCHAR_LS: /* LS */ // counts[STRI_INDEX_NEWLINE_LS]++; break; case UCHAR_PS: /* PS */ // counts[STRI_INDEX_NEWLINE_PS]++; break; default: /* not a newline character */ occurrences.back().second = j; continue; } // if here, then at newline if (omit_empty_cur && occurrences.back().second == occurrences.back().first) occurrences.back().first = occurrences.back().second = j; // don't start any new field else { occurrences.back().second = jlast; occurrences.push_back(pair(j, j)); ++k; // another field } } // if (k == n_max_cur) // occurrences.back().second = str_cur_n; if (omit_empty_cur && occurrences.back().first == occurrences.back().second) occurrences.pop_back(); SEXP ans; STRI__PROTECT(ans = Rf_allocVector(STRSXP, (R_len_t)occurrences.size())); deque< pair >::iterator iter = occurrences.begin(); for (R_len_t l = 0; iter != occurrences.end(); ++iter, ++l) { pair curoccur = *iter; SET_STRING_ELT(ans, l, Rf_mkCharLenCE(str_cur_s+curoccur.first, curoccur.second-curoccur.first, CE_UTF8)); } SET_VECTOR_ELT(ret, i, ans); STRI__UNPROTECT(1); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) } stringi/src/stri_stats.cpp0000644000175100001440000002432312612166246015453 0ustar hornikusers/* This file is part of the 'stringi' package for R. * Copyright (C) 2013-2015, Marek Gagolewski and Bartek Tartanus * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "stri_stringi.h" #include "stri_container_utf8.h" /** * General statistics for a character vector * * @param str a character vector * @return integer vector, see R man for details * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski, 2013-06-09) * Use StriContainerUTF8 * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-04-01) * detect invalid UTF-8 byte streams * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_stats_general(SEXP str) { PROTECT(str = stri_prepare_arg_string(str, "str")); R_len_t str_length = LENGTH(str); STRI__ERROR_HANDLER_BEGIN(1) StriContainerUTF8 str_cont(str, str_length); enum { gsNumLines = 0, gsNumLinesNonEmpty = 1, gsNumChars = 2, gsNumCharsNonWhite = 3, gsAll = 4 // always == number of elements }; SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, gsAll)); int* stats = INTEGER(ret); for (int i=0; i