stringi/ 0000755 0001762 0000144 00000000000 14535711033 011733 5 ustar ligges users stringi/NAMESPACE 0000644 0001762 0000144 00000015137 14063573706 013172 0 ustar ligges users # Generated by roxygen2: do not edit by hand
export("%s!=%")
export("%s!==%")
export("%s$%")
export("%s*%")
export("%s+%")
export("%s<%")
export("%s<=%")
export("%s==%")
export("%s===%")
export("%s>%")
export("%s>=%")
export("%stri!=%")
export("%stri!==%")
export("%stri$%")
export("%stri*%")
export("%stri+%")
export("%stri<%")
export("%stri<=%")
export("%stri==%")
export("%stri===%")
export("%stri>%")
export("%stri>=%")
export("stri_datetime_add<-")
export("stri_sub<-")
export("stri_sub_all<-")
export("stri_subset<-")
export("stri_subset_charclass<-")
export("stri_subset_coll<-")
export("stri_subset_fixed<-")
export("stri_subset_regex<-")
export(stri_c)
export(stri_c_list)
export(stri_cmp)
export(stri_cmp_eq)
export(stri_cmp_equiv)
export(stri_cmp_ge)
export(stri_cmp_gt)
export(stri_cmp_le)
export(stri_cmp_lt)
export(stri_cmp_neq)
export(stri_cmp_nequiv)
export(stri_coll)
export(stri_compare)
export(stri_conv)
export(stri_count)
export(stri_count_boundaries)
export(stri_count_charclass)
export(stri_count_coll)
export(stri_count_fixed)
export(stri_count_regex)
export(stri_count_words)
export(stri_datetime_add)
export(stri_datetime_create)
export(stri_datetime_fields)
export(stri_datetime_format)
export(stri_datetime_fstr)
export(stri_datetime_now)
export(stri_datetime_parse)
export(stri_datetime_symbols)
export(stri_detect)
export(stri_detect_charclass)
export(stri_detect_coll)
export(stri_detect_fixed)
export(stri_detect_regex)
export(stri_dup)
export(stri_duplicated)
export(stri_duplicated_any)
export(stri_enc_detect)
export(stri_enc_detect2)
export(stri_enc_fromutf32)
export(stri_enc_get)
export(stri_enc_info)
export(stri_enc_isascii)
export(stri_enc_isutf16be)
export(stri_enc_isutf16le)
export(stri_enc_isutf32be)
export(stri_enc_isutf32le)
export(stri_enc_isutf8)
export(stri_enc_list)
export(stri_enc_mark)
export(stri_enc_set)
export(stri_enc_toascii)
export(stri_enc_tonative)
export(stri_enc_toutf32)
export(stri_enc_toutf8)
export(stri_encode)
export(stri_endswith)
export(stri_endswith_charclass)
export(stri_endswith_coll)
export(stri_endswith_fixed)
export(stri_escape_unicode)
export(stri_extract)
export(stri_extract_all)
export(stri_extract_all_boundaries)
export(stri_extract_all_charclass)
export(stri_extract_all_coll)
export(stri_extract_all_fixed)
export(stri_extract_all_regex)
export(stri_extract_all_words)
export(stri_extract_first)
export(stri_extract_first_boundaries)
export(stri_extract_first_charclass)
export(stri_extract_first_coll)
export(stri_extract_first_fixed)
export(stri_extract_first_regex)
export(stri_extract_first_words)
export(stri_extract_last)
export(stri_extract_last_boundaries)
export(stri_extract_last_charclass)
export(stri_extract_last_coll)
export(stri_extract_last_fixed)
export(stri_extract_last_regex)
export(stri_extract_last_words)
export(stri_flatten)
export(stri_info)
export(stri_isempty)
export(stri_join)
export(stri_join_list)
export(stri_length)
export(stri_list2matrix)
export(stri_locale_get)
export(stri_locale_info)
export(stri_locale_list)
export(stri_locale_set)
export(stri_locate)
export(stri_locate_all)
export(stri_locate_all_boundaries)
export(stri_locate_all_charclass)
export(stri_locate_all_coll)
export(stri_locate_all_fixed)
export(stri_locate_all_regex)
export(stri_locate_all_words)
export(stri_locate_first)
export(stri_locate_first_boundaries)
export(stri_locate_first_charclass)
export(stri_locate_first_coll)
export(stri_locate_first_fixed)
export(stri_locate_first_regex)
export(stri_locate_first_words)
export(stri_locate_last)
export(stri_locate_last_boundaries)
export(stri_locate_last_charclass)
export(stri_locate_last_coll)
export(stri_locate_last_fixed)
export(stri_locate_last_regex)
export(stri_locate_last_words)
export(stri_match)
export(stri_match_all)
export(stri_match_all_regex)
export(stri_match_first)
export(stri_match_first_regex)
export(stri_match_last)
export(stri_match_last_regex)
export(stri_na2empty)
export(stri_numbytes)
export(stri_omit_empty)
export(stri_omit_empty_na)
export(stri_omit_na)
export(stri_opts_brkiter)
export(stri_opts_collator)
export(stri_opts_fixed)
export(stri_opts_regex)
export(stri_order)
export(stri_pad)
export(stri_pad_both)
export(stri_pad_left)
export(stri_pad_right)
export(stri_paste)
export(stri_paste_list)
export(stri_printf)
export(stri_rand_lipsum)
export(stri_rand_shuffle)
export(stri_rand_strings)
export(stri_rank)
export(stri_read_lines)
export(stri_read_raw)
export(stri_remove_empty)
export(stri_remove_empty_na)
export(stri_remove_na)
export(stri_replace)
export(stri_replace_all)
export(stri_replace_all_charclass)
export(stri_replace_all_coll)
export(stri_replace_all_fixed)
export(stri_replace_all_regex)
export(stri_replace_first)
export(stri_replace_first_charclass)
export(stri_replace_first_coll)
export(stri_replace_first_fixed)
export(stri_replace_first_regex)
export(stri_replace_last)
export(stri_replace_last_charclass)
export(stri_replace_last_coll)
export(stri_replace_last_fixed)
export(stri_replace_last_regex)
export(stri_replace_na)
export(stri_replace_rstr)
export(stri_reverse)
export(stri_sort)
export(stri_sort_key)
export(stri_split)
export(stri_split_boundaries)
export(stri_split_charclass)
export(stri_split_coll)
export(stri_split_fixed)
export(stri_split_lines)
export(stri_split_lines1)
export(stri_split_regex)
export(stri_sprintf)
export(stri_startswith)
export(stri_startswith_charclass)
export(stri_startswith_coll)
export(stri_startswith_fixed)
export(stri_stats_general)
export(stri_stats_latex)
export(stri_string_format)
export(stri_sub)
export(stri_sub_all)
export(stri_sub_all_replace)
export(stri_sub_replace)
export(stri_sub_replace_all)
export(stri_subset)
export(stri_subset_charclass)
export(stri_subset_coll)
export(stri_subset_fixed)
export(stri_subset_regex)
export(stri_timezone_get)
export(stri_timezone_info)
export(stri_timezone_list)
export(stri_timezone_set)
export(stri_trans_casefold)
export(stri_trans_char)
export(stri_trans_general)
export(stri_trans_isnfc)
export(stri_trans_isnfd)
export(stri_trans_isnfkc)
export(stri_trans_isnfkc_casefold)
export(stri_trans_isnfkd)
export(stri_trans_list)
export(stri_trans_nfc)
export(stri_trans_nfd)
export(stri_trans_nfkc)
export(stri_trans_nfkc_casefold)
export(stri_trans_nfkd)
export(stri_trans_tolower)
export(stri_trans_totitle)
export(stri_trans_toupper)
export(stri_trim)
export(stri_trim_both)
export(stri_trim_left)
export(stri_trim_right)
export(stri_unescape_unicode)
export(stri_unique)
export(stri_width)
export(stri_wrap)
export(stri_write_lines)
importFrom(stats,rnorm)
importFrom(stats,runif)
importFrom(tools,md5sum)
importFrom(utils,download.file)
importFrom(utils,packageVersion)
importFrom(utils,unzip)
useDynLib(stringi, .registration = TRUE)
stringi/LICENSE 0000644 0001762 0000144 00000057170 14521314304 012745 0 ustar ligges users # LICENSE
*******************************************************************************
# R and C++ source code of the stringi package
```
Copyright (c) 2013-2023, Marek Gagolewski
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
```
# LaTeX Word Count Algorithm
`stri_stats_latex()` in `src/stri_stats.cpp` uses a modified
Kile 2.1.3 LaTeX Word Count algorithm
(source file: `Kile/src/documentinfo.cpp`,
method: `void Info::count(const QString& line, long *stat)`),
see https://kile.sourceforge.io/.
Copyright (C) 2013-2019 by the Kile Team (Holger Danielsson, Michel Ludwig,
Jeroen Wijnhout, and others).
```
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
```
# The ICU4C Library
The files in `src/icu*/` are distributed under the following license.
```
UNICODE LICENSE V3
COPYRIGHT AND PERMISSION NOTICE
Copyright © 2016-2023 Unicode, Inc.
NOTICE TO USER: Carefully read the following legal agreement. BY
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
Permission is hereby granted, free of charge, to any person obtaining a
copy of data files and any associated documentation (the "Data Files") or
software and any associated documentation (the "Software") to deal in the
Data Files or Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, and/or sell
copies of the Data Files or Software, and to permit persons to whom the
Data Files or Software are furnished to do so, provided that either (a)
this copyright and permission notice appear with all copies of the Data
Files or Software, or (b) this copyright and permission notice appear in
associated Documentation.
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
FILES OR SOFTWARE.
Except as contained in this notice, the name of a copyright holder shall
not be used in advertising or otherwise to promote the sale, use or other
dealings in these Data Files or Software without prior written
authorization of the copyright holder.
----------------------------------------------------------------------
Third-Party Software Licenses
This section contains third-party software notices and/or additional
terms for licensed third-party software components included within ICU
libraries.
----------------------------------------------------------------------
ICU License - ICU 1.8.1 to ICU 57.1
COPYRIGHT AND PERMISSION NOTICE
Copyright (c) 1995-2016 International Business Machines Corporation and others
All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, and/or sell copies of the Software, and to permit persons
to whom the Software is furnished to do so, provided that the above
copyright notice(s) and this permission notice appear in all copies of
the Software and that both the above copyright notice(s) and this
permission notice appear in supporting documentation.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY
SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
Except as contained in this notice, the name of a copyright holder
shall not be used in advertising or otherwise to promote the sale, use
or other dealings in this Software without prior written authorization
of the copyright holder.
All trademarks and registered trademarks mentioned herein are the
property of their respective owners.
----------------------------------------------------------------------
Chinese/Japanese Word Break Dictionary Data (cjdict.txt)
# The Google Chrome software developed by Google is licensed under
# the BSD license. Other software included in this distribution is
# provided under other licenses, as set forth below.
#
# The BSD License
# http://opensource.org/licenses/bsd-license.php
# Copyright (C) 2006-2008, Google Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided with
# the distribution.
# Neither the name of Google Inc. nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# The word list in cjdict.txt are generated by combining three word lists
# listed below with further processing for compound word breaking. The
# frequency is generated with an iterative training against Google web
# corpora.
#
# * Libtabe (Chinese)
# - https://sourceforge.net/project/?group_id=1519
# - Its license terms and conditions are shown below.
#
# * IPADIC (Japanese)
# - http://chasen.aist-nara.ac.jp/chasen/distribution.html
# - Its license terms and conditions are shown below.
#
# ---------COPYING.libtabe ---- BEGIN--------------------
#
# /*
# * Copyright (c) 1999 TaBE Project.
# * Copyright (c) 1999 Pai-Hsiang Hsiao.
# * All rights reserved.
# *
# * Redistribution and use in source and binary forms, with or without
# * modification, are permitted provided that the following conditions
# * are met:
# *
# * . Redistributions of source code must retain the above copyright
# * notice, this list of conditions and the following disclaimer.
# * . Redistributions in binary form must reproduce the above copyright
# * notice, this list of conditions and the following disclaimer in
# * the documentation and/or other materials provided with the
# * distribution.
# * . Neither the name of the TaBE Project nor the names of its
# * contributors may be used to endorse or promote products derived
# * from this software without specific prior written permission.
# *
# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
# * OF THE POSSIBILITY OF SUCH DAMAGE.
# */
#
# /*
# * Copyright (c) 1999 Computer Systems and Communication Lab,
# * Institute of Information Science, Academia
# * Sinica. All rights reserved.
# *
# * Redistribution and use in source and binary forms, with or without
# * modification, are permitted provided that the following conditions
# * are met:
# *
# * . Redistributions of source code must retain the above copyright
# * notice, this list of conditions and the following disclaimer.
# * . Redistributions in binary form must reproduce the above copyright
# * notice, this list of conditions and the following disclaimer in
# * the documentation and/or other materials provided with the
# * distribution.
# * . Neither the name of the Computer Systems and Communication Lab
# * nor the names of its contributors may be used to endorse or
# * promote products derived from this software without specific
# * prior written permission.
# *
# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
# * OF THE POSSIBILITY OF SUCH DAMAGE.
# */
#
# Copyright 1996 Chih-Hao Tsai @ Beckman Institute,
# University of Illinois
# c-tsai4@uiuc.edu http://casper.beckman.uiuc.edu/~c-tsai4
#
# ---------------COPYING.libtabe-----END--------------------------------
#
#
# ---------------COPYING.ipadic-----BEGIN-------------------------------
#
# Copyright 2000, 2001, 2002, 2003 Nara Institute of Science
# and Technology. All Rights Reserved.
#
# Use, reproduction, and distribution of this software is permitted.
# Any copy of this software, whether in its original form or modified,
# must include both the above copyright notice and the following
# paragraphs.
#
# Nara Institute of Science and Technology (NAIST),
# the copyright holders, disclaims all warranties with regard to this
# software, including all implied warranties of merchantability and
# fitness, in no event shall NAIST be liable for
# any special, indirect or consequential damages or any damages
# whatsoever resulting from loss of use, data or profits, whether in an
# action of contract, negligence or other tortuous action, arising out
# of or in connection with the use or performance of this software.
#
# A large portion of the dictionary entries
# originate from ICOT Free Software. The following conditions for ICOT
# Free Software applies to the current dictionary as well.
#
# Each User may also freely distribute the Program, whether in its
# original form or modified, to any third party or parties, PROVIDED
# that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
# on, or be attached to, the Program, which is distributed substantially
# in the same form as set out herein and that such intended
# distribution, if actually made, will neither violate or otherwise
# contravene any of the laws and regulations of the countries having
# jurisdiction over the User or the intended distribution itself.
#
# NO WARRANTY
#
# The program was produced on an experimental basis in the course of the
# research and development conducted during the project and is provided
# to users as so produced on an experimental basis. Accordingly, the
# program is provided without any warranty whatsoever, whether express,
# implied, statutory or otherwise. The term "warranty" used herein
# includes, but is not limited to, any warranty of the quality,
# performance, merchantability and fitness for a particular purpose of
# the program and the nonexistence of any infringement or violation of
# any right of any third party.
#
# Each user of the program will agree and understand, and be deemed to
# have agreed and understood, that there is no warranty whatsoever for
# the program and, accordingly, the entire risk arising from or
# otherwise connected with the program is assumed by the user.
#
# Therefore, neither ICOT, the copyright holder, or any other
# organization that participated in or was otherwise related to the
# development of the program and their respective officials, directors,
# officers and other employees shall be held liable for any and all
# damages, including, without limitation, general, special, incidental
# and consequential damages, arising out of or otherwise in connection
# with the use or inability to use the program or any product, material
# or result produced or otherwise obtained by using the program,
# regardless of whether they have been advised of, or otherwise had
# knowledge of, the possibility of such damages at any time during the
# project or thereafter. Each user will be deemed to have agreed to the
# foregoing by his or her commencement of use of the program. The term
# "use" as used herein includes, but is not limited to, the use,
# modification, copying and distribution of the program and the
# production of secondary products from the program.
#
# In the case where the program, whether in its original form or
# modified, was distributed or delivered to or received by a user from
# any person, organization or entity other than ICOT, unless it makes or
# grants independently of ICOT any specific warranty to the user in
# writing, such person, organization or entity, will also be exempted
# from and not be held liable to the user for any such damages as noted
# above as far as the program is concerned.
#
# ---------------COPYING.ipadic-----END----------------------------------
----------------------------------------------------------------------
Lao Word Break Dictionary Data (laodict.txt)
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2015 International Business Machines Corporation
# and others. All Rights Reserved.
#
# Project: https://github.com/rober42539/lao-dictionary
# Dictionary: https://github.com/rober42539/lao-dictionary/laodict.txt
# License: https://github.com/rober42539/lao-dictionary/LICENSE.txt
# (copied below)
#
# This file is derived from the above dictionary version of Nov 22, 2020
# ----------------------------------------------------------------------
# Copyright (C) 2013 Brian Eugene Wilson, Robert Martin Campbell.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer. Redistributions in binary
# form must reproduce the above copyright notice, this list of conditions and
# the following disclaimer in the documentation and/or other materials
# provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
# OF THE POSSIBILITY OF SUCH DAMAGE.
# --------------------------------------------------------------------------
----------------------------------------------------------------------
Burmese Word Break Dictionary Data (burmesedict.txt)
# Copyright (c) 2014 International Business Machines Corporation
# and others. All Rights Reserved.
#
# This list is part of a project hosted at:
# github.com/kanyawtech/myanmar-karen-word-lists
#
# --------------------------------------------------------------------------
# Copyright (c) 2013, LeRoy Benjamin Sharon
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met: Redistributions of source code must retain the above
# copyright notice, this list of conditions and the following
# disclaimer. Redistributions in binary form must reproduce the
# above copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided
# with the distribution.
#
# Neither the name Myanmar Karen Word Lists, nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
# THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
# --------------------------------------------------------------------------
----------------------------------------------------------------------
Time Zone Database
ICU uses the public domain data and code derived from Time Zone
Database for its time zone support. The ownership of the TZ database
is explained in BCP 175: Procedure for Maintaining the Time Zone
Database section 7.
# 7. Database Ownership
#
# The TZ database itself is not an IETF Contribution or an IETF
# document. Rather it is a pre-existing and regularly updated work
# that is in the public domain, and is intended to remain in the
# public domain. Therefore, BCPs 78 [RFC5378] and 79 [RFC3979] do
# not apply to the TZ Database or contributions that individuals make
# to it. Should any claims be made and substantiated against the TZ
# Database, the organization that is providing the IANA
# Considerations defined in this RFC, under the memorandum of
# understanding with the IETF, currently ICANN, may act in accordance
# with all competent court orders. No ownership claims will be made
# by ICANN or the IETF Trust on the database or the code. Any person
# making a contribution to the database or code waives all rights to
# future claims in that contribution or in the TZ Database.
----------------------------------------------------------------------
Google double-conversion
Copyright 2006-2011, the V8 project authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
```
stringi/tools/ 0000755 0001762 0000144 00000000000 14130207217 013066 5 ustar ligges users stringi/tools/AC_CXX_NAMESPACES.m4 0000644 0001762 0000144 00000001524 14130207217 016036 0 ustar ligges users dnl @synopsis AC_CXX_NAMESPACES
dnl
dnl If the compiler can prevent names clashes using namespaces, define
dnl HAVE_NAMESPACES.
dnl
dnl @category Cxx
dnl @author Todd Veldhuizen
dnl @author Luc Maisonobe
dnl @license AllPermissive
dnl @version 2021-10-02
dnl Marek's 2021-10-02 update: replace obsolete AC_TRY_COMPILE, AC_LANG_CPLUSPLUS
AC_DEFUN([AC_CXX_NAMESPACES],
[AC_CACHE_CHECK(whether the compiler implements namespaces,
ac_cv_cxx_namespaces,
[AC_LANG_SAVE
AC_LANG([C++])
AC_LINK_IFELSE([AC_LANG_PROGRAM([[namespace Outer { namespace Inner { int i = 0; }}]],
[[using namespace Outer::Inner; return i;]])],
ac_cv_cxx_namespaces=yes, ac_cv_cxx_namespaces=no)
AC_LANG_RESTORE
])
if test "$ac_cv_cxx_namespaces" = yes; then
AC_DEFINE(HAVE_NAMESPACES,,[define if the compiler implements namespaces])
fi
])
stringi/tools/AC_CXX_HAVE_STL.m4 0000644 0001762 0000144 00000002270 14130207217 015663 0 ustar ligges users dnl
dnl AC_CXX_HAVE_STL
dnl
dnl Description
dnl
dnl If the compiler supports the Standard Template Library, define HAVE_STL.
dnl
dnl Version: 1.2.1 (last modified: 2021-10-02)
dnl Author: Luc Maisonobe
dnl
dnl from http://www.gnu.org/software/ac-archive/htmldoc/index.html
dnl
dnl License:
dnl GNU General Public License
dnl [http://www.gnu.org/software/ac-archive/htmldoc/COPYING.html]
dnl with this special exception
dnl [http://www.gnu.org/software/ac-archive/htmldoc/COPYING-Exception.html].
dnl Marek's 2021-10-02 update: replace obsolete AC_TRY_COMPILE, AC_LANG_CPLUSPLUS
AC_DEFUN([AC_CXX_HAVE_STL],
[AC_CACHE_CHECK(whether the compiler supports the Standard Template Library,
ac_cv_cxx_have_stl,
[AC_REQUIRE([AC_CXX_NAMESPACES])
AC_LANG_SAVE
AC_LANG([C++])
AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include
#include
#ifdef HAVE_NAMESPACES
using namespace std;
#endif]],[[list x; x.push_back(5);
list::iterator iter = x.begin(); if (iter != x.end()) ++iter; return 0;]])],
[ac_cv_cxx_have_stl=yes], [ac_cv_cxx_have_stl=no])
AC_LANG_RESTORE
])
if test "$ac_cv_cxx_have_stl" = yes; then
AC_DEFINE(HAVE_STL,,[define if the compiler supports Standard Template Library])
fi
])
stringi/INSTALL 0000644 0001762 0000144 00000006735 14523011373 012774 0 ustar ligges users # Installing *stringi*
In most cases, installing *stringi* is as simple as calling:
```r
install.packages("stringi")
```
However, due to the overwhelming complexity of the ICU4C library,
upon which *stringi* is based, and the diversity of environments
it operates on, you might still experience a few issues.
Hopefully, they can be resolved with the help of this short manual.
Below we also describe some available build process tweaks.
> To get the most out of stringi, you are strongly encouraged to rely
> on our custom ICU4C 74.1 bundle by calling:
>
> ```r
> install.packages("stringi", configure.args="--disable-pkg-config")
> ```
>
> This ensures maximum portability across all platforms
> (Windows and macOS users fetch the pre-compiled binaries
> from CRAN built precisely this way).
## ICU4C
The stringi package depends on the ICU4C >= 61 library.
ICU will be built together with stringi based on the customised
ICU4C 74.1 source bundle that is shipped with the package
if we install the package from sources and one of the following is true:
* this requirement is not met (check out ,
the `libicu-devel` package on Fedora/CentOS/OpenSUSE,
`libicu-dev` on Ubuntu/Debian, etc.), or
* `pkg-config` fails to find appropriate build settings
for ICU-based projects, or
* `R CMD INSTALL` is called with the `--configure-args='--disable-pkg-config'`
argument, or the `STRINGI_DISABLE_PKG_CONFIG` environment variable
is set to non-zero or
`install.packages("stringi", configure.args="--disable-pkg-config")`
is executed.
## Customising the Build Process
Additional features and options of the `./configure` script:
* `--disable-icu-bundle`: Enforce system ICU.
* `--disable-pkg-config`: Disable `pkg-config`;
ICU4C will be compiled from sources.
* `--with-extra-cxxflags=FLAGS`: Additional C++ compiler flags.
* `--with-extra-cppflags=FLAGS`: Additional C++ preprocessor flags.
* `--with-extra-ldflags=FLAGS`: Additional linker flags.
* `--with-extra-libs=FLAGS`: Additional libraries to link against.
Some environment variables:
* `PKG_CONFIG_PATH`: An optional list of directories to search for
`pkg-config`'s `.pc` files.
* `R_HOME`: Override the R directory, e.g.,
`/usr/lib64/R`. Note that `$R_HOME/bin/R` point to the R executable.
* `CAT`: The `cat` command used to generate the list of source files to compile.
* `PKG_CONFIG`:The `pkg-config` command used to fetch the necessary compiler
flags to link to the existing `libicu` installation.
* `STRINGI_DISABLE_PKG_CONFIG`: Compile ICU from sources;
see also `--disable-pkg-config`.
* `STRINGI_DISABLE_ICU_BUNDLE`: Enforce system ICU;
see also `--disable-icu-bundle`.
* `STRINGI_CXXFLAGS`: see `--with-extra-cxxflags`.
* `STRINGI_CPPFLAGS`: see `--with-extra-cppflags`.
* `STRINGI_LDFLAGS`: see `--with-extra-ldflags`.
* `STRINGI_LIBS`: see `--with-extra-libs`.
## Getting Help
If you do not manage to set up a successful build, do not
hesitate to [file a bug report](https://github.com/gagolews/stringi/issues).
However, please check the list of archived (closed) issues first --
it is quite likely that a solution to your problem has already been posted.
To help diagnose your error further, please run (from the terminal)
the following commands and submit the output from `./configure`
as well as the contents of `config.log`.
```bash
cd /tmp
wget https://github.com/gagolews/stringi/archive/master.zip
unzip master.zip
cd stringi-master
./configure
```
stringi/man/ 0000755 0001762 0000144 00000000000 14453700767 012521 5 ustar ligges users stringi/man/about_search_boundaries.Rd 0000644 0001762 0000144 00000010103 14262507664 017654 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/search.R
\name{about_search_boundaries}
\alias{about_search_boundaries}
\alias{search_boundaries}
\alias{stringi-search-boundaries}
\title{Text Boundary Analysis in \pkg{stringi}}
\description{
Text boundary analysis is the process of locating linguistic boundaries
while formatting and handling text.
}
\details{
Examples of the boundary analysis process include:
\itemize{
\item Locating positions to word-wrap text to fit
within specific margins while displaying or printing,
see \code{\link{stri_wrap}} and \code{\link{stri_split_boundaries}}.
\item Counting characters, words, sentences, or paragraphs,
see \code{\link{stri_count_boundaries}}.
\item Making a list of the unique words in a document,
see \code{\link{stri_extract_all_words}} and then \code{\link{stri_unique}}.
\item Capitalizing the first letter of each word
or sentence, see also \code{\link{stri_trans_totitle}}.
\item Locating a particular unit of the text (for example,
finding the third word in the document),
see \code{\link{stri_locate_all_boundaries}}.
}
Generally, text boundary analysis is a locale-dependent operation.
For example, in Japanese and Chinese one does not separate words with spaces
- a line break can occur even in the middle of a word.
These languages have punctuation and diacritical
marks that cannot start or end a line, so this must also be taken into account.
\pkg{stringi} uses \pkg{ICU}'s \code{BreakIterator} to locate specific
text boundaries. Note that the \code{BreakIterator}'s behavior
may be controlled in come cases, see \code{\link{stri_opts_brkiter}}.
\itemize{
\item The \code{character} boundary iterator tries to match what a user
would think of as a ``character'' -- a basic unit of a writing system
for a language -- which may be more than just a single Unicode code point.
\item The \code{word} boundary iterator locates the boundaries
of words, for purposes such as ``Find whole words'' operations.
\item The \code{line_break} iterator locates positions that would
be appropriate to wrap lines when displaying the text.
\item The break iterator of type \code{sentence}
locates sentence boundaries.
}
For technical details on different classes of text boundaries refer
to the \pkg{ICU} User Guide, see below.
}
\references{
\emph{Boundary Analysis} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/boundaryanalysis/}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other locale_sensitive:
\code{\link{\%s<\%}()},
\code{\link{about_locale}},
\code{\link{about_search_coll}},
\code{\link{stri_compare}()},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_duplicated}()},
\code{\link{stri_enc_detect2}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_collator}()},
\code{\link{stri_order}()},
\code{\link{stri_rank}()},
\code{\link{stri_sort_key}()},
\code{\link{stri_sort}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_unique}()},
\code{\link{stri_wrap}()}
Other text_boundaries:
\code{\link{about_search}},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_brkiter}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_split_lines}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_wrap}()}
Other stringi_general_topics:
\code{\link{about_arguments}},
\code{\link{about_encoding}},
\code{\link{about_locale}},
\code{\link{about_search_charclass}},
\code{\link{about_search_coll}},
\code{\link{about_search_fixed}},
\code{\link{about_search_regex}},
\code{\link{about_search}},
\code{\link{about_stringi}}
}
\concept{locale_sensitive}
\concept{stringi_general_topics}
\concept{text_boundaries}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_locate_boundaries.Rd 0000644 0001762 0000144 00000011041 14262507664 017527 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/search_locate_bound.R
\name{stri_locate_all_boundaries}
\alias{stri_locate_all_boundaries}
\alias{stri_locate_last_boundaries}
\alias{stri_locate_first_boundaries}
\alias{stri_locate_all_words}
\alias{stri_locate_last_words}
\alias{stri_locate_first_words}
\title{Locate Text Boundaries}
\usage{
stri_locate_all_boundaries(
str,
omit_no_match = FALSE,
get_length = FALSE,
...,
opts_brkiter = NULL
)
stri_locate_last_boundaries(str, get_length = FALSE, ..., opts_brkiter = NULL)
stri_locate_first_boundaries(str, get_length = FALSE, ..., opts_brkiter = NULL)
stri_locate_all_words(
str,
omit_no_match = FALSE,
locale = NULL,
get_length = FALSE
)
stri_locate_last_words(str, locale = NULL, get_length = FALSE)
stri_locate_first_words(str, locale = NULL, get_length = FALSE)
}
\arguments{
\item{str}{character vector or an object coercible to}
\item{omit_no_match}{single logical value; if \code{TRUE},
a no-match will be indicated by a matrix with 0 rows
\code{stri_locate_all_*} only}
\item{get_length}{single logical value; if \code{FALSE} (default),
generate \emph{from-to} matrices; otherwise, output
\emph{from-length} ones}
\item{...}{additional settings for \code{opts_brkiter}}
\item{opts_brkiter}{named list with \pkg{ICU} BreakIterator's settings,
see \code{\link{stri_opts_brkiter}};
\code{NULL} for default break iterator, i.e., \code{line_break}}
\item{locale}{\code{NULL} or \code{''} for text boundary analysis following
the conventions of the default locale, or a single string with
locale identifier, see \link{stringi-locale}}
}
\value{
\code{stri_locate_all_*} yields a list of \code{length(str)}
integer matrices.
\code{stri_locate_first_*} and \code{stri_locate_last_*} generate
return an integer matrix.
See \code{\link{stri_locate}} for more details.
}
\description{
These functions locate text boundaries
(like character, word, line, or sentence boundaries).
Use \code{stri_locate_all_*} to locate all the matches.
\code{stri_locate_first_*} and \code{stri_locate_last_*}
give the first or the last matches, respectively.
}
\details{
Vectorized over \code{str}.
For more information on text boundary analysis
performed by \pkg{ICU}'s \code{BreakIterator}, see
\link{stringi-search-boundaries}.
For \code{stri_locate_*_words},
just like in \code{\link{stri_extract_all_words}} and \code{\link{stri_count_words}},
\pkg{ICU}'s word \code{BreakIterator} iterator is used
to locate the word boundaries, and all non-word characters
(\code{UBRK_WORD_NONE} rule status) are ignored.
This function is equivalent to a call to
\code{stri_locate_*_boundaries(str, type='word', skip_word_none=TRUE, locale=locale)}
}
\examples{
test <- 'The\u00a0above-mentioned features are very useful. Spam, spam, eggs, bacon, and spam.'
stri_locate_all_words(test)
stri_locate_all_boundaries(
'Mr. Jones and Mrs. Brown are very happy. So am I, Prof. Smith.',
type='sentence',
locale='en_US@ss=standard' # ICU >= 56 only
)
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other search_locate:
\code{\link{about_search}},
\code{\link{stri_locate_all}()}
Other indexing:
\code{\link{stri_locate_all}()},
\code{\link{stri_sub_all}()},
\code{\link{stri_sub}()}
Other locale_sensitive:
\code{\link{\%s<\%}()},
\code{\link{about_locale}},
\code{\link{about_search_boundaries}},
\code{\link{about_search_coll}},
\code{\link{stri_compare}()},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_duplicated}()},
\code{\link{stri_enc_detect2}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_opts_collator}()},
\code{\link{stri_order}()},
\code{\link{stri_rank}()},
\code{\link{stri_sort_key}()},
\code{\link{stri_sort}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_unique}()},
\code{\link{stri_wrap}()}
Other text_boundaries:
\code{\link{about_search_boundaries}},
\code{\link{about_search}},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_opts_brkiter}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_split_lines}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_wrap}()}
}
\concept{indexing}
\concept{locale_sensitive}
\concept{search_locate}
\concept{text_boundaries}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_rand_strings.Rd 0000644 0001762 0000144 00000004074 14262507664 016552 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/random.R
\name{stri_rand_strings}
\alias{stri_rand_strings}
\title{Generate Random Strings}
\usage{
stri_rand_strings(n, length, pattern = "[A-Za-z0-9]")
}
\arguments{
\item{n}{single integer, number of observations}
\item{length}{integer vector, desired string lengths}
\item{pattern}{character vector specifying character classes to draw
elements from, see \link{stringi-search-charclass}}
}
\value{
Returns a character vector.
}
\description{
Generates (pseudo)random strings of desired lengths.
}
\details{
Vectorized over \code{length} and \code{pattern}.
If length of \code{length} or \code{pattern} is greater than \code{n},
then redundant elements are ignored. Otherwise,
these vectors are recycled if necessary.
This operation may result in non-Unicode-normalized
strings and may give peculiar outputs for bidirectional strings.
Sampling of code points from the set specified by \code{pattern}
is always done with replacement and each code point appears with equal
probability.
}
\examples{
stri_rand_strings(5, 10) # 5 strings of length 10
stri_rand_strings(5, sample(1:10, 5, replace=TRUE)) # 5 strings of random lengths
stri_rand_strings(10, 5, '[\\\\p{script=latin}&\\\\p{Ll}]') # small letters from the Latin script
# generate n random passwords of length in [8, 14]
# consisting of at least one digit, small and big ASCII letter:
n <- 10
stri_rand_shuffle(stri_paste(
stri_rand_strings(n, 1, '[0-9]'),
stri_rand_strings(n, 1, '[a-z]'),
stri_rand_strings(n, 1, '[A-Z]'),
stri_rand_strings(n, sample(5:11, 5, replace=TRUE), '[a-zA-Z0-9]')
))
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other random:
\code{\link{stri_rand_lipsum}()},
\code{\link{stri_rand_shuffle}()}
}
\concept{random}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_join.Rd 0000644 0001762 0000144 00000005126 14262507664 015013 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/join.R
\name{stri_join}
\alias{stri_join}
\alias{stri_c}
\alias{stri_paste}
\title{Concatenate Character Vectors}
\usage{
stri_join(..., sep = "", collapse = NULL, ignore_null = FALSE)
stri_c(..., sep = "", collapse = NULL, ignore_null = FALSE)
stri_paste(..., sep = "", collapse = NULL, ignore_null = FALSE)
}
\arguments{
\item{...}{character vectors (or objects coercible to character vectors)
whose corresponding elements are to be concatenated}
\item{sep}{a single string; separates terms}
\item{collapse}{a single string or \code{NULL}; an optional
results separator}
\item{ignore_null}{a single logical value; if \code{TRUE}, then empty
vectors provided via \code{...} are silently ignored}
}
\value{
Returns a character vector.
}
\description{
These are the \pkg{stringi}'s equivalents of the built-in
\code{\link{paste}} function.
\code{stri_c} and \code{stri_paste} are aliases for \code{stri_join}.
}
\details{
Vectorized over each atomic vector in `\code{...}`.
Unless \code{collapse} is \code{NULL}, the result will be a single string.
Otherwise, you get a character vector of length equal
to the length of the longest argument.
If any of the arguments in `\code{...}` is a vector of length 0
(not to be confused with vectors of empty strings)
and \code{ignore_null} is \code{FALSE}, then
you will get a 0-length character vector in result.
If \code{collapse} or \code{sep} has length greater than 1,
then only the first string will be used.
In case where there are missing values in any of the input vectors,
\code{NA} is set to the corresponding element.
Note that this behavior is different from \code{\link{paste}},
which treats missing values as ordinary strings like \code{'NA'}.
Moreover, as usual in \pkg{stringi}, the resulting strings are
always in UTF-8.
}
\examples{
stri_join(1:13, letters)
stri_join(1:13, letters, sep=',')
stri_join(1:13, letters, collapse='; ')
stri_join(1:13, letters, sep=',', collapse='; ')
stri_join(c('abc', '123', 'xyz'),'###', 1:6, sep=',')
stri_join(c('abc', '123', 'xyz'),'###', 1:6, sep=',', collapse='; ')
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other join:
\code{\link{\%s+\%}()},
\code{\link{stri_dup}()},
\code{\link{stri_flatten}()},
\code{\link{stri_join_list}()}
}
\concept{join}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_enc_isascii.Rd 0000644 0001762 0000144 00000002460 14262507664 016323 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/encoding_detection.R
\name{stri_enc_isascii}
\alias{stri_enc_isascii}
\title{Check If a Data Stream Is Possibly in ASCII}
\usage{
stri_enc_isascii(str)
}
\arguments{
\item{str}{character vector, a raw vector, or
a list of \code{raw} vectors}
}
\value{
Returns a logical vector.
The i-th element indicates whether the i-th string
corresponds to a valid ASCII byte sequence.
}
\description{
The function checks whether all bytes in a string are <= 127.
}
\details{
This function is independent of the way \R marks encodings in
character strings (see \link{Encoding} and \link{stringi-encoding}).
}
\examples{
stri_enc_isascii(letters[1:3])
stri_enc_isascii('\u0105\u0104')
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other encoding_detection:
\code{\link{about_encoding}},
\code{\link{stri_enc_detect2}()},
\code{\link{stri_enc_detect}()},
\code{\link{stri_enc_isutf16be}()},
\code{\link{stri_enc_isutf8}()}
}
\concept{encoding_detection}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/about_arguments.Rd 0000644 0001762 0000144 00000006633 14262507664 016216 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/internal_prepare_arg.R
\name{about_arguments}
\alias{about_arguments}
\alias{arguments}
\alias{stringi-arguments}
\title{Passing Arguments to Functions in \pkg{stringi}}
\description{
Below we explain how \pkg{stringi} deals with its functions' arguments.
If some function violates one of the following rules
(for a very important reason),
this is clearly indicated in its documentation (with discussion).
}
\section{Coercion of Arguments}{
When a character vector argument is expected, factors and other vectors
coercible to characters vectors are silently converted with
\code{\link{as.character}}, otherwise an error is generated.
Coercion from a list which does not consist of length-1 atomic vectors
issues a warning.
When a logical, numeric, or integer vector argument is expected,
factors are converted with \code{as.*(\link{as.character}(...))},
and other coercible vectors are converted with \code{as.*},
otherwise an error is generated.
}
\section{Vectorization}{
Almost all functions are vectorized with respect to all their arguments
and the recycling rule is applied whenever necessary.
Due to this property you may,
for instance, search for one pattern in each given string,
search for each pattern in one given string,
and search for the i-th pattern within the i-th string.
We of course took great care of performance issues:
e.g., in regular expression searching, regex matchers are reused
from iteration to iteration, as long as it is possible.
Functions with some non-vectorized arguments are rare:
e.g., regular expression matcher's settings are established
once per each call.
Some functions
assume that a vector with one element is given
as an argument (like \code{collapse} in \code{\link{stri_join}}).
In such cases, if an empty vector is given you will get an error
and for vectors with more than 1 elements - a warning will be
generated (only the first element will be used).
You may find details on vectorization behavior in the man pages
on each particular function of your interest.
}
\section{Handling Missing Values (\code{NA}s)}{
\pkg{stringi} handles missing values consistently.
For any vectorized operation, if at least one vector element is missing,
then the corresponding resulting value is also set to \code{NA}.
}
\section{Preserving Object Attributes}{
Generally, all our functions drop input objects' attributes
(e.g., \code{\link{names}}, \code{\link{dim}}, etc.).
This is due to deep vectorization as well as for efficiency reasons.
If the preservation of attributes is needed,
important attributes can be manually copied. Alternatively, the notation
\code{x[] <- stri_...(x, ...)} can sometimes be used too.
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other stringi_general_topics:
\code{\link{about_encoding}},
\code{\link{about_locale}},
\code{\link{about_search_boundaries}},
\code{\link{about_search_charclass}},
\code{\link{about_search_coll}},
\code{\link{about_search_fixed}},
\code{\link{about_search_regex}},
\code{\link{about_search}},
\code{\link{about_stringi}}
}
\concept{prepare_arg}
\concept{stringi_general_topics}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_unique.Rd 0000644 0001762 0000144 00000004614 14262507664 015363 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/sort.R
\name{stri_unique}
\alias{stri_unique}
\title{Extract Unique Elements}
\usage{
stri_unique(str, ..., opts_collator = NULL)
}
\arguments{
\item{str}{a character vector}
\item{...}{additional settings for \code{opts_collator}}
\item{opts_collator}{a named list with \pkg{ICU} Collator's options,
see \code{\link{stri_opts_collator}}, \code{NULL}
for default collation options}
}
\value{
Returns a character vector.
}
\description{
This function returns a character vector like \code{str},
but with duplicate elements removed.
}
\details{
As usual in \pkg{stringi}, no attributes are copied.
Unlike \code{\link{unique}}, this function
tests for canonical equivalence of strings (and not
whether the strings are just bytewise equal). Such an operation
is locale-dependent. Hence, \code{stri_unique} is significantly
slower (but much better suited for natural language processing)
than its base R counterpart.
See also \code{\link{stri_duplicated}} for indicating non-unique elements.
}
\examples{
# normalized and non-Unicode-normalized version of the same code point:
stri_unique(c('\u0105', stri_trans_nfkd('\u0105')))
unique(c('\u0105', stri_trans_nfkd('\u0105')))
stri_unique(c('gro\u00df', 'GROSS', 'Gro\u00df', 'Gross'), strength=1)
}
\references{
\emph{Collation} - ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/collation/}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other locale_sensitive:
\code{\link{\%s<\%}()},
\code{\link{about_locale}},
\code{\link{about_search_boundaries}},
\code{\link{about_search_coll}},
\code{\link{stri_compare}()},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_duplicated}()},
\code{\link{stri_enc_detect2}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_collator}()},
\code{\link{stri_order}()},
\code{\link{stri_rank}()},
\code{\link{stri_sort_key}()},
\code{\link{stri_sort}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_wrap}()}
}
\concept{locale_sensitive}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_remove_empty.Rd 0000644 0001762 0000144 00000003306 14262507664 016565 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utils.R
\name{stri_remove_empty}
\alias{stri_remove_empty}
\alias{stri_omit_empty}
\alias{stri_remove_empty_na}
\alias{stri_omit_empty_na}
\alias{stri_remove_na}
\alias{stri_omit_na}
\title{Remove All Empty Strings from a Character Vector}
\usage{
stri_remove_empty(x, na_empty = FALSE)
stri_omit_empty(x, na_empty = FALSE)
stri_remove_empty_na(x)
stri_omit_empty_na(x)
stri_remove_na(x)
stri_omit_na(x)
}
\arguments{
\item{x}{a character vector}
\item{na_empty}{should missing values be treated as empty strings?}
}
\value{
Returns a character vector.
}
\description{
\code{stri_remove_empty} (alias \code{stri_omit_empty})
removes all empty strings from a character vector,
and, if \code{na_empty} is \code{TRUE}, also gets rid of all missing
values.
\code{stri_remove_empty_na} (alias \code{stri_omit_empty_na})
removes both empty strings and missing values.
\code{stri_remove_na} (alias \code{stri_omit_na})
returns a version of \code{x} with missing values removed.
}
\examples{
stri_remove_empty(stri_na2empty(c('a', NA, '', 'b')))
stri_remove_empty(c('a', NA, '', 'b'))
stri_remove_empty(c('a', NA, '', 'b'), TRUE)
stri_omit_empty_na(c('a', NA, '', 'b'))
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other utils:
\code{\link{stri_list2matrix}()},
\code{\link{stri_na2empty}()},
\code{\link{stri_replace_na}()}
}
\concept{utils}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/about_stringi.Rd 0000644 0001762 0000144 00000013727 14350705363 015664 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/stringi-package.R
\docType{package}
\name{about_stringi}
\alias{about_stringi}
\alias{stringi}
\alias{stringi-package}
\title{Fast and Portable Character String Processing in R}
\description{
\pkg{stringi} is THE R package for fast, correct, consistent,
and convenient string/text manipulation.
It gives predictable results on every platform, in each locale,
and under any native character encoding.
\bold{Keywords}: R, text processing, character strings,
internationalization, localization, ICU, ICU4C, i18n, l10n, Unicode.
\bold{Homepage}: \url{https://stringi.gagolewski.com/}
\bold{License}: The BSD-3-clause license for the package code,
the ICU license for the accompanying ICU4C distribution,
and the UCD license for the Unicode Character Database.
See the COPYRIGHTS and LICENSE file for more details.
}
\details{
Manual pages on general topics:
\itemize{
\item \link{about_encoding} -- character encoding issues, including
information on encoding management in \pkg{stringi}, as well as
on encoding detection and conversion.
\item \link{about_locale} -- locale issues, including locale
management and specification in \pkg{stringi}, and the list of
locale-sensitive operations. In particular, see
\code{\link{stri_opts_collator}} for a description of the string
collation algorithm, which is used for string comparing, ordering,
ranking, sorting, case-folding, and searching.
\item \link{about_arguments} -- information on how \pkg{stringi}
handles the arguments passed to its function.
}
}
\section{Facilities available}{
Refer to the following:
\itemize{
\item \link{about_search} for string searching facilities;
these include pattern searching, matching, string splitting, and so on.
The following independent search engines are provided:
\itemize{
\item \link{about_search_regex} -- with ICU (Java-like) regular expressions,
\item \link{about_search_fixed} -- fast, locale-independent, byte-wise pattern
matching,
\item \link{about_search_coll} -- locale-aware pattern matching
for natural language processing tasks,
\item \link{about_search_charclass} -- seeking elements of
particular character classes, like ``all whites-paces'' or ``all digits'',
\item \link{about_search_boundaries} -- text boundary analysis.
}
\item \code{\link{stri_datetime_format}} for date/time formatting
and parsing. Also refer to the links therein for other date/time/time zone-
related operations.
\item \code{\link{stri_stats_general}} and \code{\link{stri_stats_latex}}
for gathering some fancy statistics on a character vector's contents.
\item \code{\link{stri_join}}, \code{\link{stri_dup}}, \code{\link{\%s+\%}},
and \code{\link{stri_flatten}} for concatenation-based operations.
\item \code{\link{stri_sub}} for extracting and replacing substrings,
and \code{\link{stri_reverse}} for a joyful function
to reverse all code points in a string.
\item \code{\link{stri_length}} (among others) for determining the number
of code points in a string. See also \code{\link{stri_count_boundaries}}
for counting the number of Unicode characters
and \code{\link{stri_width}} for approximating the width of a string.
\item \code{\link{stri_trim}} (among others) for
trimming characters from the beginning or/and end of a string,
see also \link{about_search_charclass}, and \code{\link{stri_pad}}
for padding strings so that they are of the same width.
Additionally, \code{\link{stri_wrap}} wraps text into lines.
\item \code{\link{stri_trans_tolower}} (among others) for case mapping,
i.e., conversion to lower, UPPER, or Title Case,
\code{\link{stri_trans_nfc}} (among others) for Unicode normalization,
\code{\link{stri_trans_char}} for translating individual code points,
and \code{\link{stri_trans_general}} for other universal
text transforms, including transliteration.
\item \code{\link{stri_cmp}}, \code{\link{\%s<\%}}, \code{\link{stri_order}},
\code{\link{stri_sort}}, \code{\link{stri_rank}}, \code{\link{stri_unique}},
and \code{\link{stri_duplicated}} for collation-based,
locale-aware operations, see also \link{about_locale}.
\item \code{\link{stri_split_lines}} (among others)
to split a string into text lines.
\item \code{\link{stri_escape_unicode}} (among others) for escaping
some code points.
\item \code{\link{stri_rand_strings}}, \code{\link{stri_rand_shuffle}},
and \code{\link{stri_rand_lipsum}} for generating (pseudo)random strings.
\item \code{\link{stri_read_raw}},
\code{\link{stri_read_lines}}, and \code{\link{stri_write_lines}}
for reading and writing text files.
}
Note that each man page provides many further links to other
interesting facilities and topics.
}
\references{
\emph{\pkg{stringi} Package Homepage},
\url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string
processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59,
\doi{10.18637/jss.v103.i02}
\emph{ICU -- International Components for Unicode},
\url{https://icu.unicode.org/}
\emph{ICU4C API Documentation},
\url{https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/}
\emph{The Unicode Consortium},
\url{https://home.unicode.org/}
\emph{UTF-8, A Transformation Format of ISO 10646} -- RFC 3629,
\url{https://www.rfc-editor.org/rfc/rfc3629}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other stringi_general_topics:
\code{\link{about_arguments}},
\code{\link{about_encoding}},
\code{\link{about_locale}},
\code{\link{about_search_boundaries}},
\code{\link{about_search_charclass}},
\code{\link{about_search_coll}},
\code{\link{about_search_fixed}},
\code{\link{about_search_regex}},
\code{\link{about_search}}
}
\author{
Marek Gagolewski,
with contributions from Bartek Tartanus and many others.
ICU4C was developed by IBM, Unicode, Inc., and others.
}
\concept{stringi_general_topics}
stringi/man/stri_rank.Rd 0000644 0001762 0000144 00000005006 14365436502 015000 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/sort.R
\name{stri_rank}
\alias{stri_rank}
\title{Ranking}
\usage{
stri_rank(str, ..., opts_collator = NULL)
}
\arguments{
\item{str}{a character vector}
\item{...}{additional settings for \code{opts_collator}}
\item{opts_collator}{a named list with \pkg{ICU} Collator's options,
see \code{\link{stri_opts_collator}}, \code{NULL}
for default collation options}
}
\value{
The result is a vector of ranks corresponding to each
string in \code{str}.
}
\description{
This function ranks each string in a character vector according to a
locale-dependent lexicographic order.
It is a portable replacement for the base \code{xtfrm} function.
}
\details{
Missing values result in missing ranks and tied observations receive
the same ranks (based on min).
For more information on \pkg{ICU}'s Collator and how to tune it up
in \pkg{stringi}, refer to \code{\link{stri_opts_collator}}.
}
\examples{
stri_rank(c('hladny', 'chladny'), locale='pl_PL')
stri_rank(c('hladny', 'chladny'), locale='sk_SK')
stri_rank("a" \%s+\% c(1, 100, 2, 101, 11, 10)) # lexicographic order
stri_rank("a" \%s+\% c(1, 100, 2, 101, 11, 10), numeric=TRUE) # OK
stri_rank("a" \%s+\% c(0.25, 0.5, 1, -1, -2, -3), numeric=TRUE) # incorrect
# Ordering a data frame with respect to two criteria:
X <- data.frame(a=c("b", NA, "b", "b", NA, "a", "a", "c"), b=runif(8))
X[order(stri_rank(X$a), X$b), ]
}
\references{
\emph{Collation} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/collation/}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other locale_sensitive:
\code{\link{\%s<\%}()},
\code{\link{about_locale}},
\code{\link{about_search_boundaries}},
\code{\link{about_search_coll}},
\code{\link{stri_compare}()},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_duplicated}()},
\code{\link{stri_enc_detect2}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_collator}()},
\code{\link{stri_order}()},
\code{\link{stri_sort_key}()},
\code{\link{stri_sort}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_unique}()},
\code{\link{stri_wrap}()}
}
\concept{locale_sensitive}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_split_lines.Rd 0000644 0001762 0000144 00000006016 14262507664 016400 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/search_split_bound.R
\name{stri_split_lines}
\alias{stri_split_lines}
\alias{stri_split_lines1}
\title{Split a String Into Text Lines}
\usage{
stri_split_lines(str, omit_empty = FALSE)
stri_split_lines1(str)
}
\arguments{
\item{str}{character vector (\code{stri_split_lines})
or a single string (\code{stri_split_lines1})}
\item{omit_empty}{logical vector; determines whether empty
strings should be removed from the result
[\code{stri_split_lines} only]}
}
\value{
\code{stri_split_lines} returns a list of character vectors.
If any input string is \code{NA}, then the corresponding list element
is a single \code{NA} string.
\code{stri_split_lines1(str)} is equivalent to
\code{stri_split_lines(str[1])[[1]]} (with default parameters),
therefore it returns a character vector. Moreover, if the input string
ends with a newline sequence, the last empty string is omitted from the
file's contents into text lines.
}
\description{
These functions split each character string in a given vector
into text lines.
}
\details{
Vectorized over \code{str} and \code{omit_empty}.
\code{omit_empty} is applied when splitting. If set to \code{TRUE},
then empty strings will never appear in the resulting vector.
Newlines are represented with the Carriage Return
(CR, 0x0D), Line Feed (LF, 0x0A), CRLF, or Next Line (NEL, 0x85) characters,
depending on the platform.
Moreover, the Unicode Standard defines two unambiguous separator characters,
the Paragraph Separator (PS, 0x2029) and the Line Separator (LS, 0x2028).
Sometimes also the Vertical Tab (VT, 0x0B) and the Form Feed (FF, 0x0C)
are used for this purpose.
These \pkg{stringi} functions follow UTR#18 rules,
where a newline sequence
corresponds to the following regular expression:
\code{(?:\\u\{D A\}|(?!\\u\{D A\})[\\u\{A\}-\\u\{D\}\\u\{85\}\\u\{2028\}\\u\{2029\}]}.
Each match serves as a text line separator.
}
\references{
\emph{Unicode Newline Guidelines} -- Unicode Technical Report #13,
\url{https://www.unicode.org/standard/reports/tr13/tr13-5.html}
\emph{Unicode Regular Expressions} -- Unicode Technical Standard #18,
\url{https://www.unicode.org/reports/tr18/}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other search_split:
\code{\link{about_search}},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_split}()}
Other text_boundaries:
\code{\link{about_search_boundaries}},
\code{\link{about_search}},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_brkiter}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_wrap}()}
}
\concept{search_split}
\concept{text_boundaries}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_enc_toutf8.Rd 0000644 0001762 0000144 00000004765 14262507664 016142 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/encoding_conversion.R
\name{stri_enc_toutf8}
\alias{stri_enc_toutf8}
\title{Convert Strings To UTF-8}
\usage{
stri_enc_toutf8(str, is_unknown_8bit = FALSE, validate = FALSE)
}
\arguments{
\item{str}{a character vector to be converted}
\item{is_unknown_8bit}{a single logical value, see Details}
\item{validate}{a single logical value (can be \code{NA}), see Details}
}
\value{
Returns a character vector.
}
\description{
Converts character strings with declared marked encodings
to UTF-8 strings.
}
\details{
If \code{is_unknown_8bit} is set to \code{FALSE} (the default),
then R encoding marks are used, see \code{\link{stri_enc_mark}}.
Bytes-marked strings will cause the function to fail.
If a string is in UTF-8 and has a byte order mark (BOM),
then the BOM will be silently removed from the output string.
If the default encoding is UTF-8, see \code{\link{stri_enc_get}},
then strings marked with \code{native} are -- for efficiency reasons --
returned as-is, i.e., with unchanged markings.
A similar behavior is observed when calling \code{\link{enc2utf8}}.
For \code{is_unknown_8bit=TRUE}, if a string is declared to be neither
in ASCII nor in UTF-8, then all byte codes > 127 are replaced with
the Unicode REPLACEMENT CHARACTER (\\Ufffd).
Note that the REPLACEMENT CHARACTER may be interpreted as Unicode
missing value for single characters.
Here a \code{bytes}-marked string is assumed to use an 8-bit encoding
that extends the ASCII map.
What is more, setting \code{validate} to \code{TRUE}
or \code{NA} in both cases validates the resulting UTF-8 byte stream.
If \code{validate=TRUE}, then
in case of any incorrect byte sequences, they will be
replaced with the REPLACEMENT CHARACTER.
This option may be used in a case
where you want to fix an invalid UTF-8 byte sequence.
For \code{NA}, a bogus string will be replaced with a missing value.
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other encoding_conversion:
\code{\link{about_encoding}},
\code{\link{stri_enc_fromutf32}()},
\code{\link{stri_enc_toascii}()},
\code{\link{stri_enc_tonative}()},
\code{\link{stri_enc_toutf32}()},
\code{\link{stri_encode}()}
}
\concept{encoding_conversion}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_opts_fixed.Rd 0000644 0001762 0000144 00000003602 14523017034 016200 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/opts.R
\name{stri_opts_fixed}
\alias{stri_opts_fixed}
\title{Generate a List with Fixed Pattern Search Engine's Settings}
\usage{
stri_opts_fixed(case_insensitive = FALSE, overlap = FALSE)
}
\arguments{
\item{case_insensitive}{logical; enable simple case insensitive matching}
\item{overlap}{logical; enable overlapping matches' detection}
}
\value{
Returns a named list object.
}
\description{
A convenience function used to tune up the behavior of \code{stri_*_fixed}
functions, see \link{stringi-search-fixed}.
}
\details{
Case-insensitive matching uses a simple, single-code point case mapping
(via ICU's \code{u_toupper()} function).
Full case mappings should be used whenever possible because they produce
better results by working on whole strings. They also take into account
the string context and the language, see \link{stringi-search-coll}.
Searching for overlapping pattern matches is available in
\code{\link{stri_extract_all_fixed}}, \code{\link{stri_locate_all_fixed}},
and \code{\link{stri_count_fixed}} functions.
}
\examples{
stri_detect_fixed('ala', 'ALA') # case-sensitive by default
stri_detect_fixed('ala', 'ALA', opts_fixed=stri_opts_fixed(case_insensitive=TRUE))
stri_detect_fixed('ala', 'ALA', case_insensitive=TRUE) # equivalent
}
\references{
\emph{C/POSIX Migration} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/icu/posix.html}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other search_fixed:
\code{\link{about_search_fixed}},
\code{\link{about_search}}
}
\concept{search_fixed}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_timezone_set.Rd 0000644 0001762 0000144 00000004370 14262507664 016561 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/time_zone.R
\name{stri_timezone_get}
\alias{stri_timezone_get}
\alias{stri_timezone_set}
\title{Set or Get Default Time Zone in \pkg{stringi}}
\usage{
stri_timezone_get()
stri_timezone_set(tz)
}
\arguments{
\item{tz}{single string; time zone identifier}
}
\value{
\code{stri_timezone_set} returns a string with
previously used timezone, invisibly.
\code{stri_timezone_get} returns a single string
with the current default time zone.
}
\description{
\code{stri_timezone_set} changes the current default time zone for all functions
in the \pkg{stringi} package, i.e., establishes the meaning of the
``\code{NULL} time zone'' argument to date/time processing functions.
\code{stri_timezone_get} gets the current default time zone.
For more information on time zone representation in \pkg{ICU}
and \pkg{stringi}, refer to \code{\link{stri_timezone_list}}.
}
\details{
Unless the default time zone has already been set using
\code{stri_timezone_set}, the default time zone is determined
by querying the OS with methods in \pkg{ICU}'s internal platform utilities.
}
\examples{
\dontrun{
oldtz <- stri_timezone_set('Europe/Warsaw')
# ... many time zone-dependent operations
stri_timezone_set(oldtz) # restore previous default time zone
}
}
\references{
\emph{TimeZone} class -- ICU API Documentation,
\url{https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1TimeZone.html}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other datetime:
\code{\link{stri_datetime_add}()},
\code{\link{stri_datetime_create}()},
\code{\link{stri_datetime_fields}()},
\code{\link{stri_datetime_format}()},
\code{\link{stri_datetime_fstr}()},
\code{\link{stri_datetime_now}()},
\code{\link{stri_datetime_symbols}()},
\code{\link{stri_timezone_info}()},
\code{\link{stri_timezone_list}()}
Other timezone:
\code{\link{stri_timezone_info}()},
\code{\link{stri_timezone_list}()}
}
\concept{datetime}
\concept{timezone}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_escape_unicode.Rd 0000644 0001762 0000144 00000002560 14522301203 016775 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/escape.R
\name{stri_escape_unicode}
\alias{stri_escape_unicode}
\title{Escape Unicode Code Points}
\usage{
stri_escape_unicode(str)
}
\arguments{
\item{str}{character vector}
}
\value{
Returns a character vector.
}
\description{
Generates an ASCII string where all non-printable characters
and non-ASCII characters are converted to escape sequences.
}
\details{
For non-printable and certain special (well-known,
see also the R man page \link{Quotes})
ASCII characters, the following
(also recognized in R) convention is used.
We get \code{\\a}, \code{\\b}, \code{\\t}, \code{\\n}, \code{\\v},
\code{\\f}, \code{\\r}, \code{\"}, \code{\'}, \code{\\\\}
or either \code{\\uXXXX} (4 hex digits) or \code{\\UXXXXXXXX} (8 hex digits)
otherwise.
As usual in stringi, any input string is converted to Unicode
before executing the escape process.
}
\examples{
stri_escape_unicode('a\u0105!')
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other escape:
\code{\link{stri_unescape_unicode}()}
}
\concept{escape}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_enc_set.Rd 0000644 0001762 0000144 00000004412 14262507664 015471 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/encoding_management.R
\name{stri_enc_set}
\alias{stri_enc_set}
\alias{stri_enc_get}
\title{Set or Get Default Character Encoding in \pkg{stringi}}
\usage{
stri_enc_set(enc)
stri_enc_get()
}
\arguments{
\item{enc}{single string; character encoding name,
see \code{\link{stri_enc_list}} for the list of supported encodings.}
}
\value{
\code{stri_enc_set} returns a string with
previously used character encoding, invisibly.
\code{stri_enc_get} returns a string with current default character
encoding.
}
\description{
\code{stri_enc_set} sets the encoding used to re-encode strings
internally (i.e., by \R) declared to be in native encoding,
see \link{stringi-encoding} and \code{\link{stri_enc_mark}}.
\code{stri_enc_get} returns the currently used default encoding.
}
\details{
\code{stri_enc_get} is the same as
\code{\link{stri_enc_info}(NULL)$Name.friendly}.
Note that changing the default encoding may have undesired consequences.
Unless you are an expert user and you know what you are doing,
\code{stri_enc_set} should only be used if \pkg{ICU} fails to detect
your system's encoding correctly (while testing \pkg{stringi}
we only encountered such a situation on a very old Solaris machine).
Note that \pkg{ICU} tries to match the encoding part of the \code{LC_CTYPE}
category as given by \code{\link{Sys.getlocale}}.
If you set a default encoding that is neither a superset of ASCII,
nor an 8-bit encoding, a warning will be generated,
see \link{stringi-encoding} for discussion.
\code{stri_enc_set} has no effect if the system ICU assumes that
the default charset is always UTF-8 (i.e., where the internal
\code{U_CHARSET_IS_UTF8} is defined and set to 1), see
\code{\link{stri_info}}.
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other encoding_management:
\code{\link{about_encoding}},
\code{\link{stri_enc_info}()},
\code{\link{stri_enc_list}()},
\code{\link{stri_enc_mark}()}
}
\concept{encoding_management}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/about_search_charclass.Rd 0000644 0001762 0000144 00000031303 14350705363 017463 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/search.R
\name{about_search_charclass}
\alias{about_search_charclass}
\alias{search_charclass}
\alias{stringi-search-charclass}
\title{Character Classes in \pkg{stringi}}
\description{
Here we describe how character classes (sets) can be specified
in the \pkg{stringi} package. These are useful for defining
search patterns (note that the \pkg{ICU} regex engine uses the same
scheme for denoting character classes) or, e.g.,
generating random code points with \code{\link{stri_rand_strings}}.
}
\details{
All \code{stri_*_charclass} functions in \pkg{stringi} perform
a single character (i.e., Unicode code point) search-based operations.
You may obtain the same results using \link{about_search_regex}.
However, these very functions aim to be faster.
Character classes are defined using \pkg{ICU}'s \code{UnicodeSet}
patterns. Below we briefly summarize their syntax.
For more details refer to the bibliographic References below.
}
\section{\code{UnicodeSet} patterns}{
A \code{UnicodeSet} represents a subset of Unicode code points
(recall that \pkg{stringi} converts strings in your native encoding
to Unicode automatically). Legal code points are U+0000 to U+10FFFF,
inclusive.
Patterns either consist of series of characters bounded by
square brackets
(such patterns follow a syntax similar to that employed
by regular expression character classes)
or of Perl-like Unicode property set specifiers.
\code{[]} denotes an empty set, \code{[a]} --
a set consisting of character ``a'',
\code{[\\u0105]} -- a set with character U+0105,
and \code{[abc]} -- a set with ``a'', ``b'', and ``c''.
\code{[a-z]} denotes a set consisting of characters
``a'' through ``z'' inclusively, in Unicode code point order.
Some set-theoretic operations are available.
\code{^} denotes the complement, e.g., \code{[^a-z]} contains
all characters but ``a'' through ``z''.
Moreover, \code{[[pat1][pat2]]},
\code{[[pat1]\&[pat2]]}, and \code{[[pat1]-[pat2]]}
denote union, intersection, and asymmetric difference of sets
specified by \code{pat1} and \code{pat2}, respectively.
Note that all white-spaces are ignored unless they are quoted or back-slashed
(white spaces can be freely used for clarity, as \code{[a c d-f m]}
means the same as \code{[acd-fm]}).
\pkg{stringi} does not allow including multi-character strings
(see \code{UnicodeSet} API documentation).
Also, empty string patterns are disallowed.
Any character may be preceded by
a backslash in order to remove its special meaning.
A malformed pattern always results in an error.
Set expressions at a glance
(according to \url{https://unicode-org.github.io/icu/userguide/strings/regexp.html}):
Some examples:
\describe{
\item{\code{[abc]}}{Match any of the characters a, b or c.}
\item{\code{[^abc]}}{Negation -- match any character except a, b or c.}
\item{\code{[A-M]}}{Range -- match any character from A to M. The characters
to include are determined by Unicode code point ordering.}
\item{\code{[\\u0000-\\U0010ffff]}}{Range -- match all characters.}
\item{\code{[\\p{Letter}]} or \code{[\\p{General_Category=Letter}]} or \code{[\\p{L}]}}{
Characters with Unicode Category = Letter. All forms shown are equivalent.}
\item{\code{[\\P{Letter}]}}{Negated property
(Note the upper case \code{\\P}) -- match everything except Letters.}
\item{\code{[\\p{numeric_value=9}]}}{Match all numbers with a numeric value of 9.
Any Unicode Property may be used in set expressions.}
\item{\code{[\\p{Letter}&\\p{script=cyrillic}]}}{Set
intersection -- match the set of all Cyrillic letters.}
\item{\code{[\\p{Letter}-\\p{script=latin}]}}{Set difference --
match all non-Latin letters.}
\item{\code{[[a-z][A-Z][0-9]]} or \code{[a-zA-Z0-9]}}{Implicit union of
sets -- match ASCII letters and digits (the two forms are equivalent).}
\item{\code{[:script=Greek:]}}{Alternative POSIX-like syntax for properties --
equivalent to \code{\\p{script=Greek}}.}
}
}
\section{Unicode properties}{
Unicode property sets are specified with a POSIX-like syntax,
e.g., \code{[:Letter:]},
or with a (extended) Perl-style syntax, e.g., \code{\\p{L}}.
The complements of the above sets are
\code{[:^Letter:]} and \code{\\P{L}}, respectively.
The names are normalized before matching
(for example, the match is case-insensitive).
Moreover, many names have short aliases.
Among predefined Unicode properties we find, e.g.:
\itemize{
\item Unicode General Categories, e.g., \code{Lu} for uppercase letters,
\item Unicode Binary Properties, e.g., \code{WHITE_SPACE},
}
and many more (including Unicode scripts).
Each property provides access to the large and comprehensive
Unicode Character Database.
Generally, the list of properties available in \pkg{ICU}
is not well-documented. Please refer to the References section
for some links.
Please note that some classes might overlap.
However, e.g., General Category \code{Z} (some space) and Binary Property
\code{WHITE_SPACE} matches different character sets.
}
\section{Unicode General Categories}{
The Unicode General Category property of a code point provides the most
general classification of that code point.
Each code point falls into one and only one Category.
\describe{
\item{\code{Cc}}{a C0 or C1 control code.}
\item{\code{Cf}}{a format control character.}
\item{\code{Cn}}{a reserved unassigned code point or a non-character.}
\item{\code{Co}}{a private-use character.}
\item{\code{Cs}}{a surrogate code point.}
\item{\code{Lc}}{the union of Lu, Ll, Lt.}
\item{\code{Ll}}{a lowercase letter.}
\item{\code{Lm}}{a modifier letter.}
\item{\code{Lo}}{other letters, including syllables and ideographs.}
\item{\code{Lt}}{a digraphic character, with the first part uppercase.}
\item{\code{Lu}}{an uppercase letter.}
\item{\code{Mc}}{a spacing combining mark (positive advance width).}
\item{\code{Me}}{an enclosing combining mark.}
\item{\code{Mn}}{a non-spacing combining mark (zero advance width).}
\item{\code{Nd}}{a decimal digit.}
\item{\code{Nl}}{a letter-like numeric character.}
\item{\code{No}}{a numeric character of other type.}
\item{\code{Pd}}{a dash or hyphen punctuation mark.}
\item{\code{Ps}}{an opening punctuation mark (of a pair).}
\item{\code{Pe}}{a closing punctuation mark (of a pair).}
\item{\code{Pc}}{a connecting punctuation mark, like a tie.}
\item{\code{Po}}{a punctuation mark of other type.}
\item{\code{Pi}}{an initial quotation mark.}
\item{\code{Pf}}{a final quotation mark.}
\item{\code{Sm}}{a symbol of mathematical use.}
\item{\code{Sc}}{a currency sign.}
\item{\code{Sk}}{a non-letter-like modifier symbol.}
\item{\code{So}}{a symbol of other type.}
\item{\code{Zs}}{a space character (of non-zero width).}
\item{\code{Zl}}{U+2028 LINE SEPARATOR only.}
\item{\code{Zp}}{U+2029 PARAGRAPH SEPARATOR only.}
\item{\code{C} }{the union of Cc, Cf, Cs, Co, Cn.}
\item{\code{L} }{the union of Lu, Ll, Lt, Lm, Lo.}
\item{\code{M} }{the union of Mn, Mc, Me.}
\item{\code{N} }{the union of Nd, Nl, No.}
\item{\code{P} }{the union of Pc, Pd, Ps, Pe, Pi, Pf, Po.}
\item{\code{S} }{the union of Sm, Sc, Sk, So.}
\item{\code{Z} }{the union of Zs, Zl, Zp }
}
}
\section{Unicode Binary Properties}{
Each character may follow many Binary Properties at a time.
Here is a comprehensive list of supported Binary Properties:
\describe{
\item{\code{ALPHABETIC} }{alphabetic character.}
\item{\code{ASCII_HEX_DIGIT}}{a character matching the \code{[0-9A-Fa-f]} charclass.}
\item{\code{BIDI_CONTROL} }{a format control which have specific functions
in the Bidi (bidirectional text) Algorithm.}
\item{\code{BIDI_MIRRORED} }{a character that may change display in right-to-left text.}
\item{\code{DASH} }{a kind of a dash character.}
\item{\code{DEFAULT_IGNORABLE_CODE_POINT}}{characters that are ignorable in most
text processing activities,
e.g., <2060..206F, FFF0..FFFB, E0000..E0FFF>.}
\item{\code{DEPRECATED} }{a deprecated character according
to the current Unicode standard (the usage of deprecated characters
is strongly discouraged).}
\item{\code{DIACRITIC} }{a character that linguistically modifies
the meaning of another character to which it applies.}
\item{\code{EXTENDER} }{a character that extends the value
or shape of a preceding alphabetic character,
e.g., a length and iteration mark.}
\item{\code{HEX_DIGIT} }{a character commonly
used for hexadecimal numbers,
see also \code{ASCII_HEX_DIGIT}.}
\item{\code{HYPHEN}}{a dash used to mark connections between
pieces of words, plus the Katakana middle dot.}
\item{\code{ID_CONTINUE}}{a character that can continue an identifier,
\code{ID_START}+\code{Mn}+\code{Mc}+\code{Nd}+\code{Pc}.}
\item{\code{ID_START}}{a character that can start an identifier,
\code{Lu}+\code{Ll}+\code{Lt}+\code{Lm}+\code{Lo}+\code{Nl}.}
\item{\code{IDEOGRAPHIC}}{a CJKV (Chinese-Japanese-Korean-Vietnamese)
ideograph.}
\item{\code{LOWERCASE}}{...}
\item{\code{MATH}}{...}
\item{\code{NONCHARACTER_CODE_POINT}}{...}
\item{\code{QUOTATION_MARK}}{...}
\item{\code{SOFT_DOTTED}}{a character with a ``soft dot'', like i or j,
such that an accent placed on this character causes the dot to disappear.}
\item{\code{TERMINAL_PUNCTUATION}}{a punctuation character that generally
marks the end of textual units.}
\item{\code{UPPERCASE}}{...}
\item{\code{WHITE_SPACE}}{a space character or TAB or CR or LF or ZWSP or ZWNBSP.}
\item{\code{CASE_SENSITIVE}}{...}
\item{\code{POSIX_ALNUM}}{...}
\item{\code{POSIX_BLANK}}{...}
\item{\code{POSIX_GRAPH}}{...}
\item{\code{POSIX_PRINT}}{...}
\item{\code{POSIX_XDIGIT}}{...}
\item{\code{CASED}}{...}
\item{\code{CASE_IGNORABLE}}{...}
\item{\code{CHANGES_WHEN_LOWERCASED}}{...}
\item{\code{CHANGES_WHEN_UPPERCASED}}{...}
\item{\code{CHANGES_WHEN_TITLECASED}}{...}
\item{\code{CHANGES_WHEN_CASEFOLDED}}{...}
\item{\code{CHANGES_WHEN_CASEMAPPED}}{...}
\item{\code{CHANGES_WHEN_NFKC_CASEFOLDED}}{...}
\item{\code{EMOJI}}{Since ICU 57}
\item{\code{EMOJI_PRESENTATION}}{Since ICU 57}
\item{\code{EMOJI_MODIFIER}}{Since ICU 57}
\item{\code{EMOJI_MODIFIER_BASE}}{Since ICU 57}
}
}
\section{POSIX Character Classes}{
Avoid using POSIX character classes,
e.g., \code{[:punct:]}. The ICU User Guide (see below)
states that in general they are not well-defined, so you may end up
with something different than you expect.
In particular, in POSIX-like regex engines, \code{[:punct:]} stands for
the character class corresponding to the \code{ispunct()} classification
function (check out \code{man 3 ispunct} on UNIX-like systems).
According to ISO/IEC 9899:1990 (ISO C90), the \code{ispunct()} function
tests for any printing character except for space or a character
for which \code{isalnum()} is true. However, in a POSIX setting,
the details of what characters belong into which class depend
on the current locale. So the \code{[:punct:]} class does not lead
to a portable code (again, in POSIX-like regex engines).
Therefore, a POSIX flavor of \code{[:punct:]} is more like
\code{[\\p{P}\\p{S}]} in \pkg{ICU}. You have been warned.
}
\references{
\emph{The Unicode Character Database} -- Unicode Standard Annex #44,
\url{https://www.unicode.org/reports/tr44/}
\emph{UnicodeSet} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/strings/unicodeset.html}
\emph{Properties} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/strings/properties.html}
\emph{C/POSIX Migration} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/icu/posix.html}
\emph{Unicode Script Data}, \url{https://www.unicode.org/Public/UNIDATA/Scripts.txt}
\emph{icu::Unicodeset Class Reference} -- ICU4C API Documentation,
\url{https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1UnicodeSet.html}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other search_charclass:
\code{\link{about_search}},
\code{\link{stri_trim_both}()}
Other stringi_general_topics:
\code{\link{about_arguments}},
\code{\link{about_encoding}},
\code{\link{about_locale}},
\code{\link{about_search_boundaries}},
\code{\link{about_search_coll}},
\code{\link{about_search_fixed}},
\code{\link{about_search_regex}},
\code{\link{about_search}},
\code{\link{about_stringi}}
}
\concept{search_charclass}
\concept{stringi_general_topics}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_datetime_now.Rd 0000644 0001762 0000144 00000002266 14262507664 016535 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/time_calendar.R
\name{stri_datetime_now}
\alias{stri_datetime_now}
\title{Get Current Date and Time}
\usage{
stri_datetime_now()
}
\value{
Returns an object of class \code{\link{POSIXct}}.
}
\description{
Returns the current date and time.
}
\details{
The current date and time in \pkg{stringi} is represented as the (signed)
number of seconds since 1970-01-01 00:00:00 UTC.
UTC leap seconds are ignored.
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other datetime:
\code{\link{stri_datetime_add}()},
\code{\link{stri_datetime_create}()},
\code{\link{stri_datetime_fields}()},
\code{\link{stri_datetime_format}()},
\code{\link{stri_datetime_fstr}()},
\code{\link{stri_datetime_symbols}()},
\code{\link{stri_timezone_get}()},
\code{\link{stri_timezone_info}()},
\code{\link{stri_timezone_list}()}
}
\concept{datetime}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_enc_info.Rd 0000644 0001762 0000144 00000004363 14262507664 015636 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/encoding_management.R
\name{stri_enc_info}
\alias{stri_enc_info}
\title{Query a Character Encoding}
\usage{
stri_enc_info(enc = NULL)
}
\arguments{
\item{enc}{\code{NULL} or \code{''} for the default encoding,
or a single string with encoding name}
}
\value{
Returns a list with the following components:
\itemize{
\item \code{Name.friendly} -- friendly encoding name:
MIME Name or JAVA Name or \pkg{ICU} Canonical Name
(the first of provided ones is selected, see below);
\item \code{Name.ICU} -- encoding name as identified by \pkg{ICU};
\item \code{Name.*} -- other standardized encoding names,
e.g., \code{Name.UTR22}, \code{Name.IBM}, \code{Name.WINDOWS},
\code{Name.JAVA}, \code{Name.IANA}, \code{Name.MIME} (some of them
may be unavailable for all the encodings);
\item \code{ASCII.subset} -- is ASCII a subset of the given encoding?;
\item \code{Unicode.1to1} -- for 8-bit encodings only: are all characters
translated to exactly one Unicode code point and is the translation
scheme reversible?;
\item \code{CharSize.8bit} -- is this an 8-bit encoding, i.e., do we have
\code{CharSize.min == CharSize.max} and \code{CharSize.min == 1}?;
\item \code{CharSize.min} -- minimal number of bytes used
to represent a UChar (in UTF-16, this is not the same as UChar32)
\item \code{CharSize.max} -- maximal number of bytes used
to represent a UChar (in UTF-16, this is not the same as UChar32,
i.e., does not reflect the maximal code point representation size)
}
}
\description{
Gets basic information on a character encoding.
}
\details{
An error is raised if the provided encoding is unknown to \pkg{ICU}
(see \code{\link{stri_enc_list}} for more details).
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other encoding_management:
\code{\link{about_encoding}},
\code{\link{stri_enc_list}()},
\code{\link{stri_enc_mark}()},
\code{\link{stri_enc_set}()}
}
\concept{encoding_management}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_list2matrix.Rd 0000644 0001762 0000144 00000004717 14262507664 016343 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utils.R
\name{stri_list2matrix}
\alias{stri_list2matrix}
\title{Convert a List to a Character Matrix}
\usage{
stri_list2matrix(
x,
byrow = FALSE,
fill = NA_character_,
n_min = 0,
by_row = byrow
)
}
\arguments{
\item{x}{a list of atomic vectors}
\item{byrow}{a single logical value; should the resulting matrix be
transposed?}
\item{fill}{a single string, see Details}
\item{n_min}{a single integer value; minimal number of rows (\code{byrow==FALSE})
or columns (otherwise) in the resulting matrix}
\item{by_row}{alias of \code{byrow}}
}
\value{
Returns a character matrix.
}
\description{
This function converts a given list of atomic vectors to
a character matrix.
}
\details{
This function is similar to the built-in \code{\link{simplify2array}}
function. However, it always returns a character matrix,
even if each element in \code{x} is of length 1
or if elements in \code{x} are not of the same lengths.
Moreover, the elements in \code{x} are always coerced to character vectors.
If \code{byrow} is \code{FALSE}, then a matrix with \code{length(x)}
columns is returned.
The number of rows is the length of the
longest vector in \code{x}, but no less than \code{n_min}. Basically, we have
\code{result[i,j] == x[[j]][i]} if \code{i <= length(x[[j]])}
and \code{result[i,j] == fill} otherwise, see Examples.
If \code{byrow} is \code{TRUE}, then the resulting matrix is
a transposition of the above-described one.
This function may be useful, e.g., in connection with \code{\link{stri_split}}
and \code{\link{stri_extract_all}}.
}
\examples{
simplify2array(list(c('a', 'b'), c('c', 'd'), c('e', 'f')))
stri_list2matrix(list(c('a', 'b'), c('c', 'd'), c('e', 'f')))
stri_list2matrix(list(c('a', 'b'), c('c', 'd'), c('e', 'f')), byrow=TRUE)
simplify2array(list('a', c('b', 'c')))
stri_list2matrix(list('a', c('b', 'c')))
stri_list2matrix(list('a', c('b', 'c')), fill='')
stri_list2matrix(list('a', c('b', 'c')), fill='', n_min=5)
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other utils:
\code{\link{stri_na2empty}()},
\code{\link{stri_remove_empty}()},
\code{\link{stri_replace_na}()}
}
\concept{utils}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_datetime_fstr.Rd 0000644 0001762 0000144 00000004155 14262507664 016707 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/time_format.R
\name{stri_datetime_fstr}
\alias{stri_datetime_fstr}
\title{Convert \code{strptime}-Style Format Strings}
\usage{
stri_datetime_fstr(x, ignore_special = TRUE)
}
\arguments{
\item{x}{character vector of date/time format strings}
\item{ignore_special}{if \code{FALSE}, special identifiers like
\code{"datetime_full"} or \code{date_relative_short}
(see \code{\link{stri_datetime_format}}) are left as-is}
}
\value{
Returns a character vector.
}
\description{
This function converts \code{\link[base]{strptime}} or
\code{\link[base]{strftime}}-style
format strings to \pkg{ICU} format strings that may be used
in \code{\link{stri_datetime_parse}} and \code{\link{stri_datetime_format}}
functions.
}
\details{
For more details on conversion specifiers please refer to
the manual page of \code{\link[base]{strptime}}. Most of the formatters
of the form \code{\%x}, where \code{x} is a letter, are supported.
Moreover, each \code{\%\%} is replaced with \code{\%}.
Warnings are given in the case of \code{\%x}, \code{\%X}, \code{\%u},
\code{\%w}, \code{\%g}, \code{\%G}, \code{\%c}, \code{\%U}, and \code{\%W}
as in such circumstances either \pkg{ICU} does not
support the functionality requested using the string format API
or there are some inconsistencies between base R and \pkg{ICU}.
}
\examples{
stri_datetime_fstr('\%Y-\%m-\%d \%H:\%M:\%S')
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other datetime:
\code{\link{stri_datetime_add}()},
\code{\link{stri_datetime_create}()},
\code{\link{stri_datetime_fields}()},
\code{\link{stri_datetime_format}()},
\code{\link{stri_datetime_now}()},
\code{\link{stri_datetime_symbols}()},
\code{\link{stri_timezone_get}()},
\code{\link{stri_timezone_info}()},
\code{\link{stri_timezone_list}()}
}
\concept{datetime}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_read_lines.Rd 0000644 0001762 0000144 00000003205 14523030312 016132 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/files.R
\name{stri_read_lines}
\alias{stri_read_lines}
\title{Read Text Lines from a Text File}
\usage{
stri_read_lines(con, encoding = NULL, fname = con)
}
\arguments{
\item{con}{name of the output file or a connection object
(opened in the binary mode)}
\item{encoding}{single string; input encoding;
\code{NULL} or \code{''} for the current default encoding.}
\item{fname}{[DEPRECATED] alias of \code{con}}
}
\value{
Returns a character vector, each text line is a separate string.
The output is always marked as UTF-8.
}
\description{
Reads a text file in ins entirety, re-encodes it, and splits it into text lines.
}
\details{
This aims to be a substitute for the \code{\link{readLines}} function,
with the ability to re-encode the input file in a much more robust way,
and split the text into lines with \code{\link{stri_split_lines1}}
(which conforms with the Unicode guidelines for newline markers).
The function calls \code{\link{stri_read_raw}},
\code{\link{stri_encode}}, and \code{\link{stri_split_lines1}},
in this order.
Because of the way this function is currently implemented,
maximal file size cannot exceed ~0.67 GB.
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other files:
\code{\link{stri_read_raw}()},
\code{\link{stri_write_lines}()}
}
\concept{files}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_rand_lipsum.Rd 0000644 0001762 0000144 00000003363 14523011377 016361 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/random.R
\name{stri_rand_lipsum}
\alias{stri_rand_lipsum}
\title{A Lorem Ipsum Generator}
\usage{
stri_rand_lipsum(n_paragraphs, start_lipsum = TRUE, nparagraphs = n_paragraphs)
}
\arguments{
\item{n_paragraphs}{single integer, number of paragraphs to generate}
\item{start_lipsum}{single logical value; should the resulting
text start with \emph{Lorem ipsum dolor sit amet}?}
\item{nparagraphs}{[DEPRECATED] alias of \code{n_paragraphs}}
}
\value{
Returns a character vector of length \code{n_paragraphs}.
}
\description{
Generates (pseudo)random \emph{lorem ipsum} text consisting
of a given number of text paragraphs.
}
\details{
\emph{Lorem ipsum} is a dummy text often used as a source
of data for string processing and displaying/lay-outing exercises.
The current implementation is very simple:
words are selected randomly from a Zipf distribution
(based on a set of ca. 190 predefined Latin words).
The number of words per sentence and sentences per paragraph
follows a discretized, truncated normal distribution.
No Markov chain modeling, just i.i.d. word selection.
}
\examples{
cat(sapply(
stri_wrap(stri_rand_lipsum(10), 80, simplify=FALSE),
stri_flatten, collapse='\n'), sep='\n\n')
cat(stri_rand_lipsum(10), sep='\n\n')
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other random:
\code{\link{stri_rand_shuffle}()},
\code{\link{stri_rand_strings}()}
}
\concept{random}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_sort.Rd 0000644 0001762 0000144 00000005753 14365436502 015045 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/sort.R
\name{stri_sort}
\alias{stri_sort}
\title{String Sorting}
\usage{
stri_sort(str, decreasing = FALSE, na_last = NA, ..., opts_collator = NULL)
}
\arguments{
\item{str}{a character vector}
\item{decreasing}{a single logical value; should the sort order
be nondecreasing (\code{FALSE}, default, i.e., weakly increasing)
or nonincreasing (\code{TRUE})?}
\item{na_last}{a single logical value; controls the treatment of \code{NA}s
in \code{str}. If \code{TRUE}, then missing values in \code{str} are put
at the end; if \code{FALSE}, they are put at the beginning;
if \code{NA}, then they are removed from the output}
\item{...}{additional settings for \code{opts_collator}}
\item{opts_collator}{a named list with \pkg{ICU} Collator's options,
see \code{\link{stri_opts_collator}}, \code{NULL}
for default collation options}
}
\value{
The result is a sorted version of \code{str},
i.e., a character vector.
}
\description{
This function sorts a character vector according to a locale-dependent
lexicographic order.
}
\details{
For more information on \pkg{ICU}'s Collator and how to tune it up
in \pkg{stringi}, refer to \code{\link{stri_opts_collator}}.
As usual in \pkg{stringi}, non-character inputs are coerced to strings,
see an example below for a somewhat non-intuitive behavior of lexicographic
sorting on numeric inputs.
This function uses a stable sort algorithm (\pkg{STL}'s \code{stable_sort}),
which performs up to \eqn{N*log^2(N)} element comparisons,
where \eqn{N} is the length of \code{str}.
}
\examples{
stri_sort(c('hladny', 'chladny'), locale='pl_PL')
stri_sort(c('hladny', 'chladny'), locale='sk_SK')
stri_sort(sample(LETTERS))
stri_sort(c(1, 100, 2, 101, 11, 10)) # lexicographic order
stri_sort(c(1, 100, 2, 101, 11, 10), numeric=TRUE) # OK for integers
stri_sort(c(0.25, 0.5, 1, -1, -2, -3), numeric=TRUE) # incorrect
}
\references{
\emph{Collation} - ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/collation/}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other locale_sensitive:
\code{\link{\%s<\%}()},
\code{\link{about_locale}},
\code{\link{about_search_boundaries}},
\code{\link{about_search_coll}},
\code{\link{stri_compare}()},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_duplicated}()},
\code{\link{stri_enc_detect2}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_collator}()},
\code{\link{stri_order}()},
\code{\link{stri_rank}()},
\code{\link{stri_sort_key}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_unique}()},
\code{\link{stri_wrap}()}
}
\concept{locale_sensitive}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_write_lines.Rd 0000644 0001762 0000144 00000002721 14523011376 016364 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/files.R
\name{stri_write_lines}
\alias{stri_write_lines}
\title{Write Text Lines to a Text File}
\usage{
stri_write_lines(
str,
con,
encoding = "UTF-8",
sep = ifelse(.Platform$OS.type == "windows", "\\r\\n", "\\n"),
fname = con
)
}
\arguments{
\item{str}{character vector with data to write}
\item{con}{name of the output file or a connection object
(opened in the binary mode)}
\item{encoding}{output encoding, \code{NULL} or \code{''} for
the current default one}
\item{sep}{newline separator}
\item{fname}{[DEPRECATED] alias of \code{con}}
}
\value{
This function returns nothing noteworthy.
}
\description{
Writes a text file is such a way that each element of a given
character vector becomes a separate text line.
}
\details{
It is a substitute for the \R \code{\link{writeLines}} function,
with the ability to easily re-encode the output.
We suggest using the UTF-8 encoding for all text files:
thus, it is the default one for the output.
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other files:
\code{\link{stri_read_lines}()},
\code{\link{stri_read_raw}()}
}
\concept{files}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_replace.Rd 0000644 0001762 0000144 00000016450 14262507664 015471 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/search_replace_4.R
\name{stri_replace_all}
\alias{stri_replace_all}
\alias{stri_replace_first}
\alias{stri_replace_last}
\alias{stri_replace}
\alias{stri_replace_all_charclass}
\alias{stri_replace_first_charclass}
\alias{stri_replace_last_charclass}
\alias{stri_replace_all_coll}
\alias{stri_replace_first_coll}
\alias{stri_replace_last_coll}
\alias{stri_replace_all_fixed}
\alias{stri_replace_first_fixed}
\alias{stri_replace_last_fixed}
\alias{stri_replace_all_regex}
\alias{stri_replace_first_regex}
\alias{stri_replace_last_regex}
\title{Replace Pattern Occurrences}
\usage{
stri_replace_all(str, replacement, ..., regex, fixed, coll, charclass)
stri_replace_first(str, replacement, ..., regex, fixed, coll, charclass)
stri_replace_last(str, replacement, ..., regex, fixed, coll, charclass)
stri_replace(
str,
replacement,
...,
regex,
fixed,
coll,
charclass,
mode = c("first", "all", "last")
)
stri_replace_all_charclass(
str,
pattern,
replacement,
merge = FALSE,
vectorize_all = TRUE,
vectorise_all = vectorize_all
)
stri_replace_first_charclass(str, pattern, replacement)
stri_replace_last_charclass(str, pattern, replacement)
stri_replace_all_coll(
str,
pattern,
replacement,
vectorize_all = TRUE,
vectorise_all = vectorize_all,
...,
opts_collator = NULL
)
stri_replace_first_coll(str, pattern, replacement, ..., opts_collator = NULL)
stri_replace_last_coll(str, pattern, replacement, ..., opts_collator = NULL)
stri_replace_all_fixed(
str,
pattern,
replacement,
vectorize_all = TRUE,
vectorise_all = vectorize_all,
...,
opts_fixed = NULL
)
stri_replace_first_fixed(str, pattern, replacement, ..., opts_fixed = NULL)
stri_replace_last_fixed(str, pattern, replacement, ..., opts_fixed = NULL)
stri_replace_all_regex(
str,
pattern,
replacement,
vectorize_all = TRUE,
vectorise_all = vectorize_all,
...,
opts_regex = NULL
)
stri_replace_first_regex(str, pattern, replacement, ..., opts_regex = NULL)
stri_replace_last_regex(str, pattern, replacement, ..., opts_regex = NULL)
}
\arguments{
\item{str}{character vector; strings to search in}
\item{replacement}{character vector with replacements for matched patterns}
\item{...}{supplementary arguments passed to the underlying functions,
including additional settings for \code{opts_collator}, \code{opts_regex},
\code{opts_fixed}, and so on}
\item{mode}{single string;
one of: \code{'first'} (the default), \code{'all'}, \code{'last'}}
\item{pattern, regex, fixed, coll, charclass}{character vector;
search patterns; for more details refer to \link{stringi-search}}
\item{merge}{single logical value;
should consecutive matches be merged into one string;
\code{stri_replace_all_charclass} only}
\item{vectorize_all}{single logical value;
should each occurrence of a pattern in every string
be replaced by a corresponding replacement string?;
\code{stri_replace_all_*} only}
\item{vectorise_all}{alias of \code{vectorize_all}}
\item{opts_collator, opts_fixed, opts_regex}{a named list used to tune up
the search engine's settings; see
\code{\link{stri_opts_collator}}, \code{\link{stri_opts_fixed}},
and \code{\link{stri_opts_regex}}, respectively; \code{NULL}
for the defaults}
}
\value{
All the functions return a character vector.
}
\description{
These functions replace, with the given replacement string, every/first/last
substring of the input that matches the specified \code{pattern}.
}
\details{
By default, all the functions are vectorized over
\code{str}, \code{pattern}, \code{replacement} (with recycling
of the elements in the shorter vector if necessary).
Input that is not part of any match is left unchanged;
each match is replaced in the result by the replacement string.
However, for \code{stri_replace_all*}, if \code{vectorize_all} is \code{FALSE},
then each substring matching any of the supplied \code{pattern}s
is replaced by a corresponding \code{replacement} string.
In such a case, the vectorization is over \code{str},
and - independently - over \code{pattern} and \code{replacement}.
In other words, this is equivalent to something like
\code{for (i in 1:npatterns) str <- stri_replace_all(str, pattern[i], replacement[i]}.
Note that you must set \code{length(pattern) >= length(replacement)}.
In case of \code{stri_replace_*_regex},
the replacement string may contain references to capture groups
(in round parentheses).
References are of the form \code{$n}, where \code{n} is the number
of the capture group (\code{$1} denotes the first group).
For the literal \code{$},
escape it with a backslash.
Moreover, \code{${name}} are used for named capture groups.
Note that \code{stri_replace_last_regex} searches from start to end,
but skips overlapping matches, see the example below.
\code{stri_replace}, \code{stri_replace_all}, \code{stri_replace_first},
and \code{stri_replace_last} are convenience functions; they just call
\code{stri_replace_*_*} variants, depending on the arguments used.
If you wish to remove white-spaces from the start or end
of a string, see \code{\link{stri_trim}}.
}
\examples{
stri_replace_all_charclass('aaaa', '[a]', 'b', merge=c(TRUE, FALSE))
stri_replace_all_charclass('a\nb\tc d', '\\\\p{WHITE_SPACE}', ' ')
stri_replace_all_charclass('a\nb\tc d', '\\\\p{WHITE_SPACE}', ' ', merge=TRUE)
s <- 'Lorem ipsum dolor sit amet, consectetur adipisicing elit.'
stri_replace_all_fixed(s, ' ', '#')
stri_replace_all_fixed(s, 'o', '0')
stri_replace_all_fixed(c('1', 'NULL', '3'), 'NULL', NA)
stri_replace_all_regex(s, ' .*? ', '#')
stri_replace_all_regex(s, '(el|s)it', '1234')
stri_replace_all_regex('abaca', 'a', c('!', '*'))
stri_replace_all_regex('123|456|789', '(\\\\p{N}).(\\\\p{N})', '$2-$1')
stri_replace_all_regex(c('stringi R', 'REXAMINE', '123'), '( R|R.)', ' r ')
# named capture groups are available since ICU 55
\dontrun{
stri_replace_all_regex('words 123 and numbers 456',
'(?[0-9]+)', '!${numbers}!')
}
# Compare the results:
stri_replace_all_fixed('The quick brown fox jumped over the lazy dog.',
c('quick', 'brown', 'fox'), c('slow', 'black', 'bear'), vectorize_all=TRUE)
stri_replace_all_fixed('The quick brown fox jumped over the lazy dog.',
c('quick', 'brown', 'fox'), c('slow', 'black', 'bear'), vectorize_all=FALSE)
# Compare the results:
stri_replace_all_fixed('The quicker brown fox jumped over the lazy dog.',
c('quick', 'brown', 'fox'), c('slow', 'black', 'bear'), vectorize_all=FALSE)
stri_replace_all_regex('The quicker brown fox jumped over the lazy dog.',
'\\\\b'\%s+\%c('quick', 'brown', 'fox')\%s+\%'\\\\b', c('slow', 'black', 'bear'), vectorize_all=FALSE)
# Searching for the last occurrence:
# Note the difference - regex searches left to right, with no overlaps.
stri_replace_last_fixed("agAGA", "aga", "*", case_insensitive=TRUE)
stri_replace_last_regex("agAGA", "aga", "*", case_insensitive=TRUE)
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other search_replace:
\code{\link{about_search}},
\code{\link{stri_replace_rstr}()},
\code{\link{stri_trim_both}()}
}
\concept{search_replace}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_trans_list.Rd 0000644 0001762 0000144 00000002117 14262507664 016233 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/trans_transliterate.R
\name{stri_trans_list}
\alias{stri_trans_list}
\title{List Available Text Transforms and Transliterators}
\usage{
stri_trans_list()
}
\value{
Returns a character vector.
}
\description{
Returns a list of available text transform identifiers.
Each of them may be used in \code{\link{stri_trans_general}}
tasks.
}
\examples{
stri_trans_list()
}
\references{
\emph{General Transforms} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/transforms/general/}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other transform:
\code{\link{stri_trans_char}()},
\code{\link{stri_trans_general}()},
\code{\link{stri_trans_nfc}()},
\code{\link{stri_trans_tolower}()}
}
\concept{transform}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_count.Rd 0000644 0001762 0000144 00000006146 14262507664 015207 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/search_count_4.R
\name{stri_count}
\alias{stri_count}
\alias{stri_count_charclass}
\alias{stri_count_coll}
\alias{stri_count_fixed}
\alias{stri_count_regex}
\title{Count the Number of Pattern Occurrences}
\usage{
stri_count(str, ..., regex, fixed, coll, charclass)
stri_count_charclass(str, pattern)
stri_count_coll(str, pattern, ..., opts_collator = NULL)
stri_count_fixed(str, pattern, ..., opts_fixed = NULL)
stri_count_regex(str, pattern, ..., opts_regex = NULL)
}
\arguments{
\item{str}{character vector; strings to search in}
\item{...}{supplementary arguments passed to the underlying functions,
including additional settings for \code{opts_collator}, \code{opts_regex},
\code{opts_fixed}, and so on}
\item{pattern, regex, fixed, coll, charclass}{character vector;
search patterns; for more details refer to \link{stringi-search}}
\item{opts_collator, opts_fixed, opts_regex}{a named list used to tune up
the search engine's settings; see
\code{\link{stri_opts_collator}}, \code{\link{stri_opts_fixed}},
and \code{\link{stri_opts_regex}}, respectively; \code{NULL}
for the defaults}
}
\value{
All the functions return an integer vector.
}
\description{
These functions count the number of occurrences
of a pattern in a string.
}
\details{
Vectorized over \code{str} and \code{pattern} (with recycling
of the elements in the shorter vector if necessary). This allows to,
for instance, search for one pattern in each given string,
search for each pattern in one given string,
and search for the i-th pattern within the i-th string.
If \code{pattern} is empty, then the result is \code{NA}
and a warning is generated.
\code{stri_count} is a convenience function.
It calls either \code{stri_count_regex},
\code{stri_count_fixed}, \code{stri_count_coll},
or \code{stri_count_charclass}, depending on the argument used.
}
\examples{
s <- 'Lorem ipsum dolor sit amet, consectetur adipisicing elit.'
stri_count(s, fixed='dolor')
stri_count(s, regex='\\\\p{L}+')
stri_count_fixed(s, ' ')
stri_count_fixed(s, 'o')
stri_count_fixed(s, 'it')
stri_count_fixed(s, letters)
stri_count_fixed('babab', 'b')
stri_count_fixed(c('stringi', '123'), 'string')
stri_count_charclass(c('stRRRingi', 'STrrrINGI', '123'),
c('\\\\p{Ll}', '\\\\p{Lu}', '\\\\p{Zs}'))
stri_count_charclass(' \t\n', '\\\\p{WHITE_SPACE}') # white space - binary property
stri_count_charclass(' \t\n', '\\\\p{Z}') # white-space - general category (note the difference)
stri_count_regex(s, '(s|el)it')
stri_count_regex(s, 'i.i')
stri_count_regex(s, '.it')
stri_count_regex('bab baab baaab', c('b.*?b', 'b.b'))
stri_count_regex(c('stringi', '123'), '^(s|1)')
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other search_count:
\code{\link{about_search}},
\code{\link{stri_count_boundaries}()}
}
\concept{search_count}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_extract.Rd 0000644 0001762 0000144 00000016203 14262507664 015524 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/search_extract_4.R
\name{stri_extract_all}
\alias{stri_extract_all}
\alias{stri_extract_first}
\alias{stri_extract_last}
\alias{stri_extract}
\alias{stri_extract_all_charclass}
\alias{stri_extract_first_charclass}
\alias{stri_extract_last_charclass}
\alias{stri_extract_all_coll}
\alias{stri_extract_first_coll}
\alias{stri_extract_last_coll}
\alias{stri_extract_all_regex}
\alias{stri_extract_first_regex}
\alias{stri_extract_last_regex}
\alias{stri_extract_all_fixed}
\alias{stri_extract_first_fixed}
\alias{stri_extract_last_fixed}
\title{Extract Pattern Occurrences}
\usage{
stri_extract_all(str, ..., regex, fixed, coll, charclass)
stri_extract_first(str, ..., regex, fixed, coll, charclass)
stri_extract_last(str, ..., regex, fixed, coll, charclass)
stri_extract(
str,
...,
regex,
fixed,
coll,
charclass,
mode = c("first", "all", "last")
)
stri_extract_all_charclass(
str,
pattern,
merge = TRUE,
simplify = FALSE,
omit_no_match = FALSE
)
stri_extract_first_charclass(str, pattern)
stri_extract_last_charclass(str, pattern)
stri_extract_all_coll(
str,
pattern,
simplify = FALSE,
omit_no_match = FALSE,
...,
opts_collator = NULL
)
stri_extract_first_coll(str, pattern, ..., opts_collator = NULL)
stri_extract_last_coll(str, pattern, ..., opts_collator = NULL)
stri_extract_all_regex(
str,
pattern,
simplify = FALSE,
omit_no_match = FALSE,
...,
opts_regex = NULL
)
stri_extract_first_regex(str, pattern, ..., opts_regex = NULL)
stri_extract_last_regex(str, pattern, ..., opts_regex = NULL)
stri_extract_all_fixed(
str,
pattern,
simplify = FALSE,
omit_no_match = FALSE,
...,
opts_fixed = NULL
)
stri_extract_first_fixed(str, pattern, ..., opts_fixed = NULL)
stri_extract_last_fixed(str, pattern, ..., opts_fixed = NULL)
}
\arguments{
\item{str}{character vector; strings to search in}
\item{...}{supplementary arguments passed to the underlying functions,
including additional settings for \code{opts_collator}, \code{opts_regex},
and so on}
\item{mode}{single string;
one of: \code{'first'} (the default), \code{'all'}, \code{'last'}}
\item{pattern, regex, fixed, coll, charclass}{character vector;
search patterns; for more details refer to \link{stringi-search}}
\item{merge}{single logical value; indicates whether consecutive pattern
matches will be merged into one string;
\code{stri_extract_all_charclass} only}
\item{simplify}{single logical value;
if \code{TRUE} or \code{NA}, then a character matrix is returned;
otherwise (the default), a list of character vectors is given, see Value;
\code{stri_extract_all_*} only}
\item{omit_no_match}{single logical value; if \code{FALSE},
then a missing value will indicate that there was no match;
\code{stri_extract_all_*} only}
\item{opts_collator, opts_fixed, opts_regex}{a named list to tune up
the search engine's settings; see \code{\link{stri_opts_collator}},
\code{\link{stri_opts_fixed}}, and \code{\link{stri_opts_regex}},
respectively; \code{NULL} for the defaults}
}
\value{
For \code{stri_extract_all*}, if \code{simplify=FALSE} (the default), then
a list of character vectors is returned. Each list element
represents the results of a different search scenario.
If a pattern is not found and \code{omit_no_match=FALSE},
then a character vector of length 1
with single \code{NA} value will be generated.
Otherwise, i.e., if \code{simplify} is not \code{FALSE},
then \code{\link{stri_list2matrix}} with \code{byrow=TRUE} argument
is called on the resulting object.
In such a case, the function yields a character matrix with an appropriate
number of rows (according to the length of \code{str}, \code{pattern}, etc.).
Note that \code{\link{stri_list2matrix}}'s \code{fill} argument is set
either to an empty string or \code{NA}, depending on
whether \code{simplify} is \code{TRUE} or \code{NA}, respectively.
\code{stri_extract_first*} and \code{stri_extract_last*}
return a character vector. A \code{NA} element indicates a no-match.
Note that \code{stri_extract_last_regex} searches from start to end,
but skips overlapping matches, see the example below.
}
\description{
These functions extract all substrings matching a given pattern.
\code{stri_extract_all_*} extracts all the matches.
\code{stri_extract_first_*} and \code{stri_extract_last_*}
yield the first or the last matches, respectively.
}
\details{
Vectorized over \code{str} and \code{pattern} (with recycling
of the elements in the shorter vector if necessary). This allows to,
for instance, search for one pattern in each given string,
search for each pattern in one given string,
and search for the i-th pattern within the i-th string.
Check out \code{\link{stri_match}} for the extraction of matches
to individual regex capture groups.
\code{stri_extract}, \code{stri_extract_all}, \code{stri_extract_first},
and \code{stri_extract_last} are convenience functions.
They merely call \code{stri_extract_*_*}, depending on the arguments used.
}
\examples{
stri_extract_all('XaaaaX', regex=c('\\\\p{Ll}', '\\\\p{Ll}+', '\\\\p{Ll}{2,3}', '\\\\p{Ll}{2,3}?'))
stri_extract_all('Bartolini', coll='i')
stri_extract_all('stringi is so good!', charclass='\\\\p{Zs}') # all white-spaces
stri_extract_all_charclass(c('AbcdeFgHijK', 'abc', 'ABC'), '\\\\p{Ll}')
stri_extract_all_charclass(c('AbcdeFgHijK', 'abc', 'ABC'), '\\\\p{Ll}', merge=FALSE)
stri_extract_first_charclass('AaBbCc', '\\\\p{Ll}')
stri_extract_last_charclass('AaBbCc', '\\\\p{Ll}')
\dontrun{
# emoji support available since ICU 57
stri_extract_all_charclass(stri_enc_fromutf32(32:55200), '\\\\p{EMOJI}')
}
stri_extract_all_coll(c('AaaaaaaA', 'AAAA'), 'a')
stri_extract_first_coll(c('Yy\u00FD', 'AAA'), 'y', strength=2, locale='sk_SK')
stri_extract_last_coll(c('Yy\u00FD', 'AAA'), 'y', strength=1, locale='sk_SK')
stri_extract_all_regex('XaaaaX', c('\\\\p{Ll}', '\\\\p{Ll}+', '\\\\p{Ll}{2,3}', '\\\\p{Ll}{2,3}?'))
stri_extract_first_regex('XaaaaX', c('\\\\p{Ll}', '\\\\p{Ll}+', '\\\\p{Ll}{2,3}', '\\\\p{Ll}{2,3}?'))
stri_extract_last_regex('XaaaaX', c('\\\\p{Ll}', '\\\\p{Ll}+', '\\\\p{Ll}{2,3}', '\\\\p{Ll}{2,3}?'))
stri_list2matrix(stri_extract_all_regex('XaaaaX', c('\\\\p{Ll}', '\\\\p{Ll}+')))
stri_extract_all_regex('XaaaaX', c('\\\\p{Ll}', '\\\\p{Ll}+'), simplify=TRUE)
stri_extract_all_regex('XaaaaX', c('\\\\p{Ll}', '\\\\p{Ll}+'), simplify=NA)
stri_extract_all_fixed('abaBAba', 'Aba', case_insensitive=TRUE)
stri_extract_all_fixed('abaBAba', 'Aba', case_insensitive=TRUE, overlap=TRUE)
# Searching for the last occurrence:
# Note the difference - regex searches left to right, with no overlaps.
stri_extract_last_fixed("agAGA", "aga", case_insensitive=TRUE)
stri_extract_last_regex("agAGA", "aga", case_insensitive=TRUE)
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other search_extract:
\code{\link{about_search}},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_match_all}()}
}
\concept{search_extract}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_split.Rd 0000644 0001762 0000144 00000014024 14262507664 015204 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/search_split_4.R
\name{stri_split}
\alias{stri_split}
\alias{stri_split_fixed}
\alias{stri_split_regex}
\alias{stri_split_coll}
\alias{stri_split_charclass}
\title{Split a String By Pattern Matches}
\usage{
stri_split(str, ..., regex, fixed, coll, charclass)
stri_split_fixed(
str,
pattern,
n = -1L,
omit_empty = FALSE,
tokens_only = FALSE,
simplify = FALSE,
...,
opts_fixed = NULL
)
stri_split_regex(
str,
pattern,
n = -1L,
omit_empty = FALSE,
tokens_only = FALSE,
simplify = FALSE,
...,
opts_regex = NULL
)
stri_split_coll(
str,
pattern,
n = -1L,
omit_empty = FALSE,
tokens_only = FALSE,
simplify = FALSE,
...,
opts_collator = NULL
)
stri_split_charclass(
str,
pattern,
n = -1L,
omit_empty = FALSE,
tokens_only = FALSE,
simplify = FALSE
)
}
\arguments{
\item{str}{character vector; strings to search in}
\item{...}{supplementary arguments passed to the underlying functions,
including additional settings for \code{opts_collator}, \code{opts_regex},
\code{opts_fixed}, and so on}
\item{pattern, regex, fixed, coll, charclass}{character vector;
search patterns; for more details refer to \link{stringi-search}}
\item{n}{integer vector, maximal number of strings to return,
and, at the same time, maximal number of text boundaries to look for}
\item{omit_empty}{logical vector; determines whether empty
tokens should be removed from the result (\code{TRUE} or \code{FALSE})
or replaced with \code{NA}s (\code{NA})}
\item{tokens_only}{single logical value;
may affect the result if \code{n} is positive, see Details}
\item{simplify}{single logical value;
if \code{TRUE} or \code{NA}, then a character matrix is returned;
otherwise (the default), a list of character vectors is given, see Value}
\item{opts_collator, opts_fixed, opts_regex}{a named list used to tune up
the search engine's settings; see
\code{\link{stri_opts_collator}}, \code{\link{stri_opts_fixed}},
and \code{\link{stri_opts_regex}}, respectively; \code{NULL}
for the defaults}
}
\value{
If \code{simplify=FALSE} (the default),
then the functions return a list of character vectors.
Otherwise, \code{\link{stri_list2matrix}} with \code{byrow=TRUE}
and \code{n_min=n} arguments is called on the resulting object.
In such a case, a character matrix with an appropriate number of rows
(according to the length of \code{str}, \code{pattern}, etc.)
is returned. Note that \code{\link{stri_list2matrix}}'s \code{fill} argument
is set to an empty string and \code{NA}, for \code{simplify} equal to
\code{TRUE} and \code{NA}, respectively.
}
\description{
These functions split each element in \code{str} into substrings.
\code{pattern} defines the delimiters that separate the inputs into tokens.
The input data between the matches become the fields themselves.
}
\details{
Vectorized over \code{str}, \code{pattern}, \code{n}, and \code{omit_empty}
(with recycling of the elements in the shorter vector if necessary).
If \code{n} is negative, then all pieces are extracted.
Otherwise, if \code{tokens_only} is \code{FALSE} (which is the default),
then \code{n-1} tokens are extracted (if possible) and the \code{n}-th string
gives the remainder (see Examples).
On the other hand, if \code{tokens_only} is \code{TRUE},
then only full tokens (up to \code{n} pieces) are extracted.
\code{omit_empty} is applied during the split process: if it is set to
\code{TRUE}, then tokens of zero length are ignored. Thus, empty strings
will never appear in the resulting vector. On the other hand, if
\code{omit_empty} is \code{NA}, then empty tokens are substituted with
missing strings.
Empty search patterns are not supported. If you wish to split a
string into individual characters, use, e.g.,
\code{\link{stri_split_boundaries}(str, type='character')} for THE Unicode way.
\code{stri_split} is a convenience function. It calls either
\code{stri_split_regex}, \code{stri_split_fixed}, \code{stri_split_coll},
or \code{stri_split_charclass}, depending on the argument used.
}
\examples{
stri_split_fixed('a_b_c_d', '_')
stri_split_fixed('a_b_c__d', '_')
stri_split_fixed('a_b_c__d', '_', omit_empty=TRUE)
stri_split_fixed('a_b_c__d', '_', n=2, tokens_only=FALSE) # 'a' & remainder
stri_split_fixed('a_b_c__d', '_', n=2, tokens_only=TRUE) # 'a' & 'b' only
stri_split_fixed('a_b_c__d', '_', n=4, omit_empty=TRUE, tokens_only=TRUE)
stri_split_fixed('a_b_c__d', '_', n=4, omit_empty=FALSE, tokens_only=TRUE)
stri_split_fixed('a_b_c__d', '_', omit_empty=NA)
stri_split_fixed(c('ab_c', 'd_ef_g', 'h', ''), '_', n=1, tokens_only=TRUE, omit_empty=TRUE)
stri_split_fixed(c('ab_c', 'd_ef_g', 'h', ''), '_', n=2, tokens_only=TRUE, omit_empty=TRUE)
stri_split_fixed(c('ab_c', 'd_ef_g', 'h', ''), '_', n=3, tokens_only=TRUE, omit_empty=TRUE)
stri_list2matrix(stri_split_fixed(c('ab,c', 'd,ef,g', ',h', ''), ',', omit_empty=TRUE))
stri_split_fixed(c('ab,c', 'd,ef,g', ',h', ''), ',', omit_empty=FALSE, simplify=TRUE)
stri_split_fixed(c('ab,c', 'd,ef,g', ',h', ''), ',', omit_empty=NA, simplify=TRUE)
stri_split_fixed(c('ab,c', 'd,ef,g', ',h', ''), ',', omit_empty=TRUE, simplify=TRUE)
stri_split_fixed(c('ab,c', 'd,ef,g', ',h', ''), ',', omit_empty=NA, simplify=NA)
stri_split_regex(c('ab,c', 'd,ef , g', ', h', ''),
'\\\\p{WHITE_SPACE}*,\\\\p{WHITE_SPACE}*', omit_empty=NA, simplify=TRUE)
stri_split_charclass('Lorem ipsum dolor sit amet', '\\\\p{WHITE_SPACE}')
stri_split_charclass(' Lorem ipsum dolor', '\\\\p{WHITE_SPACE}', n=3,
omit_empty=c(FALSE, TRUE))
stri_split_regex('Lorem ipsum dolor sit amet',
'\\\\p{Z}+') # see also stri_split_charclass
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other search_split:
\code{\link{about_search}},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_split_lines}()}
}
\concept{search_split}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_join_list.Rd 0000644 0001762 0000144 00000004251 14262507664 016044 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/join.R
\name{stri_join_list}
\alias{stri_join_list}
\alias{stri_c_list}
\alias{stri_paste_list}
\title{Concatenate Strings in a List}
\usage{
stri_join_list(x, sep = "", collapse = NULL)
stri_c_list(x, sep = "", collapse = NULL)
stri_paste_list(x, sep = "", collapse = NULL)
}
\arguments{
\item{x}{a list consisting of character vectors}
\item{sep}{a single string; separates strings in each of the character
vectors in \code{x}}
\item{collapse}{a single string or \code{NULL}; an optional
results separator}
}
\value{
Returns a character vector.
}
\description{
These functions concatenate all the strings in each character vector
in a given list.
\code{stri_c_list} and \code{stri_paste_list} are aliases for
\code{stri_join_list}.
}
\details{
Unless \code{collapse} is \code{NULL}, the result will be a single string.
Otherwise, you get a character vector of length equal
to the length of \code{x}.
Vectors in \code{x} of length 0 are silently ignored.
If \code{collapse} or \code{sep} has length greater than 1,
then only the first string will be used.
}
\examples{
stri_join_list(
stri_extract_all_words(c('Lorem ipsum dolor sit amet.',
'Spam spam bacon sausage and spam.')),
sep=', ')
stri_join_list(
stri_extract_all_words(c('Lorem ipsum dolor sit amet.',
'Spam spam bacon sausage and spam.')),
sep=', ', collapse='. ')
stri_join_list(
stri_extract_all_regex(
c('spam spam bacon', '123 456', 'spam 789 sausage'), '\\\\p{L}+'
),
sep=',')
stri_join_list(
stri_extract_all_regex(
c('spam spam bacon', '123 456', 'spam 789 sausage'), '\\\\p{L}+',
omit_no_match=TRUE
),
sep=',', collapse='; ')
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other join:
\code{\link{\%s+\%}()},
\code{\link{stri_dup}()},
\code{\link{stri_flatten}()},
\code{\link{stri_join}()}
}
\concept{join}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_compare.Rd 0000644 0001762 0000144 00000012772 14262507664 015507 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/compare.R
\name{stri_compare}
\alias{stri_compare}
\alias{stri_cmp}
\alias{stri_cmp_eq}
\alias{stri_cmp_neq}
\alias{stri_cmp_equiv}
\alias{stri_cmp_nequiv}
\alias{stri_cmp_lt}
\alias{stri_cmp_gt}
\alias{stri_cmp_le}
\alias{stri_cmp_ge}
\title{Compare Strings with or without Collation}
\usage{
stri_compare(e1, e2, ..., opts_collator = NULL)
stri_cmp(e1, e2, ..., opts_collator = NULL)
stri_cmp_eq(e1, e2)
stri_cmp_neq(e1, e2)
stri_cmp_equiv(e1, e2, ..., opts_collator = NULL)
stri_cmp_nequiv(e1, e2, ..., opts_collator = NULL)
stri_cmp_lt(e1, e2, ..., opts_collator = NULL)
stri_cmp_gt(e1, e2, ..., opts_collator = NULL)
stri_cmp_le(e1, e2, ..., opts_collator = NULL)
stri_cmp_ge(e1, e2, ..., opts_collator = NULL)
}
\arguments{
\item{e1, e2}{character vectors or objects coercible to character vectors}
\item{...}{additional settings for \code{opts_collator}}
\item{opts_collator}{a named list with \pkg{ICU} Collator's options,
see \code{\link{stri_opts_collator}}, \code{NULL}
for the default collation options.}
}
\value{
The \code{stri_cmp} and \code{stri_compare} functions
return an integer vector representing the comparison results:
\code{-1} if \code{e1[...] < e2[...]},
\code{0} if they are canonically equivalent, and \code{1} if greater.
All the other functions return a logical vector that indicates
whether a given relation holds between two corresponding elements
in \code{e1} and \code{e2}.
}
\description{
These functions may be used to determine if two strings
are equal, canonically equivalent (this is performed in a much more clever
fashion than when testing for equality), or to check whether they are in
a specific lexicographic order.
}
\details{
All the functions listed here are vectorized over \code{e1} and \code{e2}.
\code{stri_cmp_eq} tests whether two corresponding strings
consist of exactly the same code points, while \code{stri_cmp_neq} allows
to check whether there is any difference between them. These are
locale-independent operations: for natural language processing,
where the notion of canonical equivalence is more valid, this might
not be exactly what you are looking for, see Examples.
Please note that \pkg{stringi} always silently removes UTF-8
BOMs from input strings, therefore, e.g., \code{stri_cmp_eq} does not take
BOMs into account while comparing strings.
\code{stri_cmp_equiv} tests for canonical equivalence of two strings
and is locale-dependent. Additionally, the \pkg{ICU}'s Collator may be
tuned up so that, e.g., the comparison is case-insensitive.
To test whether two strings are not canonically equivalent,
call \code{stri_cmp_nequiv}.
\code{stri_cmp_le} tests whether
the elements in the first vector are less than or equal to
the corresponding elements in the second vector,
\code{stri_cmp_ge} tests whether they are greater or equal,
\code{stri_cmp_lt} if less, and \code{stri_cmp_gt} if greater,
see also, e.g., \code{\link{\%s<\%}}.
\code{stri_compare} is an alias to \code{stri_cmp}. They both
perform exactly the same locale-dependent operation.
Both functions provide a C library's \code{strcmp()} look-and-feel,
see Value for details.
For more information on \pkg{ICU}'s Collator and how to tune its settings
refer to \code{\link{stri_opts_collator}}.
Note that different locale settings may lead to different results
(see the examples below).
}
\examples{
# in Polish, ch < h:
stri_cmp_lt('hladny', 'chladny', locale='pl_PL')
# in Slovak, ch > h:
stri_cmp_lt('hladny', 'chladny', locale='sk_SK')
# < or > (depends on locale):
stri_cmp('hladny', 'chladny')
# ignore case differences:
stri_cmp_equiv('hladny', 'HLADNY', strength=2)
# also ignore diacritical differences:
stri_cmp_equiv('hladn\u00FD', 'hladny', strength=1, locale='sk_SK')
marios <- c('Mario', 'mario', 'M\\\\u00e1rio', 'm\\\\u00e1rio')
stri_cmp_equiv(marios, 'mario', case_level=TRUE, strength=2L)
stri_cmp_equiv(marios, 'mario', case_level=TRUE, strength=1L)
stri_cmp_equiv(marios, 'mario', strength=1L)
stri_cmp_equiv(marios, 'mario', strength=2L)
# non-Unicode-normalized vs normalized string:
stri_cmp_equiv(stri_trans_nfkd('\u0105'), '\u105')
# note the difference:
stri_cmp_eq(stri_trans_nfkd('\u0105'), '\u105')
# ligatures:
stri_cmp_equiv('\ufb00', 'ff', strength=2)
# phonebook collation
stri_cmp_equiv('G\u00e4rtner', 'Gaertner', locale='de_DE@collation=phonebook', strength=1L)
stri_cmp_equiv('G\u00e4rtner', 'Gaertner', locale='de_DE', strength=1L)
}
\references{
\emph{Collation} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/collation/}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other locale_sensitive:
\code{\link{\%s<\%}()},
\code{\link{about_locale}},
\code{\link{about_search_boundaries}},
\code{\link{about_search_coll}},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_duplicated}()},
\code{\link{stri_enc_detect2}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_collator}()},
\code{\link{stri_order}()},
\code{\link{stri_rank}()},
\code{\link{stri_sort_key}()},
\code{\link{stri_sort}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_unique}()},
\code{\link{stri_wrap}()}
}
\concept{locale_sensitive}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_datetime_symbols.Rd 0000644 0001762 0000144 00000005631 14262507664 017421 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/time_symbols.R
\name{stri_datetime_symbols}
\alias{stri_datetime_symbols}
\title{List Localizable Date-Time Formatting Data}
\usage{
stri_datetime_symbols(locale = NULL, context = "standalone", width = "wide")
}
\arguments{
\item{locale}{\code{NULL} or \code{''} for default locale,
or a single string with locale identifier}
\item{context}{single string; one of: \code{'format'}, \code{'standalone'}}
\item{width}{single string; one of: \code{'abbreviated'}, \code{'wide'}, \code{'narrow'}}
}
\value{
Returns a list with the following named components:
\enumerate{
\item \code{Month} - month names,
\item \code{Weekday} - weekday names,
\item \code{Quarter} - quarter names,
\item \code{AmPm} - AM/PM names,
\item \code{Era} - era names.
}
}
\description{
Returns a list of all localizable date-time formatting data,
including month and weekday names, localized AM/PM strings, etc.
}
\details{
\code{context} stands for a selector for date formatting context
and \code{width} - for date formatting width.
}
\examples{
stri_datetime_symbols() # uses the Gregorian calendar in most locales
stri_datetime_symbols('@calendar=hebrew')
stri_datetime_symbols('he_IL@calendar=hebrew')
stri_datetime_symbols('@calendar=islamic')
stri_datetime_symbols('@calendar=persian')
stri_datetime_symbols('@calendar=indian')
stri_datetime_symbols('@calendar=coptic')
stri_datetime_symbols('@calendar=japanese')
stri_datetime_symbols('ja_JP_TRADITIONAL') # uses the Japanese calendar by default
stri_datetime_symbols('th_TH_TRADITIONAL') # uses the Buddhist calendar
stri_datetime_symbols('pl_PL', context='format')
stri_datetime_symbols('pl_PL', context='standalone')
stri_datetime_symbols(width='wide')
stri_datetime_symbols(width='abbreviated')
stri_datetime_symbols(width='narrow')
}
\references{
\emph{Calendar} - ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/datetime/calendar/}
\emph{DateFormatSymbols} class -- ICU API Documentation,
\url{https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1DateFormatSymbols.html}
\emph{Formatting Dates and Times} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/format_parse/datetime/}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other datetime:
\code{\link{stri_datetime_add}()},
\code{\link{stri_datetime_create}()},
\code{\link{stri_datetime_fields}()},
\code{\link{stri_datetime_format}()},
\code{\link{stri_datetime_fstr}()},
\code{\link{stri_datetime_now}()},
\code{\link{stri_timezone_get}()},
\code{\link{stri_timezone_info}()},
\code{\link{stri_timezone_list}()}
}
\concept{datetime}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_reverse.Rd 0000644 0001762 0000144 00000002111 14262507664 015516 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/reverse.R
\name{stri_reverse}
\alias{stri_reverse}
\title{Reverse Each String}
\usage{
stri_reverse(str)
}
\arguments{
\item{str}{character vector}
}
\value{
Returns a character vector.
}
\description{
Reverses the order of the code points in every string.
}
\details{
Note that this operation may result in non-Unicode-normalized
strings and may give peculiar outputs for bidirectional strings.
See also \code{\link{stri_rand_shuffle}} for a random permutation
of code points.
}
\examples{
stri_reverse(c('123', 'abc d e f'))
stri_reverse('ZXY (\u0105\u0104123$^).')
stri_reverse(stri_trans_nfd('\u0105')) == stri_trans_nfd('\u0105') # A, ogonek -> agonek, A
}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
}
stringi/man/stri_datetime_format.Rd 0000644 0001762 0000144 00000024060 14522615467 017216 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/time_format.R
\name{stri_datetime_format}
\alias{stri_datetime_format}
\alias{stri_datetime_parse}
\title{Date and Time Formatting and Parsing}
\usage{
stri_datetime_format(
time,
format = "uuuu-MM-dd HH:mm:ss",
tz = NULL,
locale = NULL
)
stri_datetime_parse(
str,
format = "uuuu-MM-dd HH:mm:ss",
lenient = FALSE,
tz = NULL,
locale = NULL
)
}
\arguments{
\item{time}{an object of class \code{\link{POSIXct}} with date-time data
to be formatted
(\code{as.POSIXct} will be called on character vectors
and objects of class \code{POSIXlt}, \code{Date}, and \code{factor})}
\item{format}{character vector, see Details; see also \code{\link{stri_datetime_fstr}}}
\item{tz}{\code{NULL} or \code{''} for the default time zone
or a single string with a timezone identifier,
see \code{\link{stri_timezone_get}} and \code{\link{stri_timezone_list}}}
\item{locale}{\code{NULL} or \code{''} for the default locale,
or a single string with locale identifier; a non-Gregorian calendar
may be specified by setting the \code{@calendar=name} keyword}
\item{str}{character vector with strings to be parsed}
\item{lenient}{single logical value; should date/time parsing be lenient?}
}
\value{
\code{stri_datetime_format} returns a character vector.
\code{stri_datetime_parse} returns an object of class \code{\link{POSIXct}}.
}
\description{
These functions convert a given date/time object
to a character vector, or vice versa.
}
\details{
Vectorized over \code{format} and \code{time} or \code{str}.
When parsing strings, unspecified date-time fields
(e.g., seconds where only hours and minutes are given)
are based on today's midnight in the local time zone
(for compatibility with \code{\link[base]{strptime}}).
By default, \code{stri_datetime_format} (for compatibility
with the \code{\link[base]{strftime}} function)
formats a date/time object using the current default time zone.
\code{format} may be one of \code{DT_STYLE} or \code{DT_relative_STYLE},
where \code{DT} is equal to \code{date}, \code{time}, or \code{datetime},
and \code{STYLE} is equal to \code{full}, \code{long}, \code{medium},
or \code{short}. This gives a locale-dependent date and/or time format.
Note that currently \pkg{ICU} does not support \code{relative}
\code{time} formats, thus this flag is currently ignored in such a context.
Otherwise, \code{format} is a pattern:
a string where specific sequences of characters are replaced
with date/time data from a calendar when formatting or used
to generate data for a calendar when parsing.
For example, \code{y} stands for 'year'. Characters
may be used multiple times:
\code{yy} might produce \code{99}, whereas \code{yyyy} yields \code{1999}.
For most numerical fields, the number of characters specifies
the field width. For example, if \code{h} is the hour, \code{h} might
produce \code{5}, but \code{hh} yields \code{05}.
For some characters, the count specifies whether an abbreviated
or full form should be used.
Two single quotes represent a literal single quote, either
inside or outside single quotes. Text within single quotes
is not interpreted in any way (except for two adjacent single quotes).
Otherwise, all ASCII letters from \code{a} to \code{z} and
\code{A} to \code{Z} are reserved as syntax characters, and require quoting
if they are to represent literal characters. In addition, certain
ASCII punctuation characters may become available in the future
(e.g., \code{:} being interpreted as the time separator and \code{/}
as a date separator, and replaced by respective
locale-sensitive characters in display).
\tabular{llll}{
\bold{Symbol} \tab \bold{Meaning} \tab \bold{Example(s)} \tab \bold{Output} \cr
G \tab era designator \tab G, GG, or GGG \tab AD \cr
\tab \tab GGGG \tab Anno Domini \cr
\tab \tab GGGGG \tab A \cr
y \tab year \tab yy \tab 96 \cr
\tab \tab y or yyyy \tab 1996 \cr
u \tab extended year \tab u \tab 4601 \cr
U \tab cyclic year name, as in Chinese lunar calendar \tab U \tab \cr
r \tab related Gregorian year \tab r \tab 1996 \cr
Q \tab quarter \tab Q or QQ \tab 02 \cr
\tab \tab QQQ \tab Q2 \cr
\tab \tab QQQQ \tab 2nd quarter \cr
\tab \tab QQQQQ \tab 2 \cr
q \tab Stand Alone quarter \tab q or qq \tab 02 \cr
\tab \tab qqq \tab Q2 \cr
\tab \tab qqqq \tab 2nd quarter \cr
\tab \tab qqqqq \tab 2 \cr
M \tab month in year \tab M or MM \tab 09 \cr
\tab \tab MMM \tab Sep \cr
\tab \tab MMMM \tab September \cr
\tab \tab MMMMM \tab S \cr
L \tab Stand Alone month in year \tab L or LL \tab 09 \cr
\tab \tab LLL \tab Sep \cr
\tab \tab LLLL \tab September \cr
\tab \tab LLLLL \tab S \cr
w \tab week of year \tab w or ww \tab 27 \cr
W \tab week of month \tab W \tab 2 \cr
d \tab day in month \tab d \tab 2 \cr
\tab \tab dd \tab 02 \cr
D \tab day of year \tab D \tab 189 \cr
F \tab day of week in month \tab F \tab 2 (2nd Wed in July) \cr
g \tab modified Julian day \tab g \tab 2451334 \cr
E \tab day of week \tab E, EE, or EEE \tab Tue \cr
\tab \tab EEEE \tab Tuesday \cr
\tab \tab EEEEE \tab T \cr
\tab \tab EEEEEE \tab Tu \cr
e \tab local day of week \tab e or ee \tab 2 \cr
\tab example: if Monday is 1st day, Tuesday is 2nd ) \tab eee \tab Tue \cr
\tab \tab eeee \tab Tuesday \cr
\tab \tab eeeee \tab T \cr
\tab \tab eeeeee \tab Tu \cr
c \tab Stand Alone local day of week \tab c or cc \tab 2 \cr
\tab \tab ccc \tab Tue \cr
\tab \tab cccc \tab Tuesday \cr
\tab \tab ccccc \tab T \cr
\tab \tab cccccc \tab Tu \cr
a \tab am/pm marker \tab a \tab pm \cr
h \tab hour in am/pm (1~12) \tab h \tab 7 \cr
\tab \tab hh \tab 07 \cr
H \tab hour in day (0~23) \tab H \tab 0 \cr
\tab \tab HH \tab 00 \cr
k \tab hour in day (1~24) \tab k \tab 24 \cr
\tab \tab kk \tab 24 \cr
K \tab hour in am/pm (0~11) \tab K \tab 0 \cr
\tab \tab KK \tab 00 \cr
m \tab minute in hour \tab m \tab 4 \cr
\tab \tab mm \tab 04 \cr
s \tab second in minute \tab s \tab 5 \cr
\tab \tab ss \tab 05 \cr
S \tab fractional second - truncates (like other time fields) \tab S \tab 2 \cr
\tab to the count of letters when formatting. Appends \tab SS \tab 23 \cr
\tab zeros if more than 3 letters specified. Truncates at \tab SSS \tab 235 \cr
\tab three significant digits when parsing. \tab SSSS \tab 2350 \cr
A \tab milliseconds in day \tab A \tab 61201235 \cr
z \tab Time Zone: specific non-location \tab z, zz, or zzz \tab PDT \cr
\tab \tab zzzz \tab Pacific Daylight Time \cr
Z \tab Time Zone: ISO8601 basic hms? / RFC 822 \tab Z, ZZ, or ZZZ \tab -0800 \cr
\tab Time Zone: long localized GMT (=OOOO) \tab ZZZZ \tab GMT-08:00 \cr
\tab Time Zone: ISO8601 extended hms? (=XXXXX) \tab ZZZZZ \tab -08:00, -07:52:58, Z \cr
O \tab Time Zone: short localized GMT \tab O \tab GMT-8 \cr
\tab Time Zone: long localized GMT (=ZZZZ) \tab OOOO \tab GMT-08:00 \cr
v \tab Time Zone: generic non-location \tab v \tab PT \cr
\tab (falls back first to VVVV) \tab vvvv \tab Pacific Time or Los Angeles Time \cr
V \tab Time Zone: short time zone ID \tab V \tab uslax \cr
\tab Time Zone: long time zone ID \tab VV \tab America/Los_Angeles \cr
\tab Time Zone: time zone exemplar city \tab VVV \tab Los Angeles \cr
\tab Time Zone: generic location (falls back to OOOO) \tab VVVV \tab Los Angeles Time \cr
X \tab Time Zone: ISO8601 basic hm?, with Z for 0 \tab X \tab -08, +0530, Z \cr
\tab Time Zone: ISO8601 basic hm, with Z \tab XX \tab -0800, Z \cr
\tab Time Zone: ISO8601 extended hm, with Z \tab XXX \tab -08:00, Z \cr
\tab Time Zone: ISO8601 basic hms?, with Z \tab XXXX \tab -0800, -075258, Z \cr
\tab Time Zone: ISO8601 extended hms?, with Z \tab XXXXX \tab -08:00, -07:52:58, Z \cr
x \tab Time Zone: ISO8601 basic hm?, without Z for 0 \tab x \tab -08, +0530 \cr
\tab Time Zone: ISO8601 basic hm, without Z \tab xx \tab -0800 \cr
\tab Time Zone: ISO8601 extended hm, without Z \tab xxx \tab -08:00 \cr
\tab Time Zone: ISO8601 basic hms?, without Z \tab xxxx \tab -0800, -075258 \cr
\tab Time Zone: ISO8601 extended hms?, without Z \tab xxxxx \tab -08:00, -07:52:58 \cr
' \tab escape for text \tab ' \tab (nothing) \cr
' ' \tab two single quotes produce one \tab ' ' \tab '
}
Note that any characters in the pattern that are not in the ranges
of \code{[a-z]} and \code{[A-Z]} will be treated as quoted text.
For instance, characters like \code{:}, \code{.}, \code{ } (a space),
\code{#} and \code{@} will appear in the resulting time text
even if they are not enclosed within single quotes. The single quote is used
to ``escape'' the letters. Two single quotes in a row,
inside or outside a quoted sequence, represent a ``real'' single quote.
A few examples:
\tabular{ll}{
\bold{Example Pattern} \tab \bold{Result} \cr
yyyy.MM.dd 'at' HH:mm:ss zzz \tab 2015.12.31 at 23:59:59 GMT+1 \cr
EEE, MMM d, ''yy \tab czw., gru 31, '15 \cr
h:mm a \tab 11:59 PM \cr
hh 'o''clock' a, zzzz \tab 11 o'clock PM, GMT+01:00 \cr
K:mm a, z \tab 11:59 PM, GMT+1 \cr
yyyyy.MMMM.dd GGG hh:mm aaa \tab 2015.grudnia.31 n.e. 11:59 PM \cr
uuuu-MM-dd'T'HH:mm:ssZ \tab 2015-12-31T23:59:59+0100 (the ISO 8601 guideline) \cr
}
}
\examples{
x <- c('2015-02-28', '2015-02-29')
stri_datetime_parse(x, 'yyyy-MM-dd')
stri_datetime_parse(x, 'yyyy-MM-dd', lenient=TRUE)
stri_datetime_parse(x \%s+\% " 17:13", "yyyy-MM-dd HH:mm")
stri_datetime_parse('19 lipca 2015', 'date_long', locale='pl_PL')
stri_datetime_format(stri_datetime_now(), 'datetime_relative_medium')
}
\references{
\emph{Formatting Dates and Times} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/format_parse/datetime/}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other datetime:
\code{\link{stri_datetime_add}()},
\code{\link{stri_datetime_create}()},
\code{\link{stri_datetime_fields}()},
\code{\link{stri_datetime_fstr}()},
\code{\link{stri_datetime_now}()},
\code{\link{stri_datetime_symbols}()},
\code{\link{stri_timezone_get}()},
\code{\link{stri_timezone_info}()},
\code{\link{stri_timezone_list}()}
}
\concept{datetime}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_detect.Rd 0000644 0001762 0000144 00000010345 14262507664 015323 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/search_detect_4.R
\name{stri_detect}
\alias{stri_detect}
\alias{stri_detect_fixed}
\alias{stri_detect_charclass}
\alias{stri_detect_coll}
\alias{stri_detect_regex}
\title{Detect Pattern Occurrences}
\usage{
stri_detect(str, ..., regex, fixed, coll, charclass)
stri_detect_fixed(
str,
pattern,
negate = FALSE,
max_count = -1,
...,
opts_fixed = NULL
)
stri_detect_charclass(str, pattern, negate = FALSE, max_count = -1)
stri_detect_coll(
str,
pattern,
negate = FALSE,
max_count = -1,
...,
opts_collator = NULL
)
stri_detect_regex(
str,
pattern,
negate = FALSE,
max_count = -1,
...,
opts_regex = NULL
)
}
\arguments{
\item{str}{character vector; strings to search in}
\item{...}{supplementary arguments passed to the underlying functions,
including additional settings for \code{opts_collator}, \code{opts_regex},
\code{opts_fixed}, and so on}
\item{pattern, regex, fixed, coll, charclass}{character vector;
search patterns; for more details refer to \link{stringi-search}}
\item{negate}{single logical value; whether a no-match to a pattern
is rather of interest}
\item{max_count}{single integer; allows to stop searching once a given
number of occurrences is detected; \code{-1} (the default) inspects all
elements}
\item{opts_collator, opts_fixed, opts_regex}{a named list used to tune up
the search engine's settings; see
\code{\link{stri_opts_collator}}, \code{\link{stri_opts_fixed}},
and \code{\link{stri_opts_regex}}, respectively; \code{NULL}
for the defaults}
}
\value{
Each function returns a logical vector.
}
\description{
These functions determine, for each string in \code{str},
if there is at least one match to a corresponding \code{pattern}.
}
\details{
Vectorized over \code{str} and \code{pattern} (with recycling
of the elements in the shorter vector if necessary). This allows to,
for instance, search for one pattern in each given string,
search for each pattern in one given string,
and search for the i-th pattern within the i-th string.
If \code{pattern} is empty, then the result is \code{NA}
and a warning is generated.
\code{stri_detect} is a convenience function.
It calls either \code{stri_detect_regex},
\code{stri_detect_fixed}, \code{stri_detect_coll},
or \code{stri_detect_charclass}, depending on the argument used.
See also \code{\link{stri_startswith}} and \code{\link{stri_endswith}}
for testing whether a string starts or ends with a match to a given pattern.
Moreover, see \code{\link{stri_subset}} for a character vector subsetting.
If \code{max_count} is negative, then all stings are examined.
Otherwise, searching terminates
once \code{max_count} matches (or, if \code{negate} is \code{TRUE},
no-matches) are detected. The uninspected cases are marked
as missing in the return vector. Be aware that, unless \code{pattern} is a
singleton, the elements in \code{str} might be inspected in a
non-consecutive order.
}
\examples{
stri_detect_fixed(c('stringi R', 'R STRINGI', '123'), c('i', 'R', '0'))
stri_detect_fixed(c('stringi R', 'R STRINGI', '123'), 'R')
stri_detect_charclass(c('stRRRingi','R STRINGI', '123'),
c('\\\\p{Ll}', '\\\\p{Lu}', '\\\\p{Zs}'))
stri_detect_regex(c('stringi R', 'R STRINGI', '123'), 'R.')
stri_detect_regex(c('stringi R', 'R STRINGI', '123'), '[[:alpha:]]*?')
stri_detect_regex(c('stringi R', 'R STRINGI', '123'), '[a-zC1]')
stri_detect_regex(c('stringi R', 'R STRINGI', '123'), '( R|RE)')
stri_detect_regex('stringi', 'STRING.', case_insensitive=TRUE)
stri_detect_regex(c('abc', 'def', '123', 'ghi', '456', '789', 'jkl'),
'^[0-9]+$', max_count=1)
stri_detect_regex(c('abc', 'def', '123', 'ghi', '456', '789', 'jkl'),
'^[0-9]+$', max_count=2)
stri_detect_regex(c('abc', 'def', '123', 'ghi', '456', '789', 'jkl'),
'^[0-9]+$', negate=TRUE, max_count=3)
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other search_detect:
\code{\link{about_search}},
\code{\link{stri_startswith}()}
}
\concept{search_detect}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_sprintf.Rd 0000644 0001762 0000144 00000012601 14262507664 015535 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/sprintf.R
\name{stri_sprintf}
\alias{stri_sprintf}
\alias{stri_string_format}
\alias{stri_printf}
\title{Format Strings}
\usage{
stri_sprintf(
format,
...,
na_string = NA_character_,
inf_string = "Inf",
nan_string = "NaN",
use_length = FALSE
)
stri_string_format(
format,
...,
na_string = NA_character_,
inf_string = "Inf",
nan_string = "NaN",
use_length = FALSE
)
stri_printf(
format,
...,
file = "",
sep = "\\n",
append = FALSE,
na_string = "NA",
inf_string = "Inf",
nan_string = "NaN",
use_length = FALSE
)
}
\arguments{
\item{format}{character vector of format strings}
\item{...}{vectors (coercible to integer, real, or character)}
\item{na_string}{single string to represent missing values;
if \code{NA}, missing values in \code{...}
result in the corresponding outputs be missing too;
use \code{"NA"} for compatibility with base R}
\item{inf_string}{single string to represent the (unsigned) infinity (\code{NA} allowed)}
\item{nan_string}{single string to represent the not-a-number (\code{NA} allowed)}
\item{use_length}{single logical value; should the number of code
points be used when applying modifiers such as \code{\%20s}
instead of the total code point width?}
\item{file}{see \code{\link[base]{cat}}}
\item{sep}{see \code{\link[base]{cat}}}
\item{append}{see \code{\link[base]{cat}}}
}
\value{
\code{stri_printf} is used for its side effect, which is printing
text on the standard output or other connection/file. Hence, it returns
\code{invisible(NULL)}.
The other functions return a character vector.
}
\description{
\code{stri_sprintf} (synonym: \code{stri_string_format})
is a Unicode-aware replacement for and enhancement of
the built-in \code{\link[base]{sprintf}} function.
Moreover, \code{stri_printf} prints formatted strings.
}
\details{
Vectorized over \code{format} and all vectors passed via \code{...}.
Unicode code points may have various widths when
printed on the console (compare \code{\link{stri_width}}).
These functions, by default (see the \code{use_length} argument), take this
into account.
These functions are not locale sensitive. For instance, numbers are
always formatted in the "POSIX" style, e.g., \code{-123456.789}
(no thousands separator, dot as a fractional separator).
Such a feature might be added at a later date, though.
All arguments passed via \code{...} are evaluated. If some of them
are unused, a warning is generated. Too few arguments result in an error.
Note that \code{stri_printf} treats missing values in \code{...}
as \code{"NA"} strings by default.
All format specifiers supported \code{\link[base]{sprintf}} are
also available here. For the formatting of integers and floating-point
values, currently the system \code{std::snprintf()} is called, but
this may change in the future. Format specifiers are normalized
and necessary sanity checks are performed.
Supported conversion specifiers: \code{dioxX} (integers)
\code{feEgGaA} (floats) and \code{s} (character strings).
Supported flags: \code{-} (left-align),
\code{+} (force output sign or blank when \code{NaN} or \code{NA}; numeric only),
\code{} (output minus or space for a sign; numeric only)
\code{0} (pad with 0s; numeric only),
\code{#} (alternative output of some numerics).
}
\examples{
stri_printf("\%4s=\%.3f", c("e", "e\u00b2", "\u03c0", "\u03c0\u00b2"),
c(exp(1), exp(2), pi, pi^2))
x <- c(
"xxabcd",
"xx\u0105\u0106\u0107\u0108",
stri_paste(
"\u200b\u200b\u200b\u200b",
"\U0001F3F4\U000E0067\U000E0062\U000E0073\U000E0063\U000E0074\U000E007F",
"abcd"
))
stri_printf("[\%10s]", x) # minimum width = 10
stri_printf("[\%-10.3s]", x) # output of max width = 3, but pad to width of 10
stri_printf("[\%10s]", x, use_length=TRUE) # minimum number of Unicode code points = 10
# vectorization wrt all arguments:
p <- runif(10)
stri_sprintf(ifelse(p > 0.5, "P(Y=1)=\%1$.2f", "P(Y=0)=\%2$.2f"), p, 1-p)
# using a "preformatted" logical vector:
x <- c(TRUE, FALSE, FALSE, NA, TRUE, FALSE)
stri_sprintf("\%s) \%s", letters[seq_along(x)], c("\u2718", "\u2713")[x+1])
# custom NA/Inf/NaN strings:
stri_printf("\%+10.3f", c(-Inf, -0, 0, Inf, NaN, NA_real_),
na_string="", nan_string="\U0001F4A9", inf_string="\u221E")
stri_sprintf("UNIX time \%1$f is \%1$s.", Sys.time())
# the following do not work in sprintf()
stri_sprintf("\%1$#- *2$.*3$f", 1.23456, 10, 3) # two asterisks
stri_sprintf(c("\%s", "\%f"), pi) # re-coercion needed
stri_sprintf("\%1$s is \%1$f UNIX time.", Sys.time()) # re-coercion needed
stri_sprintf(c("\%d", "\%s"), factor(11:12)) # re-coercion needed
stri_sprintf(c("\%s", "\%d"), factor(11:12)) # re-coercion needed
}
\references{
\code{printf} in \code{glibc},
\url{https://man.archlinux.org/man/printf.3}
\code{printf} format strings -- Wikipedia,
\url{https://en.wikipedia.org/wiki/Printf_format_string}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other length:
\code{\link{\%s$\%}()},
\code{\link{stri_isempty}()},
\code{\link{stri_length}()},
\code{\link{stri_numbytes}()},
\code{\link{stri_pad_both}()},
\code{\link{stri_width}()}
}
\concept{length}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_duplicated.Rd 0000644 0001762 0000144 00000007042 14262507664 016171 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/sort.R
\name{stri_duplicated}
\alias{stri_duplicated}
\alias{stri_duplicated_any}
\title{Determine Duplicated Elements}
\usage{
stri_duplicated(
str,
from_last = FALSE,
fromLast = from_last,
...,
opts_collator = NULL
)
stri_duplicated_any(
str,
from_last = FALSE,
fromLast = from_last,
...,
opts_collator = NULL
)
}
\arguments{
\item{str}{a character vector}
\item{from_last}{a single logical value;
indicates whether search should be performed from the last to the
first string}
\item{fromLast}{[DEPRECATED] alias of \code{from_last}}
\item{...}{additional settings for \code{opts_collator}}
\item{opts_collator}{a named list with \pkg{ICU} Collator's options,
see \code{\link{stri_opts_collator}}, \code{NULL}
for default collation options}
}
\value{
\code{stri_duplicated()} returns a logical vector of the same length
as \code{str}. Each of its elements indicates whether a canonically
equivalent string was already found in \code{str}.
\code{stri_duplicated_any()} returns a single non-negative integer.
Value of 0 indicates that all the elements in \code{str} are unique.
Otherwise, it gives the index of the first non-unique element.
}
\description{
\code{stri_duplicated()} determines which strings in a character vector
are duplicates of other elements.
\code{stri_duplicated_any()} determines if there are any duplicated
strings in a character vector.
}
\details{
Missing values are regarded as equal.
Unlike \code{\link{duplicated}} and \code{\link{anyDuplicated}},
these functions test for canonical equivalence of strings
(and not whether the strings are just bytewise equal)
Such operations are locale-dependent.
Hence, \code{stri_duplicated} and \code{stri_duplicated_any}
are significantly slower (but much better suited for natural language
processing) than their base R counterparts.
See also \code{\link{stri_unique}} for extracting unique elements.
}
\examples{
# In the following examples, we have 3 duplicated values,
# 'a' - 2 times, NA - 1 time
stri_duplicated(c('a', 'b', 'a', NA, 'a', NA))
stri_duplicated(c('a', 'b', 'a', NA, 'a', NA), from_last=TRUE)
stri_duplicated_any(c('a', 'b', 'a', NA, 'a', NA))
# compare the results:
stri_duplicated(c('\u0105', stri_trans_nfkd('\u0105')))
duplicated(c('\u0105', stri_trans_nfkd('\u0105')))
stri_duplicated(c('gro\u00df', 'GROSS', 'Gro\u00df', 'Gross'), strength=1)
duplicated(c('gro\u00df', 'GROSS', 'Gro\u00df', 'Gross'))
}
\references{
\emph{Collation} - ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/collation/}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other locale_sensitive:
\code{\link{\%s<\%}()},
\code{\link{about_locale}},
\code{\link{about_search_boundaries}},
\code{\link{about_search_coll}},
\code{\link{stri_compare}()},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_enc_detect2}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_collator}()},
\code{\link{stri_order}()},
\code{\link{stri_rank}()},
\code{\link{stri_sort_key}()},
\code{\link{stri_sort}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_unique}()},
\code{\link{stri_wrap}()}
}
\concept{locale_sensitive}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_enc_detect.Rd 0000644 0001762 0000144 00000011364 14262507664 016152 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/encoding_detection.R
\name{stri_enc_detect}
\alias{stri_enc_detect}
\title{Detect Character Set and Language}
\usage{
stri_enc_detect(str, filter_angle_brackets = FALSE)
}
\arguments{
\item{str}{character vector, a raw vector, or
a list of \code{raw} vectors}
\item{filter_angle_brackets}{logical; If filtering is enabled,
text within angle brackets ('<' and '>') will be removed before detection,
which will remove most HTML or XML markup.}
}
\value{
Returns a list of length equal to the length of \code{str}.
Each list element is a data frame with the following three named vectors
representing all the guesses:
\itemize{
\item \code{Encoding} -- string; guessed encodings; \code{NA} on failure,
\item \code{Language} -- string; guessed languages; \code{NA} if the language could
not be determined (e.g., in case of UTF-8),
\item \code{Confidence} -- numeric in [0,1]; the higher the value,
the more confidence there is in the match; \code{NA} on failure.
}
The guesses are ordered by decreasing confidence.
}
\description{
This function uses the \pkg{ICU} engine to determine the character set,
or encoding, of character data in an unknown format.
}
\details{
Vectorized over \code{str} and \code{filter_angle_brackets}.
For a character vector input, merging all text lines
via \code{\link{stri_flatten}(str, collapse='\n')}
might be needed if \code{str} has been obtained via a call to
\code{readLines} and in fact represents an image of a single text file.
This is, at best, an imprecise operation using statistics and heuristics.
Because of this, detection works best if you supply at least a few hundred
bytes of character data that is mostly in a single language.
However, because the detection only looks at a limited amount of the input
data, some of the returned character sets may fail to handle all of the
input data. Note that in some cases,
the language can be determined along with the encoding.
Several different techniques are used for character set detection.
For multi-byte encodings, the sequence of bytes is checked for legible
patterns. The detected characters are also checked against a list of
frequently used characters in that encoding. For single byte encodings,
the data is checked against a list of the most commonly occurring three
letter groups for each language that can be written using that encoding.
The detection process can be configured to optionally ignore
HTML or XML style markup (using \pkg{ICU}'s internal facilities),
which can interfere with the detection
process by changing the statistics.
This function should most often be used for byte-marked input strings,
especially after loading them from text files and before the main
conversion with \code{\link{stri_encode}}.
The input encoding is of course not taken into account here, even
if marked.
The following table shows all the encodings that can be detected:
\tabular{ll}{
\strong{Character_Set} \tab \strong{Languages}\cr
UTF-8 \tab -- \cr
UTF-16BE \tab -- \cr
UTF-16LE \tab -- \cr
UTF-32BE \tab -- \cr
UTF-32LE \tab -- \cr
Shift_JIS \tab Japanese \cr
ISO-2022-JP \tab Japanese \cr
ISO-2022-CN \tab Simplified Chinese \cr
ISO-2022-KR \tab Korean \cr
GB18030 \tab Chinese \cr
Big5 \tab Traditional Chinese \cr
EUC-JP \tab Japanese \cr
EUC-KR \tab Korean \cr
ISO-8859-1 \tab Danish, Dutch, English, French, German, Italian, Norwegian, Portuguese, Swedish \cr
ISO-8859-2 \tab Czech, Hungarian, Polish, Romanian \cr
ISO-8859-5 \tab Russian \cr
ISO-8859-6 \tab Arabic \cr
ISO-8859-7 \tab Greek \cr
ISO-8859-8 \tab Hebrew \cr
ISO-8859-9 \tab Turkish \cr
windows-1250 \tab Czech, Hungarian, Polish, Romanian \cr
windows-1251 \tab Russian \cr
windows-1252 \tab Danish, Dutch, English, French, German, Italian, Norwegian, Portuguese, Swedish \cr
windows-1253 \tab Greek \cr
windows-1254 \tab Turkish \cr
windows-1255 \tab Hebrew \cr
windows-1256 \tab Arabic \cr
KOI8-R \tab Russian \cr
IBM420 \tab Arabic \cr
IBM424 \tab Hebrew \cr
}
}
\examples{
## Not run:
## f <- rawToChar(readBin('test.txt', 'raw', 100000))
## stri_enc_detect(f)
}
\references{
\emph{Character Set Detection} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/conversion/detection.html}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other encoding_detection:
\code{\link{about_encoding}},
\code{\link{stri_enc_detect2}()},
\code{\link{stri_enc_isascii}()},
\code{\link{stri_enc_isutf16be}()},
\code{\link{stri_enc_isutf8}()}
}
\concept{encoding_detection}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_count_boundaries.Rd 0000644 0001762 0000144 00000007473 14262507664 017426 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/search_count_bound.R
\name{stri_count_boundaries}
\alias{stri_count_boundaries}
\alias{stri_count_words}
\title{Count the Number of Text Boundaries}
\usage{
stri_count_boundaries(str, ..., opts_brkiter = NULL)
stri_count_words(str, locale = NULL)
}
\arguments{
\item{str}{character vector or an object coercible to}
\item{...}{additional settings for \code{opts_brkiter}}
\item{opts_brkiter}{a named list with \pkg{ICU} BreakIterator's settings,
see \code{\link{stri_opts_brkiter}};
\code{NULL} for the default break iterator, i.e., \code{line_break}}
\item{locale}{\code{NULL} or \code{''} for text boundary analysis following
the conventions of the default locale, or a single string with
locale identifier, see \link{stringi-locale}}
}
\value{
Both functions return an integer vector.
}
\description{
These functions determine the number of text boundaries
(like character, word, line, or sentence boundaries) in a string.
}
\details{
Vectorized over \code{str}.
For more information on text boundary analysis
performed by \pkg{ICU}'s \code{BreakIterator}, see
\link{stringi-search-boundaries}.
In case of \code{stri_count_words},
just like in \code{\link{stri_extract_all_words}} and
\code{\link{stri_locate_all_words}},
\pkg{ICU}'s word \code{BreakIterator} iterator is used
to locate the word boundaries, and all non-word characters
(\code{UBRK_WORD_NONE} rule status) are ignored.
This function is equivalent to a call to
\code{\link{stri_count_boundaries}(str, type='word', skip_word_none=TRUE, locale=locale)}.
Note that a \code{BreakIterator} of type \code{character}
may be used to count the number of \emph{Unicode characters} in a string.
The \code{\link{stri_length}} function,
which aims to count the number of \emph{Unicode code points},
might report different results.
Moreover, a \code{BreakIterator} of type \code{sentence}
may be used to count the number of sentences in a text piece.
}
\examples{
test <- 'The\u00a0above-mentioned features are very useful. Spam, spam, eggs, bacon, and spam.'
stri_count_boundaries(test, type='word')
stri_count_boundaries(test, type='sentence')
stri_count_boundaries(test, type='character')
stri_count_words(test)
test2 <- stri_trans_nfkd('\u03c0\u0153\u0119\u00a9\u00df\u2190\u2193\u2192')
stri_count_boundaries(test2, type='character')
stri_length(test2)
stri_numbytes(test2)
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other search_count:
\code{\link{about_search}},
\code{\link{stri_count}()}
Other locale_sensitive:
\code{\link{\%s<\%}()},
\code{\link{about_locale}},
\code{\link{about_search_boundaries}},
\code{\link{about_search_coll}},
\code{\link{stri_compare}()},
\code{\link{stri_duplicated}()},
\code{\link{stri_enc_detect2}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_collator}()},
\code{\link{stri_order}()},
\code{\link{stri_rank}()},
\code{\link{stri_sort_key}()},
\code{\link{stri_sort}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_unique}()},
\code{\link{stri_wrap}()}
Other text_boundaries:
\code{\link{about_search_boundaries}},
\code{\link{about_search}},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_brkiter}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_split_lines}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_wrap}()}
}
\concept{locale_sensitive}
\concept{search_count}
\concept{text_boundaries}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_locate.Rd 0000644 0001762 0000144 00000016554 14262507664 015332 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/search_locate_4.R
\name{stri_locate_all}
\alias{stri_locate_all}
\alias{stri_locate_first}
\alias{stri_locate_last}
\alias{stri_locate}
\alias{stri_locate_all_charclass}
\alias{stri_locate_first_charclass}
\alias{stri_locate_last_charclass}
\alias{stri_locate_all_coll}
\alias{stri_locate_first_coll}
\alias{stri_locate_last_coll}
\alias{stri_locate_all_regex}
\alias{stri_locate_first_regex}
\alias{stri_locate_last_regex}
\alias{stri_locate_all_fixed}
\alias{stri_locate_first_fixed}
\alias{stri_locate_last_fixed}
\title{Locate Pattern Occurrences}
\usage{
stri_locate_all(str, ..., regex, fixed, coll, charclass)
stri_locate_first(str, ..., regex, fixed, coll, charclass)
stri_locate_last(str, ..., regex, fixed, coll, charclass)
stri_locate(
str,
...,
regex,
fixed,
coll,
charclass,
mode = c("first", "all", "last")
)
stri_locate_all_charclass(
str,
pattern,
merge = TRUE,
omit_no_match = FALSE,
get_length = FALSE
)
stri_locate_first_charclass(str, pattern, get_length = FALSE)
stri_locate_last_charclass(str, pattern, get_length = FALSE)
stri_locate_all_coll(
str,
pattern,
omit_no_match = FALSE,
get_length = FALSE,
...,
opts_collator = NULL
)
stri_locate_first_coll(
str,
pattern,
get_length = FALSE,
...,
opts_collator = NULL
)
stri_locate_last_coll(
str,
pattern,
get_length = FALSE,
...,
opts_collator = NULL
)
stri_locate_all_regex(
str,
pattern,
omit_no_match = FALSE,
capture_groups = FALSE,
get_length = FALSE,
...,
opts_regex = NULL
)
stri_locate_first_regex(
str,
pattern,
capture_groups = FALSE,
get_length = FALSE,
...,
opts_regex = NULL
)
stri_locate_last_regex(
str,
pattern,
capture_groups = FALSE,
get_length = FALSE,
...,
opts_regex = NULL
)
stri_locate_all_fixed(
str,
pattern,
omit_no_match = FALSE,
get_length = FALSE,
...,
opts_fixed = NULL
)
stri_locate_first_fixed(
str,
pattern,
get_length = FALSE,
...,
opts_fixed = NULL
)
stri_locate_last_fixed(
str,
pattern,
get_length = FALSE,
...,
opts_fixed = NULL
)
}
\arguments{
\item{str}{character vector; strings to search in}
\item{...}{supplementary arguments passed to the underlying functions,
including additional settings for \code{opts_collator},
\code{opts_regex}, \code{opts_fixed}, and so on}
\item{mode}{single string;
one of: \code{'first'} (the default), \code{'all'}, \code{'last'}}
\item{pattern, regex, fixed, coll, charclass}{character vector;
search patterns; for more details refer to \link{stringi-search}}
\item{merge}{single logical value;
indicates whether consecutive sequences of indexes in the resulting
matrix should be merged; \code{stri_locate_all_charclass} only}
\item{omit_no_match}{single logical value; if \code{TRUE},
a no-match will be indicated by a matrix with 0 rows
\code{stri_locate_all_*} only}
\item{get_length}{single logical value; if \code{FALSE} (default),
generate \emph{from-to} matrices; otherwise, output
\emph{from-length} ones}
\item{opts_collator, opts_fixed, opts_regex}{named list used to tune up
the selected search engine's settings; see
\code{\link{stri_opts_collator}}, \code{\link{stri_opts_fixed}},
and \code{\link{stri_opts_regex}}, respectively; \code{NULL}
for the defaults}
\item{capture_groups}{single logical value;
whether positions of matches to parenthesized subexpressions
should be returned too (as \code{capture_groups} attribute);
\code{stri_locate_*_regex} only}
}
\value{
For \code{stri_locate_all_*},
a list of integer matrices is returned. Each list element
represents the results of a separate search scenario.
The first column gives the start positions
of the matches, and the second column gives the end positions.
Moreover, two \code{NA}s in a row denote \code{NA} arguments
or a no-match (the latter only if \code{omit_no_match} is \code{FALSE}).
\code{stri_locate_first_*} and \code{stri_locate_last_*}
return an integer matrix with
two columns, giving the start and end positions of the first
or the last matches, respectively, and two \code{NA}s if and
only if they are not found.
For \code{stri_locate_*_regex}, if the match is of zero length,
\code{end} will be one character less than \code{start}.
Note that \code{stri_locate_last_regex} searches from start to end,
but skips overlapping matches, see the example below.
Setting \code{get_length=TRUE} results in the 2nd column representing
the length of the match instead of the end position. In this case,
negative length denotes a no-match.
If \code{capture_groups=TRUE}, then the outputs are equipped with the
\code{capture_groups} attribute, which is a list of matrices
giving the start-end positions of matches to parenthesized subexpressions.
Similarly to \code{stri_match_regex}, capture group names are extracted
unless looking for first/last occurrences of many different patterns.
}
\description{
These functions find the indexes (positions) where
there is a match to some pattern.
The functions \code{stri_locate_all_*} locate all the matches.
\code{stri_locate_first_*} and \code{stri_locate_last_*}
give the first and the last matches, respectively.
}
\details{
Vectorized over \code{str} and \code{pattern} (with recycling
of the elements in the shorter vector if necessary). This allows to,
for instance, search for one pattern in each string,
search for each pattern in one string,
and search for the i-th pattern within the i-th string.
The matches may be extracted by calling
\code{\link{stri_sub}} or \code{\link{stri_sub_all}}.
Alternatively, you may call \code{\link{stri_extract}} directly.
\code{stri_locate}, \code{stri_locate_all}, \code{stri_locate_first},
and \code{stri_locate_last} are convenience functions.
They just call \code{stri_locate_*_*}, depending on the arguments used.
}
\examples{
stri_locate_all('stringi', fixed='i')
stri_locate_first_coll('hladn\u00FD', 'HLADNY', strength=1, locale='sk_SK')
stri_locate_all_regex(
c('breakfast=eggs;lunch=pizza', 'breakfast=spam', 'no food here'),
'(?\\\\w+)=(?\\\\w+)',
capture_groups=TRUE
) # named capture groups
stri_locate_all_fixed("abababa", "ABA", case_insensitive=TRUE, overlap=TRUE)
stri_locate_first_fixed("ababa", "aba")
stri_locate_last_fixed("ababa", "aba") # starts from end
stri_locate_last_regex("ababa", "aba") # no overlaps, from left to right
x <- c("yes yes", "no", NA)
stri_locate_all_fixed(x, "yes")
stri_locate_all_fixed(x, "yes", omit_no_match=TRUE)
stri_locate_all_fixed(x, "yes", get_length=TRUE)
stri_locate_all_fixed(x, "yes", get_length=TRUE, omit_no_match=TRUE)
stri_locate_first_fixed(x, "yes")
stri_locate_first_fixed(x, "yes", get_length=TRUE)
# Use regex positive-lookahead to locate overlapping pattern matches:
stri_locate_all_regex('ACAGAGACTTTAGATAGAGAAGA', '(?=AGA)')
# note that start > end here (match of length zero)
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other search_locate:
\code{\link{about_search}},
\code{\link{stri_locate_all_boundaries}()}
Other indexing:
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_sub_all}()},
\code{\link{stri_sub}()}
}
\concept{indexing}
\concept{search_locate}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_trans_general.Rd 0000644 0001762 0000144 00000010131 14453700767 016671 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/trans_transliterate.R
\name{stri_trans_general}
\alias{stri_trans_general}
\title{General Text Transforms, Including Transliteration}
\usage{
stri_trans_general(str, id, rules = FALSE, forward = TRUE)
}
\arguments{
\item{str}{character vector}
\item{id}{a single string with transform identifier,
see \code{\link{stri_trans_list}}, or custom transliteration rules}
\item{rules}{if \code{TRUE}, treat \code{id} as a string with
semicolon-separated transliteration rules (see the \pkg{ICU} manual);}
\item{forward}{transliteration direction (\code{TRUE} for forward,
\code{FALSE} for reverse)}
}
\value{
Returns a character vector.
}
\description{
\pkg{ICU} General transforms provide different ways
for processing Unicode text. They are useful in handling a variety
of different tasks, including:
\itemize{
\item locale-independent upper case, lower case, title case,
full/halfwidth conversions,
\item normalization,
\item hex and character name conversions,
\item script to script conversion/transliteration.
}
}
\details{
\pkg{ICU} Transforms were mainly designed to transliterate characters
from one script to another (for example, from Greek to Latin,
or Japanese Katakana to Latin).
However, these services are also capable of handling a much
broader range of tasks.
In particular, the Transforms include prebuilt transformations
for case conversions, for normalization conversions, for the removal
of given characters, and also for a variety of language and script
transliterations. Transforms can be chained together to perform
a series of operations and each step of the process can use a
UnicodeSet to restrict the characters that are affected.
To get the list of available transforms,
call \code{\link{stri_trans_list}}.
Note that transliterators are often combined in sequence
to achieve a desired transformation.
This is analogous to the composition of mathematical functions.
For example, given a script that converts lowercase ASCII characters
from Latin script to Katakana script, it is convenient to first
(1) separate input base characters and accents, and then (2)
convert uppercase to lowercase.
To achieve this, a compound transform can be specified as follows:
\code{NFKD; Lower; Latin-Katakana;} (with the default \code{rules=FALSE}).
Custom rule-based transliteration is also supported, see the \pkg{ICU}
manual and below for some examples.
Transliteration is not dependent on the current locale.
}
\examples{
stri_trans_general('gro\u00df', 'latin-ascii')
stri_trans_general('stringi', 'latin-greek')
stri_trans_general('stringi', 'latin-cyrillic')
stri_trans_general('stringi', 'upper') # see stri_trans_toupper
stri_trans_general('\u0104', 'nfd; lower') # compound id; see stri_trans_nfd
stri_trans_general('Marek G\u0105golewski', 'pl-pl_FONIPA')
stri_trans_general('\u2620', 'any-name') # character name
stri_trans_general('\\\\N{latin small letter a}', 'name-any') # decode name
stri_trans_general('\u2620', 'hex/c') # to hex
stri_trans_general("\u201C\u2026\u201D \u0105\u015B\u0107\u017C",
"NFKD; NFC; [^\\\\p{L}] latin-ascii")
x <- "\uC885\uB85C\uAD6C \uC0AC\uC9C1\uB3D9"
stringi::stri_trans_general(x, "Hangul-Latin")
# Deviate from the ICU rules of romanisation of Korean,
# see https://en.wikipedia.org/wiki/Romanization_of_Korean
id <- "
:: NFD;
\u11A8 > k;
\u11AE > t;
\u11B8 > p;
\u1105 > r;
:: Hangul-Latin;
"
stringi::stri_trans_general(x, id, rules=TRUE)
}
\references{
\emph{General Transforms} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/transforms/general/}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other transform:
\code{\link{stri_trans_char}()},
\code{\link{stri_trans_list}()},
\code{\link{stri_trans_nfc}()},
\code{\link{stri_trans_tolower}()}
}
\concept{transform}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/about_search.Rd 0000644 0001762 0000144 00000011313 14262507664 015445 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/search.R
\name{about_search}
\alias{about_search}
\alias{search}
\alias{stringi-search}
\title{String Searching}
\description{
This man page explains how to perform string search-based
operations in \pkg{stringi}.
}
\details{
The following independent string searching engines are available
in \pkg{stringi}.
\itemize{
\item \code{stri_*_regex} -- \pkg{ICU}'s regular expressions (regexes),
see \link{about_search_regex},
\item \code{stri_*_fixed} -- locale-independent byte-wise pattern matching,
see \link{about_search_fixed},
\item \code{stri_*_coll} -- \pkg{ICU}'s \code{StringSearch},
locale-sensitive, Collator-based pattern search,
useful for natural language processing tasks,
see \link{about_search_coll},
\item \code{stri_*_charclass} -- character classes search,
e.g., Unicode General Categories or Binary Properties,
see \link{about_search_charclass},
\item \code{stri_*_boundaries} -- text boundary analysis,
see \link{about_search_boundaries}
}
Each search engine is able to perform many search-based operations.
These may include:
\itemize{
\item \code{stri_detect_*} - detect if a pattern occurs in a string,
see, e.g., \code{\link{stri_detect}},
\item \code{stri_count_*} - count the number of pattern occurrences,
see, e.g., \code{\link{stri_count}},
\item \code{stri_locate_*} - locate all, first, or last occurrences
of a pattern, see, e.g., \code{\link{stri_locate}},
\item \code{stri_extract_*} - extract all, first, or last occurrences
of a pattern, see, e.g., \code{\link{stri_extract}}
and, in case of regexes, \code{\link{stri_match}},
\item \code{stri_replace_*} - replace all, first, or last occurrences
of a pattern, see, e.g., \code{\link{stri_replace}}
and also \code{\link{stri_trim}},
\item \code{stri_split_*} - split a string into chunks indicated
by occurrences of a pattern,
see, e.g., \code{\link{stri_split}},
\item \code{stri_startswith_*} and \code{stri_endswith_*} detect
if a string starts or ends with a pattern match, see,
e.g., \code{\link{stri_startswith}},
\item \code{stri_subset_*} - return a subset of a character vector
with strings that match a given pattern, see, e.g., \code{\link{stri_subset}}.
}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other text_boundaries:
\code{\link{about_search_boundaries}},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_brkiter}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_split_lines}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_wrap}()}
Other search_regex:
\code{\link{about_search_regex}},
\code{\link{stri_opts_regex}()}
Other search_fixed:
\code{\link{about_search_fixed}},
\code{\link{stri_opts_fixed}()}
Other search_coll:
\code{\link{about_search_coll}},
\code{\link{stri_opts_collator}()}
Other search_charclass:
\code{\link{about_search_charclass}},
\code{\link{stri_trim_both}()}
Other search_detect:
\code{\link{stri_detect}()},
\code{\link{stri_startswith}()}
Other search_count:
\code{\link{stri_count_boundaries}()},
\code{\link{stri_count}()}
Other search_locate:
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_locate_all}()}
Other search_replace:
\code{\link{stri_replace_all}()},
\code{\link{stri_replace_rstr}()},
\code{\link{stri_trim_both}()}
Other search_split:
\code{\link{stri_split_boundaries}()},
\code{\link{stri_split_lines}()},
\code{\link{stri_split}()}
Other search_subset:
\code{\link{stri_subset}()}
Other search_extract:
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_extract_all}()},
\code{\link{stri_match_all}()}
Other stringi_general_topics:
\code{\link{about_arguments}},
\code{\link{about_encoding}},
\code{\link{about_locale}},
\code{\link{about_search_boundaries}},
\code{\link{about_search_charclass}},
\code{\link{about_search_coll}},
\code{\link{about_search_fixed}},
\code{\link{about_search_regex}},
\code{\link{about_stringi}}
}
\concept{search_charclass}
\concept{search_coll}
\concept{search_count}
\concept{search_detect}
\concept{search_extract}
\concept{search_fixed}
\concept{search_in}
\concept{search_locate}
\concept{search_regex}
\concept{search_replace}
\concept{search_split}
\concept{search_subset}
\concept{stringi_general_topics}
\concept{text_boundaries}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_enc_fromutf32.Rd 0000644 0001762 0000144 00000003513 14262507664 016526 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/encoding_conversion.R
\name{stri_enc_fromutf32}
\alias{stri_enc_fromutf32}
\title{Convert From UTF-32}
\usage{
stri_enc_fromutf32(vec)
}
\arguments{
\item{vec}{a list of integer vectors (or objects coercible to such vectors)
or \code{NULL}s. For convenience, a single integer vector can also
be given.}
}
\value{
Returns a character vector (in UTF-8).
\code{NULL}s in the input list are converted to \code{NA_character_}.
}
\description{
This function converts integer vectors,
representing sequences of UTF-32 code points, to UTF-8 strings.
}
\details{
UTF-32 is a 32-bit encoding where each Unicode code point
corresponds to exactly one integer value.
This function is a vectorized version of
\code{\link{intToUtf8}}. As usual in \pkg{stringi},
it returns character strings in UTF-8.
See \code{\link{stri_enc_toutf32}} for a dual operation.
If an ill-defined code point is given, a warning is generated
and the corresponding string is set to \code{NA}.
Note that \code{0}s are not allowed in \code{vec}, as they are used
internally to mark the end of a string (in the C API).
See also \code{\link{stri_encode}} for decoding arbitrary byte sequences
from any given encoding.
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other encoding_conversion:
\code{\link{about_encoding}},
\code{\link{stri_enc_toascii}()},
\code{\link{stri_enc_tonative}()},
\code{\link{stri_enc_toutf32}()},
\code{\link{stri_enc_toutf8}()},
\code{\link{stri_encode}()}
}
\concept{encoding_conversion}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_datetime_add.Rd 0000644 0001762 0000144 00000005260 14262507664 016457 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/time_calendar.R
\name{stri_datetime_add}
\alias{stri_datetime_add}
\alias{stri_datetime_add<-}
\title{Date and Time Arithmetic}
\usage{
stri_datetime_add(
time,
value = 1L,
units = "seconds",
tz = NULL,
locale = NULL
)
stri_datetime_add(time, units = "seconds", tz = NULL, locale = NULL) <- value
}
\arguments{
\item{time}{an object of class \code{\link{POSIXct}}
(\code{as.POSIXct} will be called on character vectors
and objects of class \code{POSIXlt}, \code{Date}, and \code{factor})}
\item{value}{integer vector; signed number of units to add to \code{time}}
\item{units}{single string; one of \code{'years'}, \code{'months'},
\code{'weeks'}, \code{'days'}, \code{'hours'}, \code{'minutes'},
\code{'seconds'}, or \code{'milliseconds'}}
\item{tz}{\code{NULL} or \code{''} for the default time zone
or a single string with a timezone identifier,}
\item{locale}{\code{NULL} or \code{''} for default locale,
or a single string with locale identifier; a non-Gregorian calendar
may be specified by setting the \code{@calendar=name} keyword}
}
\value{
Both functions return an object of class \code{\link{POSIXct}}.
The replacement version of \code{stri_datetime_add} modifies
the state of the \code{time} object.
}
\description{
Modifies a date-time object by adding a specific amount of time units.
}
\details{
Vectorized over \code{time} and \code{value}.
Note that, e.g., January, 31 + 1 month = February, 28 or 29.
}
\examples{
x <- stri_datetime_now()
stri_datetime_add(x, units='months') <- 2
print(x)
stri_datetime_add(x, -2, units='months')
stri_datetime_add(stri_datetime_create(2014, 4, 20), 1, units='years')
stri_datetime_add(stri_datetime_create(2014, 4, 20), 1, units='years', locale='@calendar=hebrew')
stri_datetime_add(stri_datetime_create(2016, 1, 31), 1, units='months')
}
\references{
\emph{Calendar Classes} - ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/datetime/calendar/}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other datetime:
\code{\link{stri_datetime_create}()},
\code{\link{stri_datetime_fields}()},
\code{\link{stri_datetime_format}()},
\code{\link{stri_datetime_fstr}()},
\code{\link{stri_datetime_now}()},
\code{\link{stri_datetime_symbols}()},
\code{\link{stri_timezone_get}()},
\code{\link{stri_timezone_info}()},
\code{\link{stri_timezone_list}()}
}
\concept{datetime}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_stats_latex.Rd 0000644 0001762 0000144 00000003475 14262507664 016414 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/stats.R
\name{stri_stats_latex}
\alias{stri_stats_latex}
\title{Statistics for a Character Vector Containing LaTeX Commands}
\usage{
stri_stats_latex(str)
}
\arguments{
\item{str}{character vector to be aggregated}
}
\value{
Returns an integer vector with the following named elements:
\enumerate{
\item \code{CharsWord} - number of word characters;
\item \code{CharsCmdEnvir} - command and words characters;
\item \code{CharsWhite} - LaTeX white spaces, including \{ and \} in some contexts;
\item \code{Words} - number of words;
\item \code{Cmds} - number of commands;
\item \code{Envirs} - number of environments;
\item ... (Other stuff that may appear in future releases of \pkg{stringi}).
}
}
\description{
This function gives LaTeX-oriented statistics for a character vector,
e.g., obtained by loading a text file with the
\code{\link{readLines}} function, where each text line
is represented by a separate string.
}
\details{
We use a slightly modified LaTeX Word Count algorithm implemented in
Kile 2.1.3, see
\url{https://kile.sourceforge.io/team.php} for the original contributors.
}
\examples{
s <- c('Lorem \\\\textbf{ipsum} dolor sit \\\\textit{amet}, consectetur adipisicing elit.',
'\\\\begin{small}Proin nibh augue,\\\\end{small} suscipit a, scelerisque sed, lacinia in, mi.',
'')
stri_stats_latex(s)
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other stats:
\code{\link{stri_stats_general}()}
}
\concept{stats}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_numbytes.Rd 0000644 0001762 0000144 00000003636 14262507664 015726 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/length.R
\name{stri_numbytes}
\alias{stri_numbytes}
\title{Count the Number of Bytes}
\usage{
stri_numbytes(str)
}
\arguments{
\item{str}{character vector or an object coercible to}
}
\value{
Returns an integer vector of the same length as \code{str}.
}
\description{
Counts the number of bytes needed to store
each string in the computer's memory.
}
\details{
Often, this is not the function you would normally use
in your string processing activities. See \code{\link{stri_length}} instead.
For 8-bit encoded strings, this is the same as \code{\link{stri_length}}.
For UTF-8 strings, the returned values may be greater
than the number of code points, as UTF-8 is not a fixed-byte encoding:
one code point may be encoded by 1-4 bytes
(according to the current Unicode standard).
Missing values are handled properly.
The strings do not need to be re-encoded to perform this operation.
The returned values do not include the trailing NUL bytes,
which are used internally to mark the end of string data (in C).
}
\examples{
stri_numbytes(letters)
stri_numbytes(c('abc', '123', '\u0105\u0104'))
\dontrun{
# this used to fail on Windows, where there were no native support
# for 4-bytes Unicode characters; see, however, stri_unescape_unicode():
stri_numbytes('\U001F600') # compare stri_length('\U001F600')
}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other length:
\code{\link{\%s$\%}()},
\code{\link{stri_isempty}()},
\code{\link{stri_length}()},
\code{\link{stri_pad_both}()},
\code{\link{stri_sprintf}()},
\code{\link{stri_width}()}
}
\concept{length}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_datetime_create.Rd 0000644 0001762 0000144 00000004733 14522620415 017163 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/time_calendar.R
\name{stri_datetime_create}
\alias{stri_datetime_create}
\title{Create a Date-Time Object}
\usage{
stri_datetime_create(
year = NULL,
month = NULL,
day = NULL,
hour = 0L,
minute = 0L,
second = 0,
lenient = FALSE,
tz = NULL,
locale = NULL
)
}
\arguments{
\item{year}{integer vector; 0 is 1BCE, -1 is 2BCE, etc.;
\code{NULL} for the current year}
\item{month}{integer vector; months are 1-based;
\code{NULL} for the current month}
\item{day}{integer vector;
\code{NULL} for the current day}
\item{hour}{integer vector;
\code{NULL} for the current hour}
\item{minute}{integer vector;
\code{NULL} for the current minute}
\item{second}{numeric vector; fractional seconds are allowed;
\code{NULL} for the current seconds (without milliseconds)}
\item{lenient}{single logical value; should the operation be lenient?}
\item{tz}{\code{NULL} or \code{''} for the default time zone or
a single string with time zone identifier, see \code{\link{stri_timezone_list}}}
\item{locale}{\code{NULL} or \code{''} for default locale,
or a single string with locale identifier; a non-Gregorian calendar
may be specified by setting \code{@calendar=name} keyword}
}
\value{
Returns an object of class \code{\link{POSIXct}}.
}
\description{
Constructs date-time objects from numeric representations.
}
\details{
Vectorized over \code{year}, \code{month}, \code{day}, \code{hour},
\code{hour}, \code{minute}, and \code{second}.
}
\examples{
stri_datetime_create(2015, 12, 31, 23, 59, 59.999)
stri_datetime_create(5775, 8, 1, locale='@calendar=hebrew') # 1 Nisan 5775 -> 2015-03-21
stri_datetime_create(2015, 02, 29)
stri_datetime_create(2015, 02, 29, lenient=TRUE)
stri_datetime_create(hour=15, minute=59)
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other datetime:
\code{\link{stri_datetime_add}()},
\code{\link{stri_datetime_fields}()},
\code{\link{stri_datetime_format}()},
\code{\link{stri_datetime_fstr}()},
\code{\link{stri_datetime_now}()},
\code{\link{stri_datetime_symbols}()},
\code{\link{stri_timezone_get}()},
\code{\link{stri_timezone_info}()},
\code{\link{stri_timezone_list}()}
}
\concept{datetime}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_sub_all.Rd 0000644 0001762 0000144 00000010546 14262507664 015477 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/sub.R
\name{stri_sub_all}
\alias{stri_sub_all}
\alias{stri_sub_all<-}
\alias{stri_sub_replace_all}
\alias{stri_sub_all_replace}
\title{Extract or Replace Multiple Substrings}
\usage{
stri_sub_all(
str,
from = list(1L),
to = list(-1L),
length,
use_matrix = TRUE,
ignore_negative_length = TRUE
)
stri_sub_all(
str,
from = list(1L),
to = list(-1L),
length,
omit_na = FALSE,
use_matrix = TRUE
) <- value
stri_sub_replace_all(..., replacement, value = replacement)
stri_sub_all_replace(..., replacement, value = replacement)
}
\arguments{
\item{str}{character vector}
\item{from}{list of integer vector giving the start indexes; alternatively,
if \code{use_matrix=TRUE}, a list of two-column matrices of type
\code{cbind(from, to)}
(unnamed columns or the 2nd column named other than \code{length})
or \code{cbind(from, length=length)} (2nd column named \code{length})}
\item{to}{list of integer vectors giving the end indexes}
\item{length}{list of integer vectors giving the substring lengths}
\item{use_matrix}{single logical value; see \code{from}}
\item{ignore_negative_length}{single logical value; whether
negative lengths should be ignored or result in missing values}
\item{omit_na}{single logical value; indicates whether missing values
in any of the indexes or in \code{value} leave the part of the
corresponding input string
unchanged [replacement function only]}
\item{value}{a list of character vectors defining the replacement strings
[replacement function only]}
\item{...}{arguments to be passed to \code{stri_sub_all<-}}
\item{replacement}{alias of \code{value} [wherever applicable]}
}
\value{
\code{stri_sub_all} returns a list of character vectors.
Its replacement versions modify the input 'in-place'.
}
\description{
\code{stri_sub_all} extracts multiple substrings from each string.
Its replacement version substitutes (in-place) multiple substrings with the
corresponding replacement strings.
\code{stri_sub_replace_all} (alias \code{stri_sub_all_replace})
is its forward pipe operator-friendly variant, returning
a copy of the input vector.
For extracting/replacing single substrings from/within each string, see
\code{\link{stri_sub}}.
}
\details{
Vectorized over \code{str}, [\code{value}], \code{from} and
(\code{to} or \code{length}). Just like in \code{\link{stri_sub}}, parameters
\code{to} and \code{length} are mutually exclusive.
In one of the simplest scenarios, \code{stri_sub_all(str, from, to)},
the i-th element of the resulting list
generated like \code{stri_sub(str[i], from[[i]], to[[i]])}.
As usual, if one of the inputs is shorter than the others,
recycling rule is applied.
If any of \code{from}, \code{to}, \code{length},
or \code{value} is not a list,
it is wrapped into a list.
If \code{from} consists of a two-column matrix, then these two columns are
used as \code{from} and \code{to}, respectively,
unless the second column is named \code{length}.
Such types of index matrices are generated by
\code{\link{stri_locate_all}}.
If extraction or replacement based on \code{\link{stri_locate_first}}
or \code{\link{stri_locate_last}} is needed, see \code{\link{stri_sub}}.
In the replacement function, the index ranges must be sorted
with respect to \code{from} and must be mutually disjoint.
Negative \code{length} does not result in any altering of the
corresponding input string. On the other hand, in \code{stri_sub_all},
this make the corresponding chunk be ignored,
see \code{ignore_negative_length}, though.
}
\examples{
x <- c('12 3456 789', 'abc', '', NA, '667')
stri_sub_all(x, stri_locate_all_regex(x, '[0-9]+')) # see stri_extract_all
stri_sub_all(x, stri_locate_all_regex(x, '[0-9]+', omit_no_match=TRUE))
stri_sub_all(x, stri_locate_all_regex(x, '[0-9]+', omit_no_match=TRUE)) <- '***'
print(x)
stri_sub_replace_all('a b c', c(1, 3, 5), c(1, 3, 5), replacement=c('A', 'B', 'C'))
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other indexing:
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_locate_all}()},
\code{\link{stri_sub}()}
}
\concept{indexing}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_flatten.Rd 0000644 0001762 0000144 00000003632 14262507664 015511 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/join.R
\name{stri_flatten}
\alias{stri_flatten}
\title{Flatten a String}
\usage{
stri_flatten(str, collapse = "", na_empty = FALSE, omit_empty = FALSE)
}
\arguments{
\item{str}{a vector of strings to be coerced to character}
\item{collapse}{a single string denoting the separator}
\item{na_empty}{single logical value; should missing values
in \code{str} be treated as empty strings (\code{TRUE})
or be omitted whatsoever (\code{NA})?}
\item{omit_empty}{single logical value; should empty strings
in \code{str} be omitted?}
}
\value{
Returns a single string, i.e., a character
vector of length 1.
}
\description{
Joins the elements of a character vector into one string.
}
\details{
The \code{stri_flatten(str, collapse='XXX')} call
is equivalent to \code{\link{paste}(str, collapse='XXX', sep='')}.
If you wish to use some more fancy (e.g., differing)
separators between flattened strings,
call \code{\link{stri_join}(str, separators, collapse='')}.
If \code{str} is not empty, then a single string is returned.
If \code{collapse} has length > 1, then only the first string
will be used.
}
\examples{
stri_flatten(LETTERS)
stri_flatten(LETTERS, collapse=',')
stri_flatten(stri_dup(letters[1:6], 1:3))
stri_flatten(c(NA, '', 'A', '', 'B', NA, 'C'), collapse=',', na_empty=TRUE, omit_empty=TRUE)
stri_flatten(c(NA, '', 'A', '', 'B', NA, 'C'), collapse=',', na_empty=NA)
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other join:
\code{\link{\%s+\%}()},
\code{\link{stri_dup}()},
\code{\link{stri_join_list}()},
\code{\link{stri_join}()}
}
\concept{join}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/about_locale.Rd 0000644 0001762 0000144 00000013334 14523017450 015431 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/locale.R
\name{about_locale}
\alias{about_locale}
\alias{locale}
\alias{stringi-locale}
\title{Locales and \pkg{stringi}}
\description{
In this section we explain how we specify locales in \pkg{stringi}.
Locale is a fundamental concept in \pkg{ICU}.
It identifies a specific user community, i.e., a group of users
who have similar culture and language expectations
for human-computer interaction.
}
\details{
Because a locale is just an identifier of a region,
no validity check is performed when you specify a Locale.
\pkg{ICU} is implemented as a set of services.
If you want to verify whether particular resources are available
in the locale you asked for, you must query those resources.
Note: when you ask for a resource for a particular locale, you get back
the best available match, not necessarily precisely the one you requested.
}
\section{Locale Identifiers}{
\pkg{ICU} services are parametrized by locale,
to deliver culturally correct results.
Locales are identified by character strings
of the form \code{Language} code,
\code{Language_Country} code, or \code{Language_Country_Variant}
code, e.g., 'en_US'.
The two-letter \code{Language} code uses the ISO-639-1 standard,
e.g., 'en' stands for English, 'pl' -- Polish, 'fr' -- French,
and 'de' for German.
\code{Country} is a two-letter code following the ISO-3166 standard.
This is to reflect different language conventions within the same language,
for example in US-English ('en_US') and Australian-English ('en_AU').
Differences may also appear in language conventions used within
the same country. For example, the Euro currency may be used in several European
countries while the individual country's currency is still in circulation.
In such a case, \pkg{ICU} \code{Variant} '_EURO' could be used for selecting
locales that support the Euro currency.
The final (optional) element of a locale is a list of
keywords together with their values. Keywords must be unique.
Their order is not significant. Unknown keywords are ignored.
The handling of keywords depends on the specific services that
utilize them. Currently, the following keywords are recognized:
\code{calendar}, \code{collation}, \code{currency}, and \code{numbers},
e.g., \code{fr@collation=phonebook;}\code{calendar=islamic-civil} is a valid
French locale specifier together with keyword arguments. For
more information, refer to the ICU user guide.
For a list of locales that are recognized by \pkg{ICU},
call \code{\link{stri_locale_list}}.
Note that in \pkg{stringi}, 'C' is a synonym of `en_US_POSIX`.
}
\section{A Note on Default Locales}{
Each locale-sensitive function in \pkg{stringi}
selects the current default locale if an empty string or \code{NULL}
is provided as its \code{locale} argument. Default locales are available
to all the functions; initially, the system locale on that platform is used,
but it may be changed by calling \code{\link{stri_locale_set}}.
Your program should avoid changing the default locale.
All locale-sensitive functions may request
any desired locale per-call (by specifying the \code{locale} argument),
i.e., without referencing to the default locale.
During many tests, however, we did not observe any improper
behavior of \pkg{stringi} while using a modified default locale.
}
\section{Locale-Sensitive Functions in \pkg{stringi}}{
One of many examples of locale-dependent services is the Collator, which
performs a locale-aware string comparison. It is used for string comparing,
ordering, sorting, and searching. See \code{\link{stri_opts_collator}}
for the description on how to tune its settings, and its \code{locale}
argument in particular.
When choosing a resource bundle that is not available in the explicitly
requested locale (but not when using the default locale)
nor in its more general variants (e.g., `es_ES` vs `es`),
a warning is emitted.
Other locale-sensitive functions include, e.g.,
\code{\link{stri_trans_tolower}} (that does character case mapping).
}
\references{
\emph{Locale} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/locale/}
\emph{ISO 639: Language Codes},
\url{https://www.iso.org/iso-639-language-codes.html}
\emph{ISO 3166: Country Codes},
\url{https://www.iso.org/iso-3166-country-codes.html}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other locale_management:
\code{\link{stri_locale_info}()},
\code{\link{stri_locale_list}()},
\code{\link{stri_locale_set}()}
Other locale_sensitive:
\code{\link{\%s<\%}()},
\code{\link{about_search_boundaries}},
\code{\link{about_search_coll}},
\code{\link{stri_compare}()},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_duplicated}()},
\code{\link{stri_enc_detect2}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_collator}()},
\code{\link{stri_order}()},
\code{\link{stri_rank}()},
\code{\link{stri_sort_key}()},
\code{\link{stri_sort}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_unique}()},
\code{\link{stri_wrap}()}
Other stringi_general_topics:
\code{\link{about_arguments}},
\code{\link{about_encoding}},
\code{\link{about_search_boundaries}},
\code{\link{about_search_charclass}},
\code{\link{about_search_coll}},
\code{\link{about_search_fixed}},
\code{\link{about_search_regex}},
\code{\link{about_search}},
\code{\link{about_stringi}}
}
\concept{locale_management}
\concept{locale_sensitive}
\concept{stringi_general_topics}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_opts_brkiter.Rd 0000644 0001762 0000144 00000007036 14523017034 016550 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/opts.R
\name{stri_opts_brkiter}
\alias{stri_opts_brkiter}
\title{Generate a List with BreakIterator Settings}
\usage{
stri_opts_brkiter(
type,
locale,
skip_word_none,
skip_word_number,
skip_word_letter,
skip_word_kana,
skip_word_ideo,
skip_line_soft,
skip_line_hard,
skip_sentence_term,
skip_sentence_sep
)
}
\arguments{
\item{type}{single string; either the break iterator type, one of \code{character},
\code{line_break}, \code{sentence}, \code{word},
or a custom set of ICU break iteration rules;
see \link{stringi-search-boundaries}}
\item{locale}{single string, \code{NULL} or \code{''} for default locale}
\item{skip_word_none}{logical; perform no action for 'words' that
do not fit into any other categories}
\item{skip_word_number}{logical; perform no action for words that
appear to be numbers}
\item{skip_word_letter}{logical; perform no action for words that
contain letters, excluding hiragana, katakana, or ideographic characters}
\item{skip_word_kana}{logical; perform no action for words
containing kana characters}
\item{skip_word_ideo}{logical; perform no action for words
containing ideographic characters}
\item{skip_line_soft}{logical; perform no action for soft line breaks,
i.e., positions where a line break is acceptable but not required}
\item{skip_line_hard}{logical; perform no action for hard,
or mandatory line breaks}
\item{skip_sentence_term}{logical; perform no action for sentences
ending with a sentence terminator ('\code{.}', '\code{,}', '\code{?}',
'\code{!}'), possibly followed by a hard separator
(\code{CR}, \code{LF}, \code{PS}, etc.)}
\item{skip_sentence_sep}{logical; perform no action for sentences
that do not contain an ending sentence terminator, but are ended
by a hard separator or end of input}
}
\value{
Returns a named list object.
Omitted \code{skip_*} values act as they have been set to \code{FALSE}.
}
\description{
A convenience function to tune the \pkg{ICU} \code{BreakIterator}'s behavior
in some text boundary analysis functions, see
\link{stringi-search-boundaries}.
}
\details{
The \code{skip_*} family of settings may be used to prevent performing
any special actions on particular types of text boundaries, e.g.,
in case of the \code{\link{stri_locate_all_boundaries}} and
\code{\link{stri_split_boundaries}} functions.
Note that custom break iterator rules (advanced users only)
should be specified as a single string.
For a detailed description of the syntax of RBBI rules, please refer
to the ICU User Guide on Boundary Analysis.
}
\references{
\emph{\code{ubrk.h} File Reference} -- ICU4C API Documentation,
\url{https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/ubrk_8h.html}
\emph{Boundary Analysis} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/boundaryanalysis/}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other text_boundaries:
\code{\link{about_search_boundaries}},
\code{\link{about_search}},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_split_lines}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_wrap}()}
}
\concept{text_boundaries}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_subset.Rd 0000644 0001762 0000144 00000007712 14262507664 015364 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/search_subset_4.R
\name{stri_subset}
\alias{stri_subset}
\alias{stri_subset<-}
\alias{stri_subset_fixed}
\alias{stri_subset_fixed<-}
\alias{stri_subset_charclass}
\alias{stri_subset_charclass<-}
\alias{stri_subset_coll}
\alias{stri_subset_coll<-}
\alias{stri_subset_regex}
\alias{stri_subset_regex<-}
\title{Select Elements that Match a Given Pattern}
\usage{
stri_subset(str, ..., regex, fixed, coll, charclass)
stri_subset(str, ..., regex, fixed, coll, charclass) <- value
stri_subset_fixed(
str,
pattern,
omit_na = FALSE,
negate = FALSE,
...,
opts_fixed = NULL
)
stri_subset_fixed(str, pattern, negate=FALSE, ..., opts_fixed=NULL) <- value
stri_subset_charclass(str, pattern, omit_na = FALSE, negate = FALSE)
stri_subset_charclass(str, pattern, negate=FALSE) <- value
stri_subset_coll(
str,
pattern,
omit_na = FALSE,
negate = FALSE,
...,
opts_collator = NULL
)
stri_subset_coll(str, pattern, negate=FALSE, ..., opts_collator=NULL) <- value
stri_subset_regex(
str,
pattern,
omit_na = FALSE,
negate = FALSE,
...,
opts_regex = NULL
)
stri_subset_regex(str, pattern, negate=FALSE, ..., opts_regex=NULL) <- value
}
\arguments{
\item{str}{character vector; strings to search within}
\item{...}{supplementary arguments passed to the underlying functions,
including additional settings for \code{opts_collator}, \code{opts_regex},
\code{opts_fixed}, and so on}
\item{value}{non-empty character vector of replacement strings;
replacement function only}
\item{pattern, regex, fixed, coll, charclass}{character vector;
search patterns (no more than the length of \code{str});
for more details refer to \link{stringi-search}}
\item{omit_na}{single logical value; should missing values be excluded
from the result?}
\item{negate}{single logical value; whether a no-match is rather of interest}
\item{opts_collator, opts_fixed, opts_regex}{a named list used to tune up
the search engine's settings; see
\code{\link{stri_opts_collator}}, \code{\link{stri_opts_fixed}},
and \code{\link{stri_opts_regex}}, respectively; \code{NULL}
for the defaults}
}
\value{
The \code{stri_subset_*} functions return a character vector.
As usual, the output encoding is UTF-8.
The \code{stri_subset_*<-} functions modifies \code{str} 'in-place'.
}
\description{
These functions return or modify a sub-vector where there is a match to
a given pattern. In other words, they
are roughly equivalent (but faster and easier to use) to a call to
\code{str[\link{stri_detect}(str, ...)]} or
\code{str[\link{stri_detect}(str, ...)] <- value}.
}
\details{
Vectorized over \code{str} as well as partially over \code{pattern}
and \code{value},
with recycling of the elements in the shorter vector if necessary.
As the aim here is to subset \code{str}, \code{pattern}
cannot be longer than the former. Moreover, if the number of
items to replace is not a multiple of length of \code{value},
a warning is emitted and the unused elements are ignored.
Hence, the length of the output will be the same as length of \code{str}.
\code{stri_subset} and \code{stri_subset<-} are convenience functions.
They call either \code{stri_subset_regex},
\code{stri_subset_fixed}, \code{stri_subset_coll},
or \code{stri_subset_charclass},
depending on the argument used.
}
\examples{
stri_subset_regex(c('stringi R', '123', 'ID456', ''), '^[0-9]+$')
x <- c('stringi R', '123', 'ID456', '')
`stri_subset_regex<-`(x, '[0-9]+$', negate=TRUE, value=NA) # returns a copy
stri_subset_regex(x, '[0-9]+$') <- NA # modifies `x` in-place
print(x)
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other search_subset:
\code{\link{about_search}}
}
\concept{search_subset}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_order.Rd 0000644 0001762 0000144 00000006210 14365436502 015156 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/sort.R
\name{stri_order}
\alias{stri_order}
\title{Ordering Permutation}
\usage{
stri_order(str, decreasing = FALSE, na_last = TRUE, ..., opts_collator = NULL)
}
\arguments{
\item{str}{a character vector}
\item{decreasing}{a single logical value; should the sort order
be nondecreasing (\code{FALSE}, default)
or nonincreasing (\code{TRUE})?}
\item{na_last}{a single logical value; controls the treatment of \code{NA}s
in \code{str}. If \code{TRUE}, then missing values in \code{str} are put
at the end; if \code{FALSE}, they are put at the beginning;
if \code{NA}, then they are removed from the output}
\item{...}{additional settings for \code{opts_collator}}
\item{opts_collator}{a named list with \pkg{ICU} Collator's options,
see \code{\link{stri_opts_collator}}, \code{NULL}
for default collation options}
}
\value{
The function yields an integer vector that gives the sort order.
}
\description{
This function finds a permutation which rearranges the
strings in a given character vector into the ascending or descending
locale-dependent lexicographic order.
}
\details{
For more information on \pkg{ICU}'s Collator and how to tune it up
in \pkg{stringi}, refer to \code{\link{stri_opts_collator}}.
As usual in \pkg{stringi}, non-character inputs are coerced to strings,
see an example below for a somewhat non-intuitive behavior of lexicographic
sorting on numeric inputs.
This function uses a stable sort algorithm (\pkg{STL}'s \code{stable_sort}),
which performs up to \eqn{N*log^2(N)} element comparisons,
where \eqn{N} is the length of \code{str}.
For ordering with regards to multiple criteria (such as sorting
data frames by more than 1 column), see \code{\link{stri_rank}}.
}
\examples{
stri_order(c('hladny', 'chladny'), locale='pl_PL')
stri_order(c('hladny', 'chladny'), locale='sk_SK')
stri_order(c(1, 100, 2, 101, 11, 10)) # lexicographic order
stri_order(c(1, 100, 2, 101, 11, 10), numeric=TRUE) # OK for integers
stri_order(c(0.25, 0.5, 1, -1, -2, -3), numeric=TRUE) # incorrect
}
\references{
\emph{Collation} - ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/collation/}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other locale_sensitive:
\code{\link{\%s<\%}()},
\code{\link{about_locale}},
\code{\link{about_search_boundaries}},
\code{\link{about_search_coll}},
\code{\link{stri_compare}()},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_duplicated}()},
\code{\link{stri_enc_detect2}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_collator}()},
\code{\link{stri_rank}()},
\code{\link{stri_sort_key}()},
\code{\link{stri_sort}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_unique}()},
\code{\link{stri_wrap}()}
}
\concept{locale_sensitive}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_trans_casemap.Rd 0000644 0001762 0000144 00000010606 14262507664 016673 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/trans_casemap.R
\name{stri_trans_tolower}
\alias{stri_trans_tolower}
\alias{stri_trans_toupper}
\alias{stri_trans_casefold}
\alias{stri_trans_totitle}
\title{Transform Strings with Case Mapping or Folding}
\usage{
stri_trans_tolower(str, locale = NULL)
stri_trans_toupper(str, locale = NULL)
stri_trans_casefold(str)
stri_trans_totitle(str, ..., opts_brkiter = NULL)
}
\arguments{
\item{str}{character vector}
\item{locale}{\code{NULL} or \code{''} for case mapping following
the conventions of the default locale, or a single string with
locale identifier, see \link{stringi-locale}.}
\item{...}{additional settings for \code{opts_brkiter}}
\item{opts_brkiter}{a named list with \pkg{ICU} BreakIterator's settings,
see \code{\link{stri_opts_brkiter}};
\code{NULL} for default break iterator, i.e., \code{word};
\code{stri_trans_totitle} only}
}
\value{
Each function returns a character vector.
}
\description{
These functions transform strings either to lower case,
UPPER CASE, or Title Case or perform case folding.
}
\details{
Vectorized over \code{str}.
\pkg{ICU} implements full Unicode string case mappings. It is
worth noting that, generally, case mapping:
\itemize{
\item can change the number of code points and/or code units
of a string,
\item is language-sensitive (results may differ depending on the locale), and
\item is context-sensitive (a character in the input string may map
differently depending on surrounding characters).
}
With \code{stri_trans_totitle}, if \code{word} \code{BreakIterator}
is used (the default), then the first letter of each word will be capitalized
and the rest will be transformed to lower case.
With the break iterator of type \code{sentence}, the first letter
of each sentence will be capitalized only.
Note that according the \pkg{ICU} User Guide,
the string \code{'one. two. three.'} consists of one sentence.
Case folding, on the other hand, is locale-independent.
Its purpose is to make two pieces of text that differ only in case identical.
This may come in handy when comparing strings.
For more general (but not locale dependent)
text transforms refer to \code{\link{stri_trans_general}}.
}
\examples{
stri_trans_toupper('\u00DF', 'de_DE') # small German Eszett / scharfes S
stri_cmp_eq(stri_trans_toupper('i', 'en_US'), stri_trans_toupper('i', 'tr_TR'))
stri_trans_toupper(c('abc', '123', '\u0105\u0104'))
stri_trans_tolower(c('AbC', '123', '\u0105\u0104'))
stri_trans_totitle(c('AbC', '123', '\u0105\u0104'))
stri_trans_casefold(c('AbC', '123', '\u0105\u0104'))
stri_trans_totitle('stringi is a FREE R pAcKaGe. WItH NO StrinGS attached.') # word boundary
stri_trans_totitle('stringi is a FREE R pAcKaGe. WItH NO StrinGS attached.', type='sentence')
}
\references{
\emph{Case Mappings} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/transforms/casemappings.html}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other locale_sensitive:
\code{\link{\%s<\%}()},
\code{\link{about_locale}},
\code{\link{about_search_boundaries}},
\code{\link{about_search_coll}},
\code{\link{stri_compare}()},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_duplicated}()},
\code{\link{stri_enc_detect2}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_collator}()},
\code{\link{stri_order}()},
\code{\link{stri_rank}()},
\code{\link{stri_sort_key}()},
\code{\link{stri_sort}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_unique}()},
\code{\link{stri_wrap}()}
Other transform:
\code{\link{stri_trans_char}()},
\code{\link{stri_trans_general}()},
\code{\link{stri_trans_list}()},
\code{\link{stri_trans_nfc}()}
Other text_boundaries:
\code{\link{about_search_boundaries}},
\code{\link{about_search}},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_brkiter}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_split_lines}()},
\code{\link{stri_wrap}()}
}
\concept{locale_sensitive}
\concept{text_boundaries}
\concept{transform}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_enc_toascii.Rd 0000644 0001762 0000144 00000003124 14262507664 016330 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/encoding_conversion.R
\name{stri_enc_toascii}
\alias{stri_enc_toascii}
\title{Convert To ASCII}
\usage{
stri_enc_toascii(str)
}
\arguments{
\item{str}{a character vector to be converted}
}
\value{
Returns a character vector.
}
\description{
This function converts input strings to ASCII,
i.e., to character strings consisting of bytes not greater than 127.
}
\details{
All code points greater than 127 are replaced with the ASCII SUBSTITUTE
CHARACTER (0x1A).
\R encoding declarations are always used to determine
which encoding is assumed for each input, see \code{\link{stri_enc_mark}}.
If ill-formed byte sequences are found in UTF-8 byte
streams, a warning is generated.
A \code{bytes}-marked string is assumed to be in an 8-bit encoding
extending the ASCII map (a common assumption in \R itself).
Note that the SUBSTITUTE CHARACTER (\code{\\x1a == \\032}) may be interpreted
as the ASCII missing value for single characters.
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other encoding_conversion:
\code{\link{about_encoding}},
\code{\link{stri_enc_fromutf32}()},
\code{\link{stri_enc_tonative}()},
\code{\link{stri_enc_toutf32}()},
\code{\link{stri_enc_toutf8}()},
\code{\link{stri_encode}()}
}
\concept{encoding_conversion}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_opts_regex.Rd 0000644 0001762 0000144 00000010050 14523017034 016206 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/opts.R
\name{stri_opts_regex}
\alias{stri_opts_regex}
\title{Generate a List with Regex Matcher Settings}
\usage{
stri_opts_regex(
case_insensitive,
comments,
dotall,
dot_all = dotall,
literal,
multiline,
multi_line = multiline,
unix_lines,
uword,
error_on_unknown_escapes,
time_limit = 0L,
stack_limit = 0L
)
}
\arguments{
\item{case_insensitive}{logical; enables case insensitive matching [regex flag \code{(?i)}]}
\item{comments}{logical; allows white space and comments within patterns [regex flag \code{(?x)}]}
\item{dotall}{logical; if set, `\code{.}` matches line terminators,
otherwise matching of `\code{.}` stops at a line end [regex flag \code{(?s)}]}
\item{dot_all}{alias of \code{dotall}}
\item{literal}{logical; if set, treat the entire pattern as a literal string:
metacharacters or escape sequences in the input sequence will be given no special meaning;
note that in most cases you would rather use the \link{stringi-search-fixed}
facilities in this case}
\item{multiline}{logical; controls the behavior of `\code{$}` and `\code{^}`.
If set, recognize line terminators within a string, otherwise,
match only at start and end of input string [regex flag \code{(?m)}]}
\item{multi_line}{alias of \code{multiline}}
\item{unix_lines}{logical; Unix-only line endings;
when enabled, only \code{U+000a} is recognized as a
line ending by `\code{.}`, `\code{$}`, and `\code{^}`.}
\item{uword}{logical; Unicode word boundaries;
if set, uses the Unicode TR 29 definition of word boundaries;
warning: Unicode word boundaries are quite different from traditional
regex word boundaries. [regex flag \code{(?w)}]
See \url{https://unicode.org/reports/tr29/#Word_Boundaries}}
\item{error_on_unknown_escapes}{logical;
whether to generate an error on unrecognized backslash escapes;
if set, fail with an error on patterns that contain backslash-escaped ASCII
letters without a known special meaning;
otherwise, these escaped letters represent themselves}
\item{time_limit}{integer; processing time limit, in ~milliseconds (but not precisely so,
depends on the CPU speed), for match operations;
setting a limit is desirable if poorly written regexes are expected on input;
0 for no limit}
\item{stack_limit}{integer; maximal size, in bytes, of the heap storage available
for the match backtracking stack; setting a limit is desirable if poorly
written regexes are expected on input; 0 for no limit}
}
\value{
Returns a named list object; missing settings are left with default values.
}
\description{
A convenience function to tune the \pkg{ICU} regular expressions
matcher's behavior, e.g., in \code{\link{stri_count_regex}}
and other \link{stringi-search-regex} functions.
}
\details{
Note that some regex settings may be changed using ICU regex flags
inside regexes. For example, \code{'(?i)pattern'} performs
a case-insensitive match of a given pattern,
see the \pkg{ICU} User Guide entry on Regular Expressions
in the References section or \link{stringi-search-regex}.
}
\examples{
stri_detect_regex('ala', 'ALA') # case-sensitive by default
stri_detect_regex('ala', 'ALA', opts_regex=stri_opts_regex(case_insensitive=TRUE))
stri_detect_regex('ala', 'ALA', case_insensitive=TRUE) # equivalent
stri_detect_regex('ala', '(?i)ALA') # equivalent
}
\references{
\emph{\code{enum URegexpFlag}: Constants for Regular Expression Match Modes}
-- ICU4C API Documentation,
\url{https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/uregex_8h.html}
\emph{Regular Expressions} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/strings/regexp.html}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other search_regex:
\code{\link{about_search_regex}},
\code{\link{about_search}}
}
\concept{search_regex}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_isempty.Rd 0000644 0001762 0000144 00000002243 14262507664 015543 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/length.R
\name{stri_isempty}
\alias{stri_isempty}
\title{Determine if a String is of Length Zero}
\usage{
stri_isempty(str)
}
\arguments{
\item{str}{character vector or an object coercible to}
}
\value{
Returns a logical vector of the same length as \code{str}.
}
\description{
This is the fastest way to find out
whether the elements of a character vector are empty strings.
}
\details{
Missing values are handled properly.
}
\examples{
stri_isempty(letters[1:3])
stri_isempty(c(',', '', 'abc', '123', '\u0105\u0104'))
stri_isempty(character(1))
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other length:
\code{\link{\%s$\%}()},
\code{\link{stri_length}()},
\code{\link{stri_numbytes}()},
\code{\link{stri_pad_both}()},
\code{\link{stri_sprintf}()},
\code{\link{stri_width}()}
}
\concept{length}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_locale_info.Rd 0000644 0001762 0000144 00000003335 14262507664 016326 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/locale_management.R
\name{stri_locale_info}
\alias{stri_locale_info}
\title{Query Given Locale}
\usage{
stri_locale_info(locale = NULL)
}
\arguments{
\item{locale}{\code{NULL} or \code{''} for default locale,
or a single string with locale identifier.}
}
\value{
Returns a list with the following named character strings:
\code{Language}, \code{Country}, \code{Variant}, and
\code{Name}, being their underscore separated combination.
}
\description{
Provides some basic information on a given locale identifier.
}
\details{
With this function you may obtain some basic information
on any provided locale identifier,
even if it is unsupported by \pkg{ICU} or if you pass a malformed locale
identifier (the one that is not, e.g., of the form Language_Country).
See \link{stringi-locale} for discussion.
This function does not do anything really complicated. In many
cases it is similar to a call to
\code{\link{as.list}(\link{stri_split_fixed}(locale, '_', 3L)[[1]])},
with \code{locale} case mapped.
It may be used, however, to get insight on how ICU understands a given
locale identifier.
}
\examples{
stri_locale_info('pl_PL')
stri_locale_info('Pl_pL') # the same result
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other locale_management:
\code{\link{about_locale}},
\code{\link{stri_locale_list}()},
\code{\link{stri_locale_set}()}
}
\concept{locale_management}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_sub.Rd 0000644 0001762 0000144 00000012224 14262507664 014642 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/sub.R
\name{stri_sub}
\alias{stri_sub}
\alias{stri_sub<-}
\alias{stri_sub_replace}
\title{Extract a Substring From or Replace a Substring In a Character Vector}
\usage{
stri_sub(
str,
from = 1L,
to = -1L,
length,
use_matrix = TRUE,
ignore_negative_length = FALSE
)
stri_sub(str, from = 1L, to = -1L, length, omit_na = FALSE, use_matrix = TRUE) <- value
stri_sub_replace(..., replacement, value = replacement)
}
\arguments{
\item{str}{character vector}
\item{from}{integer vector giving the start indexes; alternatively,
if \code{use_matrix=TRUE},
a two-column matrix of type \code{cbind(from, to)}
(unnamed columns or the 2nd column named other than \code{length})
or \code{cbind(from, length=length)} (2nd column named \code{length})}
\item{to}{integer vector giving the end indexes; mutually exclusive with
\code{length} and \code{from} being a matrix}
\item{length}{integer vector giving the substring lengths;
mutually exclusive with \code{to} and \code{from} being a matrix}
\item{use_matrix}{single logical value; see \code{from}}
\item{ignore_negative_length}{single logical value; whether
negative lengths should be ignored or result in missing values}
\item{omit_na}{single logical value; indicates whether missing values
in any of the indexes or in \code{value} leave the corresponding input string
unchanged [replacement function only]}
\item{value}{a character vector defining the replacement strings
[replacement function only]}
\item{...}{arguments to be passed to \code{stri_sub<-}}
\item{replacement}{alias of \code{value} [wherever applicable]}
}
\value{
\code{stri_sub} and \code{stri_sub_replace} return a character vector.
\code{stri_sub<-} changes the \code{str} object 'in-place'.
}
\description{
\code{stri_sub} extracts particular substrings at code point-based
index ranges provided. Its replacement version allows to substitute
(in-place) parts of
a string with given replacement strings. \code{stri_sub_replace}
is its forward pipe operator-friendly variant that returns
a copy of the input vector.
For extracting/replacing multiple substrings from/within each string, see
\code{\link{stri_sub_all}}.
}
\details{
Vectorized over \code{str}, [\code{value}], \code{from} and
(\code{to} or \code{length}). Parameters
\code{to} and \code{length} are mutually exclusive.
Indexes are 1-based, i.e., the start of a string is at index 1.
For negative indexes in \code{from} or \code{to},
counting starts at the end of the string.
For instance, index -1 denotes the last code point in the string.
Non-positive \code{length} gives an empty string.
Argument \code{from} gives the start of a substring to extract.
Argument \code{to} defines the last index of a substring, inclusive.
Alternatively, its \code{length} may be provided.
If \code{from} is a two-column matrix, then these two columns are
used as \code{from} and \code{to}, respectively,
unless the second column is named \code{length}.
In such a case anything passed
explicitly as \code{to} or \code{length} is ignored.
Such types of index matrices are generated by \code{\link{stri_locate_first}}
and \code{\link{stri_locate_last}}. If extraction based on
\code{\link{stri_locate_all}} is needed, see
\code{\link{stri_sub_all}}.
In \code{stri_sub}, out-of-bound indexes are silently
corrected. If \code{from} > \code{to}, then an empty string is returned.
By default, negative \code{length} results in the corresponding output being
\code{NA}, see \code{ignore_negative_length}, though.
In \code{stri_sub<-}, some configurations of indexes may work as
substring 'injection' at the front, back, or in middle.
Negative \code{length} does not alter the corresponding input string.
If both \code{to} and \code{length} are provided,
\code{length} has priority over \code{to}.
Note that for some Unicode strings, the extracted substrings might not
be well-formed, especially if input strings are not normalized
(see \code{\link{stri_trans_nfc}}),
include byte order marks, Bidirectional text marks, and so on.
Handle with care.
}
\examples{
s <- c("spam, spam, bacon, and spam", "eggs and spam")
stri_sub(s, from=-4)
stri_sub(s, from=1, length=c(10, 4))
(stri_sub(s, 1, 4) <- 'stringi')
x <- c('12 3456 789', 'abc', '', NA, '667')
stri_sub(x, stri_locate_first_regex(x, '[0-9]+')) # see stri_extract_first
stri_sub(x, stri_locate_last_regex(x, '[0-9]+')) # see stri_extract_last
stri_sub_replace(x, stri_locate_first_regex(x, '[0-9]+'),
omit_na=TRUE, replacement='***') # see stri_replace_first
stri_sub_replace(x, stri_locate_last_regex(x, '[0-9]+'),
omit_na=TRUE, replacement='***') # see stri_replace_last
\dontrun{x |> stri_sub_replace(1, 5, replacement='new_substring')}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other indexing:
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_locate_all}()},
\code{\link{stri_sub_all}()}
}
\concept{indexing}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_dup.Rd 0000644 0001762 0000144 00000002506 14262507664 014643 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/join.R
\name{stri_dup}
\alias{stri_dup}
\alias{operator_multiply}
\alias{oper_multiply}
\alias{\%s*\%}
\alias{\%stri*\%}
\title{Duplicate Strings}
\usage{
stri_dup(str, times)
e1 \%s*\% e2
e1 \%stri*\% e2
}
\arguments{
\item{str, e1}{a character vector of strings to be duplicated}
\item{times, e2}{an integer vector with the numbers of times to duplicate each string}
}
\value{
Returns a character vector.
}
\description{
Duplicates each \code{str}(\code{e1}) string \code{times}(\code{e2}) times
and concatenates the results.
}
\details{
Vectorized over all arguments.
\code{e1 \%s*\% e2} and \code{e1 \%stri*\% e2} are synonyms
for \code{stri_dup(e1, e2)}
}
\examples{
stri_dup('a', 1:5)
stri_dup(c('a', NA, 'ba'), 4)
stri_dup(c('abc', 'pqrst'), c(4, 2))
"a" \%s*\% 5
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other join:
\code{\link{\%s+\%}()},
\code{\link{stri_flatten}()},
\code{\link{stri_join_list}()},
\code{\link{stri_join}()}
}
\concept{join}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_trim.Rd 0000644 0001762 0000144 00000005417 14262507664 015032 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/trim.R
\name{stri_trim_both}
\alias{stri_trim_both}
\alias{stri_trim}
\alias{stri_trim_left}
\alias{stri_trim_right}
\title{Trim Characters from the Left and/or Right Side of a String}
\usage{
stri_trim_both(str, pattern = "\\\\P{Wspace}", negate = FALSE)
stri_trim_left(str, pattern = "\\\\P{Wspace}", negate = FALSE)
stri_trim_right(str, pattern = "\\\\P{Wspace}", negate = FALSE)
stri_trim(
str,
side = c("both", "left", "right"),
pattern = "\\\\P{Wspace}",
negate = FALSE
)
}
\arguments{
\item{str}{a character vector of strings to be trimmed}
\item{pattern}{a single pattern, specifying the class of characters
(see \link{stringi-search-charclass}) to
to be preserved (if \code{negate} is \code{FALSE}; default)
or trimmed (otherwise)}
\item{negate}{either \code{TRUE} or \code{FALSE}; see \code{pattern}}
\item{side}{character [\code{stri_trim} only]; defaults to \code{'both'}}
}
\value{
All functions return a character vector.
}
\description{
These functions may be used, e.g., to remove unnecessary
white-spaces from strings. Trimming ends at the first or
starts at the last \code{pattern} match.
}
\details{
Vectorized over \code{str} and \code{pattern}.
\code{stri_trim} is a convenience wrapper over \code{stri_trim_left}
and \code{stri_trim_right}.
Contrary to many other string processing libraries,
our trimming functions are universal. The class of characters
to be retained or trimmed can be adjusted.
For replacing pattern matches with
an arbitrary replacement string, see \code{\link{stri_replace}}.
Trimming can also be used where you would normally rely on
regular expressions. For instance, you may get
\code{'23.5'} out of \code{'total of 23.5 bitcoins'}.
For trimming white-spaces, please note the difference
between Unicode binary property `\code{\\p\{Wspace\}}` (more universal)
and general character category `\code{\\p\{Z\}}`,
see \link{stringi-search-charclass}.
}
\examples{
stri_trim_left(' aaa')
stri_trim_right('r-project.org/', '\\\\P{P}')
stri_trim_both(' Total of 23.5 bitcoins. ', '\\\\p{N}')
stri_trim_both(' Total of 23.5 bitcoins. ', '\\\\P{N}', negate=TRUE)
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other search_replace:
\code{\link{about_search}},
\code{\link{stri_replace_all}()},
\code{\link{stri_replace_rstr}()}
Other search_charclass:
\code{\link{about_search_charclass}},
\code{\link{about_search}}
}
\concept{search_charclass}
\concept{search_replace}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_encode.Rd 0000644 0001762 0000144 00000007653 14262507664 015320 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/encoding_conversion.R
\name{stri_encode}
\alias{stri_encode}
\alias{stri_conv}
\title{Convert Strings Between Given Encodings}
\usage{
stri_encode(str, from = NULL, to = NULL, to_raw = FALSE)
stri_conv(str, from = NULL, to = NULL, to_raw = FALSE)
}
\arguments{
\item{str}{a character vector, a raw vector, or
a list of \code{raw} vectors to be converted}
\item{from}{input encoding:
\code{NULL} or \code{''} for the default encoding
or internal encoding marks' usage (see Details);
otherwise, a single string with encoding name,
see \code{\link{stri_enc_list}}}
\item{to}{target encoding:
\code{NULL} or \code{''} for default encoding
(see \code{\link{stri_enc_get}}),
or a single string with encoding name}
\item{to_raw}{a single logical value; indicates whether a list of raw vectors
rather than a character vector should be returned}
}
\value{
If \code{to_raw} is \code{FALSE},
then a character vector with encoded strings (and appropriate
encoding marks) is returned.
Otherwise, a list of vectors of type raw is produced.
}
\description{
These functions convert strings between encodings.
They aim to serve as a more portable and faster replacement
for \R's own \code{\link{iconv}}.
}
\details{
\code{stri_conv} is an alias for \code{stri_encode}.
Refer to \code{\link{stri_enc_list}} for the list
of supported encodings and \link{stringi-encoding}
for a general discussion.
If \code{from} is either missing, \code{''}, or \code{NULL},
and if \code{str} is a character vector
then the marked encodings are used
(see \code{\link{stri_enc_mark}}) -- in such a case \code{bytes}-declared
strings are disallowed.
Otherwise, i.e., if \code{str} is a \code{raw}-type vector
or a list of raw vectors,
we assume that the input encoding is the current default encoding
as given by \code{\link{stri_enc_get}}.
However, if \code{from} is given explicitly,
the internal encoding declarations are always ignored.
For \code{to_raw=FALSE}, the output
strings always have the encodings marked according to the target converter
used (as specified by \code{to}) and the current default Encoding
(\code{ASCII}, \code{latin1}, \code{UTF-8}, \code{native},
or \code{bytes} in all other cases).
Note that some issues might occur if \code{to} indicates, e.g,
UTF-16 or UTF-32, as the output strings may have embedded NULs.
In such cases, please use \code{to_raw=TRUE} and consider
specifying a byte order marker (BOM) for portability reasons
(e.g., set \code{UTF-16} or \code{UTF-32} which automatically
adds the BOMs).
Note that \code{stri_encode(as.raw(data), 'encodingname')}
is a clever substitute for \code{\link{rawToChar}}.
In the current version of \pkg{stringi}, if an incorrect code point is found
on input, it is replaced with the default (for that target encoding)
'missing/erroneous' character (with a warning), e.g.,
the SUBSTITUTE character (U+001A) or the REPLACEMENT one (U+FFFD).
Occurrences thereof can be located in the output string to diagnose
the problematic sequences, e.g., by calling:
\code{stri_locate_all_regex(converted_string, '[\\ufffd\\u001a]'}.
Because of the way this function is currently implemented,
maximal size of a single string to be converted cannot exceed ~0.67 GB.
}
\references{
\emph{Conversion} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/conversion/}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other encoding_conversion:
\code{\link{about_encoding}},
\code{\link{stri_enc_fromutf32}()},
\code{\link{stri_enc_toascii}()},
\code{\link{stri_enc_tonative}()},
\code{\link{stri_enc_toutf32}()},
\code{\link{stri_enc_toutf8}()}
}
\concept{encoding_conversion}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_split_boundaries.Rd 0000644 0001762 0000144 00000010641 14262507664 017420 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/search_split_bound.R
\name{stri_split_boundaries}
\alias{stri_split_boundaries}
\title{Split a String at Text Boundaries}
\usage{
stri_split_boundaries(
str,
n = -1L,
tokens_only = FALSE,
simplify = FALSE,
...,
opts_brkiter = NULL
)
}
\arguments{
\item{str}{character vector or an object coercible to}
\item{n}{integer vector, maximal number of strings to return}
\item{tokens_only}{single logical value; may affect the result if \code{n}
is positive, see Details}
\item{simplify}{single logical value; if \code{TRUE} or \code{NA},
then a character matrix is returned; otherwise (the default), a list of
character vectors is given, see Value}
\item{...}{additional settings for \code{opts_brkiter}}
\item{opts_brkiter}{a named list with \pkg{ICU} BreakIterator's settings,
see \code{\link{stri_opts_brkiter}}; \code{NULL} for the
default break iterator, i.e., \code{line_break}}
}
\value{
If \code{simplify=FALSE} (the default),
then the functions return a list of character vectors.
Otherwise, \code{\link{stri_list2matrix}} with \code{byrow=TRUE}
and \code{n_min=n} arguments is called on the resulting object.
In such a case, a character matrix with \code{length(str)} rows
is returned. Note that \code{\link{stri_list2matrix}}'s \code{fill}
argument is set to an empty string and \code{NA},
for \code{simplify} equal to \code{TRUE} and \code{NA}, respectively.
}
\description{
This function locates text boundaries
(like character, word, line, or sentence boundaries)
and splits strings at the indicated positions.
}
\details{
Vectorized over \code{str} and \code{n}.
If \code{n} is negative (the default), then all text pieces are extracted.
Otherwise, if \code{tokens_only} is \code{FALSE} (which is the default),
then \code{n-1} tokens are extracted (if possible) and the \code{n}-th string
gives the (non-split) remainder (see Examples).
On the other hand, if \code{tokens_only} is \code{TRUE},
then only full tokens (up to \code{n} pieces) are extracted.
For more information on text boundary analysis
performed by \pkg{ICU}'s \code{BreakIterator}, see
\link{stringi-search-boundaries}.
}
\examples{
test <- 'The\u00a0above-mentioned features are very useful. ' \%s+\%
'Spam, spam, eggs, bacon, and spam. 123 456 789'
stri_split_boundaries(test, type='line')
stri_split_boundaries(test, type='word')
stri_split_boundaries(test, type='word', skip_word_none=TRUE)
stri_split_boundaries(test, type='word', skip_word_none=TRUE, skip_word_letter=TRUE)
stri_split_boundaries(test, type='word', skip_word_none=TRUE, skip_word_number=TRUE)
stri_split_boundaries(test, type='sentence')
stri_split_boundaries(test, type='sentence', skip_sentence_sep=TRUE)
stri_split_boundaries(test, type='character')
# a filtered break iterator with the new ICU:
stri_split_boundaries('Mr. Jones and Mrs. Brown are very happy.
So am I, Prof. Smith.', type='sentence', locale='en_US@ss=standard') # ICU >= 56 only
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other search_split:
\code{\link{about_search}},
\code{\link{stri_split_lines}()},
\code{\link{stri_split}()}
Other locale_sensitive:
\code{\link{\%s<\%}()},
\code{\link{about_locale}},
\code{\link{about_search_boundaries}},
\code{\link{about_search_coll}},
\code{\link{stri_compare}()},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_duplicated}()},
\code{\link{stri_enc_detect2}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_collator}()},
\code{\link{stri_order}()},
\code{\link{stri_rank}()},
\code{\link{stri_sort_key}()},
\code{\link{stri_sort}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_unique}()},
\code{\link{stri_wrap}()}
Other text_boundaries:
\code{\link{about_search_boundaries}},
\code{\link{about_search}},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_brkiter}()},
\code{\link{stri_split_lines}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_wrap}()}
}
\concept{locale_sensitive}
\concept{search_split}
\concept{text_boundaries}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_enc_tonative.Rd 0000644 0001762 0000144 00000002606 14262507664 016532 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/encoding_conversion.R
\name{stri_enc_tonative}
\alias{stri_enc_tonative}
\title{Convert Strings To Native Encoding}
\usage{
stri_enc_tonative(str)
}
\arguments{
\item{str}{a character vector to be converted}
}
\value{
Returns a character vector.
}
\description{
Converts character strings with declared encodings
to the current native encoding.
}
\details{
This function just calls \code{\link{stri_encode}(str, NULL, NULL)}.
The current native encoding can be read with \code{\link{stri_enc_get}}.
Character strings declared to be in \code{bytes} encoding will fail here.
Note that if working in a UTF-8 environment,
resulting strings will be marked with \code{UTF-8}
and not \code{native}, see \code{\link{stri_enc_mark}}.
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other encoding_conversion:
\code{\link{about_encoding}},
\code{\link{stri_enc_fromutf32}()},
\code{\link{stri_enc_toascii}()},
\code{\link{stri_enc_toutf32}()},
\code{\link{stri_enc_toutf8}()},
\code{\link{stri_encode}()}
}
\concept{encoding_conversion}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_replace_rstr.Rd 0000644 0001762 0000144 00000001766 14262507664 016547 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/search_replace_4.R
\name{stri_replace_rstr}
\alias{stri_replace_rstr}
\title{Convert gsub-Style Replacement Strings}
\usage{
stri_replace_rstr(x)
}
\arguments{
\item{x}{character vector}
}
\value{
Returns a character vector.
}
\description{
Converts a \code{\link[base]{gsub}}-style replacement strings
to those which can be used in \code{\link{stri_replace}}.
In particular, \code{$} becomes \code{\\$} and \code{\\1} becomes \code{$1}.
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other search_replace:
\code{\link{about_search}},
\code{\link{stri_replace_all}()},
\code{\link{stri_trim_both}()}
}
\concept{search_replace}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/about_search_regex.Rd 0000644 0001762 0000144 00000030067 14350705363 016640 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/search.R
\name{about_search_regex}
\alias{about_search_regex}
\alias{search_regex}
\alias{stringi-search-regex}
\title{Regular Expressions in \pkg{stringi}}
\description{
A regular expression is a pattern describing, possibly in a very
abstract way, a text fragment.
With so many regex functions in \pkg{stringi},
regular expressions may be a very powerful tool
to perform string searching, substring extraction, string splitting, etc.,
tasks.
}
\details{
All \code{stri_*_regex} functions in \pkg{stringi} use
the \pkg{ICU} regex engine. Its settings may be tuned up (for example
to perform case-insensitive search) via the
\code{\link{stri_opts_regex}} function.
Regular expression patterns in \pkg{ICU} are quite similar in form and
behavior to Perl's regexes. Their implementation is loosely inspired
by JDK 1.4 \code{java.util.regex}.
\pkg{ICU} Regular Expressions conform to the Unicode Technical Standard #18
(see References section) and its features are summarized in
the ICU User Guide (see below). A good general introduction
to regexes is (Friedl, 2002).
Some general topics are also covered in the \R manual, see \link{regex}.
}
\section{\pkg{ICU} Regex Operators at a Glance}{
Here is a list of operators provided by the
ICU User Guide on regexes.
\describe{
\item{\code{|}}{Alternation. \code{A|B} matches either A or B.}
\item{\code{*}}{Match 0 or more times. Match as many times as possible.}
\item{\code{+}}{Match 1 or more times. Match as many times as possible.}
\item{\code{?}}{Match zero or one times. Prefer one.}
\item{\code{{n}} }{Match exactly n times.}
\item{\code{{n,}} }{Match at least n times. Match as many times as possible.}
\item{\code{{n,m}} }{Match between n and m times.
Match as many times as possible, but not more than m.}
\item{\code{*?}}{Match 0 or more times. Match as few times as possible.}
\item{\code{+?}}{Match 1 or more times. Match as few times as possible.}
\item{\code{??}}{Match zero or one times. Prefer zero.}
\item{\code{{n}?}}{Match exactly n times.}
\item{\code{{n,}?}}{Match at least n times, but no more than required
for an overall pattern match.}
\item{\code{{n,m}?}}{Match between n and m times. Match as few times
as possible, but not less than n.}
\item{\code{*+}}{Match 0 or more times. Match as many times as possible
when first encountered, do not retry with fewer even if overall match fails
(Possessive Match).}
\item{\code{++}}{Match 1 or more times. Possessive match.}
\item{\code{?+}}{Match zero or one times. Possessive match.}
\item{\code{{n}+}}{Match exactly n times.}
\item{\code{{n,}+}}{Match at least n times. Possessive Match.}
\item{\code{{n,m}+}}{Match between n and m times. Possessive Match.}
\item{\code{(...)}}{Capturing parentheses. Range of input that matched
the parenthesized sub-expression is available after the match,
see \code{\link{stri_match}}.}
\item{\code{(?:...)}}{Non-capturing parentheses. Groups the included pattern,
but does not provide capturing of matching text. Somewhat more efficient
than capturing parentheses.}
\item{\code{(?>...)}}{Atomic-match parentheses. The first match of the
parenthesized sub-expression is the only one tried; if it does not lead to
an overall pattern match, back up the search for a match to a position
before the \code{(?>}.}
\item{\code{(?#...)}}{Free-format comment \code{(?# comment )}.}
\item{\code{(?=...)}}{Look-ahead assertion. True if the parenthesized
pattern matches at the current input position, but does not advance
the input position.}
\item{\code{(?!...)}}{Negative look-ahead assertion. True if the
parenthesized pattern does not match at the current input position.
Does not advance the input position.}
\item{\code{(?<=...)}}{Look-behind assertion. True if the parenthesized
pattern matches text preceding the current input position, with the last
character of the match being the input character just before the current
position. Does not alter the input position. The length of possible strings
matched by the look-behind pattern must not be unbounded (no \code{*}
or \code{+} operators.)}
\item{\code{(?...)}}{Named capture group, where \code{name}
(enclosed within the angle brackets)
is a sequence like \code{[A-Za-z][A-Za-z0-9]*}}
\item{\code{(?ismwx-ismwx:...)}}{Flag settings. Evaluate the parenthesized
expression with the specified flags enabled or \code{-}disabled,
see also \code{\link{stri_opts_regex}}.}
\item{\code{(?ismwx-ismwx)}}{Flag settings. Change the flag settings.
Changes apply to the portion of the pattern following the setting.
For example, \code{(?i)} changes to a case insensitive match,
see also \code{\link{stri_opts_regex}}.}
}
}
\section{\pkg{ICU} Regex Meta-characters at a Glance}{
Here is a list of meta-characters provided by the
ICU User Guide on regexes.
\describe{
\item{\code{\\a}}{Match a BELL, \code{\\u0007}.}
\item{\code{\\A}}{Match at the beginning of the input. Differs from \code{^}.
in that \code{\\A} will not match after a new line within the input.}
\item{\code{\\b}}{Match if the current position is a word boundary.
Boundaries occur at the transitions between word (\code{\\w}) and non-word
(\code{\\W}) characters, with combining marks ignored. For better word
boundaries, see \pkg{ICU} Boundary Analysis, e.g., \code{\link{stri_extract_all_words}}.}
\item{\code{\\B}}{Match if the current position is not a word boundary.}
\item{\code{\\cX}}{Match a control-\code{X} character.}
\item{\code{\\d}}{Match any character with the Unicode General Category of
\code{Nd} (Number, Decimal Digit.).}
\item{\code{\\D}}{Match any character that is not a decimal digit.}
\item{\code{\\e}}{Match an ESCAPE, \code{\\u001B}.}
\item{\code{\\E}}{Terminates a \code{\\Q} ... \code{\\E} quoted sequence.}
\item{\code{\\f}}{Match a FORM FEED, \code{\\u000C}.}
\item{\code{\\G}}{Match if the current position is at the end of the
previous match.}
\item{\code{\\h}}{Match a Horizontal White Space character.
They are characters with Unicode General Category of Space_Separator plus
the ASCII tab, \code{\\u0009}. [Since ICU 55]}
\item{\code{\\H}}{Match a non-Horizontal White Space character.
[Since ICU 55]}
\item{\code{\\k}}{Named Capture Back Reference. [Since ICU 55]}
\item{\code{\\n}}{Match a LINE FEED, \code{\\u000A}.}
\item{\code{\\N{UNICODE CHARACTER NAME}} }{Match the named character.}
\item{\code{\\p{UNICODE PROPERTY NAME}} }{Match any character with the
specified Unicode Property.}
\item{\code{\\P{UNICODE PROPERTY NAME}} }{Match any character not having
the specified Unicode Property.}
\item{\code{\\Q}}{Quotes all following characters until \code{\\E}.}
\item{\code{\\r}}{Match a CARRIAGE RETURN, \code{\\u000D}.}
\item{\code{\\s}}{Match a white space character. White space is defined
as \code{[\\t\\n\\f\\r\\p{Z}]}.}
\item{\code{\\S}}{Match a non-white space character.}
\item{\code{\\t}}{Match a HORIZONTAL TABULATION, \code{\\u0009}.}
\item{\code{\\uhhhh}}{Match the character with the hex value \code{hhhh}.}
\item{\code{\\Uhhhhhhhh}}{Match the character with the hex value \code{hhhhhhhh}.
Exactly eight hex digits must be provided, even though the largest
Unicode code point is \code{\\U0010ffff}.}
\item{\code{\\w}}{Match a word character. Word characters are
\code{[\\p{Alphabetic}\\p{Mark}\\p{Decimal_Number}\\p{Connector_Punctuation}\\u200c\\u200d]}.}
\item{\code{\\W}}{Match a non-word character.}
\item{\code{\\x{hhhh}} }{Match the character with hex value hhhh.
From one to six hex digits may be supplied.}
\item{\code{\\xhh}}{Match the character with two digit hex value hh }
\item{\code{\\X}}{Match a Grapheme Cluster.}
\item{\code{\\Z}}{Match if the current position is at the end of input,
but before the final line terminator, if one exists.}
\item{\code{\\z}}{Match if the current position is at the end of input.}
\item{\code{\\n}}{Back Reference. Match whatever the nth capturing
group matched. n must be a number > 1 and < total number of capture
groups in the pattern.}
\item{\code{\\0ooo}}{Match an Octal character. \code{'ooo'} is from one to three
octal digits. 0377 is the largest allowed Octal character. The leading
zero is required; it distinguishes Octal constants from back references.}
\item{\code{[pattern]}}{Match any one character from the set.}
\item{\code{.}}{Match any character except for - by default - newline, compare \code{\link{stri_opts_regex}}.}
\item{\code{^}}{Match at the beginning of a line.}
\item{\code{$}}{Match at the end of a line.}
\item{\code{\\}}{[outside of sets] Quotes the following character.
Characters that must be quoted to be treated as literals are
\code{* ? + [ ( ) { } ^ $ | \\ .}.}
\item{\code{\\}}{[inside sets] Quotes the following character.
Characters that must be quoted to be treated as literals are
\code{[ ] \\}; Characters that may need to be quoted, depending
on the context are \code{- &}.}
}
}
\section{Character Classes}{
The syntax is similar, but not 100\% compatible with the one
described in \link{about_search_charclass}. In particular,
whitespaces are not ignored and set-theoretic operations are
denoted slightly differently. However, other than this
\link{about_search_charclass} is a good reference
on the capabilities offered.
The ICU User Guide on regexes lists what follows.
\describe{
\item{\code{[abc]}}{Match any of the characters a, b, or c}
\item{\code{[^abc]}}{Negation -- match any character except a, b, or c}
\item{\code{[A-M]}}{Range -- match any character from A to M (based on Unicode code point ordering)}
\item{\code{[\\p{L}]}, \code{[\\p{Letter}]}, \code{[\\p{General_Category=Letter}]}, \code{[:letter:]}}{Characters with Unicode Category = Letter (4 equivalent forms)}
\item{\code{[\\P{Letter}]}}{Negated property -- natch everything except Letters}
\item{\code{[\\p{numeric_value=9}]}}{Match all numbers with a numeric value of 9}
\item{\code{[\\p{Letter}&&\\p{script=cyrillic}]}}{Intersection; match the set of all Cyrillic letters}
\item{\code{[\\p{Letter}--\\p{script=latin}]}}{Set difference; match all non-Latin letters}
\item{\code{[[a-z][A-Z][0-9]]}, \code{[a-zA-Z0-9]}}{Union; match ASCII letters and digits (2 equivalent forms)}
}
}
\section{Regex Functions in \pkg{stringi}}{
Note that if a given regex \code{pattern} is empty,
then all the functions in \pkg{stringi} give \code{NA} in result
and generate a warning.
On a syntax error, a quite informative failure message is shown.
If you wish to search for a fixed pattern,
refer to \link{about_search_coll} or \link{about_search_fixed}.
They allow to perform a locale-aware text lookup,
or a very fast exact-byte search, respectively.
}
\references{
\emph{Regular expressions} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/strings/regexp.html}
J.E.F. Friedl, \emph{Mastering Regular Expressions}, O'Reilly, 2002
\emph{Unicode Regular Expressions} -- Unicode Technical Standard #18,
\url{https://www.unicode.org/reports/tr18/}
\emph{Unicode Regular Expressions} -- Regex tutorial,
\url{https://www.regular-expressions.info/unicode.html}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other search_regex:
\code{\link{about_search}},
\code{\link{stri_opts_regex}()}
Other stringi_general_topics:
\code{\link{about_arguments}},
\code{\link{about_encoding}},
\code{\link{about_locale}},
\code{\link{about_search_boundaries}},
\code{\link{about_search_charclass}},
\code{\link{about_search_coll}},
\code{\link{about_search_fixed}},
\code{\link{about_search}},
\code{\link{about_stringi}}
}
\concept{search_regex}
\concept{stringi_general_topics}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_enc_mark.Rd 0000644 0001762 0000144 00000004273 14262507664 015635 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/encoding_management.R
\name{stri_enc_mark}
\alias{stri_enc_mark}
\title{Get Declared Encodings of Each String}
\usage{
stri_enc_mark(str)
}
\arguments{
\item{str}{character vector
or an object coercible to a character vector}
}
\value{
Returns a character vector of the same length as \code{str}.
Unlike in the \code{\link{Encoding}} function, here the possible encodings are:
\code{ASCII}, \code{latin1}, \code{bytes}, \code{native},
and \code{UTF-8}. Additionally, missing values are handled properly.
This gives exactly the same data that is used by
all the functions in \pkg{stringi} to re-encode their inputs.
}
\description{
Reads declared encodings for each string in a character vector
as seen by \pkg{stringi}.
}
\details{
According to \code{\link{Encoding}},
\R has a simple encoding marking mechanism:
strings can be declared to be in \code{latin1},
\code{UTF-8} or \code{bytes}.
Moreover, we may check (via the R/C API) whether
a string is in ASCII (\R assumes that this holds if and only if
all bytes in a string are not greater than 127,
so there is an implicit assumption that your platform uses
an encoding that extends ASCII)
or in the system's default (a.k.a. \code{unknown} in \code{\link{Encoding}})
encoding.
Intuitively, the default encoding should be equivalent to
the one you use on \code{stdin} (e.g., your 'keyboard').
In \pkg{stringi} we assume that such an encoding
is equivalent to the one returned by \code{\link{stri_enc_get}}.
It is automatically detected by \pkg{ICU}
to match -- by default -- the encoding part of the \code{LC_CTYPE} category
as given by \code{\link{Sys.getlocale}}.
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other encoding_management:
\code{\link{about_encoding}},
\code{\link{stri_enc_info}()},
\code{\link{stri_enc_list}()},
\code{\link{stri_enc_set}()}
}
\concept{encoding_management}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_match.Rd 0000644 0001762 0000144 00000011567 14262507664 015156 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/search_match_4.R
\name{stri_match_all}
\alias{stri_match_all}
\alias{stri_match_first}
\alias{stri_match_last}
\alias{stri_match}
\alias{stri_match_all_regex}
\alias{stri_match_first_regex}
\alias{stri_match_last_regex}
\title{Extract Regex Pattern Matches, Together with Capture Groups}
\usage{
stri_match_all(str, ..., regex)
stri_match_first(str, ..., regex)
stri_match_last(str, ..., regex)
stri_match(str, ..., regex, mode = c("first", "all", "last"))
stri_match_all_regex(
str,
pattern,
omit_no_match = FALSE,
cg_missing = NA_character_,
...,
opts_regex = NULL
)
stri_match_first_regex(
str,
pattern,
cg_missing = NA_character_,
...,
opts_regex = NULL
)
stri_match_last_regex(
str,
pattern,
cg_missing = NA_character_,
...,
opts_regex = NULL
)
}
\arguments{
\item{str}{character vector; strings to search in}
\item{...}{supplementary arguments passed to the underlying functions,
including additional settings for \code{opts_regex}}
\item{mode}{single string;
one of: \code{'first'} (the default), \code{'all'}, \code{'last'}}
\item{pattern, regex}{character vector;
search patterns; for more details refer to \link{stringi-search}}
\item{omit_no_match}{single logical value; if \code{FALSE},
then a row with missing values will indicate that there was no match;
\code{stri_match_all_*} only}
\item{cg_missing}{single string to be used if a capture group match
is unavailable}
\item{opts_regex}{a named list with \pkg{ICU} Regex settings,
see \code{\link{stri_opts_regex}}; \code{NULL}
for default settings}
}
\value{
For \code{stri_match_all*},
a list of character matrices is returned. Each list element
represents the results of a different search scenario.
For \code{stri_match_first*} and \code{stri_match_last*}
a character matrix is returned.
Each row corresponds to a different search result.
The first matrix column gives the whole match. The second one corresponds to
the first capture group, the third -- the second capture group, and so on.
If regular expressions feature a named capture group,
the matrix columns will be named accordingly.
However, for \code{stri_match_first*} and \code{stri_match_last*}
this will only be the case if there is a single pattern.
}
\description{
These functions extract substrings in \code{str} that
match a given regex \code{pattern}. Additionally, they extract matches
to every \emph{capture group}, i.e., to all the sub-patterns given
in round parentheses.
}
\details{
Vectorized over \code{str} and \code{pattern} (with recycling
of the elements in the shorter vector if necessary). This allows to,
for instance, search for one pattern in each given string,
search for each pattern in one given string,
and search for the i-th pattern within the i-th string.
If no pattern match is detected and \code{omit_no_match=FALSE},
then \code{NA}s are included in the resulting matrix (matrices), see Examples.
\code{stri_match}, \code{stri_match_all}, \code{stri_match_first},
and \code{stri_match_last} are convenience functions.
They merely call \code{stri_match_*_regex} and are
provided for consistency with other string searching functions' wrappers,
see, among others, \code{\link{stri_extract}}.
}
\examples{
stri_match_all_regex('breakfast=eggs, lunch=pizza, dessert=icecream',
'(\\\\w+)=(\\\\w+)')
stri_match_all_regex(c('breakfast=eggs', 'lunch=pizza', 'no food here'),
'(\\\\w+)=(\\\\w+)')
stri_match_all_regex(c('breakfast=eggs;lunch=pizza',
'breakfast=bacon;lunch=spaghetti', 'no food here'),
'(\\\\w+)=(\\\\w+)')
stri_match_all_regex(c('breakfast=eggs;lunch=pizza',
'breakfast=bacon;lunch=spaghetti', 'no food here'),
'(?\\\\w+)=(?\\\\w+)') # named capture groups
stri_match_first_regex(c('breakfast=eggs;lunch=pizza',
'breakfast=bacon;lunch=spaghetti', 'no food here'),
'(\\\\w+)=(\\\\w+)')
stri_match_last_regex(c('breakfast=eggs;lunch=pizza',
'breakfast=bacon;lunch=spaghetti', 'no food here'),
'(\\\\w+)=(\\\\w+)')
stri_match_first_regex(c('abcd', ':abcd', ':abcd:'), '^(:)?([^:]*)(:)?$')
stri_match_first_regex(c('abcd', ':abcd', ':abcd:'), '^(:)?([^:]*)(:)?$', cg_missing='')
# Match all the pattern of the form XYX, including overlapping matches:
stri_match_all_regex('ACAGAGACTTTAGATAGAGAAGA', '(?=(([ACGT])[ACGT]\\\\2))')[[1]][,2]
# Compare the above to:
stri_extract_all_regex('ACAGAGACTTTAGATAGAGAAGA', '([ACGT])[ACGT]\\\\1')
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other search_extract:
\code{\link{about_search}},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_extract_all}()}
}
\concept{search_extract}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_trans_nf.Rd 0000644 0001762 0000144 00000007265 14350705363 015666 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/trans_normalization.R
\name{stri_trans_nfc}
\alias{stri_trans_nfc}
\alias{stri_trans_nfd}
\alias{stri_trans_nfkd}
\alias{stri_trans_nfkc}
\alias{stri_trans_nfkc_casefold}
\alias{stri_trans_isnfc}
\alias{stri_trans_isnfd}
\alias{stri_trans_isnfkd}
\alias{stri_trans_isnfkc}
\alias{stri_trans_isnfkc_casefold}
\title{Perform or Check For Unicode Normalization}
\usage{
stri_trans_nfc(str)
stri_trans_nfd(str)
stri_trans_nfkd(str)
stri_trans_nfkc(str)
stri_trans_nfkc_casefold(str)
stri_trans_isnfc(str)
stri_trans_isnfd(str)
stri_trans_isnfkd(str)
stri_trans_isnfkc(str)
stri_trans_isnfkc_casefold(str)
}
\arguments{
\item{str}{character vector to be encoded}
}
\value{
The \code{stri_trans_nf*} functions return a character vector
of the same length as input (the output is always in UTF-8).
\code{stri_trans_isnf*} return a logical vector.
}
\description{
These functions convert strings to NFC, NFKC, NFD, NFKD, or NFKC_Casefold
Unicode Normalization Form or check whether strings are normalized.
}
\details{
Unicode Normalization Forms are formally defined normalizations of Unicode
strings which, e.g., make possible to determine whether any two
strings are equivalent.
Essentially, the Unicode Normalization Algorithm puts all combining
marks in a specified order, and uses rules for decomposition
and composition to transform each string into one of the
Unicode Normalization Forms.
The following Normalization Forms (NFs) are supported:
\itemize{
\item NFC (Canonical Decomposition, followed by Canonical Composition),
\item NFD (Canonical Decomposition),
\item NFKC (Compatibility Decomposition, followed by Canonical Composition),
\item NFKD (Compatibility Decomposition),
\item NFKC_Casefold (combination of NFKC, case folding, and removing ignorable
characters which was introduced with Unicode 5.2).
}
Note that many W3C Specifications recommend using NFC for all content,
because this form avoids potential interoperability problems arising
from the use of canonically equivalent, yet different,
character sequences in document formats on the Web.
Thus, you will rather not use these functions in typical
string processing activities. Most often you may assume
that a string is in NFC, see RFC5198.
As usual in \pkg{stringi},
if the input character vector is in the native encoding,
it will be automatically converted to UTF-8.
For more general text transforms refer to \code{\link{stri_trans_general}}.
}
\examples{
stri_trans_nfd('\u0105') # a with ogonek -> a, ogonek
stri_trans_nfkc('\ufdfa') # 1 codepoint -> 18 codepoints
}
\references{
\emph{Unicode Normalization Forms} -- Unicode Standard Annex #15,
\url{https://unicode.org/reports/tr15/}
\emph{Unicode Format for Network Interchange}
-- RFC5198, \url{https://www.rfc-editor.org/rfc/rfc5198}
\emph{Character Model for the World Wide Web 1.0: Normalization}
-- W3C Working Draft, \url{https://www.w3.org/TR/charmod-norm/}
\emph{Normalization} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/transforms/normalization/}
(technical details)
\emph{Unicode Equivalence} -- Wikipedia,
\url{https://en.wikipedia.org/wiki/Unicode_equivalence}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other transform:
\code{\link{stri_trans_char}()},
\code{\link{stri_trans_general}()},
\code{\link{stri_trans_list}()},
\code{\link{stri_trans_tolower}()}
}
\concept{transform}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_enc_isutf16.Rd 0000644 0001762 0000144 00000003144 14262507664 016200 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/encoding_detection.R
\name{stri_enc_isutf16be}
\alias{stri_enc_isutf16be}
\alias{stri_enc_isutf16le}
\alias{stri_enc_isutf32be}
\alias{stri_enc_isutf32le}
\title{Check If a Data Stream Is Possibly in UTF-16 or UTF-32}
\usage{
stri_enc_isutf16be(str)
stri_enc_isutf16le(str)
stri_enc_isutf32be(str)
stri_enc_isutf32le(str)
}
\arguments{
\item{str}{character vector, a raw vector, or
a list of \code{raw} vectors}
}
\value{
Returns a logical vector.
}
\description{
These functions detect whether a given byte stream is
valid UTF-16LE, UTF-16BE, UTF-32LE, or UTF-32BE.
}
\details{
These functions are independent of the way \R marks encodings in
character strings (see \link{Encoding} and \link{stringi-encoding}).
Most often, these functions act on raw vectors.
A result of \code{FALSE} means that a string is surely not valid UTF-16
or UTF-32. However, false positives are possible.
Also note that a data stream may be sometimes classified
as both valid UTF-16LE and UTF-16BE.
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other encoding_detection:
\code{\link{about_encoding}},
\code{\link{stri_enc_detect2}()},
\code{\link{stri_enc_detect}()},
\code{\link{stri_enc_isascii}()},
\code{\link{stri_enc_isutf8}()}
}
\concept{encoding_detection}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_sort_key.Rd 0000644 0001762 0000144 00000004745 14262507664 015721 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/sort.R
\name{stri_sort_key}
\alias{stri_sort_key}
\title{Sort Keys}
\usage{
stri_sort_key(str, ..., opts_collator = NULL)
}
\arguments{
\item{str}{a character vector}
\item{...}{additional settings for \code{opts_collator}}
\item{opts_collator}{a named list with \pkg{ICU} Collator's options,
see \code{\link{stri_opts_collator}}, \code{NULL}
for default collation options}
}
\value{
The result is a character vector with the same length as \code{str} that
contains the sort keys. The output is marked as \code{bytes}-encoded.
}
\description{
This function computes a locale-dependent sort key, which is an alternative
character representation of the string that, when ordered in the C locale
(which orders using the underlying bytes directly), will give an equivalent
ordering to the original string. It is useful for enhancing algorithms
that sort only in the C locale (e.g., the \code{strcmp} function in libc)
with the ability to be locale-aware.
}
\details{
For more information on \pkg{ICU}'s Collator and how to tune it up
in \pkg{stringi}, refer to \code{\link{stri_opts_collator}}.
See also \code{\link{stri_rank}} for ranking strings with a single character
vector, i.e., generating relative sort keys.
}
\examples{
stri_sort_key(c('hladny', 'chladny'), locale='pl_PL')
stri_sort_key(c('hladny', 'chladny'), locale='sk_SK')
}
\references{
\emph{Collation} - ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/collation/}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other locale_sensitive:
\code{\link{\%s<\%}()},
\code{\link{about_locale}},
\code{\link{about_search_boundaries}},
\code{\link{about_search_coll}},
\code{\link{stri_compare}()},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_duplicated}()},
\code{\link{stri_enc_detect2}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_collator}()},
\code{\link{stri_order}()},
\code{\link{stri_rank}()},
\code{\link{stri_sort}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_unique}()},
\code{\link{stri_wrap}()}
}
\concept{locale_sensitive}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/operator_dollar.Rd 0000644 0001762 0000144 00000003404 14262507664 016200 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/sprintf.R
\name{\%s$\%}
\alias{\%s$\%}
\alias{operator_dollar}
\alias{oper_dollar}
\alias{\%stri$\%}
\title{C-Style Formatting with \code{\link{stri_sprintf}} as a Binary Operator}
\usage{
e1 \%s$\% e2
e1 \%stri$\% e2
}
\arguments{
\item{e1}{format strings, see \code{\link{stri_sprintf}} for syntax}
\item{e2}{a list of atomic vectors to be passed to \code{\link{stri_sprintf}}
or a single atomic vector}
}
\value{
Returns a character vector.
}
\description{
Provides access to \code{\link{stri_sprintf}} in form of a binary
operator in a way similar to Python's \code{\%} overloaded for strings.
Missing values and empty vectors are propagated as usual.
}
\details{
Vectorized over \code{e1} and \code{e2}.
\code{e1 \%s$\% atomic_vector} is equivalent to
\code{e1 \%s$\% list(atomic_vector)}.
}
\examples{
"value='\%d'" \%s$\% 3
"value='\%d'" \%s$\% 1:3
"\%s='\%d'" \%s$\% list("value", 3)
"\%s='\%d'" \%s$\% list("value", 1:3)
"\%s='\%d'" \%s$\% list(c("a", "b", "c"), 1)
"\%s='\%d'" \%s$\% list(c("a", "b", "c"), 1:3)
x <- c("abcd", "\u00DF\u00B5\U0001F970", "abcdef")
cat("[\%6s]" \%s$\% x, sep="\n") # width used, not the number of bytes
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other length:
\code{\link{stri_isempty}()},
\code{\link{stri_length}()},
\code{\link{stri_numbytes}()},
\code{\link{stri_pad_both}()},
\code{\link{stri_sprintf}()},
\code{\link{stri_width}()}
}
\concept{length}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_read_raw.Rd 0000644 0001762 0000144 00000002226 14523011376 015624 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/files.R
\name{stri_read_raw}
\alias{stri_read_raw}
\title{Read Text File as Raw}
\usage{
stri_read_raw(con, fname = con)
}
\arguments{
\item{con}{name of the output file or a connection object
(opened in the binary mode)}
\item{fname}{[DEPRECATED] alias of \code{con}}
}
\value{
Returns a vector of type \code{raw}.
}
\description{
Reads a text file as-is, with no conversion or text line splitting.
}
\details{
Once a text file is read into memory,
encoding detection (see \code{\link{stri_enc_detect}}),
conversion (see \code{\link{stri_encode}}), and/or
splitting of text into lines (see \code{\link{stri_split_lines1}})
can be performed.
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other files:
\code{\link{stri_read_lines}()},
\code{\link{stri_write_lines}()}
}
\concept{files}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_extract_boundaries.Rd 0000644 0001762 0000144 00000010612 14262507664 017735 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/search_extract_bound.R
\name{stri_extract_all_boundaries}
\alias{stri_extract_all_boundaries}
\alias{stri_extract_last_boundaries}
\alias{stri_extract_first_boundaries}
\alias{stri_extract_all_words}
\alias{stri_extract_first_words}
\alias{stri_extract_last_words}
\title{Extract Data Between Text Boundaries}
\usage{
stri_extract_all_boundaries(
str,
simplify = FALSE,
omit_no_match = FALSE,
...,
opts_brkiter = NULL
)
stri_extract_last_boundaries(str, ..., opts_brkiter = NULL)
stri_extract_first_boundaries(str, ..., opts_brkiter = NULL)
stri_extract_all_words(
str,
simplify = FALSE,
omit_no_match = FALSE,
locale = NULL
)
stri_extract_first_words(str, locale = NULL)
stri_extract_last_words(str, locale = NULL)
}
\arguments{
\item{str}{character vector or an object coercible to}
\item{simplify}{single logical value;
if \code{TRUE} or \code{NA}, then a character matrix is returned;
otherwise (the default), a list of character vectors is given, see Value}
\item{omit_no_match}{single logical value; if \code{FALSE},
then a missing value will indicate that there are no words}
\item{...}{additional settings for \code{opts_brkiter}}
\item{opts_brkiter}{a named list with \pkg{ICU} BreakIterator's settings,
see \code{\link{stri_opts_brkiter}};
\code{NULL} for the default break iterator, i.e., \code{line_break}}
\item{locale}{\code{NULL} or \code{''} for text boundary analysis following
the conventions of the default locale, or a single string with
locale identifier, see \link{stringi-locale}}
}
\value{
For \code{stri_extract_all_*},
if \code{simplify=FALSE} (the default), then a
list of character vectors is returned. Each string consists of
a separate word. In case of \code{omit_no_match=FALSE} and
if there are no words or if a string is missing,
a single \code{NA} is provided on output.
Otherwise, \code{\link{stri_list2matrix}} with \code{byrow=TRUE} argument
is called on the resulting object.
In such a case, a character matrix with \code{length(str)} rows
is returned. Note that \code{\link{stri_list2matrix}}'s \code{fill} argument
is set to an empty string and \code{NA},
for \code{simplify} \code{TRUE} and \code{NA}, respectively.
For \code{stri_extract_first_*} and \code{stri_extract_last_*},
a character vector is returned.
A \code{NA} element indicates a no-match.
}
\description{
These functions extract data between text boundaries.
}
\details{
Vectorized over \code{str}.
For more information on text boundary analysis
performed by \pkg{ICU}'s \code{BreakIterator}, see
\link{stringi-search-boundaries}.
In case of \code{stri_extract_*_words},
just like in \code{\link{stri_count_words}},
\pkg{ICU}'s word \code{BreakIterator} iterator is used
to locate the word boundaries, and all non-word characters
(\code{UBRK_WORD_NONE} rule status) are ignored.
}
\examples{
stri_extract_all_words('stringi: THE string processing package 123.48...')
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other search_extract:
\code{\link{about_search}},
\code{\link{stri_extract_all}()},
\code{\link{stri_match_all}()}
Other locale_sensitive:
\code{\link{\%s<\%}()},
\code{\link{about_locale}},
\code{\link{about_search_boundaries}},
\code{\link{about_search_coll}},
\code{\link{stri_compare}()},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_duplicated}()},
\code{\link{stri_enc_detect2}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_collator}()},
\code{\link{stri_order}()},
\code{\link{stri_rank}()},
\code{\link{stri_sort_key}()},
\code{\link{stri_sort}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_unique}()},
\code{\link{stri_wrap}()}
Other text_boundaries:
\code{\link{about_search_boundaries}},
\code{\link{about_search}},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_brkiter}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_split_lines}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_wrap}()}
}
\concept{locale_sensitive}
\concept{search_extract}
\concept{text_boundaries}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_stats_general.Rd 0000644 0001762 0000144 00000003636 14262507664 016713 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/stats.R
\name{stri_stats_general}
\alias{stri_stats_general}
\title{General Statistics for a Character Vector}
\usage{
stri_stats_general(str)
}
\arguments{
\item{str}{character vector to be aggregated}
}
\value{
Returns an integer vector with the following named elements:
\enumerate{
\item \code{Lines} - number of lines (number of
non-missing strings in the vector);
\item \code{LinesNEmpty} - number of lines with at least
one non-\code{WHITE_SPACE} character;
\item \code{Chars} - total number of Unicode code points detected;
\item \code{CharsNWhite} - number of Unicode code points
that are not \code{WHITE_SPACE}s;
\item ... (Other stuff that may appear in future releases of \pkg{stringi}).
}
}
\description{
This function gives general statistics for a character vector,
e.g., obtained by loading a text file with the
\code{\link{readLines}} or \code{\link{stri_read_lines}} function,
where each text line' is represented by a separate string.
}
\details{
None of the strings may contain \code{\\r} or \code{\\n} characters,
otherwise you will get at error.
Below by `white space` we mean the Unicode binary property
\code{WHITE_SPACE}, see \code{stringi-search-charclass}.
}
\examples{
s <- c('Lorem ipsum dolor sit amet, consectetur adipisicing elit.',
'nibh augue, suscipit a, scelerisque sed, lacinia in, mi.',
'Cras vel lorem. Etiam pellentesque aliquet tellus.',
'')
stri_stats_general(s)
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other stats:
\code{\link{stri_stats_latex}()}
}
\concept{stats}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_pad.Rd 0000644 0001762 0000144 00000006104 14262507664 014615 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/pad.R
\name{stri_pad_both}
\alias{stri_pad_both}
\alias{stri_pad_left}
\alias{stri_pad_right}
\alias{stri_pad}
\title{Pad (Center/Left/Right Align) a String}
\usage{
stri_pad_both(
str,
width = floor(0.9 * getOption("width")),
pad = " ",
use_length = FALSE
)
stri_pad_left(
str,
width = floor(0.9 * getOption("width")),
pad = " ",
use_length = FALSE
)
stri_pad_right(
str,
width = floor(0.9 * getOption("width")),
pad = " ",
use_length = FALSE
)
stri_pad(
str,
width = floor(0.9 * getOption("width")),
side = c("left", "right", "both"),
pad = " ",
use_length = FALSE
)
}
\arguments{
\item{str}{character vector}
\item{width}{integer vector giving minimal output string lengths}
\item{pad}{character vector giving padding code points}
\item{use_length}{single logical value; should the number of code
points be used instead of the total code point width
(see \code{\link{stri_width}})?}
\item{side}{[\code{stri_pad} only] single character string;
sides on which padding character is added
(\code{left} (default), \code{right}, or \code{both})}
}
\value{
These functions return a character vector.
}
\description{
Add multiple \code{pad} characters at the given \code{side}(s) of each string
so that each output string is of total width of at least \code{width}.
These functions may be used to center or left/right-align each string.
}
\details{
Vectorized over \code{str}, \code{width}, and \code{pad}.
Each string in \code{pad} should consist of a code points of total width
equal to 1 or, if \code{use_length} is \code{TRUE}, exactly one code point.
\code{stri_pad} is a convenience function, which dispatches
to \code{stri_pad_*}.
Note that Unicode code points may have various widths when
printed on the console and that, by default, the function takes that
into account. By changing the state of the \code{use_length}
argument, this function starts acting like each code point
was of width 1. This feature should rather be used with
text in Latin script.
See \code{\link{stri_trim_left}} (among others) for reverse operation.
Also check out \code{\link{stri_wrap}} for line wrapping.
}
\examples{
stri_pad_left('stringi', 10, pad='#')
stri_pad_both('stringi', 8:12, pad='*')
# center on screen:
cat(stri_pad_both(c('the', 'string', 'processing', 'package'),
getOption('width')*0.9), sep='\n')
cat(stri_pad_both(c('\ud6c8\ubbfc\uc815\uc74c', # takes width into account
stri_trans_nfkd('\ud6c8\ubbfc\uc815\uc74c'), 'abcd'),
width=10), sep='\n')
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other length:
\code{\link{\%s$\%}()},
\code{\link{stri_isempty}()},
\code{\link{stri_length}()},
\code{\link{stri_numbytes}()},
\code{\link{stri_sprintf}()},
\code{\link{stri_width}()}
}
\concept{length}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_timezone_list.Rd 0000644 0001762 0000144 00000007107 14262507664 016742 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/time_zone.R
\name{stri_timezone_list}
\alias{stri_timezone_list}
\title{List Available Time Zone Identifiers}
\usage{
stri_timezone_list(region = NA_character_, offset = NA_integer_)
}
\arguments{
\item{region}{single string;
a ISO 3166 two-letter country code or UN M.49 three-digit area code;
\code{NA} for all regions}
\item{offset}{single numeric value;
a given raw offset from GMT, in hours;
\code{NA} for all offsets}
}
\value{
Returns a character vector.
}
\description{
Returns a list of available time zone identifiers.
}
\details{
If \code{offset} and \code{region} are \code{NA} (the default), then
all time zones are returned. Otherwise,
only time zone identifiers with a given raw offset from GMT
and/or time zones corresponding to a given region are provided.
Note that the effect of daylight savings time is ignored.
A time zone represents an offset applied to the Greenwich Mean Time (GMT)
to obtain local time (Universal Coordinated Time, or UTC, is similar,
but not precisely identical, to GMT; in \pkg{ICU} the two terms
are used interchangeably since \pkg{ICU} does not concern itself with
either leap seconds or historical behavior).
The offset might vary throughout the year, if daylight savings time (DST)
is used, or might be the same all year long.
Typically, regions closer to the equator do not use DST.
If DST is in use, then specific rules define the point where
the offset changes and the amount by which it changes.
If DST is observed, then three additional bits of information are needed:
\enumerate{
\item The precise date and time during the year when DST begins.
In the first half of the year it is in the northern hemisphere,
and in the second half of the year it is in the southern hemisphere.
\item The precise date and time during the year when DST ends.
In the first half of the year it is in the southern hemisphere,
and in the second half of the year it is in the northern hemisphere.
\item The amount by which the GMT offset changes when DST is in effect.
This is almost always one hour.
}
}
\examples{
stri_timezone_list()
stri_timezone_list(offset=1)
stri_timezone_list(offset=5.5)
stri_timezone_list(offset=5.75)
stri_timezone_list(region='PL')
stri_timezone_list(region='US', offset=-10)
# Fetch information on all time zones
do.call(rbind.data.frame,
lapply(stri_timezone_list(), function(tz) stri_timezone_info(tz)))
}
\references{
\emph{TimeZone} class -- ICU API Documentation,
\url{https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1TimeZone.html}
\emph{ICU TimeZone classes} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/datetime/timezone/}
\emph{Date/Time Services} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/datetime/}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other datetime:
\code{\link{stri_datetime_add}()},
\code{\link{stri_datetime_create}()},
\code{\link{stri_datetime_fields}()},
\code{\link{stri_datetime_format}()},
\code{\link{stri_datetime_fstr}()},
\code{\link{stri_datetime_now}()},
\code{\link{stri_datetime_symbols}()},
\code{\link{stri_timezone_get}()},
\code{\link{stri_timezone_info}()}
Other timezone:
\code{\link{stri_timezone_get}()},
\code{\link{stri_timezone_info}()}
}
\concept{datetime}
\concept{timezone}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_info.Rd 0000644 0001762 0000144 00000003356 14262507664 015012 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/ICU_settings.R
\name{stri_info}
\alias{stri_info}
\title{Query Default Settings for \pkg{stringi}}
\usage{
stri_info(short = FALSE)
}
\arguments{
\item{short}{logical; whether or not the results should be given
in a concise form; defaults to \code{TRUE}}
}
\value{
If \code{short} is \code{TRUE}, then a single string providing
information on the default character encoding, locale, and Unicode
as well as \pkg{ICU} version is returned.
Otherwise, a list with the following components is returned:
\itemize{
\item \code{Unicode.version} -- version of Unicode supported
by the \pkg{ICU} library;
\item \code{ICU.version} -- \pkg{ICU} library version used;
\item \code{Locale} -- contains information on default locale,
as returned by \code{\link{stri_locale_info}};
\item \code{Charset.internal} -- fixed at \code{c('UTF-8', 'UTF-16')};
\item \code{Charset.native} -- information on the default encoding,
as returned by \code{\link{stri_enc_info}};
\item \code{ICU.system} -- logical; \code{TRUE} indicates that
the system \pkg{ICU} libs are used, otherwise \pkg{ICU} was built together
with \pkg{stringi};
\item \code{ICU.UTF8} -- logical; \code{TRUE} if the internal
\code{U_CHARSET_IS_UTF8} flag is defined and set.
}
}
\description{
Gives the current default settings used by the \pkg{ICU} library.
}
\concept{encoding}
\concept{locale}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
}
stringi/man/stri_locale_list.Rd 0000644 0001762 0000144 00000002151 14262507664 016341 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/locale_management.R
\name{stri_locale_list}
\alias{stri_locale_list}
\title{List Available Locales}
\usage{
stri_locale_list()
}
\value{
Returns a character vector with locale identifiers
that are known to \pkg{ICU}.
}
\description{
Creates a character vector with all available locale identifies.
}
\details{
Note that some of the services may be unavailable in some locales.
Querying for locale-specific services is always performed
during the resource request.
See \link{stringi-locale} for more information.
}
\examples{
stri_locale_list()
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other locale_management:
\code{\link{about_locale}},
\code{\link{stri_locale_info}()},
\code{\link{stri_locale_set}()}
}
\concept{locale_management}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_locale_set.Rd 0000644 0001762 0000144 00000003667 14262507664 016176 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/locale_management.R
\name{stri_locale_set}
\alias{stri_locale_set}
\alias{stri_locale_get}
\title{Set or Get Default Locale in \pkg{stringi}}
\usage{
stri_locale_set(locale)
stri_locale_get()
}
\arguments{
\item{locale}{single string of the form \code{Language},
\code{Language_Country}, or \code{Language_Country_Variant}, e.g.,
\code{'en_US'}, see \code{\link{stri_locale_list}}.}
}
\value{
\code{stri_locale_set} returns a string with
previously used locale, invisibly.
\code{stri_locale_get} returns a string of the form \code{Language},
\code{Language_Country}, or \code{Language_Country_Variant},
e.g., \code{'en_US'}.
}
\description{
\code{stri_locale_set} changes the default locale for all the functions
in the \pkg{stringi} package,
i.e., establishes the meaning of the ``\code{NULL} locale'' argument
of locale-sensitive functions.
\code{stri_locale_get}
gives the current default locale.
}
\details{
See \link{stringi-locale} for more information on the effect of
changing the default locale.
\code{stri_locale_get} is the same as \code{\link{stri_locale_info}(NULL)$Name}.
}
\examples{
\dontrun{
oldloc <- stri_locale_set('pt_BR')
# ... some locale-dependent operations
# ... note that you may always modify a locale per-call
# ... changing the default locale is convenient if you perform
# ... many operations
stri_locale_set(oldloc) # restore the previous default locale
}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other locale_management:
\code{\link{about_locale}},
\code{\link{stri_locale_info}()},
\code{\link{stri_locale_list}()}
}
\concept{locale_management}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_na2empty.Rd 0000644 0001762 0000144 00000001661 14262507664 015613 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utils.R
\name{stri_na2empty}
\alias{stri_na2empty}
\title{Replace NAs with Empty Strings}
\usage{
stri_na2empty(x)
}
\arguments{
\item{x}{a character vector}
}
\value{
Returns a character vector.
}
\description{
This function replaces all missing values with empty strings.
See \code{\link{stri_replace_na}} for a generalization.
}
\examples{
stri_na2empty(c('a', NA, '', 'b'))
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other utils:
\code{\link{stri_list2matrix}()},
\code{\link{stri_remove_empty}()},
\code{\link{stri_replace_na}()}
}
\concept{utils}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_enc_isutf8.Rd 0000644 0001762 0000144 00000003573 14262507664 016127 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/encoding_detection.R
\name{stri_enc_isutf8}
\alias{stri_enc_isutf8}
\title{Check If a Data Stream Is Possibly in UTF-8}
\usage{
stri_enc_isutf8(str)
}
\arguments{
\item{str}{character vector, a raw vector, or
a list of \code{raw} vectors}
}
\value{
Returns a logical vector.
Its i-th element indicates whether the i-th string
corresponds to a valid UTF-8 byte sequence.
}
\description{
The function checks whether given sequences of bytes forms
a proper UTF-8 string.
}
\details{
\code{FALSE} means that a string is certainly not valid UTF-8.
However, false positives are possible. For instance,
\code{(c4,85)} represents ('a with ogonek') in UTF-8
as well as ('A umlaut', 'Ellipsis') in WINDOWS-1250.
Also note that UTF-8, as well as most 8-bit encodings, extend ASCII
(note that \code{\link{stri_enc_isascii}} implies that
\code{\link{stri_enc_isutf8}}).
However, the longer the sequence,
the greater the possibility that the result
is indeed in UTF-8 -- this is because not all sequences of bytes
are valid UTF-8.
This function is independent of the way \R marks encodings in
character strings (see \link{Encoding} and \link{stringi-encoding}).
}
\examples{
stri_enc_isutf8(letters[1:3])
stri_enc_isutf8('\u0105\u0104')
stri_enc_isutf8('\u1234\u0222')
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other encoding_detection:
\code{\link{about_encoding}},
\code{\link{stri_enc_detect2}()},
\code{\link{stri_enc_detect}()},
\code{\link{stri_enc_isascii}()},
\code{\link{stri_enc_isutf16be}()}
}
\concept{encoding_detection}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_length.Rd 0000644 0001762 0000144 00000003774 14262507664 015344 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/length.R
\name{stri_length}
\alias{stri_length}
\title{Count the Number of Code Points}
\usage{
stri_length(str)
}
\arguments{
\item{str}{character vector or an object coercible to}
}
\value{
Returns an integer vector of the same length as \code{str}.
}
\description{
This function returns the number of code points
in each string.
}
\details{
Note that the number of code points is
not the same as the `width` of the string when
printed on the console.
If a given string is in UTF-8 and has not been properly normalized
(e.g., by \code{\link{stri_trans_nfc}}), the returned counts may sometimes be
misleading. See \code{\link{stri_count_boundaries}} for a method to count
\emph{Unicode characters}. Moreover, if an incorrect UTF-8 byte sequence
is detected, then a warning is generated and the corresponding output element
is set to \code{NA}, see also \code{\link{stri_enc_toutf8}} for a method
to deal with such cases.
Missing values are handled properly.
For `byte` encodings we get, as usual, an error.
}
\examples{
stri_length(LETTERS)
stri_length(c('abc', '123', '\u0105\u0104'))
stri_length('\u0105') # length is one, but...
stri_numbytes('\u0105') # 2 bytes are used
stri_numbytes(stri_trans_nfkd('\u0105')) # 3 bytes here but...
stri_length(stri_trans_nfkd('\u0105')) # ...two code points (!)
stri_count_boundaries(stri_trans_nfkd('\u0105'), type='character') # ...and one Unicode character
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other length:
\code{\link{\%s$\%}()},
\code{\link{stri_isempty}()},
\code{\link{stri_numbytes}()},
\code{\link{stri_pad_both}()},
\code{\link{stri_sprintf}()},
\code{\link{stri_width}()}
}
\concept{length}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/operator_compare.Rd 0000644 0001762 0000144 00000005110 14262507664 016345 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/compare.R
\name{\%s<\%}
\alias{\%s<\%}
\alias{operator_compare}
\alias{oper_comparison}
\alias{oper_compare}
\alias{\%s<=\%}
\alias{\%s>\%}
\alias{\%s>=\%}
\alias{\%s==\%}
\alias{\%s!=\%}
\alias{\%s===\%}
\alias{\%s!==\%}
\alias{\%stri<\%}
\alias{\%stri<=\%}
\alias{\%stri>\%}
\alias{\%stri>=\%}
\alias{\%stri==\%}
\alias{\%stri!=\%}
\alias{\%stri===\%}
\alias{\%stri!==\%}
\title{Compare Strings with or without Collation}
\usage{
e1 \%s<\% e2
e1 \%s<=\% e2
e1 \%s>\% e2
e1 \%s>=\% e2
e1 \%s==\% e2
e1 \%s!=\% e2
e1 \%s===\% e2
e1 \%s!==\% e2
e1 \%stri<\% e2
e1 \%stri<=\% e2
e1 \%stri>\% e2
e1 \%stri>=\% e2
e1 \%stri==\% e2
e1 \%stri!=\% e2
e1 \%stri===\% e2
e1 \%stri!==\% e2
}
\arguments{
\item{e1, e2}{character vectors or objects coercible to character vectors}
}
\value{
All the functions return a logical vector
indicating the result of a pairwise comparison.
As usual, the elements of shorter vectors are recycled if necessary.
}
\description{
Relational operators for comparing corresponding strings in
two character vectors, with a typical R look-and-feel.
}
\details{
These functions call \code{\link{stri_cmp_le}} or its
friends, using the default collator options.
As a consequence, they are vectorized over \code{e1} and \code{e2}.
\code{\%stri==\%} tests for canonical equivalence of strings
(see \code{\link{stri_cmp_equiv}}) and is a locale-dependent operation.
\code{\%stri===\%} performs a locale-independent,
code point-based comparison.
}
\examples{
'a' \%stri<\% 'b'
c('a', 'b', 'c') \%stri>=\% 'b'
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other locale_sensitive:
\code{\link{about_locale}},
\code{\link{about_search_boundaries}},
\code{\link{about_search_coll}},
\code{\link{stri_compare}()},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_duplicated}()},
\code{\link{stri_enc_detect2}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_collator}()},
\code{\link{stri_order}()},
\code{\link{stri_rank}()},
\code{\link{stri_sort_key}()},
\code{\link{stri_sort}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_unique}()},
\code{\link{stri_wrap}()}
}
\concept{locale_sensitive}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_datetime_fields.Rd 0000644 0001762 0000144 00000005044 14262507664 017175 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/time_calendar.R
\name{stri_datetime_fields}
\alias{stri_datetime_fields}
\title{Get Values for Date and Time Fields}
\usage{
stri_datetime_fields(time, tz = attr(time, "tzone"), locale = NULL)
}
\arguments{
\item{time}{an object of class \code{\link{POSIXct}}
(\code{as.POSIXct} will be called on character vectors
and objects of class \code{POSIXlt}, \code{Date}, and \code{factor})}
\item{tz}{\code{NULL} or \code{''} for the default time zone or
a single string with time zone identifier, see \code{\link{stri_timezone_list}}}
\item{locale}{\code{NULL} or \code{''} for the current default locale,
or a single string with a locale identifier; a non-Gregorian calendar
may be specified by setting \code{@calendar=name} keyword}
}
\value{
Returns a data frame with the following columns:
\enumerate{
\item Year (0 is 1BC, -1 is 2BC, etc.)
\item Month (1-based, i.e., 1 stands for the first month, e.g., January;
note that the number of months depends on the selected calendar,
see \code{\link{stri_datetime_symbols}})
\item Day
\item Hour (24-h clock)
\item Minute
\item Second
\item Millisecond
\item WeekOfYear (this is locale-dependent)
\item WeekOfMonth (this is locale-dependent)
\item DayOfYear
\item DayOfWeek (1-based, 1 denotes Sunday; see \code{\link{stri_datetime_symbols}})
\item Hour12 (12-h clock)
\item AmPm (see \code{\link{stri_datetime_symbols}})
\item Era (see \code{\link{stri_datetime_symbols}})
}
}
\description{
Computes and returns values for all date and time fields.
}
\details{
Vectorized over \code{time}.
}
\examples{
stri_datetime_fields(stri_datetime_now())
stri_datetime_fields(stri_datetime_now(), locale='@calendar=hebrew')
stri_datetime_symbols(locale='@calendar=hebrew')$Month[
stri_datetime_fields(stri_datetime_now(), locale='@calendar=hebrew')$Month
]
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other datetime:
\code{\link{stri_datetime_add}()},
\code{\link{stri_datetime_create}()},
\code{\link{stri_datetime_format}()},
\code{\link{stri_datetime_fstr}()},
\code{\link{stri_datetime_now}()},
\code{\link{stri_datetime_symbols}()},
\code{\link{stri_timezone_get}()},
\code{\link{stri_timezone_info}()},
\code{\link{stri_timezone_list}()}
}
\concept{datetime}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/about_search_coll.Rd 0000644 0001762 0000144 00000005671 14350705363 016462 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/search.R
\name{about_search_coll}
\alias{about_search_coll}
\alias{search_coll}
\alias{stringi-search-coll}
\title{Locale-Sensitive Text Searching in \pkg{stringi}}
\description{
String searching facilities described here
provide a way to locate a specific piece of
text. Interestingly, locale-sensitive searching, especially
on a non-English text, is a much more complex process
than it seems at first glance.
}
\section{Locale-Aware String Search Engine}{
All \code{stri_*_coll} functions in \pkg{stringi} use
\pkg{ICU}'s \code{StringSearch} engine,
which implements a locale-sensitive string search algorithm.
The matches are defined by using the notion of ``canonical equivalence''
between strings.
Tuning the Collator's parameters allows you to perform correct matching
that properly takes into account accented letters, conjoined letters,
ignorable punctuation and letter case.
For more information on \pkg{ICU}'s Collator and the search engine
and how to tune it up
in \pkg{stringi}, refer to \code{\link{stri_opts_collator}}.
Please note that \pkg{ICU}'s \code{StringSearch}-based functions
are often much slower that those to perform fixed pattern searches.
}
\references{
\emph{ICU String Search Service} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/collation/string-search.html}
L. Werner, \emph{Efficient Text Searching in Java}, 1999,
\url{https://icu-project.org/docs/papers/efficient_text_searching_in_java.html}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other search_coll:
\code{\link{about_search}},
\code{\link{stri_opts_collator}()}
Other locale_sensitive:
\code{\link{\%s<\%}()},
\code{\link{about_locale}},
\code{\link{about_search_boundaries}},
\code{\link{stri_compare}()},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_duplicated}()},
\code{\link{stri_enc_detect2}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_collator}()},
\code{\link{stri_order}()},
\code{\link{stri_rank}()},
\code{\link{stri_sort_key}()},
\code{\link{stri_sort}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_unique}()},
\code{\link{stri_wrap}()}
Other stringi_general_topics:
\code{\link{about_arguments}},
\code{\link{about_encoding}},
\code{\link{about_locale}},
\code{\link{about_search_boundaries}},
\code{\link{about_search_charclass}},
\code{\link{about_search_fixed}},
\code{\link{about_search_regex}},
\code{\link{about_search}},
\code{\link{about_stringi}}
}
\concept{locale_sensitive}
\concept{search_coll}
\concept{stringi_general_topics}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_wrap.Rd 0000644 0001762 0000144 00000013703 14262507664 015025 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/wrap.R
\name{stri_wrap}
\alias{stri_wrap}
\title{Word Wrap Text to Format Paragraphs}
\usage{
stri_wrap(
str,
width = floor(0.9 * getOption("width")),
cost_exponent = 2,
simplify = TRUE,
normalize = TRUE,
normalise = normalize,
indent = 0,
exdent = 0,
prefix = "",
initial = prefix,
whitespace_only = FALSE,
use_length = FALSE,
locale = NULL
)
}
\arguments{
\item{str}{character vector of strings to reformat}
\item{width}{single integer giving the suggested
maximal total width/number of code points per line}
\item{cost_exponent}{single numeric value, values not greater than zero
will select a greedy word-wrapping algorithm; otherwise
this value denotes the exponent in the cost function
of a (more aesthetic) dynamic programming-based algorithm
(values in [2, 3] are recommended)}
\item{simplify}{single logical value, see Value}
\item{normalize}{single logical value, see Details}
\item{normalise}{alias of \code{normalize}}
\item{indent}{single non-negative integer; gives the indentation of the
first line in each paragraph}
\item{exdent}{single non-negative integer; specifies the indentation
of subsequent lines in paragraphs}
\item{prefix, initial}{single strings; \code{prefix} is used as prefix for each
line except the first, for which \code{initial} is utilized}
\item{whitespace_only}{single logical value; allow breaks only at white-spaces?
if \code{FALSE}, \pkg{ICU}'s line break iterator is used to split text
into words, which is suitable for natural language processing}
\item{use_length}{single logical value; should the number of code
points be used instead of the total code point width (see \code{\link{stri_width}})?}
\item{locale}{\code{NULL} or \code{''} for text boundary analysis following
the conventions of the default locale, or a single string with
locale identifier, see \link{stringi-locale}}
}
\value{
If \code{simplify} is \code{TRUE}, then a character vector is returned.
Otherwise, you will get a list of \code{length(str)} character vectors.
}
\description{
This function breaks text paragraphs into lines,
of total width (if it is possible) at most given \code{width}.
}
\details{
Vectorized over \code{str}.
If \code{whitespace_only} is \code{FALSE},
then \pkg{ICU}'s line-\code{BreakIterator} is used to determine
text boundaries where a line break is possible.
This is a locale-dependent operation.
Otherwise, the breaks are only at white-spaces.
Note that Unicode code points may have various widths when
printed on the console and that this function, by default, takes that
into account. By changing the state of the \code{use_length}
argument, this function starts to act as if each code point
was of width 1.
If \code{normalize} is \code{FALSE},
then multiple white spaces between the word boundaries are
preserved within each wrapped line.
In such a case, none of the strings can contain \code{\\r}, \code{\\n},
or other new line characters, otherwise you will get an error.
You should split the input text into lines
or, for example, substitute line breaks with spaces
before applying this function.
If \code{normalize} is \code{TRUE}, then
all consecutive white space (ASCII space, horizontal TAB, CR, LF)
sequences are replaced with single ASCII spaces
before actual string wrapping. Moreover, \code{\link{stri_split_lines}}
and \code{\link{stri_trans_nfc}} is called on the input character vector.
This is for compatibility with \code{\link{strwrap}}.
The greedy algorithm (for \code{cost_exponent} being non-positive)
provides a very simple way for word wrapping.
It always puts as many words in each line as possible.
This method -- contrary to the dynamic algorithm -- does not minimize
the number of space left at the end of every line.
The dynamic algorithm (a.k.a. Knuth's word wrapping algorithm)
is more complex, but it returns text wrapped
in a more aesthetic way. This method minimizes the squared
(by default, see \code{cost_exponent}) number of spaces (raggedness)
at the end of each line, so the text is mode arranged evenly.
Note that the cost of printing the last line is always zero.
}
\examples{
s <- stri_paste(
'Lorem ipsum dolor sit amet, consectetur adipisicing elit. Proin ',
'nibh augue, suscipit a, scelerisque sed, lacinia in, mi. Cras vel ',
'lorem. Etiam pellentesque aliquet tellus.')
cat(stri_wrap(s, 20, 0.0), sep='\n') # greedy
cat(stri_wrap(s, 20, 2.0), sep='\n') # dynamic
cat(stri_pad(stri_wrap(s), side='both'), sep='\n')
}
\references{
D.E. Knuth, M.F. Plass,
Breaking paragraphs into lines, \emph{Software: Practice and Experience} 11(11),
1981, pp. 1119--1184.
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other locale_sensitive:
\code{\link{\%s<\%}()},
\code{\link{about_locale}},
\code{\link{about_search_boundaries}},
\code{\link{about_search_coll}},
\code{\link{stri_compare}()},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_duplicated}()},
\code{\link{stri_enc_detect2}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_collator}()},
\code{\link{stri_order}()},
\code{\link{stri_rank}()},
\code{\link{stri_sort_key}()},
\code{\link{stri_sort}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_unique}()}
Other text_boundaries:
\code{\link{about_search_boundaries}},
\code{\link{about_search}},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_brkiter}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_split_lines}()},
\code{\link{stri_trans_tolower}()}
}
\concept{locale_sensitive}
\concept{text_boundaries}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/about_search_fixed.Rd 0000644 0001762 0000144 00000003700 14262507664 016625 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/search.R
\name{about_search_fixed}
\alias{about_search_fixed}
\alias{search_fixed}
\alias{stringi-search-fixed}
\title{Locale-Insensitive Fixed Pattern Matching in \pkg{stringi}}
\description{
String searching facilities described here
provide a way to locate a specific sequence of bytes in a string.
The search engine's settings may be tuned up (for example
to perform case-insensitive search) via a call to the
\code{\link{stri_opts_fixed}} function.
}
\section{Byte Compare}{
The fast Knuth-Morris-Pratt search algorithm, with worst time complexity of
O(n+p) (\code{n == length(str)}, \code{p == length(pattern)})
is implemented (with some tweaks for very short search patterns).
Be aware that, for natural language processing,
fixed pattern searching might not be what
you actually require. It is because a bitwise match will
not give correct results in cases of:
\enumerate{
\item accented letters;
\item conjoined letters;
\item ignorable punctuation;
\item ignorable case,
}
see also \link{about_search_coll}.
Note that the conversion of input data
to Unicode is done as usual.
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other search_fixed:
\code{\link{about_search}},
\code{\link{stri_opts_fixed}()}
Other stringi_general_topics:
\code{\link{about_arguments}},
\code{\link{about_encoding}},
\code{\link{about_locale}},
\code{\link{about_search_boundaries}},
\code{\link{about_search_charclass}},
\code{\link{about_search_coll}},
\code{\link{about_search_regex}},
\code{\link{about_search}},
\code{\link{about_stringi}}
}
\concept{search_fixed}
\concept{stringi_general_topics}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_trans_char.Rd 0000644 0001762 0000144 00000003245 14262507664 016200 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/trans_other.R
\name{stri_trans_char}
\alias{stri_trans_char}
\title{Translate Characters}
\usage{
stri_trans_char(str, pattern, replacement)
}
\arguments{
\item{str}{character vector}
\item{pattern}{a single character string providing code points to be translated}
\item{replacement}{a single character string giving translated code points}
}
\value{
Returns a character vector.
}
\description{
Translates Unicode code points in each input string.
}
\details{
Vectorized over \code{str} and with respect to each code point
in \code{pattern} and \code{replacement}.
If \code{pattern} and \code{replacement} consist of a different number
of code points, then the extra code points in the longer of the two
are ignored, with a warning.
If code points in a given \code{pattern} are not unique, the
last corresponding replacement code point is used.
Time complexity for each string in \code{str} is
O(\code{stri_length(str)*stri_length(pattern)}).
}
\examples{
stri_trans_char('id.123', '.', '_')
stri_trans_char('babaab', 'ab', '01')
stri_trans_char('GCUACGGAGCUUCGGAGCUAG', 'ACGT', 'TGCA')
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other transform:
\code{\link{stri_trans_general}()},
\code{\link{stri_trans_list}()},
\code{\link{stri_trans_nfc}()},
\code{\link{stri_trans_tolower}()}
}
\concept{transform}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_rand_shuffle.Rd 0000644 0001762 0000144 00000002327 14262507664 016514 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/random.R
\name{stri_rand_shuffle}
\alias{stri_rand_shuffle}
\title{Randomly Shuffle Code Points in Each String}
\usage{
stri_rand_shuffle(str)
}
\arguments{
\item{str}{character vector}
}
\value{
Returns a character vector.
}
\description{
Generates a (pseudo)random permutation of the code points
in each string.
}
\details{
This operation may result in non-Unicode-normalized
strings and may give peculiar outputs in case of bidirectional strings.
See also \code{\link{stri_reverse}} for reversing the order of code points.
}
\examples{
stri_rand_shuffle(c('abcdefghi', '0123456789'))
# you can do better than this with stri_rand_strings:
stri_rand_shuffle(rep(stri_paste(letters, collapse=''), 10))
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other random:
\code{\link{stri_rand_lipsum}()},
\code{\link{stri_rand_strings}()}
}
\concept{random}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_enc_list.Rd 0000644 0001762 0000144 00000003304 14262507664 015650 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/encoding_management.R
\name{stri_enc_list}
\alias{stri_enc_list}
\title{List Known Character Encodings}
\usage{
stri_enc_list(simplify = TRUE)
}
\arguments{
\item{simplify}{single logical value; return a character vector or a
list of character vectors?}
}
\value{
If \code{simplify} is \code{FALSE}, a list of
character vectors is returned. Each list element represents a unique
character encoding. The \code{name} attribute gives the \pkg{ICU} Canonical
Name of an encoding family. The elements (character vectors) are
its aliases.
If \code{simplify} is \code{TRUE} (the default), then the resulting list
is coerced to a character vector and sorted, and returned with
removed duplicated entries.
}
\description{
Gives the list of encodings that are supported by \pkg{ICU}.
}
\details{
Apart from given encoding identifiers and their aliases,
some other specifiers might additionally be available.
This is due to the fact that \pkg{ICU} tries to normalize
converter names. For instance, \code{'UTF8'} is also valid,
see \link{stringi-encoding} for more information.
}
\examples{
stri_enc_list()
stri_enc_list(FALSE)
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other encoding_management:
\code{\link{about_encoding}},
\code{\link{stri_enc_info}()},
\code{\link{stri_enc_mark}()},
\code{\link{stri_enc_set}()}
}
\concept{encoding_management}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_enc_toutf32.Rd 0000644 0001762 0000144 00000003275 14262507664 016212 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/encoding_conversion.R
\name{stri_enc_toutf32}
\alias{stri_enc_toutf32}
\title{Convert Strings To UTF-32}
\usage{
stri_enc_toutf32(str)
}
\arguments{
\item{str}{a character vector (or an object coercible to)
to be converted}
}
\value{
Returns a list of integer vectors.
Missing values are converted to \code{NULL}s.
}
\description{
UTF-32 is a 32-bit encoding where each Unicode code point
corresponds to exactly one integer value.
This function converts a character vector to a list
of integer vectors so that, e.g.,
individual code points may be easily accessed, changed, etc.
}
\details{
See \code{\link{stri_enc_fromutf32}} for a dual operation.
This function is roughly equivalent to a vectorized call
to \code{\link{utf8ToInt}(enc2utf8(str))}.
If you want a list of raw vectors on output,
use \code{\link{stri_encode}}.
Unlike \code{utf8ToInt}, if ill-formed UTF-8 byte sequences are detected,
a corresponding element is set to NULL and a warning is generated.
To deal with such issues, use, e.g., \code{\link{stri_enc_toutf8}}.
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other encoding_conversion:
\code{\link{about_encoding}},
\code{\link{stri_enc_fromutf32}()},
\code{\link{stri_enc_toascii}()},
\code{\link{stri_enc_tonative}()},
\code{\link{stri_enc_toutf8}()},
\code{\link{stri_encode}()}
}
\concept{encoding_conversion}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/operator_add.Rd 0000644 0001762 0000144 00000002752 14262507664 015460 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/join.R
\name{\%s+\%}
\alias{\%s+\%}
\alias{oper_plus}
\alias{operator_add}
\alias{operator_plus}
\alias{\%stri+\%}
\title{Concatenate Two Character Vectors}
\usage{
e1 \%s+\% e2
e1 \%stri+\% e2
}
\arguments{
\item{e1}{a character vector or an object coercible to a character vector}
\item{e2}{a character vector or an object coercible to a character vector}
}
\value{
Returns a character vector.
}
\description{
Binary operators for joining (concatenating) two character vectors,
with a typical R look-and-feel.
}
\details{
Vectorized over \code{e1} and \code{e2}.
These operators act like a call to \code{\link{stri_join}(e1, e2, sep='')}.
However, note that joining 3 vectors, e.g., \code{e1 \%s+\% e2 \%s+\% e3}
is slower than \code{\link{stri_join}(e1, e2, e3, sep='')},
because it creates a new (temporary) result vector each time
the operator is applied.
}
\examples{
c('abc', '123', 'xy') \%s+\% letters[1:6]
'ID_' \%s+\% 1:5
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other join:
\code{\link{stri_dup}()},
\code{\link{stri_flatten}()},
\code{\link{stri_join_list}()},
\code{\link{stri_join}()}
}
\concept{join}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_replace_na.Rd 0000644 0001762 0000144 00000002627 14262507664 016150 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utils.R
\name{stri_replace_na}
\alias{stri_replace_na}
\title{Replace Missing Values in a Character Vector}
\usage{
stri_replace_na(str, replacement = "NA")
}
\arguments{
\item{str}{character vector or an object coercible to}
\item{replacement}{single string}
}
\value{
Returns a character vector.
}
\description{
This function gives a convenient way to replace each missing (\code{NA})
value with a given string.
}
\details{
This function is roughly equivalent to
\code{str2 <- stri_enc_toutf8(str);
str2[is.na(str2)] <- stri_enc_toutf8(replacement);
str2}.
It may be used, e.g., wherever the 'plain R' \code{NA} handling is
desired, see Examples.
}
\examples{
x <- c('test', NA)
stri_paste(x, 1:2) # 'test1' NA
paste(x, 1:2) # 'test 1' 'NA 2'
stri_paste(stri_replace_na(x), 1:2, sep=' ') # 'test 1' 'NA 2'
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other utils:
\code{\link{stri_list2matrix}()},
\code{\link{stri_na2empty}()},
\code{\link{stri_remove_empty}()}
}
\concept{utils}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_unescape_unicode.Rd 0000644 0001762 0000144 00000003040 14522301203 017332 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/escape.R
\name{stri_unescape_unicode}
\alias{stri_unescape_unicode}
\title{Un-escape All Escape Sequences}
\usage{
stri_unescape_unicode(str)
}
\arguments{
\item{str}{character vector}
}
\value{
Returns a character vector.
If an escape sequence is ill-formed,
the result will be \code{NA} and a warning will be given.
}
\description{
Un-escapes all known escape sequences.
}
\details{
Uses \pkg{ICU}'s facilities to un-escape Unicode character sequences.
The following escape sequences are recognized:
\code{\\a}, \code{\\b}, \code{\\t}, \code{\\n}, \code{\\v}, \code{\\?},
\code{\\e}, \code{\\f}, \code{\\r}, \code{\"}, \code{\'}, \code{\\\\},
\code{\\uXXXX} (4 hex digits),
\code{\\UXXXXXXXX} (8 hex digits),
\code{\\xXX} (1-2 hex digits),
\code{\\ooo} (1-3 octal digits),
\code{\\cX} (control-X; X is masked with 0x1F).
For \code{\\xXX} and \code{\\ooo}, beware of non-valid UTF-8 byte sequences.
Note that some versions of R on Windows cannot handle
characters defined with \code{\\UXXXXXXXX}.
}
\examples{
stri_unescape_unicode('a\\\\u0105!\\\\u0032\\\\n')
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other escape:
\code{\link{stri_escape_unicode}()}
}
\concept{escape}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/about_encoding.Rd 0000644 0001762 0000144 00000026072 14262507664 015776 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/encoding.R
\name{about_encoding}
\alias{about_encoding}
\alias{stringi-encoding}
\alias{encoding}
\title{Character Encodings and \pkg{stringi}}
\description{
This manual page explains how \pkg{stringi} deals with character
strings in various encodings.
In particular we should note that:
\itemize{
\item \R lets strings in ASCII, UTF-8, and your platform's
native encoding coexist. A character vector printed on the console
by calling \code{\link{print}} or \code{\link{cat}} is
silently re-encoded to the native encoding.
\item Functions in \pkg{stringi} process each string internally in
Unicode, the most universal character encoding ever.
Even if a string is given in the native encoding, i.e., your platform's
default one, it will be converted to Unicode (precisely: UTF-8 or UTF-16).
\item Most \pkg{stringi} functions always return UTF-8 encoded strings,
regardless of the input encoding. What is more, the functions have been
optimized for UTF-8/ASCII input (they have competitive, if not better
performance, especially when performing more complex operations like
string comparison, sorting, and even concatenation). Thus, it is
best to rely on cascading calls to \pkg{stringi} operations solely.
}
}
\details{
Quoting the ICU User Guide,
'Hundreds of encodings have been developed over the years, each for small
groups of languages and for special purposes. As a result,
the interpretation of text, input, sorting, display, and storage
depends on the knowledge of all the different types of character sets
and their encodings. Programs have been written to handle either
one single encoding at a time and switch between them, or to convert
between external and internal encodings.'
'Unicode provides a single character set that covers the major
languages of the world, and a small number of machine-friendly encoding
forms and schemes to fit the needs of existing applications and protocols.
It is designed for best interoperability with both ASCII and ISO-8859-1
(the most widely used character sets) to make it easier for Unicode to be
used in almost all applications and protocols' (see the ICU User Guide).
The Unicode Standard determines the way to map any possible character
to a numeric value -- a so-called code point. Such code points, however,
have to be stored somehow in computer's memory.
The Unicode Standard encodes characters in the range U+0000..U+10FFFF,
which amounts to a 21-bit code space. Depending on the encoding
form (UTF-8, UTF-16, or UTF-32), each character will
then be represented either as a sequence of one to four 8-bit bytes,
one or two 16-bit code units, or a single 32-bit integer
(compare the ICU FAQ).
Unicode can be thought of as a superset of the spectrum of characters
supported by any given code page.
}
\section{UTF-8 and UTF-16}{
For portability reasons, the UTF-8 encoding is the most natural choice
for representing Unicode character strings in \R. UTF-8 has ASCII as its
subset (code points 1--127 represent the same characters in both of them).
Code points larger than 127 are represented by multi-byte sequences
(from 2 to 4 bytes: Please note that not all sequences of bytes
are valid UTF-8, compare \code{\link{stri_enc_isutf8}}).
Most of the computations in \pkg{stringi} are performed internally
using either UTF-8 or UTF-16 encodings (this depends on type of service
you request: some \pkg{ICU} services are designed only to work with UTF-16).
Due to such a choice, with \pkg{stringi} you get the same result on
each platform, which is -- unfortunately -- not the case of base \R's
functions (for instance, it is known that performing a regular expression
search under Linux on some texts may give you a different result
to those obtained under Windows). We really had portability in our minds
while developing our package!
We have observed that \R correctly handles UTF-8 strings regardless of your
platform's native encoding (see below). Therefore, we decided that most
functions in \pkg{stringi} will output its results in UTF-8
-- this speeds ups computations on cascading calls to our functions:
the strings does not have to be re-encoded each time.
Note that some Unicode characters may have an ambiguous representation.
For example, ``a with ogonek'' (one character) and ``a''+``ogonek''
(two graphemes) are semantically the same. \pkg{stringi} provides functions
to normalize character sequences, see \code{\link{stri_trans_nfc}}
for discussion. However, it is observed that denormalized strings
do appear very rarely in typical string processing activities.
Additionally, do note that \pkg{stringi} silently removes byte order marks
(BOMs - they may incidentally appear in a string read from a text file)
from UTF8-encoded strings, see \code{\link{stri_enc_toutf8}}.
}
\section{Character Encodings in \R}{
Data in memory are just bytes (small integer
values) -- an en\emph{coding} is a way to represent characters with such
numbers, it is a semantic 'key' to understand a given byte sequence.
For example, in ISO-8859-2 (Central European), the value 177 represents
Polish ``a with ogonek'', and in ISO-8859-1 (Western European),
the same value denotes the ``plus-minus'' sign. Thus, a character encoding
is a translation scheme: we need to communicate with \R somehow,
relying on how it represents strings.
Overall, \R has a very simple encoding marking mechanism,
see \code{\link{stri_enc_mark}}. There is an implicit assumption
that your platform's default (native) encoding always extends
ASCII -- \pkg{stringi} checks that whenever your native encoding
is being detected automatically on \pkg{ICU}'s initialization and each time
when you change it manually by calling \code{\link{stri_enc_set}}.
Character strings in \R (internally) can be declared to be in:
\itemize{
\item \code{UTF-8};
\item \code{latin1}, i.e., either ISO-8859-1 (Western European on
Linux, OS X, and other Unixes) or WINDOWS-1252 (Windows);
\item \code{bytes} -- for strings that
should be manipulated as sequences of bytes.
}
Moreover, there are two other cases:
\itemize{
\item ASCII -- for strings consisting only of byte codes
not greater than 127;
\item \code{native} (a.k.a. \code{unknown} in \code{\link{Encoding}};
quite a misleading name: no explicit encoding mark) -- for
strings that are assumed to be in your platform's native (default) encoding.
This can represent UTF-8 if you are an OS X user,
or some 8-bit Windows code page, for example.
The native encoding used by \R may be determined by examining
the LC_CTYPE category, see \code{\link{Sys.getlocale}}.
}
Intuitively, ``native'' strings result from reading
a string from stdin (e.g., keyboard input). This makes sense: your operating
system works in some encoding and provides \R with some data.
Each time when a \pkg{stringi} function encounters a string declared
in native encoding, it assumes that the input data should be translated
from the default encoding, i.e., the one returned by \code{\link{stri_enc_get}}
(unless you know what you are doing, the default encoding should only be
changed if the automatic encoding detection process fails on \pkg{stringi}
load).
Functions which allow \code{'bytes'} encoding markings are very rare in
\pkg{stringi}, and were carefully selected. These are:
\code{\link{stri_enc_toutf8}} (with argument \code{is_unknown_8bit=TRUE}),
\code{\link{stri_enc_toascii}}, and \code{\link{stri_encode}}.
Finally, note that \R lets strings in ASCII, UTF-8, and your platform's
native encoding coexist. A character vector printed with
\code{\link{print}}, \code{\link{cat}}, etc., is silently re-encoded
so that it can be properly shown, e.g., on the console.
}
\section{Encoding Conversion}{
Apart from automatic conversion from the native encoding,
you may re-encode a string manually, for example
when you read it from a file created on a different platform.
Call \code{\link{stri_enc_list}} for the list of
encodings supported by \pkg{ICU}.
Note that converter names are case-insensitive
and \pkg{ICU} tries to normalize the encoding specifiers.
Leading zeroes are ignored in sequences of digits (if further digits follow),
and all non-alphanumeric characters are ignored. Thus the strings
'UTF-8', 'utf_8', 'u*Tf08' and 'Utf 8' are equivalent.
The \code{\link{stri_encode}} function
allows you to convert between any given encodings
(in some cases you will obtain \code{bytes}-marked
strings, or even lists of raw vectors (i.e., for UTF-16).
There are also some useful more specialized functions,
like \code{\link{stri_enc_toutf32}} (converts a character vector to a list
of integers, where one code point is exactly one numeric value)
or \code{\link{stri_enc_toascii}} (substitutes all non-ASCII
bytes with the SUBSTITUTE CHARACTER,
which plays a similar role as \R's \code{NA} value).
There are also some routines for automated encoding detection,
see, e.g., \code{\link{stri_enc_detect}}.
}
\section{Encoding Detection}{
Given a text file, one has to know how to interpret (encode)
raw data in order to obtain meaningful information.
Encoding detection is always an imprecise operation and
needs a considerable amount of data. However, in case of some
encodings (like UTF-8, ASCII, or UTF-32) a ``false positive'' byte
sequence is quite rare (statistically speaking).
Check out \code{\link{stri_enc_detect}} (among others) for a useful
function in this category.
}
\references{
\emph{Unicode Basics} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/icu/unicode.html}
\emph{Conversion} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/conversion/}
\emph{Converters} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/conversion/converters.html}
(technical details)
\emph{UTF-8, UTF-16, UTF-32 & BOM} -- ICU FAQ,
\url{https://www.unicode.org/faq/utf_bom.html}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other stringi_general_topics:
\code{\link{about_arguments}},
\code{\link{about_locale}},
\code{\link{about_search_boundaries}},
\code{\link{about_search_charclass}},
\code{\link{about_search_coll}},
\code{\link{about_search_fixed}},
\code{\link{about_search_regex}},
\code{\link{about_search}},
\code{\link{about_stringi}}
Other encoding_management:
\code{\link{stri_enc_info}()},
\code{\link{stri_enc_list}()},
\code{\link{stri_enc_mark}()},
\code{\link{stri_enc_set}()}
Other encoding_detection:
\code{\link{stri_enc_detect2}()},
\code{\link{stri_enc_detect}()},
\code{\link{stri_enc_isascii}()},
\code{\link{stri_enc_isutf16be}()},
\code{\link{stri_enc_isutf8}()}
Other encoding_conversion:
\code{\link{stri_enc_fromutf32}()},
\code{\link{stri_enc_toascii}()},
\code{\link{stri_enc_tonative}()},
\code{\link{stri_enc_toutf32}()},
\code{\link{stri_enc_toutf8}()},
\code{\link{stri_encode}()}
}
\concept{encoding_conversion}
\concept{encoding_detection}
\concept{encoding_management}
\concept{stringi_general_topics}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_width.Rd 0000644 0001762 0000144 00000005164 14266677210 015174 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/length.R
\name{stri_width}
\alias{stri_width}
\title{Determine the Width of Code Points}
\usage{
stri_width(str)
}
\arguments{
\item{str}{character vector or an object coercible to}
}
\value{
Returns an integer vector of the same length as \code{str}.
}
\description{
Approximates the number of text columns the `cat()` function
might use to print a string using a mono-spaced font.
}
\details{
The Unicode standard does not formalize the notion of a character
width. Roughly based on \url{http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c},
\url{https://github.com/nodejs/node/blob/master/src/node_i18n.cc},
and UAX #11 we proceed as follows.
The following code points are of width 0:
\itemize{
\item code points with general category (see \link{stringi-search-charclass})
\code{Me}, \code{Mn}, and \code{Cf}),
\item \code{C0} and \code{C1} control codes (general category \code{Cc})
- for compatibility with the \code{\link{nchar}} function,
\item Hangul Jamo medial vowels and final consonants
(code points with enumerable property \code{UCHAR_HANGUL_SYLLABLE_TYPE}
equal to \code{U_HST_VOWEL_JAMO} or \code{U_HST_TRAILING_JAMO};
note that applying the NFC normalization with \code{\link{stri_trans_nfc}}
is encouraged),
\item ZERO WIDTH SPACE (U+200B),
}
Characters with the \code{UCHAR_EAST_ASIAN_WIDTH} enumerable property
equal to \code{U_EA_FULLWIDTH} or \code{U_EA_WIDE} are
of width 2.
Most emojis and characters with general category So (other symbols)
are of width 2.
SOFT HYPHEN (U+00AD) (for compatibility with \code{\link{nchar}})
as well as any other characters have width 1.
}
\examples{
stri_width(LETTERS[1:5])
stri_width(stri_trans_nfkd('\u0105'))
stri_width(stri_trans_nfkd('\U0001F606'))
stri_width( # Full-width equivalents of ASCII characters:
stri_enc_fromutf32(as.list(c(0x3000, 0xFF01:0xFF5E)))
)
stri_width(stri_trans_nfkd('\ubc1f')) # includes Hangul Jamo medial vowels and final consonants
}
\references{
\emph{East Asian Width} -- Unicode Standard Annex #11,
\url{https://www.unicode.org/reports/tr11/}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other length:
\code{\link{\%s$\%}()},
\code{\link{stri_isempty}()},
\code{\link{stri_length}()},
\code{\link{stri_numbytes}()},
\code{\link{stri_pad_both}()},
\code{\link{stri_sprintf}()}
}
\concept{length}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_enc_detect2.Rd 0000644 0001762 0000144 00000006325 14523017034 016220 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/encoding_detection.R
\name{stri_enc_detect2}
\alias{stri_enc_detect2}
\title{[DEPRECATED] Detect Locale-Sensitive Character Encoding}
\usage{
stri_enc_detect2(str, locale = NULL)
}
\arguments{
\item{str}{character vector, a raw vector, or
a list of \code{raw} vectors}
\item{locale}{\code{NULL} or \code{''} for the default locale,
or a single string with locale identifier.}
}
\value{
Just like \code{\link{stri_enc_detect}},
this function returns a list of length equal to the length of \code{str}.
Each list element is a data frame with the following three named components:
\itemize{
\item \code{Encoding} -- string; guessed encodings; \code{NA} on failure
(if and only if \code{encodings} is empty),
\item \code{Language} -- always \code{NA},
\item \code{Confidence} -- numeric in [0,1]; the higher the value,
the more confidence there is in the match; \code{NA} on failure.
}
The guesses are ordered by decreasing confidence.
}
\description{
This function tries to detect character encoding
in case the language of text is known.
}
\details{
Vectorized over \code{str}.
First, the text is checked whether it is valid
UTF-32BE, UTF-32LE, UTF-16BE, UTF-16LE, UTF-8
(as in \code{\link{stri_enc_detect}},
this is roughly inspired by \pkg{ICU}'s \code{i18n/csrucode.cpp}) or ASCII.
If \code{locale} is not \code{NA} and the above fails,
the text is checked for the number of occurrences
of language-specific code points (data provided by the \pkg{ICU} library)
converted to all possible 8-bit encodings
that fully cover the indicated language.
The encoding is selected based on the greatest number of total
byte hits.
The guess is of course imprecise,
as it is obtained using statistics and heuristics.
Because of this, detection works best if you supply at least a few hundred
bytes of character data that is in a single language.
If you have no initial guess on the language and encoding, try with
\code{\link{stri_enc_detect}} (uses \pkg{ICU} facilities).
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other locale_sensitive:
\code{\link{\%s<\%}()},
\code{\link{about_locale}},
\code{\link{about_search_boundaries}},
\code{\link{about_search_coll}},
\code{\link{stri_compare}()},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_duplicated}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_collator}()},
\code{\link{stri_order}()},
\code{\link{stri_rank}()},
\code{\link{stri_sort_key}()},
\code{\link{stri_sort}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_unique}()},
\code{\link{stri_wrap}()}
Other encoding_detection:
\code{\link{about_encoding}},
\code{\link{stri_enc_detect}()},
\code{\link{stri_enc_isascii}()},
\code{\link{stri_enc_isutf16be}()},
\code{\link{stri_enc_isutf8}()}
}
\concept{encoding_detection}
\concept{locale_sensitive}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_startsendswith.Rd 0000644 0001762 0000144 00000010323 14262507664 017135 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/search_startsendswith_4.R
\name{stri_startswith}
\alias{stri_startswith}
\alias{stri_endswith}
\alias{stri_startswith_fixed}
\alias{stri_endswith_fixed}
\alias{stri_startswith_charclass}
\alias{stri_endswith_charclass}
\alias{stri_startswith_coll}
\alias{stri_endswith_coll}
\title{Determine if the Start or End of a String Matches a Pattern}
\usage{
stri_startswith(str, ..., fixed, coll, charclass)
stri_endswith(str, ..., fixed, coll, charclass)
stri_startswith_fixed(
str,
pattern,
from = 1L,
negate = FALSE,
...,
opts_fixed = NULL
)
stri_endswith_fixed(
str,
pattern,
to = -1L,
negate = FALSE,
...,
opts_fixed = NULL
)
stri_startswith_charclass(str, pattern, from = 1L, negate = FALSE)
stri_endswith_charclass(str, pattern, to = -1L, negate = FALSE)
stri_startswith_coll(
str,
pattern,
from = 1L,
negate = FALSE,
...,
opts_collator = NULL
)
stri_endswith_coll(
str,
pattern,
to = -1L,
negate = FALSE,
...,
opts_collator = NULL
)
}
\arguments{
\item{str}{character vector}
\item{...}{supplementary arguments passed to the underlying functions,
including additional settings for \code{opts_collator}, \code{opts_fixed},
and so on.}
\item{pattern, fixed, coll, charclass}{character vector defining search patterns;
for more details refer to \link{stringi-search}}
\item{from}{integer vector}
\item{negate}{single logical value; whether a no-match to a pattern
is rather of interest}
\item{to}{integer vector}
\item{opts_collator, opts_fixed}{a named list used to tune up
the search engine's settings; see \code{\link{stri_opts_collator}}
and \code{\link{stri_opts_fixed}}, respectively; \code{NULL}
for the defaults}
}
\value{
Each function returns a logical vector.
}
\description{
These functions check if a string starts or ends with a match
to a given pattern. Also, it is possible to check if there is a match
at a specific position.
}
\details{
Vectorized over \code{str}, \code{pattern},
and \code{from} or \code{to} (with recycling
of the elements in the shorter vector if necessary).
If \code{pattern} is empty, then the result is \code{NA}
and a warning is generated.
Argument \code{start} controls the start position in \code{str}
where there is a match to a \code{pattern}.
\code{to} gives the end position.
Indexes given by \code{from} or \code{to} are of course 1-based,
i.e., an index 1 denotes the first character
in a string. This gives a typical R look-and-feel.
For negative indexes in \code{from} or \code{to}, counting starts
at the end of the string. For instance, index -1 denotes the last code point
in the string.
If you wish to test for a pattern match at an arbitrary
position in \code{str}, use \code{\link{stri_detect}}.
\code{stri_startswith} and \code{stri_endswith} are convenience functions.
They call either \code{stri_*_fixed}, \code{stri_*_coll},
or \code{stri_*_charclass}, depending on the argument used.
Relying on these underlying functions directly will make your code run
slightly faster.
Note that testing for a pattern match at the start or end of a string
has not been implemented separately for regex patterns.
For that you may use the '\code{^}' and '\code{$}' meta-characters,
see \link{stringi-search-regex}.
}
\examples{
stri_startswith_charclass(' trim me! ', '\\\\p{WSpace}')
stri_startswith_fixed(c('a1', 'a2', 'b3', 'a4', 'c5'), 'a')
stri_detect_regex(c('a1', 'a2', 'b3', 'a4', 'c5'), '^a')
stri_startswith_fixed('ababa', 'ba')
stri_startswith_fixed('ababa', 'ba', from=2)
stri_startswith_coll(c('a1', 'A2', 'b3', 'A4', 'C5'), 'a', strength=1)
pat <- stri_paste('\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 ',
'\u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645XYZ')
stri_endswith_coll('\ufdfa\ufdfa\ufdfaXYZ', pat, strength=1)
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other search_detect:
\code{\link{about_search}},
\code{\link{stri_detect}()}
}
\concept{search_detect}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_timezone_info.Rd 0000644 0001762 0000144 00000005037 14262507664 016722 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/time_zone.R
\name{stri_timezone_info}
\alias{stri_timezone_info}
\title{Query a Given Time Zone}
\usage{
stri_timezone_info(tz = NULL, locale = NULL, display_type = "long")
}
\arguments{
\item{tz}{\code{NULL} or \code{''} for default time zone,
or a single string with time zone ID otherwise}
\item{locale}{\code{NULL} or \code{''} for default locale,
or a single string with locale identifier}
\item{display_type}{single string;
one of \code{'short'}, \code{'long'}, \code{'generic_short'},
\code{'generic_long'}, \code{'gmt_short'}, \code{'gmt_long'},
\code{'common'}, \code{'generic_location'}}
}
\value{
Returns a list with the following named components:
\enumerate{
\item \code{ID} (time zone identifier),
\item \code{Name} (localized human-readable time zone name),
\item \code{Name.Daylight} (localized human-readable time zone
name when DST is used, if available),
\item \code{Name.Windows} (Windows time zone ID, if available),
\item \code{RawOffset} (raw GMT offset, in hours, before taking
daylight savings into account), and
\item \code{UsesDaylightTime} (states whether a time zone uses
daylight savings time in the current Gregorian calendar year).
}
}
\description{
Provides some basic information on a given time zone identifier.
}
\details{
Used to fetch basic information
on any supported time zone.
For more information on time zone representation in \pkg{ICU},
see \code{\link{stri_timezone_list}}.
}
\examples{
stri_timezone_info()
stri_timezone_info(locale='sk_SK')
sapply(c('short', 'long', 'generic_short', 'generic_long',
'gmt_short', 'gmt_long', 'common', 'generic_location'),
function(e) stri_timezone_info('Europe/London', display_type=e))
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other datetime:
\code{\link{stri_datetime_add}()},
\code{\link{stri_datetime_create}()},
\code{\link{stri_datetime_fields}()},
\code{\link{stri_datetime_format}()},
\code{\link{stri_datetime_fstr}()},
\code{\link{stri_datetime_now}()},
\code{\link{stri_datetime_symbols}()},
\code{\link{stri_timezone_get}()},
\code{\link{stri_timezone_list}()}
Other timezone:
\code{\link{stri_timezone_get}()},
\code{\link{stri_timezone_list}()}
}
\concept{datetime}
\concept{timezone}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/man/stri_opts_collator.Rd 0000644 0001762 0000144 00000011231 14523017034 016715 0 ustar ligges users % Generated by roxygen2: do not edit by hand
% Please edit documentation in R/opts.R
\name{stri_opts_collator}
\alias{stri_opts_collator}
\alias{stri_coll}
\title{Generate a List with Collator Settings}
\usage{
stri_opts_collator(
locale = NULL,
strength = 3L,
alternate_shifted = FALSE,
french = FALSE,
uppercase_first = NA,
case_level = FALSE,
normalization = FALSE,
normalisation = normalization,
numeric = FALSE
)
stri_coll(
locale = NULL,
strength = 3L,
alternate_shifted = FALSE,
french = FALSE,
uppercase_first = NA,
case_level = FALSE,
normalization = FALSE,
normalisation = normalization,
numeric = FALSE
)
}
\arguments{
\item{locale}{single string, \code{NULL} or
\code{''} for default locale}
\item{strength}{single integer in \{1,2,3,4\}, which defines collation strength;
\code{1} for the most permissive collation rules, \code{4} for the strictest
ones}
\item{alternate_shifted}{single logical value; \code{FALSE}
treats all the code points with non-ignorable primary weights in the same way,
\code{TRUE} causes code points with primary weights that are equal or below
the variable top value to be ignored on primary level and moved to the quaternary level}
\item{french}{single logical value; used in Canadian French;
\code{TRUE} results in secondary weights being considered backwards}
\item{uppercase_first}{single logical value; \code{NA}
orders upper and lower case letters in accordance to their tertiary weights,
\code{TRUE} forces upper case letters to sort before lower case letters,
\code{FALSE} does the opposite}
\item{case_level}{single logical value;
controls whether an extra case level (positioned before the third level) is generated or not}
\item{normalization}{single logical value; if \code{TRUE},
then incremental check is performed to see whether the input data is in
the FCD form. If the data is not in the FCD form, incremental NFD
normalization is performed}
\item{normalisation}{alias of \code{normalization}}
\item{numeric}{single logical value;
when turned on, this attribute generates a collation key for
the numeric value of substrings of digits;
this is a way to get '100' to sort AFTER '2';
note that negative or non-integer numbers will not be ordered properly}
}
\value{
Returns a named list object; missing settings are left with default values.
}
\description{
A convenience function to tune the \pkg{ICU} Collator's behavior,
e.g., in \code{\link{stri_compare}}, \code{\link{stri_order}},
\code{\link{stri_unique}}, \code{\link{stri_duplicated}},
as well as \code{\link{stri_detect_coll}}
and other \link{stringi-search-coll} functions.
}
\details{
\pkg{ICU}'s \emph{collator} performs a locale-aware, natural-language
alike string comparison.
This is a more reliable way of establishing relationships between
strings than the one provided by base \R, and definitely
one that is more complex and appropriate than ordinary bytewise
comparison.
}
\examples{
stri_cmp('number100', 'number2')
stri_cmp('number100', 'number2', opts_collator=stri_opts_collator(numeric=TRUE))
stri_cmp('number100', 'number2', numeric=TRUE) # equivalent
stri_cmp('above mentioned', 'above-mentioned')
stri_cmp('above mentioned', 'above-mentioned', alternate_shifted=TRUE)
}
\references{
\emph{Collation} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/collation/}
\emph{ICU Collation Service Architecture} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/collation/architecture.html}
\emph{\code{icu::Collator} Class Reference} -- ICU4C API Documentation,
\url{https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/classicu_1_1Collator.html}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other locale_sensitive:
\code{\link{\%s<\%}()},
\code{\link{about_locale}},
\code{\link{about_search_boundaries}},
\code{\link{about_search_coll}},
\code{\link{stri_compare}()},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_duplicated}()},
\code{\link{stri_enc_detect2}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_order}()},
\code{\link{stri_rank}()},
\code{\link{stri_sort_key}()},
\code{\link{stri_sort}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_unique}()},
\code{\link{stri_wrap}()}
Other search_coll:
\code{\link{about_search_coll}},
\code{\link{about_search}}
}
\concept{locale_sensitive}
\concept{search_coll}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
stringi/DESCRIPTION 0000644 0001762 0000144 00000003057 14535711033 013446 0 ustar ligges users Package: stringi
Version: 1.8.3
Date: 2023-12-10
Title: Fast and Portable Character String Processing Facilities
Description: A collection of character string/text/natural language
processing tools for pattern searching (e.g., with 'Java'-like regular
expressions or the 'Unicode' collation algorithm), random string generation,
case mapping, string transliteration, concatenation, sorting, padding,
wrapping, Unicode normalisation, date-time formatting and parsing,
and many more. They are fast, consistent, convenient, and -
thanks to 'ICU' (International Components for Unicode) -
portable across all locales and platforms. Documentation about 'stringi' is
provided via its website at and
the paper by Gagolewski (2022, ).
URL: https://stringi.gagolewski.com/,
https://github.com/gagolews/stringi, https://icu.unicode.org/
BugReports: https://github.com/gagolews/stringi/issues
SystemRequirements: ICU4C (>= 61, optional)
Type: Package
Depends: R (>= 3.4)
Imports: tools, utils, stats
Biarch: TRUE
License: file LICENSE
Author: Marek Gagolewski [aut, cre, cph] (),
Bartek Tartanus [ctb], and others (stringi source code);
Unicode, Inc. and others (ICU4C source code, Unicode Character Database)
Maintainer: Marek Gagolewski
RoxygenNote: 7.2.3
Encoding: UTF-8
NeedsCompilation: yes
Packaged: 2023-12-10 07:26:05 UTC; gagolews
License_is_FOSS: yes
Repository: CRAN
Date/Publication: 2023-12-11 22:50:03 UTC
stringi/build/ 0000755 0001762 0000144 00000000000 14535264015 013035 5 ustar ligges users stringi/build/partial.rdb 0000644 0001762 0000144 00000000075 14535264015 015164 0 ustar ligges users b```b`afd`b1 H020piּb C " {7 stringi/configure.ac 0000644 0001762 0000144 00000070123 14531751561 014232 0 ustar ligges users # kate: hl bash
# autoconf/configure script for stringi
# Copyright (c) 2013-2023, Marek Gagolewski
##### COMPATIBILITY NOTES/CHANGELOG ############################################
# 2014-04-10 R 3.1 CXX1X
# 2017-04-21 R 3.4 CXX11, CXX14, CXX17
# 2020-04-24 R 4.0 A C++ compiler must conform to C++11
# 2015-04-01 ICU 55.1 was supported on old Solaris boxes (until stringi-1.8.1)
# 2017-11-01 ICU 60.1 requires full C++11 support
# 2018-03-26 ICU 61.1 has U_CHARSET_IS_UTF8=1 set by default and has no C files
(minimal requirement since stringi-1.8.1)
# 2021-04-07 ICU 69.1 (bundled with stringi 1.6.1)
# 2023-10-31 ICU 74.1 (bundled with stringi 1.8.1,
together with icudata; compressed with xz and included in the package
source tarball as suggested by BR)
# 2024-04-xx ICU 75 will require C++17 (!)
# TODO: The standard way to use external libraries on macOS and Windows
# TODO: in R is via the corresponding toolchain. For macOS, it is built via
# TODO: recipes maintained by Simon Urbanek:
# TODO: https://github.com/R-macos/recipes/blob/master/recipes/icu
# TODO: For Windows, in the new toolchain, it is built by MXE
# TODO: (with changes for R maintained by Tomas Kalibera):
# TODO: https://github.com/mxe/mxe/blob/master/src/icu4c.mk
# TODO: https://svn.r-project.org/R-dev-web/trunk/WindowsBuilds/winutf8/ucrt3/toolchain_libs/mxe/src/icu4c.mk
# TODO: Contribute updates to ICU via these channels
##### INIT ####################################################################
AC_INIT([stringi], [1.8], [https://stringi.gagolewski.com/])
ICU_VERSION_NEEDED="61" # minimal version of ICU4C required to build stringi
ICU_BUNDLE_VERSION="74" # version of the ICU4C bundle shipped with stringi
ICUDT_DIR="icu${ICU_BUNDLE_VERSION}/data"
m4_ifndef([AC_CONFIG_MACRO_DIRS], [
m4_defun([_AM_CONFIG_MACRO_DIRS], [])
m4_defun([AC_CONFIG_MACRO_DIRS], [_AM_CONFIG_MACRO_DIRS($@)])
])
m4_include([tools/AC_CXX_HAVE_STL.m4])
m4_include([tools/AC_CXX_NAMESPACES.m4])
##### OPTIONS #################################################################
# shell variables/arguments accepted:
# R_HOME - R's home dir
AC_ARG_VAR(R_HOME, [Override the R directory,
e.g., /usr/lib64/R, where /usr/lib64 is in the search path.
Note that $R_HOME/bin/R should point to the R executable.])
AC_ARG_VAR([CAT], [The 'cat' command used for generating the list
of source files to compile.])
AC_ARG_VAR([PKG_CONFIG], [The 'pkg-config' command to determine
an appropriate build configuration for the system ICU.])
AC_ARG_VAR(PKG_CONFIG_PATH, [An optional list of directories to search for
pkg-config's .pc files.])
#AC_ARG_VAR([CC], [(ignored)])
#AC_ARG_VAR([CFLAGS], [(ignored)])
#AC_ARG_VAR([CPP], [(ignored)])
AC_ARG_VAR([CPPFLAGS], [(ignored)])
AC_ARG_VAR([CXX], [(ignored)])
AC_ARG_VAR([CXXFLAGS], [(ignored)])
# AC_ARG_VAR([CXXCPP], [(ignored)])
AC_ARG_VAR([LDFLAGS], [(ignored)])
AC_ARG_VAR([LIBS], [(ignored)])
AC_ARG_ENABLE([icu_bundle],
AS_HELP_STRING([--disable-icu-bundle],
[Enforce system ICU.]))
AC_ARG_VAR([STRINGI_DISABLE_ICU_BUNDLE],
[Enforce system ICU; see also --disable-icu-bundle.])
if test "x$enable_icu_bundle" != "xno" -a -z "${STRINGI_DISABLE_ICU_BUNDLE}"; then
enable_icu_bundle="yes"
else
enable_icu_bundle="no"
fi
AC_ARG_ENABLE([pkg_config],
AS_HELP_STRING([--disable-pkg-config],
[Disable locating the system ICU with 'pkg-config'; ICU4C will be
compiled from sources (strongly recommended for portability
across all the platforms).]))
AC_ARG_VAR([STRINGI_DISABLE_PKG_CONFIG],
[Enforce our ICU source bundle; see also --disable-pkg-config.])
if test "x$enable_pkg_config" != "xno" -a -z "${STRINGI_DISABLE_PKG_CONFIG}"; then
enable_pkg_config="yes"
else
enable_pkg_config="no"
fi
AC_ARG_ENABLE([gcc_debug],
AS_HELP_STRING([--enable-gcc-debug],
[Enable -UNDEBUG when compiling stringi (for developers of stringi)]))
if test "x$enable_gcc_debug" = "xyes"; then
enable_gcc_debug="yes"
else
enable_gcc_debug="no"
fi
AC_ARG_ENABLE([gcc_pedantic],
AS_HELP_STRING([--enable-gcc-pedantic],
[Enable -Wall -Wextra -pedantic when compiling
stringi with gcc/clang (for developers of stringi)]))
if test "x$enable_gcc_pedantic" = "xyes"; then
enable_gcc_pedantic="yes"
else
enable_gcc_pedantic="no"
fi
#AC_ARG_WITH([extra_cflags],
# AS_HELP_STRING([--with-extra-cflags=FLAGS],
# [Additional C compiler flags; see also the STRINGI_CFLAGS environment variable]))
AC_ARG_WITH([extra_cppflags],
AS_HELP_STRING([--with-extra-cppflags=FLAGS],
[Additional C/C++ preprocessor flags; see also the STRINGI_CPPFLAGS environment variable]))
AC_ARG_WITH([extra_cxxflags],
AS_HELP_STRING([--with-extra-cxxflags=FLAGS],
[Additional C++ compiler flags; see also the STRINGI_CXXFLAGS environment variable]))
AC_ARG_WITH([extra_ldflags],
AS_HELP_STRING([--with-extra-ldflags=FLAGS],
[Additional linker flags; see also the STRINGI_LDFLAGS environment variable]))
AC_ARG_WITH([extra_libs],
AS_HELP_STRING([--with-extra-libs=FLAGS],
[Additional libraries to link against; see also the STRINGI_LIBS environment variable]))
#AC_ARG_VAR([STRINGI_CFLAGS],
# [Additional C compiler flags; see also --with-extra-cflags.])
AC_ARG_VAR([STRINGI_CPPFLAGS],
[Additional C/C++ preprocessor flags; see also --with-extra-cppflags.])
AC_ARG_VAR([STRINGI_CXXFLAGS],
[Additional C++ compiler flags; see also --with-extra-cxxflags.])
AC_ARG_VAR([STRINGI_LDFLAGS],
[Additional linker flags; see also --with-extra-ldflags.])
AC_ARG_VAR([STRINGI_LIBS],
[Additional libraries to link against; see also --with-extra-libs.])
with_extra_cppflags="${with_extra_cppflags} ${STRINGI_CPPFLAGS}"
with_extra_cxxflags="${with_extra_cxxflags} ${STRINGI_CXXFLAGS}"
with_extra_ldflags="${with_extra_ldflags} ${STRINGI_LDFLAGS}"
with_extra_libs="${with_extra_libs} ${STRINGI_LIBS}"
#####
MSG_CONFIG_FAIL="
*** *********************************************************************
*** 'stringi' cannot be built with the current settings in place.
*** See the INSTALL file for the solutions to the most common problems.
*** Moreover, explore the list of open and closed issues at
*** https://github.com/gagolews/stringi/issues
*** *********************************************************************
"
##### CHECK FOR R #############################################################
# this is inspired by the "Writing R Extensions" manual:
# determine R_HOME directory...
AC_MSG_CHECKING([for R_HOME])
if test -z "${R_HOME}"; then
R_HOME=`R RHOME` # set R_HOME if it has not been set already
fi
if test -z "${R_HOME}"; then
AC_MSG_RESULT(no)
echo "*** Could not determine R_HOME. Is R installed?"
exit 1
fi
AC_MSG_RESULT($R_HOME)
AC_SUBST(R_HOME)
# ...and then R_PATH
AC_MSG_CHECKING([for R])
R_PATH="${R_HOME}/bin/R" # see "Writing R Extensions"
if test ! -e "${R_PATH}"; then
# if it was determined by calling `R RHOME`, then this is likely a bug
AC_MSG_RESULT(no)
echo "*** Could not find R at R_HOME/bin/R, i.e., ${R_HOME}/bin/R"
exit 1
fi
AC_MSG_RESULT($R_PATH)
AC_MSG_CHECKING([for endianness])
"${R_PATH}" --vanilla --slave -e "if (.Platform\$endian!=\"little\") q(\"no\", 1, FALSE)"
if test $? -ne 0; then
ICUDT_ENDIANNESS="big"
AC_MSG_RESULT(big)
else
ICUDT_ENDIANNESS="little"
AC_MSG_RESULT(little)
fi
##### CHECK FOR CAT ############################################################
# Check for 'cat' and get full path.
AC_PATH_PROG([CAT],[cat],[])
if test "x$CAT" = "x"; then
echo "*** The 'cat' command cannot be found."
echo "*** Set the environment variable 'CAT' appropriately."
exit 1
fi
###### SETUP COMPILER FLAGS FOR TESTING ########################################
if test "x$enable_gcc_debug" = "xyes"; then
# -fsanitize=address -fno-omit-frame-pointer
with_extra_cppflags="${with_extra_cppflags} -DDEBUG -UNDEBUG"
else
with_extra_cppflags="${with_extra_cppflags} -UDEBUG -DNDEBUG"
fi
if test "x$enable_gcc_pedantic" = "xyes"; then
#with_extra_cflags="${with_extra_cflags} -Wall -Wextra -Wformat=2 -pedantic"
with_extra_cxxflags="${with_extra_cxxflags} -Wall -Wextra -Wformat=2 -pedantic"
fi
################################################################################
### AUXILIARY FUNCTIONS ########################################################
enable_cxx11() {
echo "*** Trying with C++11 compiler disabled."
force_cxx11="yes"
# unset cached C++ compiler features:
unset ac_cv_cxx_compiler_gnu
unset ac_cv_c_compiler_gnu
unset ac_cv_prog_cxx_g
unset ac_cv_prog_cc_g
unset ac_cv_prog_cc_c89
unset ac_cv_prog_CPP
unset ac_cv_prog_CXX
unset ac_cv_prog_CC
unset ac_cv_header_stdc
unset ac_cv_prog_ac_ct_CXX
unset ac_cv_prog_ac_ct_CC
unset ac_cv_cxx_namespaces
unset ac_cv_cxx_have_stl
unset ac_cv_objext
return 0
}
# Basic C++ compiler checks: are long long type and stl data structures
# available? These are absolutely necessary to compile ICU>=55.
#
# If this is the case, return 0
check_cpp() {
AC_LANG(C++)
AC_PROG_CXX
AC_MSG_CHECKING([whether the C++ compiler supports the 'long long' type])
AC_LINK_IFELSE([AC_LANG_SOURCE([
#include
using namespace std;
int main() {
long long x = 1;
cout << x << endl;
cout << 1LL << 1ULL << endl;
cout << 9223372036854775807LL << 18446744073709551615ULL << endl;
return 0;
}
])],
[
AC_MSG_RESULT([yes])
],[
AC_MSG_RESULT([no])
return 1
])
AC_CXX_HAVE_STL
if test "$ac_cv_cxx_have_stl" = not; then
return 1
fi
AC_MSG_CHECKING([whether std::map is available])
AC_LINK_IFELSE([AC_LANG_SOURCE([
#include
#include